Nash-pAnDiTa commited on
Commit
b47e16a
·
verified ·
1 Parent(s): f9a3149

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -65
app.py CHANGED
@@ -1,77 +1,110 @@
 
1
  import gradio as gr
 
 
2
  import pandas as pd
3
- from datasets import load_dataset, Dataset, Audio
4
  from huggingface_hub import login
5
 
6
- # Global variables to store dataset and token
7
- editable_df = pd.DataFrame()
8
- dataset_name = ""
9
- hub_token = ""
10
-
11
-
12
- def load_hf_dataset(dataset_url, token):
13
- global editable_df, dataset_name, hub_token
14
- # Extract dataset name and token
15
- dataset_name = dataset_url.split("/")[-1]
16
- hub_token = token
 
 
 
 
 
 
17
 
18
- # Authenticate and load dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  login(token)
20
- dataset = load_dataset(dataset_url)
21
- editable_df = pd.DataFrame(dataset["train"])
22
- return editable_df
23
-
24
- def update_row(row_index, column_name, new_value):
25
- """Update a specific cell in the DataFrame."""
26
- global editable_df
27
- if row_index < len(editable_df) and column_name in editable_df.columns:
28
- editable_df.at[row_index, column_name] = new_value.replace('"', '')
29
- print(new_value.replace('"', ''))
30
- return editable_df
31
-
32
- def save_and_upload():
33
- """Save the updated DataFrame back to the Hugging Face Hub."""
34
- global editable_df, dataset_name, hub_token
35
- # Convert DataFrame to Dataset
36
- updated_dataset = Dataset.from_pandas(editable_df)
37
-
38
- updated_dataset = updated_dataset.cast_column("audio", Audio(sampling_rate=16000))
39
-
40
- # Push updated dataset to Hugging Face
41
- updated_dataset.push_to_hub(dataset_name, token=hub_token)
42
- return f"Updated dataset successfully pushed to: {dataset_name}"
43
-
44
- def handle_row_selection(selected_row, evt: gr.SelectData):
45
- # print(selected_row)
46
- index = evt.index[0]
47
- return index , selected_row.transcription[index]
48
-
49
- # Gradio interface
50
- with gr.Blocks() as app:
51
- gr.Markdown("### Hugging Face Dataset Editor")
52
 
53
- with gr.Row():
54
- dataset_url_input = gr.Textbox(label="Dataset URL", placeholder="username/dataset_name")
55
- token_input = gr.Textbox(label="Hub Token", placeholder="Enter your Hugging Face Hub token", type="password")
56
- load_btn = gr.Button("Load Dataset")
57
-
58
- data_table = gr.DataFrame(value=editable_df)
59
 
60
- with gr.Row():
61
- row_input = gr.Number(label="Row Index", value=0, precision=0, interactive=False)
62
- col_input = gr.Text(label="Column Name", value="transcription", interactive=False)
63
- new_value_input = gr.Text(label="New Value", value="new_value", interactive=True)
64
- update_btn = gr.Button("Update Row")
65
 
66
- # Register callback to handle row selection and update
67
- data_table.select(handle_row_selection, data_table,[row_input,new_value_input])#
 
68
 
69
- save_btn = gr.Button("Save and Upload")
70
- status_output = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- # Button actions
73
- load_btn.click(load_hf_dataset, inputs=[dataset_url_input, token_input], outputs=data_table)
74
- update_btn.click(update_row, inputs=[row_input, col_input, new_value_input], outputs=data_table)
75
- save_btn.click(save_and_upload, outputs=status_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- app.launch(share=True)
 
 
1
+
2
  import gradio as gr
3
+ import os
4
+ from datasets import load_dataset, Dataset
5
  import pandas as pd
 
6
  from huggingface_hub import login
7
 
8
+ def load_huggingface_dataset(dataset_link, token):
9
+ """
10
+ Load a Hugging Face dataset using the provided link and token.
11
+ """
12
+ # Extract dataset name and config if applicable
13
+ parts = dataset_link.split("/")
14
+ dataset_name = parts[-1]
15
+ if len(parts) > 2:
16
+ owner = parts[-2]
17
+ dataset_name = f"{owner}/{dataset_name}"
18
+
19
+ # Load the dataset
20
+ dataset = load_dataset(dataset_link, split="train")
21
+
22
+ # Return the dataset as a DataFrame with index and transcription columns
23
+ df = dataset.to_pandas().reset_index()
24
+ return df[["index", "transcription"]], dataset
25
 
26
+ def update_transcriptions(df, dataset, token,dataset_link):
27
+ """
28
+ Update the transcriptions in the dataset and push it back to the Hugging Face Hub.
29
+ """
30
+ # Convert DataFrame back to Dataset
31
+ updated_dataset = Dataset.from_pandas(df)
32
+
33
+ # print(updated_dataset)
34
+ # print(dataset)
35
+
36
+ # Replace the original transcription column in the dataset
37
+ dataset = dataset.map(
38
+ lambda examples, idx: {"transcription": updated_dataset["transcription"][idx]},
39
+ with_indices=True
40
+ )
41
+ print(dataset['transcription'][0])
42
+
43
  login(token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ dataset.push_to_hub(dataset_link)
 
 
 
 
 
46
 
47
+ return "Dataset updated and changes submitted to the Hugging Face Hub!"
 
 
 
 
48
 
49
+ # Gradio Interface
50
+ def main():
51
+
52
 
53
+ dataset = None # To store the loaded dataset object globally
54
+
55
+ def load_dataset_and_show_table(dataset_link, token):
56
+ """
57
+ Load the dataset and return the DataFrame to display in Gradio.
58
+ """
59
+ nonlocal dataset
60
+ df, dataset = load_huggingface_dataset(dataset_link, token)
61
+ return df
62
+
63
+ def submit_changes(df, token,dataset_link):
64
+ """
65
+ Submit updated changes to the Hugging Face Hub.
66
+ """
67
+ if dataset is None:
68
+ return "No dataset loaded to update."
69
+
70
+ print(df)
71
+ print(token)
72
+ return update_transcriptions(df, dataset, token,dataset_link)
73
 
74
+ # Gradio Interface
75
+ with gr.Blocks(css=".dataframe-row { height: 200px; }") as interface:
76
+ gr.Markdown("## Hugging Face Audio Dataset Editor")
77
+
78
+ # Input fields for dataset link and token
79
+ dataset_link = gr.Textbox(label="Hugging Face Dataset Link")
80
+ hf_token = gr.Textbox(label="Hugging Face Token", type="password")
81
+
82
+ # Button to load dataset
83
+ load_button = gr.Button("Load Dataset")
84
+
85
+ # Table to display and edit dataset
86
+ table = gr.Dataframe(
87
+ headers=["index", "transcription"],
88
+ datatype=["number", "str"],
89
+ interactive=True,
90
+ label="Edit Dataset (Transcriptions are RTL)",
91
+ )
92
+
93
+ # Button to submit changes
94
+ submit_button = gr.Button("Submit Changes")
95
+ output_message = gr.Textbox(label="Message")
96
+
97
+ # RTL styling for transcription column
98
+ table.style = {"transcription": {"direction": "rtl"}}
99
+
100
+ # Button functionality
101
+ load_button.click(load_dataset_and_show_table, [dataset_link, hf_token], table)
102
+ submit_button.click(submit_changes, [table, hf_token,dataset_link], output_message)
103
+
104
+ # Launch Gradio Interface
105
+ interface.launch(share=True)
106
+
107
+
108
 
109
+ if __name__ == "__main__":
110
+ main()