Spaces:
Sleeping
Sleeping
File size: 4,979 Bytes
b47e16a e1f36d5 b47e16a e1f36d5 b47e16a 6bf9f6c b47e16a 7c83e84 4165a1d b47e16a ee1dc7d b47e16a 9178174 b47e16a e1f36d5 b47e16a e1f36d5 b47e16a e1f36d5 b47e16a e1f36d5 b47e16a 6bf9f6c b47e16a 6bf9f6c b47e16a 6bf9f6c b47e16a e1f36d5 b47e16a 6bf9f6c b47e16a 6bf9f6c b47e16a 6bf9f6c b47e16a 6bf9f6c b47e16a 18555fc b47e16a e1f36d5 b47e16a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import gradio as gr
import os
from datasets import load_dataset, Dataset
import pandas as pd
from huggingface_hub import login
def load_huggingface_dataset(dataset_link, token):
"""
Load a Hugging Face dataset using the provided link and token.
"""
# Extract dataset name and config if applicable
# Load the dataset
dataset = load_dataset(dataset_link, split="train")
# Return the dataset as a DataFrame with index and transcription columns
df = dataset.to_pandas().reset_index()
# print(df)
return df[["index", "transcription"]], dataset
def update_transcriptions(df, dataset, token,dataset_link):
"""
Update the transcriptions in the dataset and push it back to the Hugging Face Hub.
"""
# Convert DataFrame back to Dataset
updated_dataset = Dataset.from_pandas(df)
# print(updated_dataset)
# print(dataset)
# Replace the original transcription column in the dataset
dataset = dataset.map(
lambda examples, idx: {"transcription": updated_dataset["transcription"][idx]},
with_indices=True
)
# Filter out rows with empty or whitespace-only transcriptions
dataset = dataset.filter(
lambda examples: examples["transcription"].strip() != "", # Keep only non-empty transcriptions
)
login(token)
dataset.push_to_hub(dataset_link)
return "Dataset updated and changes submitted to the Hugging Face Hub!"
# Gradio Interface
def main():
dataset = None # To store the loaded dataset object globally
original_df = None # Store the original DataFrame globally for resetting search results
def load_dataset_and_show_table(dataset_link, token):
"""
Load the dataset and return the DataFrame to display in Gradio.
"""
nonlocal dataset, original_df
original_df, dataset = load_huggingface_dataset(dataset_link, token)
return original_df
def search_transcriptions(search_term):
"""
Search the transcription column and filter the table based on the search term.
"""
if original_df is None:
return pd.DataFrame(columns=["index", "transcription"]) # Empty table if no dataset is loaded
filtered_df = original_df[original_df["transcription"].str.contains(search_term, case=False, na=False)]
return filtered_df
def update_original(df):
# Merge modified DataFrame into original DataFrame
for _, row in df.iterrows():
# Locate the row in the original DataFrame with the same index
original_df.loc[original_df["index"] == row["index"], "transcription"] = row["transcription"]
return "update Successful"
def submit_changes(df, token,dataset_link):
"""
Submit updated changes to the Hugging Face Hub.
"""
if dataset is None:
return "No dataset loaded to update."
print(len(dataset))
print(len(df))
if len(df) < len(dataset):
update_original(df)
return update_transcriptions(original_df, dataset, token,dataset_link)
return update_transcriptions(df, dataset, token,dataset_link)
# Gradio Interface
with gr.Blocks(css=".dataframe-row { height: 200px; }") as interface:
gr.Markdown("## Hugging Face Audio Dataset Editor")
# Input fields for dataset link and token
dataset_link = gr.Textbox(label="Hugging Face Dataset Link")
hf_token = gr.Textbox(label="Hugging Face Token", type="password")
# Button to load dataset
load_button = gr.Button("Load Dataset")
# Search bar
search_box = gr.Textbox(label="Search Transcriptions", placeholder="Enter a search term...")
# Table to display and edit dataset
table = gr.Dataframe(
headers=["index", "transcription"],
datatype=["number", "str"],
interactive=True,
label="Edit Dataset (Transcriptions are RTL)",
)
update_button = gr.Button("Update Table")
# Button to submit changes
submit_button = gr.Button("Submit Changes")
update_message = gr.Textbox(label="update message")
output_message = gr.Textbox(label="Message")
# RTL styling for transcription column
table.style = {"transcription": {"direction": "rtl"}}
# Button functionality
load_button.click(load_dataset_and_show_table, [dataset_link, hf_token], table)
search_box.change(search_transcriptions, search_box, table)
update_button.click(update_original, [table], update_message)
submit_button.click(submit_changes, [table, hf_token,dataset_link], output_message)
# Launch Gradio Interface
interface.launch()
if __name__ == "__main__":
main()
|