import gradio as gr import os from datasets import load_dataset, Dataset import pandas as pd from huggingface_hub import login def load_huggingface_dataset(dataset_link, token): """ Load a Hugging Face dataset using the provided link and token. """ # Extract dataset name and config if applicable # Load the dataset dataset = load_dataset(dataset_link, split="train") # Return the dataset as a DataFrame with index and transcription columns df = dataset.to_pandas().reset_index() # print(df) return df[["index", "transcription"]], dataset def update_transcriptions(df, dataset, token,dataset_link): """ Update the transcriptions in the dataset and push it back to the Hugging Face Hub. """ # Convert DataFrame back to Dataset updated_dataset = Dataset.from_pandas(df) # print(updated_dataset) # print(dataset) # Replace the original transcription column in the dataset dataset = dataset.map( lambda examples, idx: {"transcription": updated_dataset["transcription"][idx]}, with_indices=True ) # Filter out rows with empty or whitespace-only transcriptions dataset = dataset.filter( lambda examples: examples["transcription"].strip() != "", # Keep only non-empty transcriptions ) login(token) dataset.push_to_hub(dataset_link) return "Dataset updated and changes submitted to the Hugging Face Hub!" # Gradio Interface def main(): dataset = None # To store the loaded dataset object globally original_df = None # Store the original DataFrame globally for resetting search results def load_dataset_and_show_table(dataset_link, token): """ Load the dataset and return the DataFrame to display in Gradio. """ nonlocal dataset, original_df original_df, dataset = load_huggingface_dataset(dataset_link, token) return original_df def search_transcriptions(search_term): """ Search the transcription column and filter the table based on the search term. """ if original_df is None: return pd.DataFrame(columns=["index", "transcription"]) # Empty table if no dataset is loaded filtered_df = original_df[original_df["transcription"].str.contains(search_term, case=False, na=False)] return filtered_df def update_original(df): # Merge modified DataFrame into original DataFrame for _, row in df.iterrows(): # Locate the row in the original DataFrame with the same index original_df.loc[original_df["index"] == row["index"], "transcription"] = row["transcription"] return "update Successful" def submit_changes(df, token,dataset_link): """ Submit updated changes to the Hugging Face Hub. """ if dataset is None: return "No dataset loaded to update." print(len(dataset)) print(len(df)) if len(df) < len(dataset): update_original(df) return update_transcriptions(original_df, dataset, token,dataset_link) return update_transcriptions(df, dataset, token,dataset_link) # Gradio Interface with gr.Blocks(css=".dataframe-row { height: 200px; }") as interface: gr.Markdown("## Hugging Face Audio Dataset Editor") # Input fields for dataset link and token dataset_link = gr.Textbox(label="Hugging Face Dataset Link") hf_token = gr.Textbox(label="Hugging Face Token", type="password") # Button to load dataset load_button = gr.Button("Load Dataset") # Search bar search_box = gr.Textbox(label="Search Transcriptions", placeholder="Enter a search term...") # Table to display and edit dataset table = gr.Dataframe( headers=["index", "transcription"], datatype=["number", "str"], interactive=True, label="Edit Dataset (Transcriptions are RTL)", ) update_button = gr.Button("Update Table") # Button to submit changes submit_button = gr.Button("Submit Changes") update_message = gr.Textbox(label="update message") output_message = gr.Textbox(label="Message") # RTL styling for transcription column table.style = {"transcription": {"direction": "rtl"}} # Button functionality load_button.click(load_dataset_and_show_table, [dataset_link, hf_token], table) search_box.change(search_transcriptions, search_box, table) update_button.click(update_original, [table], update_message) submit_button.click(submit_changes, [table, hf_token,dataset_link], output_message) # Launch Gradio Interface interface.launch() if __name__ == "__main__": main()