Nash-pAnDiTa's picture
Update app.py
9178174 verified
import gradio as gr
import os
from datasets import load_dataset, Dataset
import pandas as pd
from huggingface_hub import login
def load_huggingface_dataset(dataset_link, token):
"""
Load a Hugging Face dataset using the provided link and token.
"""
# Extract dataset name and config if applicable
# Load the dataset
dataset = load_dataset(dataset_link, split="train")
# Return the dataset as a DataFrame with index and transcription columns
df = dataset.to_pandas().reset_index()
# print(df)
return df[["index", "transcription"]], dataset
def update_transcriptions(df, dataset, token,dataset_link):
"""
Update the transcriptions in the dataset and push it back to the Hugging Face Hub.
"""
# Convert DataFrame back to Dataset
updated_dataset = Dataset.from_pandas(df)
# print(updated_dataset)
# print(dataset)
# Replace the original transcription column in the dataset
dataset = dataset.map(
lambda examples, idx: {"transcription": updated_dataset["transcription"][idx]},
with_indices=True
)
# Filter out rows with empty or whitespace-only transcriptions
dataset = dataset.filter(
lambda examples: examples["transcription"].strip() != "", # Keep only non-empty transcriptions
)
login(token)
dataset.push_to_hub(dataset_link)
return "Dataset updated and changes submitted to the Hugging Face Hub!"
# Gradio Interface
def main():
dataset = None # To store the loaded dataset object globally
original_df = None # Store the original DataFrame globally for resetting search results
def load_dataset_and_show_table(dataset_link, token):
"""
Load the dataset and return the DataFrame to display in Gradio.
"""
nonlocal dataset, original_df
original_df, dataset = load_huggingface_dataset(dataset_link, token)
return original_df
def search_transcriptions(search_term):
"""
Search the transcription column and filter the table based on the search term.
"""
if original_df is None:
return pd.DataFrame(columns=["index", "transcription"]) # Empty table if no dataset is loaded
filtered_df = original_df[original_df["transcription"].str.contains(search_term, case=False, na=False)]
return filtered_df
def update_original(df):
# Merge modified DataFrame into original DataFrame
for _, row in df.iterrows():
# Locate the row in the original DataFrame with the same index
original_df.loc[original_df["index"] == row["index"], "transcription"] = row["transcription"]
return "update Successful"
def submit_changes(df, token,dataset_link):
"""
Submit updated changes to the Hugging Face Hub.
"""
if dataset is None:
return "No dataset loaded to update."
print(len(dataset))
print(len(df))
if len(df) < len(dataset):
update_original(df)
return update_transcriptions(original_df, dataset, token,dataset_link)
return update_transcriptions(df, dataset, token,dataset_link)
# Gradio Interface
with gr.Blocks(css=".dataframe-row { height: 200px; }") as interface:
gr.Markdown("## Hugging Face Audio Dataset Editor")
# Input fields for dataset link and token
dataset_link = gr.Textbox(label="Hugging Face Dataset Link")
hf_token = gr.Textbox(label="Hugging Face Token", type="password")
# Button to load dataset
load_button = gr.Button("Load Dataset")
# Search bar
search_box = gr.Textbox(label="Search Transcriptions", placeholder="Enter a search term...")
# Table to display and edit dataset
table = gr.Dataframe(
headers=["index", "transcription"],
datatype=["number", "str"],
interactive=True,
label="Edit Dataset (Transcriptions are RTL)",
)
update_button = gr.Button("Update Table")
# Button to submit changes
submit_button = gr.Button("Submit Changes")
update_message = gr.Textbox(label="update message")
output_message = gr.Textbox(label="Message")
# RTL styling for transcription column
table.style = {"transcription": {"direction": "rtl"}}
# Button functionality
load_button.click(load_dataset_and_show_table, [dataset_link, hf_token], table)
search_box.change(search_transcriptions, search_box, table)
update_button.click(update_original, [table], update_message)
submit_button.click(submit_changes, [table, hf_token,dataset_link], output_message)
# Launch Gradio Interface
interface.launch()
if __name__ == "__main__":
main()