Spaces:

rdassignies
/

dataset_card_evaluator

Running

bagbreizh

v 0.0.1

640ae52 3 months ago

3.22 kB

	import os
	import requests
	import streamlit as st
	from huggingface_hub import InferenceClient
	from prompt import default_prompt, prompt_enhanced


	# Function to load the README.md directly from the Hugging Face API
	def load_readme(dataset_name: str):
	api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md"

	try:
	response = requests.get(api_url)
	response.raise_for_status()
	except requests.exceptions.RequestException as e:
	st.error(f"Error loading the README.md: {e}")
	return None

	return response.text

	# Function to check if the README content exceeds the token limit
	def check_token_limit(content: str, max_tokens: int = 7500):
	if len(content)//4 > max_tokens:
	truncated_content = content[:max_tokens]
	st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.")
	return truncated_content
	return content

	# Function to evaluate the quality of the dataset card
	def evaluate_readme(readme_content: str, user_prompt: str):
	# Retrieve the inference token from environment variables
	hf_token = os.getenv('HF_TOKEN_INFERENCE')

	# Ensure the token is available
	if not hf_token:
	st.error("The Hugging Face inference token is not configured. Please ensure HF_TOKEN_INFERENCE is set.")
	return None

	# Initialize the inference client with the specified model
	client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct", token=hf_token)

	# User-customizable prompt
	prompt = user_prompt.format(readme_content=readme_content)

	messages = [
	{'role': 'system', 'content': "You are an expert in legal field especially in Artificial Intelligence and data privacy."},
	{'role': 'user', 'content': prompt}
	]

	# Call the model to get an evaluation
	response = client.chat_completion(
	model="meta-llama/Meta-Llama-3-70B-Instruct",
	messages=messages,
	tool_choice="auto",
	max_tokens=500,
	)

	return response['choices'][0]['message']['content']

	# Streamlit Interface
	def main():

	from dotenv import load_dotenv
	load_dotenv()
	st.title("Dataset Card Evaluator")

	# Dataset name input
	dataset_name = st.text_input("Path to HF Dataset (e.g., amirveyseh/acronym_identification)")

	if dataset_name:
	# Load and display the dataset's README.md
	readme = load_readme(dataset_name)

	if readme:
	# Check for token limit and truncate if necessary
	readme = check_token_limit(readme)

	st.subheader("README.md content:")
	st.text_area("README.md", readme, height=200)

	# Button to evaluate the documentation
	if st.button("Evaluate dataset documentation"):
	with st.spinner("Audit in progress..."):
	evaluation_result = evaluate_readme(readme, prompt_enhanced)
	if evaluation_result:
	st.subheader("Evaluation Result:")
	st.write(evaluation_result)

	if __name__ == "__main__":
	main()