bagbreizh
v 0.0.1
640ae52
raw
history blame
3.22 kB
import os
import requests
import streamlit as st
from huggingface_hub import InferenceClient
from prompt import default_prompt, prompt_enhanced
# Function to load the README.md directly from the Hugging Face API
def load_readme(dataset_name: str):
api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md"
try:
response = requests.get(api_url)
response.raise_for_status()
except requests.exceptions.RequestException as e:
st.error(f"Error loading the README.md: {e}")
return None
return response.text
# Function to check if the README content exceeds the token limit
def check_token_limit(content: str, max_tokens: int = 7500):
if len(content)//4 > max_tokens:
truncated_content = content[:max_tokens]
st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.")
return truncated_content
return content
# Function to evaluate the quality of the dataset card
def evaluate_readme(readme_content: str, user_prompt: str):
# Retrieve the inference token from environment variables
hf_token = os.getenv('HF_TOKEN_INFERENCE')
# Ensure the token is available
if not hf_token:
st.error("The Hugging Face inference token is not configured. Please ensure HF_TOKEN_INFERENCE is set.")
return None
# Initialize the inference client with the specified model
client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct", token=hf_token)
# User-customizable prompt
prompt = user_prompt.format(readme_content=readme_content)
messages = [
{'role': 'system', 'content': "You are an expert in legal field especially in Artificial Intelligence and data privacy."},
{'role': 'user', 'content': prompt}
]
# Call the model to get an evaluation
response = client.chat_completion(
model="meta-llama/Meta-Llama-3-70B-Instruct",
messages=messages,
tool_choice="auto",
max_tokens=500,
)
return response['choices'][0]['message']['content']
# Streamlit Interface
def main():
from dotenv import load_dotenv
load_dotenv()
st.title("Dataset Card Evaluator")
# Dataset name input
dataset_name = st.text_input("Path to HF Dataset (e.g., amirveyseh/acronym_identification)")
if dataset_name:
# Load and display the dataset's README.md
readme = load_readme(dataset_name)
if readme:
# Check for token limit and truncate if necessary
readme = check_token_limit(readme)
st.subheader("README.md content:")
st.text_area("README.md", readme, height=200)
# Button to evaluate the documentation
if st.button("Evaluate dataset documentation"):
with st.spinner("Audit in progress..."):
evaluation_result = evaluate_readme(readme, prompt_enhanced)
if evaluation_result:
st.subheader("Evaluation Result:")
st.write(evaluation_result)
if __name__ == "__main__":
main()