import os import requests import streamlit as st from huggingface_hub import InferenceClient from prompt import default_prompt, prompt_enhanced # Function to load the README.md directly from the Hugging Face API def load_readme(dataset_name: str): api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md" try: response = requests.get(api_url) response.raise_for_status() except requests.exceptions.RequestException as e: st.error(f"Error loading the README.md: {e}") return None return response.text # Function to check if the README content exceeds the token limit def check_token_limit(content: str, max_tokens: int = 7500): if len(content)//4 > max_tokens: truncated_content = content[:max_tokens] st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.") return truncated_content return content # Function to evaluate the quality of the dataset card def evaluate_readme(readme_content: str, user_prompt: str): # Retrieve the inference token from environment variables hf_token = os.getenv('HF_TOKEN_INFERENCE') # Ensure the token is available if not hf_token: st.error("The Hugging Face inference token is not configured. Please ensure HF_TOKEN_INFERENCE is set.") return None # Initialize the inference client with the specified model client = InferenceClient(model="meta-llama/Llama-3.3-70B-Instruct", token=hf_token) # User-customizable prompt prompt = user_prompt.format(readme_content=readme_content) messages = [ {'role': 'system', 'content': "You are an expert in legal field especially in Artificial Intelligence and data privacy."}, {'role': 'user', 'content': prompt} ] # Call the model to get an evaluation response = client.chat_completion( model="meta-llama/Meta-Llama-3-70B-Instruct", messages=messages, tool_choice="auto", max_tokens=500, ) return response['choices'][0]['message']['content'] # Streamlit Interface def main(): st.title("Legal Audit of Dataset Cards") st.write("This Space provides an automated tool for auditing dataset cards from a legal perspective.It evaluates dataset documentation based on key legal criteria, such as compliance with data privacy regulations, ethical considerations, and transparency of information.") dataset_name = st.text_input("Path to HF Dataset (e.g., amirveyseh/acronym_identification)") if dataset_name: # Load and display the dataset's README.md readme = load_readme(dataset_name) if readme: # Check for token limit and truncate if necessary readme = check_token_limit(readme) st.subheader("README.md content:") st.text_area("README.md", readme, height=200) # Button to evaluate the documentation if st.button("Evaluate dataset documentation"): with st.spinner("Audit in progress..."): evaluation_result = evaluate_readme(readme, prompt_enhanced) if evaluation_result: st.subheader("Evaluation Result:") st.write(evaluation_result) if __name__ == "__main__": main()