|
import os |
|
import requests |
|
import streamlit as st |
|
from huggingface_hub import InferenceClient |
|
from prompt import default_prompt, prompt_enhanced |
|
|
|
|
|
|
|
def load_readme(dataset_name: str): |
|
api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md" |
|
|
|
try: |
|
response = requests.get(api_url) |
|
response.raise_for_status() |
|
except requests.exceptions.RequestException as e: |
|
st.error(f"Error loading the README.md: {e}") |
|
return None |
|
|
|
return response.text |
|
|
|
|
|
def check_token_limit(content: str, max_tokens: int = 7500): |
|
if len(content)//4 > max_tokens: |
|
truncated_content = content[:max_tokens] |
|
st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.") |
|
return truncated_content |
|
return content |
|
|
|
|
|
def evaluate_readme(readme_content: str, user_prompt: str): |
|
|
|
hf_token = os.getenv('HF_TOKEN_INFERENCE') |
|
|
|
|
|
if not hf_token: |
|
st.error("The Hugging Face inference token is not configured. Please ensure HF_TOKEN_INFERENCE is set.") |
|
return None |
|
|
|
|
|
client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct", token=hf_token) |
|
|
|
|
|
prompt = user_prompt.format(readme_content=readme_content) |
|
|
|
messages = [ |
|
{'role': 'system', 'content': "You are an expert in legal field especially in Artificial Intelligence and data privacy."}, |
|
{'role': 'user', 'content': prompt} |
|
] |
|
|
|
|
|
response = client.chat_completion( |
|
model="meta-llama/Meta-Llama-3-70B-Instruct", |
|
messages=messages, |
|
tool_choice="auto", |
|
max_tokens=500, |
|
) |
|
|
|
return response['choices'][0]['message']['content'] |
|
|
|
|
|
def main(): |
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
st.title("Dataset Card Evaluator") |
|
|
|
|
|
dataset_name = st.text_input("Path to HF Dataset (e.g., amirveyseh/acronym_identification)") |
|
|
|
if dataset_name: |
|
|
|
readme = load_readme(dataset_name) |
|
|
|
if readme: |
|
|
|
readme = check_token_limit(readme) |
|
|
|
st.subheader("README.md content:") |
|
st.text_area("README.md", readme, height=200) |
|
|
|
|
|
if st.button("Evaluate dataset documentation"): |
|
with st.spinner("Audit in progress..."): |
|
evaluation_result = evaluate_readme(readme, prompt_enhanced) |
|
if evaluation_result: |
|
st.subheader("Evaluation Result:") |
|
st.write(evaluation_result) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|