librarian-bot davanstrien HF staff commited on
Commit
745ebf9
·
0 Parent(s):

Duplicate from davanstrien/webhook_metadata_reviewer

Browse files

Co-authored-by: Daniel van Strien <[email protected]>

Files changed (5) hide show
  1. .gitattributes +34 -0
  2. Dockerfile +27 -0
  3. README.md +11 -0
  4. main.py +166 -0
  5. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.9 image
2
+ FROM python:3.11-slim-bullseye
3
+
4
+ # Set the working directory to /code
5
+ WORKDIR /code
6
+
7
+ # Copy the current directory contents into the container at /code
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Set up a new user named "user" with user ID 1000
14
+ RUN useradd -m -u 1000 user
15
+ # Switch to the "user" user
16
+ USER user
17
+ # Set home to the user's home directory
18
+ ENV HOME=/home/user \
19
+ PATH=/home/user/.local/bin:$PATH
20
+
21
+ # Set the working directory to the user's home directory
22
+ WORKDIR $HOME/app
23
+
24
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
+ COPY --chown=user . $HOME/app
26
+
27
+ CMD ["uvicorn", "main:app","--proxy-headers", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Test Webhooks
3
+ emoji: 👀
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ duplicated_from: davanstrien/webhook_metadata_reviewer
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
main.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from difflib import SequenceMatcher
4
+ from typing import Any, Dict, Optional, Tuple
5
+
6
+ from fastapi import FastAPI, Request, Response
7
+ from huggingface_hub import (DatasetCard, HfApi, ModelCard, comment_discussion,
8
+ create_discussion, get_discussion_details,
9
+ get_repo_discussions, login)
10
+ from huggingface_hub.utils import EntryNotFoundError
11
+ from tabulate import tabulate
12
+ from toolz import valmap
13
+
14
+ KEY = os.environ.get("WEBHOOK_SECRET")
15
+ HF_TOKEN = os.environ.get("HF_TOKEN")
16
+
17
+ api = HfApi(token=HF_TOKEN)
18
+ login(HF_TOKEN)
19
+
20
+ app = FastAPI()
21
+
22
+
23
+ @app.get("/")
24
+ def read_root():
25
+ return {"Hello": "World!"}
26
+
27
+
28
+ def similar(a, b):
29
+ """Check similarity of two sequences"""
30
+ return SequenceMatcher(None, a, b).ratio()
31
+
32
+
33
+ def create_metadata_key_dict(card_data, repo_type: str):
34
+ shared_keys = ["tags", "license"]
35
+ if repo_type == "model":
36
+ model_keys = ["library_name", "datasets", "metrics", "co2", "pipeline_tag"]
37
+ shared_keys.extend(model_keys)
38
+ keys = shared_keys
39
+ return {key: card_data.get(key) for key in keys}
40
+ if repo_type == "dataset":
41
+ data_keys = [
42
+ "pretty_name",
43
+ "size_categories",
44
+ "task_categories",
45
+ "task_ids",
46
+ "source_datasets",
47
+ ]
48
+ shared_keys.extend(data_keys)
49
+ keys = shared_keys
50
+ return {key: card_data.get(key) for key in keys}
51
+
52
+
53
+ def create_metadata_breakdown_table(desired_metadata_dictionary):
54
+ data = valmap(lambda x: x or "Field Missing", desired_metadata_dictionary)
55
+ metadata_fields_column = list(data.keys())
56
+ metadata_values_column = list(data.values())
57
+ table_data = list(zip(metadata_fields_column, metadata_values_column))
58
+ return tabulate(
59
+ table_data, tablefmt="github", headers=("Metadata Field", "Provided Value")
60
+ )
61
+
62
+
63
+ def calculate_grade(desired_metadata_dictionary):
64
+ metadata_values = list(desired_metadata_dictionary.values())
65
+ score = sum(1 if field else 0 for field in metadata_values) / len(metadata_values)
66
+ return round(score, 2)
67
+
68
+
69
+ def create_markdown_report(
70
+ desired_metadata_dictionary, repo_name, repo_type, score, update: bool = False
71
+ ):
72
+ report = f"""# {repo_type.title()} metadata report card {"(updated)" if update else ""}
73
+ \n
74
+ This is an automatically produced metadata quality report card for {repo_name}. This report is meant as a POC!
75
+ \n
76
+ ## Breakdown of metadata fields for your{repo_type}
77
+ \n
78
+ {create_metadata_breakdown_table(desired_metadata_dictionary)}
79
+ \n
80
+ You scored a metadata coverage grade of: **{score}**% \n {f"We're not angry we're just disappointed! {repo_type.title()} metadata is super important. Please try harder..."
81
+ if score <= 0.5 else f"Not too shabby! Make sure you also fill in a {repo_type} card too!"}
82
+ """
83
+ return report
84
+
85
+
86
+ def parse_webhook_post(data: Dict[str, Any]) -> Optional[Tuple[str, str]]:
87
+ event = data["event"]
88
+ if event["scope"] != "repo":
89
+ return None
90
+ repo = data["repo"]
91
+ repo_name = repo["name"]
92
+ repo_type = repo["type"]
93
+ if repo_type not in {"model", "dataset"}:
94
+ raise ValueError("Unknown hub type")
95
+ return repo_type, repo_name
96
+
97
+
98
+ def load_repo_card(repo_type, repo_name):
99
+ if repo_type == "dataset":
100
+ try:
101
+ return DatasetCard.load(repo_name).data.to_dict()
102
+ except EntryNotFoundError:
103
+ return {}
104
+ if repo_type == "model":
105
+ try:
106
+ return ModelCard.load(repo_name).data.to_dict()
107
+ except EntryNotFoundError:
108
+ return {}
109
+
110
+
111
+ def create_or_update_report(data):
112
+ if parsed_post := parse_webhook_post(data):
113
+ repo_type, repo_name = parsed_post
114
+ else:
115
+ return Response("Unable to parse webhook data", status_code=400)
116
+ card_data = load_repo_card(repo_type, repo_name)
117
+ desired_metadata_dictionary = create_metadata_key_dict(card_data, repo_type)
118
+ score = calculate_grade(desired_metadata_dictionary)
119
+ report = create_markdown_report(
120
+ desired_metadata_dictionary, repo_name, repo_type, score, update=False
121
+ )
122
+ repo_discussions = get_repo_discussions(
123
+ repo_name,
124
+ repo_type=repo_type,
125
+ )
126
+ for discussion in repo_discussions:
127
+ if (
128
+ discussion.title == "Metadata Report Card" and discussion.status == "open"
129
+ ): # An existing open report card thread
130
+ discussion_details = get_discussion_details(
131
+ repo_name, discussion.num, repo_type=repo_type
132
+ )
133
+ last_comment = discussion_details.events[-1].content
134
+ if similar(report, last_comment) <= 0.999:
135
+ report = create_markdown_report(
136
+ desired_metadata_dictionary,
137
+ repo_name,
138
+ repo_type,
139
+ score,
140
+ update=True,
141
+ )
142
+ comment_discussion(
143
+ repo_name,
144
+ discussion.num,
145
+ comment=report,
146
+ repo_type=repo_type,
147
+ )
148
+ return True
149
+ create_discussion(
150
+ repo_name,
151
+ "Metadata Report Card",
152
+ description=report,
153
+ repo_type=repo_type,
154
+ )
155
+ return True
156
+
157
+
158
+ @app.post("/webhook")
159
+ async def webhook(request: Request):
160
+ if request.method == "POST":
161
+ if request.headers.get("X-Webhook-Secret") != KEY:
162
+ return Response("Invalid secret", status_code=401)
163
+ payload = await request.body()
164
+ data = json.loads(payload)
165
+ result = create_or_update_report(data)
166
+ return "Webhook received!" if result else result
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ huggingface_hub==0.12.0
2
+ tabulate==0.9.0
3
+ toolz==0.12.0
4
+ fastapi==0.89.1
5
+ uvicorn==0.20.0