Spaces:
Runtime error
Runtime error
import os | |
from pathlib import Path | |
import gradio as gr | |
import requests | |
from fastapi import BackgroundTasks, Response, status | |
from huggingface_hub import WebhookPayload, WebhooksServer | |
from huggingface_hub.utils import build_hf_headers, get_session | |
from src.build_nomic import build_nomic | |
from src.my_logger import setup_logger | |
from src.readme_update import update_dataset_readme | |
from src.utilities import load_datasets, merge_and_update_datasets | |
from src.visualize_logs import log_file_to_html_string | |
proj_dir = Path(__name__).parent | |
logger = setup_logger(__name__) | |
logger.info("Starting Application...") | |
SUBREDDIT = os.environ["SUBREDDIT"] | |
USERNAME = os.environ["USERNAME"] | |
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}" | |
PROCESSED_DATASET = os.environ['PROCESSED_DATASET'] | |
# HF_TOKEN = os.environ["HF_TOKEN"] | |
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret') | |
intro_md = """ | |
# Processing BORU | |
This is a space to visually search the subreddit [/r/bestofredditorupdates](https://www.reddit.com/r/BestofRedditorUpdates/). | |
Have you ever been curious to search for stories that are similar to one of your favorites? This can help! | |
- Each dot represents a post (try clicking on one) | |
- Closer dots are similar in topic | |
- Use the filters on the left to help you narrow down what you are looking for | |
- The lasso can help you search in a smaller range that you drag with your mouse | |
- The filter can help you narrow by field, | |
- Filtering posts that are `CONCLUDED` | |
- Filtering popular posts | |
- Filtering by date | |
- The search can help you look by keyword | |
Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map) | |
""" | |
details_md = """ | |
# Details | |
## Creation Details | |
1. This space is triggered by a webhook for changes on [reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates). | |
2. It then takes the updates from that dataset and get embeddings by making leveraging [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) | |
- [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) is using [zero-spaces](https://huggingface.co/zero-gpu-explorers) a free GPU service to compute the model [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) | |
- Im calling this via [gradio_client](https://www.gradio.app/docs/client) which allows any space to be used as an API | |
3. The calculated embeddings are stored in this dataset [reddit-tools-HF/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/reddit-tools-HF/reddit-bestofredditorupdates-processed) | |
4. These get visualized by [nomic atlas](https://docs.nomic.ai/atlas/introduction/quick-start). You can see how I process it in [build_nomic.py](https://huggingface.co/spaces/reddit-tools-HF/processing-bestofredditorupdates/blob/main/src/build_nomic.py) | |
""" | |
url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map" | |
html_str = f'<iframe src={url} style="border:none;height:1024px;width:100%" allow="clipboard-read; clipboard-write" title="Nomic Atlas">' | |
with gr.Blocks() as ui: | |
with gr.Tab("Application"): | |
gr.Markdown(intro_md) | |
gr.HTML(html_str) | |
with gr.Tab("Logs"): | |
gr.Markdown("# Logs") | |
output = gr.HTML(log_file_to_html_string, every=1) | |
with gr.Tab("Details"): | |
gr.Markdown(details_md) | |
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET) | |
async def handle_repository_changes(payload: WebhookPayload, task_queue: BackgroundTasks): | |
if not payload.event.scope.startswith("repo"): | |
return Response("No task scheduled", status_code=status.HTTP_200_OK) | |
# Only run if change is on main branch | |
try: | |
if payload.updatedRefs[0].ref != 'refs/heads/main': | |
response_content = "No task scheduled: Change not on main branch" | |
logger.info(response_content) | |
return Response(response_content, status_code=status.HTTP_200_OK) | |
except: | |
response_content = "No task scheduled" | |
logger.info(response_content) | |
return Response(response_content, status_code=status.HTTP_200_OK) | |
# No need to run for README only updates | |
try: | |
commit_files_url = f"""{payload.repo.url.api}/compare/{payload.updatedRefs[0].oldSha}..{payload.updatedRefs[0].newSha}?raw=true""" | |
response_text = get_session().get(commit_files_url, headers=build_hf_headers()).text | |
logger.info(f"Git Compare URl: {commit_files_url}") | |
# Splitting the output into lines | |
file_lines = response_text.split('\n') | |
# Filtering the lines to find file changes | |
changed_files = [line.split('\t')[-1] for line in file_lines if line.strip()] | |
logger.info(f"Changed files: {changed_files}") | |
# Checking if only README.md has been changed | |
if all('README.md' in file for file in changed_files): | |
response_content = "No task scheduled: its a README only update." | |
logger.info(response_content) | |
return Response(response_content, status_code=status.HTTP_200_OK) | |
except Exception as e: | |
logger.error(f"{str(e)}") | |
response_content = "Unexpected issue :'(" | |
logger.info(response_content) | |
return Response(response_content, status_code=status.HTTP_501_NOT_IMPLEMENTED) | |
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}") | |
task_queue.add_task(_process_webhook, payload=payload) | |
return Response("Task scheduled.", status_code=status.HTTP_202_ACCEPTED) | |
def _process_webhook(payload: WebhookPayload): | |
logger.info(f"Loading new dataset...") | |
dataset, original_dataset = load_datasets() | |
logger.info(f"Loaded new dataset") | |
logger.info(f"Merging and Updating rows...") | |
dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset) | |
logger.info(f"Merged and Updated rows") | |
# Push the augmented dataset to the Hugging Face hub | |
logger.info(f"Pushing processed data to the Hugging Face Hub...") | |
dataset.push_to_hub(PROCESSED_DATASET) | |
logger.info(f"Pushed processed data to the Hugging Face Hub") | |
update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count) | |
logger.info(f"Updated README.") | |
# Build Nomic | |
logger.info(f"Building Nomic...") | |
build_nomic(dataset=dataset) | |
logger.info(f"Built Nomic") | |
logger.info(f"Update from webhook completed!") | |
if __name__ == '__main__': | |
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860) | |