Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.manifold import TSNE | |
import numpy as np | |
from numpy.linalg import norm | |
import matplotlib.pyplot as plt | |
import plotly.express as px | |
import re | |
# Load the LaBSE model | |
def load_model(): | |
return SentenceTransformer("sentence-transformers/LaBSE") | |
model = load_model() | |
def fetch_sitemap_urls(domain): | |
"""Fetch and parse URLs from sitemaps, excluding images and handling nested sitemaps.""" | |
domain = domain.replace("https://", "").replace("http://", "").strip("/") | |
sitemap_urls = [ | |
f"https://{domain}/sitemap.xml", | |
f"https://{domain}/sitemap_index.xml", | |
f"https://{domain}/robots.txt" | |
] | |
all_urls = [] | |
for sitemap_url in sitemap_urls: | |
try: | |
response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10) | |
response.raise_for_status() | |
if "robots.txt" in sitemap_url: | |
for line in response.text.splitlines(): | |
if line.lower().startswith("sitemap:"): | |
nested_sitemap_url = line.split(":", 1)[1].strip() | |
all_urls.extend(fetch_sitemap_urls_from_xml(nested_sitemap_url, domain, recursive=True)) | |
else: | |
all_urls.extend(fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=True)) | |
except requests.RequestException: | |
continue | |
return list(set(all_urls)) | |
def fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=False): | |
"""Fetch URLs from a sitemap XML file.""" | |
urls = [] | |
try: | |
response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, "lxml-xml") # Use lxml parser | |
if soup.find_all("sitemap"): | |
for sitemap in soup.find_all("sitemap"): | |
loc = sitemap.find("loc").text | |
if recursive: | |
urls.extend(fetch_sitemap_urls_from_xml(loc, domain, recursive=True)) | |
else: | |
for loc in soup.find_all("loc"): | |
url = loc.text | |
if not re.search(r"\.(jpg|jpeg|png|gif|svg|webp|bmp|tif|tiff)$", url, re.IGNORECASE): | |
urls.append(url) | |
except requests.RequestException: | |
pass | |
return urls | |
def clean_text_from_url(url, domain): | |
"""Clean URL by removing root domain and extracting readable text.""" | |
domain = domain.replace("https://", "").replace("http://", "").strip("/") | |
url = url.replace(f"https://{domain}/", "").replace(f"http://{domain}/", "") | |
text = re.sub(r"[^\w\s]", " ", url) | |
text = text.replace("/", " ").replace("_", " ").replace("-", " ") | |
return text.strip() | |
def compute_embeddings(data): | |
"""Generate normalized embeddings for the cleaned text.""" | |
data["Embedding"] = data["Cleaned Text"].apply(lambda text: model.encode(text)) | |
data["Embedding"] = data["Embedding"].apply(lambda emb: emb / norm(emb)) # Normalize | |
return data | |
def calculate_site_focus_and_radius(embeddings): | |
"""Calculate site focus score and site radius.""" | |
centroid_embedding = np.mean(embeddings, axis=0) | |
deviations = [1 - cosine_similarity([embedding], [centroid_embedding])[0][0] for embedding in embeddings] | |
site_radius = np.mean(deviations) | |
site_focus_score = max(0, 1 - site_radius) | |
return site_focus_score, site_radius, centroid_embedding, deviations | |
def plot_gradient_strip_with_indicator(score, title): | |
"""Visualize the score as a gradient strip with an indicator.""" | |
plt.figure(figsize=(8, 1)) | |
gradient = np.linspace(0, 1, 256).reshape(1, -1) | |
gradient = np.vstack((gradient, gradient)) | |
plt.imshow(gradient, aspect="auto", cmap="RdYlGn_r") # Red to Green reversed for correct mapping | |
plt.axvline(x=score * 256, color="black", linestyle="--", linewidth=2) | |
plt.gca().set_axis_off() | |
plt.title(f"{title}: {score * 100:.2f}%") | |
plt.show() | |
st.pyplot(plt) | |
def plot_3d_tsne(embeddings, urls, centroid, deviations): | |
"""Interactive 3D t-SNE scatter plot with hover labels.""" | |
tsne = TSNE(n_components=3, random_state=42, perplexity=min(30, len(embeddings) - 1)) | |
tsne_results = tsne.fit_transform(np.vstack([embeddings, centroid])) | |
centroid_tsne = tsne_results[-1] # Last point is the centroid | |
tsne_results = tsne_results[:-1] # Remaining points are pages | |
fig = px.scatter_3d( | |
x=tsne_results[:, 0], | |
y=tsne_results[:, 1], | |
z=tsne_results[:, 2], | |
color=deviations, | |
color_continuous_scale="RdYlGn_r", | |
hover_name=urls, | |
labels={"color": "Deviation"}, | |
title="3D t-SNE Projection of Page Embeddings" | |
) | |
fig.add_scatter3d( | |
x=[centroid_tsne[0]], | |
y=[centroid_tsne[1]], | |
z=[centroid_tsne[2]], | |
mode="markers", | |
marker=dict(size=15, color="green"), | |
name="Centroid" | |
) | |
st.plotly_chart(fig) | |
def plot_spherical_distances_optimized(deviations, embeddings, urls): | |
"""Improved scatter plot showing distances in a spherical layout with better angle distribution.""" | |
# Normalize embeddings | |
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) | |
num_points = len(deviations) | |
angles = np.linspace(0, 2 * np.pi, num_points, endpoint=False) # Spread angles evenly | |
# Create polar scatter plot | |
fig = px.scatter_polar( | |
r=deviations, | |
theta=np.degrees(angles), | |
color=deviations, | |
color_continuous_scale="RdYlGn_r", | |
title="Optimized Spherical Plot of Page Distances from Centroid", | |
labels={"color": "Deviation"} | |
) | |
# Update traces to show text (labels) only on hover | |
fig.update_traces( | |
mode="markers", # Display only markers by default | |
hovertemplate="%{text}<extra></extra>", # Show text on hover | |
text=urls # Set URLs as hover labels | |
) | |
st.plotly_chart(fig) | |
# Streamlit Interface | |
st.title("SiteFocus Tool") | |
domain = st.text_input("Enter domain:", placeholder="example.com") | |
if st.button("START"): | |
if domain: | |
urls = fetch_sitemap_urls(domain) | |
if not urls: | |
st.error("No URLs found. Please check the domain and try again.") | |
else: | |
cleaned_texts = [clean_text_from_url(url, domain) for url in urls] | |
embeddings = np.array([model.encode(text) / norm(model.encode(text)) for text in cleaned_texts]) | |
site_focus_score, site_radius, centroid, deviations = calculate_site_focus_and_radius(embeddings) | |
# Visualize siteFocusScore | |
st.subheader("siteFocusScore") | |
st.markdown("**Description:** The siteFocusScore reflects how tightly aligned a site's content is to a single thematic area. A higher score indicates greater thematic focus, which can improve topical authority in SEO.") | |
plot_gradient_strip_with_indicator(site_focus_score, "siteFocusScore") | |
# Visualize siteRadius | |
st.subheader("siteRadius") | |
st.markdown("**Description:** The siteRadius measures how far individual pages deviate from the site's central theme. A smaller radius indicates higher consistency across the site, which is beneficial for SEO.") | |
plot_gradient_strip_with_indicator(site_radius, "siteRadius") | |
# Sorted dataframe by closeness to centroid | |
st.subheader("Pages Closest to Centroid") | |
distances = [1 - dev for dev in deviations] | |
df = pd.DataFrame({"URL": urls, "Proximity to Centroid": distances}) | |
df_sorted = df.sort_values(by="Proximity to Centroid", ascending=False) | |
st.dataframe(df_sorted) | |
# Interactive 3D t-SNE plot | |
st.subheader("3D t-SNE Projection") | |
plot_3d_tsne(embeddings, urls, centroid, deviations) | |
# Optimized spherical distance plot | |
st.subheader("Spherical Distance Plot") | |
plot_spherical_distances_optimized(deviations, embeddings, urls) | |