Spaces:

dejanseo
/

siteFocusScore

Running

App Files Files Community

siteFocusScore / app.py

dejanseo

Update app.py

f6b3355 verified about 1 month ago

raw

history blame contribute delete

8.25 kB

	import streamlit as st
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.manifold import TSNE
	import numpy as np
	from numpy.linalg import norm
	import matplotlib.pyplot as plt
	import plotly.express as px
	import re

	# Load the LaBSE model
	@st.cache_resource
	def load_model():
	return SentenceTransformer("sentence-transformers/LaBSE")

	model = load_model()

	def fetch_sitemap_urls(domain):
	"""Fetch and parse URLs from sitemaps, excluding images and handling nested sitemaps."""
	domain = domain.replace("https://", "").replace("http://", "").strip("/")
	sitemap_urls = [
	f"https://{domain}/sitemap.xml",
	f"https://{domain}/sitemap_index.xml",
	f"https://{domain}/robots.txt"
	]
	all_urls = []

	for sitemap_url in sitemap_urls:
	try:
	response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10)
	response.raise_for_status()
	if "robots.txt" in sitemap_url:
	for line in response.text.splitlines():
	if line.lower().startswith("sitemap:"):
	nested_sitemap_url = line.split(":", 1)[1].strip()
	all_urls.extend(fetch_sitemap_urls_from_xml(nested_sitemap_url, domain, recursive=True))
	else:
	all_urls.extend(fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=True))
	except requests.RequestException:
	continue
	return list(set(all_urls))

	def fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=False):
	"""Fetch URLs from a sitemap XML file."""
	urls = []
	try:
	response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, "lxml-xml") # Use lxml parser
	if soup.find_all("sitemap"):
	for sitemap in soup.find_all("sitemap"):
	loc = sitemap.find("loc").text
	if recursive:
	urls.extend(fetch_sitemap_urls_from_xml(loc, domain, recursive=True))
	else:
	for loc in soup.find_all("loc"):
	url = loc.text
	if not re.search(r"\.(jpg\|jpeg\|png\|gif\|svg\|webp\|bmp\|tif\|tiff)$", url, re.IGNORECASE):
	urls.append(url)
	except requests.RequestException:
	pass
	return urls

	def clean_text_from_url(url, domain):
	"""Clean URL by removing root domain and extracting readable text."""
	domain = domain.replace("https://", "").replace("http://", "").strip("/")
	url = url.replace(f"https://{domain}/", "").replace(f"http://{domain}/", "")
	text = re.sub(r"[^\w\s]", " ", url)
	text = text.replace("/", " ").replace("_", " ").replace("-", " ")
	return text.strip()

	def compute_embeddings(data):
	"""Generate normalized embeddings for the cleaned text."""
	data["Embedding"] = data["Cleaned Text"].apply(lambda text: model.encode(text))
	data["Embedding"] = data["Embedding"].apply(lambda emb: emb / norm(emb)) # Normalize
	return data

	def calculate_site_focus_and_radius(embeddings):
	"""Calculate site focus score and site radius."""
	centroid_embedding = np.mean(embeddings, axis=0)
	deviations = [1 - cosine_similarity([embedding], [centroid_embedding])[0][0] for embedding in embeddings]
	site_radius = np.mean(deviations)
	site_focus_score = max(0, 1 - site_radius)
	return site_focus_score, site_radius, centroid_embedding, deviations

	def plot_gradient_strip_with_indicator(score, title):
	"""Visualize the score as a gradient strip with an indicator."""
	plt.figure(figsize=(8, 1))
	gradient = np.linspace(0, 1, 256).reshape(1, -1)
	gradient = np.vstack((gradient, gradient))
	plt.imshow(gradient, aspect="auto", cmap="RdYlGn_r") # Red to Green reversed for correct mapping
	plt.axvline(x=score * 256, color="black", linestyle="--", linewidth=2)
	plt.gca().set_axis_off()
	plt.title(f"{title}: {score * 100:.2f}%")
	plt.show()
	st.pyplot(plt)

	def plot_3d_tsne(embeddings, urls, centroid, deviations):
	"""Interactive 3D t-SNE scatter plot with hover labels."""
	tsne = TSNE(n_components=3, random_state=42, perplexity=min(30, len(embeddings) - 1))
	tsne_results = tsne.fit_transform(np.vstack([embeddings, centroid]))
	centroid_tsne = tsne_results[-1] # Last point is the centroid
	tsne_results = tsne_results[:-1] # Remaining points are pages

	fig = px.scatter_3d(
	x=tsne_results[:, 0],
	y=tsne_results[:, 1],
	z=tsne_results[:, 2],
	color=deviations,
	color_continuous_scale="RdYlGn_r",
	hover_name=urls,
	labels={"color": "Deviation"},
	title="3D t-SNE Projection of Page Embeddings"
	)
	fig.add_scatter3d(
	x=[centroid_tsne[0]],
	y=[centroid_tsne[1]],
	z=[centroid_tsne[2]],
	mode="markers",
	marker=dict(size=15, color="green"),
	name="Centroid"
	)
	st.plotly_chart(fig)

	def plot_spherical_distances_optimized(deviations, embeddings, urls):
	"""Improved scatter plot showing distances in a spherical layout with better angle distribution."""
	# Normalize embeddings
	normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
	num_points = len(deviations)
	angles = np.linspace(0, 2 * np.pi, num_points, endpoint=False) # Spread angles evenly

	# Create polar scatter plot
	fig = px.scatter_polar(
	r=deviations,
	theta=np.degrees(angles),
	color=deviations,
	color_continuous_scale="RdYlGn_r",
	title="Optimized Spherical Plot of Page Distances from Centroid",
	labels={"color": "Deviation"}
	)
	# Update traces to show text (labels) only on hover
	fig.update_traces(
	mode="markers", # Display only markers by default
	hovertemplate="%{text}<extra></extra>", # Show text on hover
	text=urls # Set URLs as hover labels
	)
	st.plotly_chart(fig)

	# Streamlit Interface
	st.title("SiteFocus Tool")

	domain = st.text_input("Enter domain:", placeholder="example.com")

	if st.button("START"):
	if domain:
	urls = fetch_sitemap_urls(domain)
	if not urls:
	st.error("No URLs found. Please check the domain and try again.")
	else:
	cleaned_texts = [clean_text_from_url(url, domain) for url in urls]
	embeddings = np.array([model.encode(text) / norm(model.encode(text)) for text in cleaned_texts])
	site_focus_score, site_radius, centroid, deviations = calculate_site_focus_and_radius(embeddings)

	# Visualize siteFocusScore
	st.subheader("siteFocusScore")
	st.markdown("Description: The siteFocusScore reflects how tightly aligned a site's content is to a single thematic area. A higher score indicates greater thematic focus, which can improve topical authority in SEO.")
	plot_gradient_strip_with_indicator(site_focus_score, "siteFocusScore")

	# Visualize siteRadius
	st.subheader("siteRadius")
	st.markdown("Description: The siteRadius measures how far individual pages deviate from the site's central theme. A smaller radius indicates higher consistency across the site, which is beneficial for SEO.")
	plot_gradient_strip_with_indicator(site_radius, "siteRadius")

	# Sorted dataframe by closeness to centroid
	st.subheader("Pages Closest to Centroid")
	distances = [1 - dev for dev in deviations]
	df = pd.DataFrame({"URL": urls, "Proximity to Centroid": distances})
	df_sorted = df.sort_values(by="Proximity to Centroid", ascending=False)
	st.dataframe(df_sorted)

	# Interactive 3D t-SNE plot
	st.subheader("3D t-SNE Projection")
	plot_3d_tsne(embeddings, urls, centroid, deviations)

	# Optimized spherical distance plot
	st.subheader("Spherical Distance Plot")
	plot_spherical_distances_optimized(deviations, embeddings, urls)