Spaces:

merve
/

streamlit-dataset-demo

Build error

App Files Files Community

streamlit-dataset-demo / app.py

merve HF staff

Upload app.py

143d008 over 3 years ago

raw

history blame

2.22 kB


	import streamlit as st
	import pandas as pd
	import re
	import nltk
	from PIL import Image
	import os
	import numpy as np
	import seaborn as sns
	from wordcloud import WordCloud, STOPWORDS
	from nltk.corpus import stopwords
	import datasets
	from datasets import load_dataset
	import sklearn
	from sklearn.preprocessing import LabelEncoder

	# loading dataset
	dataset = load_dataset("merve/poetry", streaming=True)
	df = pd.DataFrame.from_dict(dataset["train"])


	d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
	nltk.download("stopwords")
	stop = stopwords.words('english')

	# standardizing dataset by removing special characters and lowercasing

	def standardize(text, remove_digits=True):
	text=re.sub('[^a-zA-Z\d\s]', '',text)
	text = text.lower()

	return text

	st.write("Poetry dataset, content column cleaned from special characters and lowercased")
	df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
	df.content=df.content.apply(standardize)
	st.dataframe(df)

	#most appearing words including stopwords
	st.write("Most appearing words including stopwords")
	words = df.content.str.split(expand=True).unstack().value_counts()
	st.bar_chart(words[0:50])
	st.set_option('deprecation.showPyplotGlobalUse', False)



	mask = np.array(Image.open(os.path.join(d, "poet.png")))

	# distributions of poem types according to ages and authors
	st.write("Distributions of poem types according to ages and authors")
	le = LabelEncoder()

	df.author = le.fit_transform(df.author)
	sns.catplot(x="age", y="author",hue="type", data=df)
	st.pyplot()

	# most appearing words other than stop words

	import matplotlib.pyplot as plt
	def word_cloud(content, title):
	wc = WordCloud(background_color="white", max_words=200,contour_width=3,
	stopwords=STOPWORDS, mask = mask, max_font_size=50)
	wc.generate(" ".join(content.index.values))
	fig = plt.figure(figsize=(10, 10))
	plt.title(title, fontsize=20)
	plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
	plt.axis('off')
	st.pyplot()

	st.write("Most appearing words excluding stopwords")
	word_cloud(words, "Word Cloud")