TroglodyteDerivations
commited on
Upload bluesky_languages.py
Browse files- bluesky_languages.py +383 -0
bluesky_languages.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# bluesky_languages.py
|
2 |
+
|
3 |
+
from datasets import load_dataset
|
4 |
+
from langdetect import detect
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.express as px
|
7 |
+
import folium
|
8 |
+
|
9 |
+
|
10 |
+
# Load the dataset
|
11 |
+
dataset = load_dataset("alpindale/two-million-bluesky-posts")
|
12 |
+
|
13 |
+
|
14 |
+
# Function to detect language
|
15 |
+
def detect_language(text):
|
16 |
+
try:
|
17 |
+
return detect(text)
|
18 |
+
except:
|
19 |
+
return 'unknown'
|
20 |
+
|
21 |
+
# Apply language detection to the 'text' column
|
22 |
+
dataset['train'] = dataset['train'].map(lambda x: {'language': detect_language(x['text'])})
|
23 |
+
|
24 |
+
# Extract the 'language' column
|
25 |
+
languages = dataset['train']['language']
|
26 |
+
|
27 |
+
# Ensure languages is a pandas Series
|
28 |
+
languages = pd.Series(languages)
|
29 |
+
|
30 |
+
# Create a DataFrame with language counts
|
31 |
+
language_counts = pd.DataFrame(languages.value_counts()).reset_index()
|
32 |
+
language_counts.columns = ['language', 'count']
|
33 |
+
|
34 |
+
# Simplified mapping of languages to countries
|
35 |
+
# Expanded mapping of languages to countries
|
36 |
+
language_to_country = {
|
37 |
+
'en': 'United States',
|
38 |
+
'ja': 'Japan',
|
39 |
+
'unknown': 'Unknown',
|
40 |
+
'es': 'Spain',
|
41 |
+
'pt': 'Portugal',
|
42 |
+
'fr': 'France',
|
43 |
+
'de': 'Germany',
|
44 |
+
'ko': 'South Korea',
|
45 |
+
'nl': 'Netherlands',
|
46 |
+
'it': 'Italy',
|
47 |
+
'pl': 'Poland',
|
48 |
+
'so': 'Somalia',
|
49 |
+
'af': 'South Africa',
|
50 |
+
'ru': 'Russia',
|
51 |
+
'ca': 'Canada',
|
52 |
+
'tr': 'Turkey',
|
53 |
+
'no': 'Norway',
|
54 |
+
'id': 'Indonesia',
|
55 |
+
'fi': 'Finland',
|
56 |
+
'da': 'Denmark',
|
57 |
+
'cy': 'Cyprus',
|
58 |
+
'tl': 'Philippines',
|
59 |
+
'sv': 'Sweden',
|
60 |
+
'th': 'Thailand',
|
61 |
+
'ro': 'Romania',
|
62 |
+
'et': 'Estonia',
|
63 |
+
'sw': 'Kenya',
|
64 |
+
'vi': 'Vietnam',
|
65 |
+
'el': 'Greece',
|
66 |
+
'zh-cn': 'China',
|
67 |
+
'hr': 'Croatia',
|
68 |
+
'cs': 'Czech Republic',
|
69 |
+
'ur': 'Pakistan',
|
70 |
+
'sk': 'Slovakia',
|
71 |
+
'sl': 'Slovenia',
|
72 |
+
'uk': 'Ukraine',
|
73 |
+
'hu': 'Hungary',
|
74 |
+
'he': 'Israel',
|
75 |
+
'sq': 'Albania',
|
76 |
+
'bg': 'Bulgaria',
|
77 |
+
'lt': 'Lithuania',
|
78 |
+
'lv': 'Latvia',
|
79 |
+
'ar': 'Saudi Arabia',
|
80 |
+
'fa': 'Iran',
|
81 |
+
'zh-tw': 'Taiwan',
|
82 |
+
'mk': 'North Macedonia',
|
83 |
+
'hi': 'India',
|
84 |
+
'bn': 'Bangladesh',
|
85 |
+
'ne': 'Nepal',
|
86 |
+
'ml': 'India',
|
87 |
+
'ta': 'India',
|
88 |
+
'kn': 'India',
|
89 |
+
'pa': 'India',
|
90 |
+
'mr': 'India',
|
91 |
+
'te': 'India',
|
92 |
+
'gu': 'India'
|
93 |
+
}
|
94 |
+
|
95 |
+
# Map languages to countries with default value 'Unknown'
|
96 |
+
language_counts['country'] = language_counts['language'].map(language_to_country).fillna('Unknown')
|
97 |
+
|
98 |
+
# Map languages to countries
|
99 |
+
language_counts['country'] = language_counts['language'].map(language_to_country)
|
100 |
+
|
101 |
+
|
102 |
+
# Create a pie chart for languages
|
103 |
+
fig_languages = px.pie(language_counts, values='count', names='language', title='Language Distribution of Posts')
|
104 |
+
fig_languages.show()
|
105 |
+
|
106 |
+
# Create a pie chart for countries
|
107 |
+
fig_countries = px.pie(language_counts, values='count', names='country', title='Country Distribution of Posts')
|
108 |
+
fig_countries.show()
|
109 |
+
|
110 |
+
fig_languages.write_html('fig_languages.html')
|
111 |
+
fig_countries.write_html('fig_countries.html')
|
112 |
+
|
113 |
+
|
114 |
+
print(language_counts)
|
115 |
+
|
116 |
+
import folium
|
117 |
+
import pandas as pd
|
118 |
+
|
119 |
+
# Create a Folium map centered at the world
|
120 |
+
world_map = folium.Map(location=[20, 0], zoom_start=2)
|
121 |
+
|
122 |
+
# Mapping of continents to colors and icons
|
123 |
+
continent_colors = {
|
124 |
+
'Africa': 'red',
|
125 |
+
'Asia': 'green',
|
126 |
+
'Europe': 'blue',
|
127 |
+
'North America': 'purple',
|
128 |
+
'Oceania': 'orange',
|
129 |
+
'South America': 'black'
|
130 |
+
}
|
131 |
+
|
132 |
+
# Mapping of continents to icons
|
133 |
+
continent_icons = {
|
134 |
+
'Africa': 'fa-globe',
|
135 |
+
'Asia': 'fa-globe',
|
136 |
+
'Europe': 'fa-globe',
|
137 |
+
'North America': 'fa-globe',
|
138 |
+
'Oceania': 'fa-globe',
|
139 |
+
'South America': 'fa-globe'
|
140 |
+
}
|
141 |
+
|
142 |
+
# Function to get continent from country
|
143 |
+
def get_continent(country):
|
144 |
+
# Simplified mapping of countries to continents
|
145 |
+
country_to_continent = {
|
146 |
+
'United States': 'North America',
|
147 |
+
'Japan': 'Asia',
|
148 |
+
'Unknown': 'Unknown',
|
149 |
+
'Spain': 'Europe',
|
150 |
+
'Portugal': 'Europe',
|
151 |
+
'France': 'Europe',
|
152 |
+
'Germany': 'Europe',
|
153 |
+
'South Korea': 'Asia',
|
154 |
+
'Netherlands': 'Europe',
|
155 |
+
'Italy': 'Europe',
|
156 |
+
'Poland': 'Europe',
|
157 |
+
'Somalia': 'Africa',
|
158 |
+
'South Africa': 'Africa',
|
159 |
+
'Russia': 'Europe',
|
160 |
+
'Canada': 'North America',
|
161 |
+
'Turkey': 'Asia',
|
162 |
+
'Norway': 'Europe',
|
163 |
+
'Indonesia': 'Asia',
|
164 |
+
'Finland': 'Europe',
|
165 |
+
'Denmark': 'Europe',
|
166 |
+
'Cyprus': 'Asia',
|
167 |
+
'Philippines': 'Asia',
|
168 |
+
'Sweden': 'Europe',
|
169 |
+
'Thailand': 'Asia',
|
170 |
+
'Romania': 'Europe',
|
171 |
+
'Estonia': 'Europe',
|
172 |
+
'Kenya': 'Africa',
|
173 |
+
'Vietnam': 'Asia',
|
174 |
+
'Greece': 'Europe',
|
175 |
+
'China': 'Asia',
|
176 |
+
'Croatia': 'Europe',
|
177 |
+
'Czech Republic': 'Europe',
|
178 |
+
'Pakistan': 'Asia',
|
179 |
+
'Slovakia': 'Europe',
|
180 |
+
'Slovenia': 'Europe',
|
181 |
+
'Ukraine': 'Europe',
|
182 |
+
'Hungary': 'Europe',
|
183 |
+
'Israel': 'Asia',
|
184 |
+
'Albania': 'Europe',
|
185 |
+
'Bulgaria': 'Europe',
|
186 |
+
'Lithuania': 'Europe',
|
187 |
+
'Latvia': 'Europe',
|
188 |
+
'Saudi Arabia': 'Asia',
|
189 |
+
'Iran': 'Asia',
|
190 |
+
'Taiwan': 'Asia',
|
191 |
+
'North Macedonia': 'Europe',
|
192 |
+
'India': 'Asia',
|
193 |
+
'Bangladesh': 'Asia',
|
194 |
+
'Nepal': 'Asia',
|
195 |
+
'Malaysia': 'Asia',
|
196 |
+
'Singapore': 'Asia',
|
197 |
+
'Brunei': 'Asia',
|
198 |
+
'Cambodia': 'Asia',
|
199 |
+
'Laos': 'Asia',
|
200 |
+
'Myanmar': 'Asia',
|
201 |
+
'Timor-Leste': 'Asia',
|
202 |
+
'Papua New Guinea': 'Oceania',
|
203 |
+
'Australia': 'Oceania',
|
204 |
+
'New Zealand': 'Oceania',
|
205 |
+
'Fiji': 'Oceania',
|
206 |
+
'Solomon Islands': 'Oceania',
|
207 |
+
'Vanuatu': 'Oceania',
|
208 |
+
'Samoa': 'Oceania',
|
209 |
+
'Tonga': 'Oceania',
|
210 |
+
'Kiribati': 'Oceania',
|
211 |
+
'Tuvalu': 'Oceania',
|
212 |
+
'Nauru': 'Oceania',
|
213 |
+
'Palau': 'Oceania',
|
214 |
+
'Marshall Islands': 'Oceania',
|
215 |
+
'Micronesia': 'Oceania',
|
216 |
+
'Guam': 'Oceania',
|
217 |
+
'Northern Mariana Islands': 'Oceania',
|
218 |
+
'Puerto Rico': 'North America',
|
219 |
+
'Dominican Republic': 'North America',
|
220 |
+
'Haiti': 'North America',
|
221 |
+
'Jamaica': 'North America',
|
222 |
+
'Cuba': 'North America',
|
223 |
+
'Bahamas': 'North America',
|
224 |
+
'Barbados': 'North America',
|
225 |
+
'Trinidad and Tobago': 'North America',
|
226 |
+
'Grenada': 'North America',
|
227 |
+
'Saint Vincent and the Grenadines': 'North America',
|
228 |
+
'Antigua and Barbuda': 'North America',
|
229 |
+
'Saint Kitts and Nevis': 'North America',
|
230 |
+
'Belize': 'North America',
|
231 |
+
'Costa Rica': 'North America',
|
232 |
+
'El Salvador': 'North America',
|
233 |
+
'Guatemala': 'North America',
|
234 |
+
'Honduras': 'North America',
|
235 |
+
'Nicaragua': 'North America',
|
236 |
+
'Panama': 'North America',
|
237 |
+
'Argentina': 'South America',
|
238 |
+
'Bolivia': 'South America',
|
239 |
+
'Brazil': 'South America',
|
240 |
+
'Chile': 'South America',
|
241 |
+
'Colombia': 'South America',
|
242 |
+
'Ecuador': 'South America',
|
243 |
+
'Guyana': 'South America',
|
244 |
+
'Paraguay': 'South America',
|
245 |
+
'Peru': 'South America',
|
246 |
+
'Suriname': 'South America',
|
247 |
+
'Uruguay': 'South America',
|
248 |
+
'Venezuela': 'South America'
|
249 |
+
}
|
250 |
+
return country_to_continent.get(country, 'Unknown')
|
251 |
+
|
252 |
+
# Add markers for each country
|
253 |
+
for index, row in language_counts.iterrows():
|
254 |
+
country = row['country']
|
255 |
+
count = row['count']
|
256 |
+
continent = get_continent(country)
|
257 |
+
color = continent_colors.get(continent, 'gray')
|
258 |
+
icon = continent_icons.get(continent, 'fa-globe')
|
259 |
+
|
260 |
+
# Get the coordinates for the country (simplified)
|
261 |
+
country_coordinates = {
|
262 |
+
'United States': [37.0902, -95.7129],
|
263 |
+
'Japan': [36.2048, 138.2529],
|
264 |
+
'Unknown': [0, 0],
|
265 |
+
'Spain': [40.4637, -3.7492],
|
266 |
+
'Portugal': [39.3999, -8.2245],
|
267 |
+
'France': [46.6034, 1.8883],
|
268 |
+
'Germany': [51.1657, 10.4515],
|
269 |
+
'South Korea': [35.9078, 127.7669],
|
270 |
+
'Netherlands': [52.1326, 5.2913],
|
271 |
+
'Italy': [41.8719, 12.5674],
|
272 |
+
'Poland': [51.9194, 19.1451],
|
273 |
+
'Somalia': [5.1521, 46.1996],
|
274 |
+
'South Africa': [-30.5595, 22.9375],
|
275 |
+
'Russia': [61.5240, 105.3188],
|
276 |
+
'Canada': [56.1304, -106.3468],
|
277 |
+
'Turkey': [38.9637, 35.2433],
|
278 |
+
'Norway': [60.4720, 8.4689],
|
279 |
+
'Indonesia': [-0.7893, 113.9213],
|
280 |
+
'Finland': [61.9241, 25.7482],
|
281 |
+
'Denmark': [56.2639, 9.5018],
|
282 |
+
'Cyprus': [35.1264, 33.4299],
|
283 |
+
'Philippines': [12.8797, 121.7740],
|
284 |
+
'Sweden': [60.1282, 18.6435],
|
285 |
+
'Thailand': [15.8700, 100.9925],
|
286 |
+
'Romania': [45.9432, 24.9668],
|
287 |
+
'Estonia': [58.5953, 25.0136],
|
288 |
+
'Kenya': [0.0236, 37.9062],
|
289 |
+
'Vietnam': [14.0583, 108.2772],
|
290 |
+
'Greece': [39.0742, 21.8243],
|
291 |
+
'China': [35.8617, 104.1954],
|
292 |
+
'Croatia': [45.1000, 15.2000],
|
293 |
+
'Czech Republic': [49.8175, 15.4730],
|
294 |
+
'Pakistan': [30.3753, 69.3451],
|
295 |
+
'Slovakia': [48.6690, 19.6990],
|
296 |
+
'Slovenia': [46.1512, 14.9955],
|
297 |
+
'Ukraine': [48.3794, 31.1656],
|
298 |
+
'Hungary': [47.1625, 19.5033],
|
299 |
+
'Israel': [31.0461, 34.8516],
|
300 |
+
'Albania': [41.1533, 20.1683],
|
301 |
+
'Bulgaria': [42.7339, 25.4858],
|
302 |
+
'Lithuania': [55.1694, 23.8813],
|
303 |
+
'Latvia': [56.8796, 24.6032],
|
304 |
+
'Saudi Arabia': [23.8859, 45.0792],
|
305 |
+
'Iran': [32.4279, 53.6880],
|
306 |
+
'Taiwan': [23.6978, 120.9605],
|
307 |
+
'North Macedonia': [41.6086, 21.7453],
|
308 |
+
'India': [20.5937, 78.9629],
|
309 |
+
'Bangladesh': [23.6850, 90.3563],
|
310 |
+
'Nepal': [28.3949, 84.1240],
|
311 |
+
'Malaysia': [4.2105, 101.9758],
|
312 |
+
'Singapore': [1.3521, 103.8198],
|
313 |
+
'Brunei': [4.5353, 114.7277],
|
314 |
+
'Cambodia': [12.5657, 104.9910],
|
315 |
+
'Laos': [19.8563, 102.4955],
|
316 |
+
'Myanmar': [21.9162, 95.9560],
|
317 |
+
'Timor-Leste': [-8.8742, 125.7275],
|
318 |
+
'Papua New Guinea': [-6.3149, 143.9555],
|
319 |
+
'Australia': [-25.2744, 133.7751],
|
320 |
+
'New Zealand': [-40.9006, 174.8860],
|
321 |
+
'Fiji': [-17.7134, 178.0650],
|
322 |
+
'Solomon Islands': [-9.6457, 160.1562],
|
323 |
+
'Vanuatu': [-15.3767, 166.9592],
|
324 |
+
'Samoa': [-13.7590, -172.1046],
|
325 |
+
'Tonga': [-21.1790, -175.1982],
|
326 |
+
'Kiribati': [1.4518, 172.9717],
|
327 |
+
'Tuvalu': [-7.1095, 177.6493],
|
328 |
+
'Nauru': [-0.5228, 166.9315],
|
329 |
+
'Palau': [7.5150, 134.5825],
|
330 |
+
'Marshall Islands': [7.1315, 171.1845],
|
331 |
+
'Micronesia': [7.4256, 150.5508],
|
332 |
+
'Guam': [13.4443, 144.7937],
|
333 |
+
'Northern Mariana Islands': [15.0979, 145.6739],
|
334 |
+
'Puerto Rico': [18.2208, -66.5901],
|
335 |
+
'Dominican Republic': [18.7357, -70.1627],
|
336 |
+
'Haiti': [18.9712, -72.2852],
|
337 |
+
'Jamaica': [18.1096, -77.2975],
|
338 |
+
'Cuba': [21.5218, -77.7812],
|
339 |
+
'Bahamas': [25.0343, -77.3963],
|
340 |
+
'Barbados': [13.1939, -59.5432],
|
341 |
+
'Trinidad and Tobago': [10.6918, -61.2225],
|
342 |
+
'Grenada': [12.2627, -61.6041],
|
343 |
+
'Saint Vincent and the Grenadines': [12.9843, -61.2872],
|
344 |
+
'Antigua and Barbuda': [17.0608, -61.7964],
|
345 |
+
'Saint Kitts and Nevis': [17.3578, -62.7830],
|
346 |
+
'Belize': [17.1899, -88.4976],
|
347 |
+
'Costa Rica': [9.7489, -83.7534],
|
348 |
+
'El Salvador': [13.7942, -88.8965],
|
349 |
+
'Guatemala': [15.7835, -90.2308],
|
350 |
+
'Honduras': [15.1997, -86.2419],
|
351 |
+
'Nicaragua': [12.8654, -85.2072],
|
352 |
+
'Panama': [8.5380, -80.7821],
|
353 |
+
'Argentina': [-38.4161, -63.6167],
|
354 |
+
'Bolivia': [-16.2902, -63.5887],
|
355 |
+
'Brazil': [-14.2350, -51.9253],
|
356 |
+
'Chile': [-35.6751, -71.5430],
|
357 |
+
'Colombia': [4.5709, -74.2973],
|
358 |
+
'Ecuador': [-1.8312, -78.1834],
|
359 |
+
'Guyana': [4.8604, -58.9302],
|
360 |
+
'Paraguay': [-23.4425, -58.4438],
|
361 |
+
'Peru': [-9.1900, -75.0152],
|
362 |
+
'Suriname': [3.9193, -56.0278],
|
363 |
+
'Uruguay': [-32.5228, -55.7658],
|
364 |
+
'Venezuela': [6.4238, -66.5897]
|
365 |
+
}
|
366 |
+
|
367 |
+
# Get the coordinates for the country
|
368 |
+
coordinates = country_coordinates.get(country, [0, 0])
|
369 |
+
|
370 |
+
# Create a marker with pop-up information
|
371 |
+
folium.Marker(
|
372 |
+
location=coordinates,
|
373 |
+
popup=f"Country: {country}<br>Count: {count}",
|
374 |
+
icon=folium.Icon(color=color, icon=icon, prefix='fa')
|
375 |
+
).add_to(world_map)
|
376 |
+
|
377 |
+
# Save the map as an HTML file
|
378 |
+
world_map.save('world_map.html')
|
379 |
+
|
380 |
+
|
381 |
+
# Save the map as an HTML file
|
382 |
+
world_map.save('world_map.html')
|
383 |
+
|