TroglodyteDerivations commited on
Commit
f624440
·
verified ·
1 Parent(s): f2b83f7

Upload bluesky_languages.py

Browse files
Files changed (1) hide show
  1. bluesky_languages.py +383 -0
bluesky_languages.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # bluesky_languages.py
2
+
3
+ from datasets import load_dataset
4
+ from langdetect import detect
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import folium
8
+
9
+
10
+ # Load the dataset
11
+ dataset = load_dataset("alpindale/two-million-bluesky-posts")
12
+
13
+
14
+ # Function to detect language
15
+ def detect_language(text):
16
+ try:
17
+ return detect(text)
18
+ except:
19
+ return 'unknown'
20
+
21
+ # Apply language detection to the 'text' column
22
+ dataset['train'] = dataset['train'].map(lambda x: {'language': detect_language(x['text'])})
23
+
24
+ # Extract the 'language' column
25
+ languages = dataset['train']['language']
26
+
27
+ # Ensure languages is a pandas Series
28
+ languages = pd.Series(languages)
29
+
30
+ # Create a DataFrame with language counts
31
+ language_counts = pd.DataFrame(languages.value_counts()).reset_index()
32
+ language_counts.columns = ['language', 'count']
33
+
34
+ # Simplified mapping of languages to countries
35
+ # Expanded mapping of languages to countries
36
+ language_to_country = {
37
+ 'en': 'United States',
38
+ 'ja': 'Japan',
39
+ 'unknown': 'Unknown',
40
+ 'es': 'Spain',
41
+ 'pt': 'Portugal',
42
+ 'fr': 'France',
43
+ 'de': 'Germany',
44
+ 'ko': 'South Korea',
45
+ 'nl': 'Netherlands',
46
+ 'it': 'Italy',
47
+ 'pl': 'Poland',
48
+ 'so': 'Somalia',
49
+ 'af': 'South Africa',
50
+ 'ru': 'Russia',
51
+ 'ca': 'Canada',
52
+ 'tr': 'Turkey',
53
+ 'no': 'Norway',
54
+ 'id': 'Indonesia',
55
+ 'fi': 'Finland',
56
+ 'da': 'Denmark',
57
+ 'cy': 'Cyprus',
58
+ 'tl': 'Philippines',
59
+ 'sv': 'Sweden',
60
+ 'th': 'Thailand',
61
+ 'ro': 'Romania',
62
+ 'et': 'Estonia',
63
+ 'sw': 'Kenya',
64
+ 'vi': 'Vietnam',
65
+ 'el': 'Greece',
66
+ 'zh-cn': 'China',
67
+ 'hr': 'Croatia',
68
+ 'cs': 'Czech Republic',
69
+ 'ur': 'Pakistan',
70
+ 'sk': 'Slovakia',
71
+ 'sl': 'Slovenia',
72
+ 'uk': 'Ukraine',
73
+ 'hu': 'Hungary',
74
+ 'he': 'Israel',
75
+ 'sq': 'Albania',
76
+ 'bg': 'Bulgaria',
77
+ 'lt': 'Lithuania',
78
+ 'lv': 'Latvia',
79
+ 'ar': 'Saudi Arabia',
80
+ 'fa': 'Iran',
81
+ 'zh-tw': 'Taiwan',
82
+ 'mk': 'North Macedonia',
83
+ 'hi': 'India',
84
+ 'bn': 'Bangladesh',
85
+ 'ne': 'Nepal',
86
+ 'ml': 'India',
87
+ 'ta': 'India',
88
+ 'kn': 'India',
89
+ 'pa': 'India',
90
+ 'mr': 'India',
91
+ 'te': 'India',
92
+ 'gu': 'India'
93
+ }
94
+
95
+ # Map languages to countries with default value 'Unknown'
96
+ language_counts['country'] = language_counts['language'].map(language_to_country).fillna('Unknown')
97
+
98
+ # Map languages to countries
99
+ language_counts['country'] = language_counts['language'].map(language_to_country)
100
+
101
+
102
+ # Create a pie chart for languages
103
+ fig_languages = px.pie(language_counts, values='count', names='language', title='Language Distribution of Posts')
104
+ fig_languages.show()
105
+
106
+ # Create a pie chart for countries
107
+ fig_countries = px.pie(language_counts, values='count', names='country', title='Country Distribution of Posts')
108
+ fig_countries.show()
109
+
110
+ fig_languages.write_html('fig_languages.html')
111
+ fig_countries.write_html('fig_countries.html')
112
+
113
+
114
+ print(language_counts)
115
+
116
+ import folium
117
+ import pandas as pd
118
+
119
+ # Create a Folium map centered at the world
120
+ world_map = folium.Map(location=[20, 0], zoom_start=2)
121
+
122
+ # Mapping of continents to colors and icons
123
+ continent_colors = {
124
+ 'Africa': 'red',
125
+ 'Asia': 'green',
126
+ 'Europe': 'blue',
127
+ 'North America': 'purple',
128
+ 'Oceania': 'orange',
129
+ 'South America': 'black'
130
+ }
131
+
132
+ # Mapping of continents to icons
133
+ continent_icons = {
134
+ 'Africa': 'fa-globe',
135
+ 'Asia': 'fa-globe',
136
+ 'Europe': 'fa-globe',
137
+ 'North America': 'fa-globe',
138
+ 'Oceania': 'fa-globe',
139
+ 'South America': 'fa-globe'
140
+ }
141
+
142
+ # Function to get continent from country
143
+ def get_continent(country):
144
+ # Simplified mapping of countries to continents
145
+ country_to_continent = {
146
+ 'United States': 'North America',
147
+ 'Japan': 'Asia',
148
+ 'Unknown': 'Unknown',
149
+ 'Spain': 'Europe',
150
+ 'Portugal': 'Europe',
151
+ 'France': 'Europe',
152
+ 'Germany': 'Europe',
153
+ 'South Korea': 'Asia',
154
+ 'Netherlands': 'Europe',
155
+ 'Italy': 'Europe',
156
+ 'Poland': 'Europe',
157
+ 'Somalia': 'Africa',
158
+ 'South Africa': 'Africa',
159
+ 'Russia': 'Europe',
160
+ 'Canada': 'North America',
161
+ 'Turkey': 'Asia',
162
+ 'Norway': 'Europe',
163
+ 'Indonesia': 'Asia',
164
+ 'Finland': 'Europe',
165
+ 'Denmark': 'Europe',
166
+ 'Cyprus': 'Asia',
167
+ 'Philippines': 'Asia',
168
+ 'Sweden': 'Europe',
169
+ 'Thailand': 'Asia',
170
+ 'Romania': 'Europe',
171
+ 'Estonia': 'Europe',
172
+ 'Kenya': 'Africa',
173
+ 'Vietnam': 'Asia',
174
+ 'Greece': 'Europe',
175
+ 'China': 'Asia',
176
+ 'Croatia': 'Europe',
177
+ 'Czech Republic': 'Europe',
178
+ 'Pakistan': 'Asia',
179
+ 'Slovakia': 'Europe',
180
+ 'Slovenia': 'Europe',
181
+ 'Ukraine': 'Europe',
182
+ 'Hungary': 'Europe',
183
+ 'Israel': 'Asia',
184
+ 'Albania': 'Europe',
185
+ 'Bulgaria': 'Europe',
186
+ 'Lithuania': 'Europe',
187
+ 'Latvia': 'Europe',
188
+ 'Saudi Arabia': 'Asia',
189
+ 'Iran': 'Asia',
190
+ 'Taiwan': 'Asia',
191
+ 'North Macedonia': 'Europe',
192
+ 'India': 'Asia',
193
+ 'Bangladesh': 'Asia',
194
+ 'Nepal': 'Asia',
195
+ 'Malaysia': 'Asia',
196
+ 'Singapore': 'Asia',
197
+ 'Brunei': 'Asia',
198
+ 'Cambodia': 'Asia',
199
+ 'Laos': 'Asia',
200
+ 'Myanmar': 'Asia',
201
+ 'Timor-Leste': 'Asia',
202
+ 'Papua New Guinea': 'Oceania',
203
+ 'Australia': 'Oceania',
204
+ 'New Zealand': 'Oceania',
205
+ 'Fiji': 'Oceania',
206
+ 'Solomon Islands': 'Oceania',
207
+ 'Vanuatu': 'Oceania',
208
+ 'Samoa': 'Oceania',
209
+ 'Tonga': 'Oceania',
210
+ 'Kiribati': 'Oceania',
211
+ 'Tuvalu': 'Oceania',
212
+ 'Nauru': 'Oceania',
213
+ 'Palau': 'Oceania',
214
+ 'Marshall Islands': 'Oceania',
215
+ 'Micronesia': 'Oceania',
216
+ 'Guam': 'Oceania',
217
+ 'Northern Mariana Islands': 'Oceania',
218
+ 'Puerto Rico': 'North America',
219
+ 'Dominican Republic': 'North America',
220
+ 'Haiti': 'North America',
221
+ 'Jamaica': 'North America',
222
+ 'Cuba': 'North America',
223
+ 'Bahamas': 'North America',
224
+ 'Barbados': 'North America',
225
+ 'Trinidad and Tobago': 'North America',
226
+ 'Grenada': 'North America',
227
+ 'Saint Vincent and the Grenadines': 'North America',
228
+ 'Antigua and Barbuda': 'North America',
229
+ 'Saint Kitts and Nevis': 'North America',
230
+ 'Belize': 'North America',
231
+ 'Costa Rica': 'North America',
232
+ 'El Salvador': 'North America',
233
+ 'Guatemala': 'North America',
234
+ 'Honduras': 'North America',
235
+ 'Nicaragua': 'North America',
236
+ 'Panama': 'North America',
237
+ 'Argentina': 'South America',
238
+ 'Bolivia': 'South America',
239
+ 'Brazil': 'South America',
240
+ 'Chile': 'South America',
241
+ 'Colombia': 'South America',
242
+ 'Ecuador': 'South America',
243
+ 'Guyana': 'South America',
244
+ 'Paraguay': 'South America',
245
+ 'Peru': 'South America',
246
+ 'Suriname': 'South America',
247
+ 'Uruguay': 'South America',
248
+ 'Venezuela': 'South America'
249
+ }
250
+ return country_to_continent.get(country, 'Unknown')
251
+
252
+ # Add markers for each country
253
+ for index, row in language_counts.iterrows():
254
+ country = row['country']
255
+ count = row['count']
256
+ continent = get_continent(country)
257
+ color = continent_colors.get(continent, 'gray')
258
+ icon = continent_icons.get(continent, 'fa-globe')
259
+
260
+ # Get the coordinates for the country (simplified)
261
+ country_coordinates = {
262
+ 'United States': [37.0902, -95.7129],
263
+ 'Japan': [36.2048, 138.2529],
264
+ 'Unknown': [0, 0],
265
+ 'Spain': [40.4637, -3.7492],
266
+ 'Portugal': [39.3999, -8.2245],
267
+ 'France': [46.6034, 1.8883],
268
+ 'Germany': [51.1657, 10.4515],
269
+ 'South Korea': [35.9078, 127.7669],
270
+ 'Netherlands': [52.1326, 5.2913],
271
+ 'Italy': [41.8719, 12.5674],
272
+ 'Poland': [51.9194, 19.1451],
273
+ 'Somalia': [5.1521, 46.1996],
274
+ 'South Africa': [-30.5595, 22.9375],
275
+ 'Russia': [61.5240, 105.3188],
276
+ 'Canada': [56.1304, -106.3468],
277
+ 'Turkey': [38.9637, 35.2433],
278
+ 'Norway': [60.4720, 8.4689],
279
+ 'Indonesia': [-0.7893, 113.9213],
280
+ 'Finland': [61.9241, 25.7482],
281
+ 'Denmark': [56.2639, 9.5018],
282
+ 'Cyprus': [35.1264, 33.4299],
283
+ 'Philippines': [12.8797, 121.7740],
284
+ 'Sweden': [60.1282, 18.6435],
285
+ 'Thailand': [15.8700, 100.9925],
286
+ 'Romania': [45.9432, 24.9668],
287
+ 'Estonia': [58.5953, 25.0136],
288
+ 'Kenya': [0.0236, 37.9062],
289
+ 'Vietnam': [14.0583, 108.2772],
290
+ 'Greece': [39.0742, 21.8243],
291
+ 'China': [35.8617, 104.1954],
292
+ 'Croatia': [45.1000, 15.2000],
293
+ 'Czech Republic': [49.8175, 15.4730],
294
+ 'Pakistan': [30.3753, 69.3451],
295
+ 'Slovakia': [48.6690, 19.6990],
296
+ 'Slovenia': [46.1512, 14.9955],
297
+ 'Ukraine': [48.3794, 31.1656],
298
+ 'Hungary': [47.1625, 19.5033],
299
+ 'Israel': [31.0461, 34.8516],
300
+ 'Albania': [41.1533, 20.1683],
301
+ 'Bulgaria': [42.7339, 25.4858],
302
+ 'Lithuania': [55.1694, 23.8813],
303
+ 'Latvia': [56.8796, 24.6032],
304
+ 'Saudi Arabia': [23.8859, 45.0792],
305
+ 'Iran': [32.4279, 53.6880],
306
+ 'Taiwan': [23.6978, 120.9605],
307
+ 'North Macedonia': [41.6086, 21.7453],
308
+ 'India': [20.5937, 78.9629],
309
+ 'Bangladesh': [23.6850, 90.3563],
310
+ 'Nepal': [28.3949, 84.1240],
311
+ 'Malaysia': [4.2105, 101.9758],
312
+ 'Singapore': [1.3521, 103.8198],
313
+ 'Brunei': [4.5353, 114.7277],
314
+ 'Cambodia': [12.5657, 104.9910],
315
+ 'Laos': [19.8563, 102.4955],
316
+ 'Myanmar': [21.9162, 95.9560],
317
+ 'Timor-Leste': [-8.8742, 125.7275],
318
+ 'Papua New Guinea': [-6.3149, 143.9555],
319
+ 'Australia': [-25.2744, 133.7751],
320
+ 'New Zealand': [-40.9006, 174.8860],
321
+ 'Fiji': [-17.7134, 178.0650],
322
+ 'Solomon Islands': [-9.6457, 160.1562],
323
+ 'Vanuatu': [-15.3767, 166.9592],
324
+ 'Samoa': [-13.7590, -172.1046],
325
+ 'Tonga': [-21.1790, -175.1982],
326
+ 'Kiribati': [1.4518, 172.9717],
327
+ 'Tuvalu': [-7.1095, 177.6493],
328
+ 'Nauru': [-0.5228, 166.9315],
329
+ 'Palau': [7.5150, 134.5825],
330
+ 'Marshall Islands': [7.1315, 171.1845],
331
+ 'Micronesia': [7.4256, 150.5508],
332
+ 'Guam': [13.4443, 144.7937],
333
+ 'Northern Mariana Islands': [15.0979, 145.6739],
334
+ 'Puerto Rico': [18.2208, -66.5901],
335
+ 'Dominican Republic': [18.7357, -70.1627],
336
+ 'Haiti': [18.9712, -72.2852],
337
+ 'Jamaica': [18.1096, -77.2975],
338
+ 'Cuba': [21.5218, -77.7812],
339
+ 'Bahamas': [25.0343, -77.3963],
340
+ 'Barbados': [13.1939, -59.5432],
341
+ 'Trinidad and Tobago': [10.6918, -61.2225],
342
+ 'Grenada': [12.2627, -61.6041],
343
+ 'Saint Vincent and the Grenadines': [12.9843, -61.2872],
344
+ 'Antigua and Barbuda': [17.0608, -61.7964],
345
+ 'Saint Kitts and Nevis': [17.3578, -62.7830],
346
+ 'Belize': [17.1899, -88.4976],
347
+ 'Costa Rica': [9.7489, -83.7534],
348
+ 'El Salvador': [13.7942, -88.8965],
349
+ 'Guatemala': [15.7835, -90.2308],
350
+ 'Honduras': [15.1997, -86.2419],
351
+ 'Nicaragua': [12.8654, -85.2072],
352
+ 'Panama': [8.5380, -80.7821],
353
+ 'Argentina': [-38.4161, -63.6167],
354
+ 'Bolivia': [-16.2902, -63.5887],
355
+ 'Brazil': [-14.2350, -51.9253],
356
+ 'Chile': [-35.6751, -71.5430],
357
+ 'Colombia': [4.5709, -74.2973],
358
+ 'Ecuador': [-1.8312, -78.1834],
359
+ 'Guyana': [4.8604, -58.9302],
360
+ 'Paraguay': [-23.4425, -58.4438],
361
+ 'Peru': [-9.1900, -75.0152],
362
+ 'Suriname': [3.9193, -56.0278],
363
+ 'Uruguay': [-32.5228, -55.7658],
364
+ 'Venezuela': [6.4238, -66.5897]
365
+ }
366
+
367
+ # Get the coordinates for the country
368
+ coordinates = country_coordinates.get(country, [0, 0])
369
+
370
+ # Create a marker with pop-up information
371
+ folium.Marker(
372
+ location=coordinates,
373
+ popup=f"Country: {country}<br>Count: {count}",
374
+ icon=folium.Icon(color=color, icon=icon, prefix='fa')
375
+ ).add_to(world_map)
376
+
377
+ # Save the map as an HTML file
378
+ world_map.save('world_map.html')
379
+
380
+
381
+ # Save the map as an HTML file
382
+ world_map.save('world_map.html')
383
+