Spaces:
Runtime error
Runtime error
HugoLaurencon
commited on
Commit
·
f217a73
1
Parent(s):
bfbcd60
rename badwords to flagged words + new flagged words list of 68 words
Browse files- app.py +17 -16
- en_examples_with_stats.json +2 -2
- explanation_filtering_pipeline.pdf +0 -0
- filtering.py +33 -36
- badwords.py → flagged_words.py +29 -444
- languages_id.py +25 -25
- parameters_filtering.py +52 -52
app.py
CHANGED
@@ -7,6 +7,7 @@ import os
|
|
7 |
import base64
|
8 |
import json
|
9 |
import pandas as pd
|
|
|
10 |
pd.options.mode.chained_assignment = None
|
11 |
|
12 |
import numpy as np
|
@@ -40,7 +41,7 @@ class Visualization:
|
|
40 |
self.lang_dataset_id = lang_dataset_id
|
41 |
self.param = LoadParameters.load_parameters(lang_dataset_id)
|
42 |
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
|
43 |
-
self.
|
44 |
self.model_lang_id = LoadParameters.load_model_lang_id(
|
45 |
lang_dataset_id, path_fasttext_model
|
46 |
)
|
@@ -222,16 +223,16 @@ class Visualization:
|
|
222 |
print_discared_by_cond(cond)
|
223 |
conds["stopwords_ratio"] = [cond]
|
224 |
|
225 |
-
if "
|
226 |
-
cutoff_def = "If the
|
227 |
-
|
228 |
cutoff_def, 0.0, 1.0, 1.0, step=0.01
|
229 |
)
|
230 |
-
new_key = ("
|
231 |
keys.append(new_key)
|
232 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
233 |
print_discared_by_cond(cond)
|
234 |
-
conds["
|
235 |
|
236 |
if "lang_id_score" in columns:
|
237 |
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
|
@@ -316,11 +317,11 @@ class Visualization:
|
|
316 |
"Discarded documents for the filter on the stop words ratio",
|
317 |
)
|
318 |
|
319 |
-
if "
|
320 |
-
cond_filter = np.invert(np.all(conds["
|
321 |
display_dataset(
|
322 |
cond_filter,
|
323 |
-
"Discarded documents for the filter on the
|
324 |
)
|
325 |
|
326 |
if "lang_id_score" in columns:
|
@@ -504,19 +505,19 @@ class Visualization:
|
|
504 |
if is_doc_discarded(key, stopwords_ratio):
|
505 |
is_discarded = True
|
506 |
|
507 |
-
elif key[0] == "
|
508 |
-
|
509 |
personal_doc,
|
510 |
self.sentencepiece_model_tok,
|
511 |
self.param["strip_characters"],
|
512 |
self.param["cond_words_augmentation"],
|
513 |
self.param["words_augmentation_group_sizes"],
|
514 |
self.param["words_augmentation_join_char"],
|
515 |
-
self.
|
516 |
)
|
517 |
-
|
518 |
-
st.markdown(f"Flagged words ratio: {
|
519 |
-
if is_doc_discarded(key,
|
520 |
is_discarded = True
|
521 |
|
522 |
elif key[0] == "lang_id_score":
|
@@ -530,7 +531,7 @@ class Visualization:
|
|
530 |
st.markdown(
|
531 |
f"Language identification confidence score: {lang_id_score}"
|
532 |
)
|
533 |
-
if is_doc_discarded(key,
|
534 |
self.lang_dataset_id != lang_pred_dataset_id
|
535 |
):
|
536 |
is_discarded = True
|
|
|
7 |
import base64
|
8 |
import json
|
9 |
import pandas as pd
|
10 |
+
|
11 |
pd.options.mode.chained_assignment = None
|
12 |
|
13 |
import numpy as np
|
|
|
41 |
self.lang_dataset_id = lang_dataset_id
|
42 |
self.param = LoadParameters.load_parameters(lang_dataset_id)
|
43 |
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
|
44 |
+
self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
|
45 |
self.model_lang_id = LoadParameters.load_model_lang_id(
|
46 |
lang_dataset_id, path_fasttext_model
|
47 |
)
|
|
|
223 |
print_discared_by_cond(cond)
|
224 |
conds["stopwords_ratio"] = [cond]
|
225 |
|
226 |
+
if "flagged_words_ratio" in columns:
|
227 |
+
cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
|
228 |
+
cutoff_flagged_words_ratio = st.sidebar.slider(
|
229 |
cutoff_def, 0.0, 1.0, 1.0, step=0.01
|
230 |
)
|
231 |
+
new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
|
232 |
keys.append(new_key)
|
233 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
234 |
print_discared_by_cond(cond)
|
235 |
+
conds["flagged_words_ratio"] = [cond]
|
236 |
|
237 |
if "lang_id_score" in columns:
|
238 |
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
|
|
|
317 |
"Discarded documents for the filter on the stop words ratio",
|
318 |
)
|
319 |
|
320 |
+
if "flagged_words_ratio" in columns:
|
321 |
+
cond_filter = np.invert(np.all(conds["flagged_words_ratio"], axis=0))
|
322 |
display_dataset(
|
323 |
cond_filter,
|
324 |
+
"Discarded documents for the filter on the flagged words ratio",
|
325 |
)
|
326 |
|
327 |
if "lang_id_score" in columns:
|
|
|
505 |
if is_doc_discarded(key, stopwords_ratio):
|
506 |
is_discarded = True
|
507 |
|
508 |
+
elif key[0] == "flagged_words_ratio":
|
509 |
+
flagged_words_ratio = Filtering.compute_flagged_words_ratio(
|
510 |
personal_doc,
|
511 |
self.sentencepiece_model_tok,
|
512 |
self.param["strip_characters"],
|
513 |
self.param["cond_words_augmentation"],
|
514 |
self.param["words_augmentation_group_sizes"],
|
515 |
self.param["words_augmentation_join_char"],
|
516 |
+
self.flagged_words,
|
517 |
)
|
518 |
+
flagged_words_ratio = round(flagged_words_ratio, 3)
|
519 |
+
st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
|
520 |
+
if is_doc_discarded(key, flagged_words_ratio):
|
521 |
is_discarded = True
|
522 |
|
523 |
elif key[0] == "lang_id_score":
|
|
|
531 |
st.markdown(
|
532 |
f"Language identification confidence score: {lang_id_score}"
|
533 |
)
|
534 |
+
if is_doc_discarded(key, flagged_words_ratio) or (
|
535 |
self.lang_dataset_id != lang_pred_dataset_id
|
536 |
):
|
537 |
is_discarded = True
|
en_examples_with_stats.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffbb8afeba42822e4b10341112999321e0e14a19a5eeebc342dc68a9f65d3c7f
|
3 |
+
size 237426014
|
explanation_filtering_pipeline.pdf
CHANGED
Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ
|
|
filtering.py
CHANGED
@@ -13,7 +13,7 @@ from languages_id import langs_id
|
|
13 |
from parameters_filtering import parameters_filtering
|
14 |
from normalization import normalization
|
15 |
from stopwords import stopwords
|
16 |
-
from
|
17 |
|
18 |
|
19 |
class LoadParameters:
|
@@ -37,15 +37,15 @@ class LoadParameters:
|
|
37 |
return stopwords_lang
|
38 |
|
39 |
@staticmethod
|
40 |
-
def
|
41 |
-
|
42 |
-
langs_id["dataset_id"] == lang_dataset_id, "
|
43 |
].iloc[0]
|
44 |
-
if
|
45 |
-
|
46 |
else:
|
47 |
-
|
48 |
-
return
|
49 |
|
50 |
@staticmethod
|
51 |
def load_model_lang_id(lang_dataset_id, path_fasttext_model):
|
@@ -533,14 +533,14 @@ class Filtering:
|
|
533 |
return cond
|
534 |
|
535 |
@staticmethod
|
536 |
-
def
|
537 |
document,
|
538 |
sentencepiece_model_tok,
|
539 |
strip_characters,
|
540 |
cond_words_augmentation,
|
541 |
words_augmentation_group_sizes,
|
542 |
words_augmentation_join_char,
|
543 |
-
|
544 |
):
|
545 |
words = ModifyingDocuments.get_words_from_document(
|
546 |
document,
|
@@ -559,39 +559,36 @@ class Filtering:
|
|
559 |
for group_size in words_augmentation_group_sizes
|
560 |
]
|
561 |
augmentation = [word for augm in augmentation for word in augm]
|
562 |
-
|
563 |
-
[word for word in words + augmentation if word in
|
564 |
) / len(words)
|
565 |
-
if
|
566 |
-
|
567 |
-
|
568 |
-
if word in badwords:
|
569 |
-
print(word)
|
570 |
-
return badwords_ratio
|
571 |
|
572 |
@staticmethod
|
573 |
-
def
|
574 |
document,
|
575 |
sentencepiece_model_tok,
|
576 |
strip_characters,
|
577 |
cond_words_augmentation,
|
578 |
words_augmentation_group_sizes,
|
579 |
words_augmentation_join_char,
|
580 |
-
|
581 |
-
|
582 |
):
|
583 |
cond = True
|
584 |
-
if
|
585 |
-
|
586 |
document,
|
587 |
sentencepiece_model_tok,
|
588 |
strip_characters,
|
589 |
cond_words_augmentation,
|
590 |
words_augmentation_group_sizes,
|
591 |
words_augmentation_join_char,
|
592 |
-
|
593 |
)
|
594 |
-
cond =
|
595 |
return cond
|
596 |
|
597 |
@staticmethod
|
@@ -685,9 +682,9 @@ class Filtering:
|
|
685 |
cond_check_stopwords,
|
686 |
stopwords,
|
687 |
stopwords_min_cutoff,
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
cond_check_lang_id,
|
692 |
lang_dataset_id,
|
693 |
model_lang_id,
|
@@ -732,16 +729,16 @@ class Filtering:
|
|
732 |
stopwords_min_cutoff,
|
733 |
):
|
734 |
return False
|
735 |
-
if
|
736 |
-
if not Filtering.
|
737 |
document,
|
738 |
sentencepiece_model_tok,
|
739 |
strip_characters,
|
740 |
cond_words_augmentation,
|
741 |
words_augmentation_group_sizes,
|
742 |
words_augmentation_join_char,
|
743 |
-
|
744 |
-
|
745 |
):
|
746 |
return False
|
747 |
if cond_check_lang_id:
|
@@ -778,7 +775,7 @@ class FunctionDatasetFiltering:
|
|
778 |
|
779 |
self.param = LoadParameters.load_parameters(lang_dataset_id)
|
780 |
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
|
781 |
-
self.
|
782 |
self.model_lang_id = LoadParameters.load_model_lang_id(
|
783 |
lang_dataset_id, path_fasttext_model
|
784 |
)
|
@@ -812,9 +809,9 @@ class FunctionDatasetFiltering:
|
|
812 |
cond_check_stopwords=self.param["cond_check_stopwords"],
|
813 |
stopwords=self.stopwords,
|
814 |
stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
cond_check_lang_id=self.param["cond_check_lang_id"],
|
819 |
lang_dataset_id=self.lang_dataset_id,
|
820 |
model_lang_id=self.model_lang_id,
|
|
|
13 |
from parameters_filtering import parameters_filtering
|
14 |
from normalization import normalization
|
15 |
from stopwords import stopwords
|
16 |
+
from flagged_words import flagged_words
|
17 |
|
18 |
|
19 |
class LoadParameters:
|
|
|
37 |
return stopwords_lang
|
38 |
|
39 |
@staticmethod
|
40 |
+
def load_flagged_words(lang_dataset_id):
|
41 |
+
flagged_words_lang_id = langs_id.loc[
|
42 |
+
langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id"
|
43 |
].iloc[0]
|
44 |
+
if flagged_words_lang_id:
|
45 |
+
flagged_words_lang = set(flagged_words[flagged_words_lang_id])
|
46 |
else:
|
47 |
+
flagged_words_lang = None
|
48 |
+
return flagged_words_lang
|
49 |
|
50 |
@staticmethod
|
51 |
def load_model_lang_id(lang_dataset_id, path_fasttext_model):
|
|
|
533 |
return cond
|
534 |
|
535 |
@staticmethod
|
536 |
+
def compute_flagged_words_ratio(
|
537 |
document,
|
538 |
sentencepiece_model_tok,
|
539 |
strip_characters,
|
540 |
cond_words_augmentation,
|
541 |
words_augmentation_group_sizes,
|
542 |
words_augmentation_join_char,
|
543 |
+
flagged_words,
|
544 |
):
|
545 |
words = ModifyingDocuments.get_words_from_document(
|
546 |
document,
|
|
|
559 |
for group_size in words_augmentation_group_sizes
|
560 |
]
|
561 |
augmentation = [word for augm in augmentation for word in augm]
|
562 |
+
flagged_words_ratio = len(
|
563 |
+
[word for word in words + augmentation if word in flagged_words]
|
564 |
) / len(words)
|
565 |
+
if flagged_words_ratio > 1.0:
|
566 |
+
flagged_words_ratio = 1.0
|
567 |
+
return flagged_words_ratio
|
|
|
|
|
|
|
568 |
|
569 |
@staticmethod
|
570 |
+
def check_flagged_words(
|
571 |
document,
|
572 |
sentencepiece_model_tok,
|
573 |
strip_characters,
|
574 |
cond_words_augmentation,
|
575 |
words_augmentation_group_sizes,
|
576 |
words_augmentation_join_char,
|
577 |
+
flagged_words,
|
578 |
+
flagged_words_max_cutoff,
|
579 |
):
|
580 |
cond = True
|
581 |
+
if flagged_words:
|
582 |
+
flagged_words_ratio = Filtering.compute_flagged_words_ratio(
|
583 |
document,
|
584 |
sentencepiece_model_tok,
|
585 |
strip_characters,
|
586 |
cond_words_augmentation,
|
587 |
words_augmentation_group_sizes,
|
588 |
words_augmentation_join_char,
|
589 |
+
flagged_words,
|
590 |
)
|
591 |
+
cond = flagged_words_ratio <= flagged_words_max_cutoff
|
592 |
return cond
|
593 |
|
594 |
@staticmethod
|
|
|
682 |
cond_check_stopwords,
|
683 |
stopwords,
|
684 |
stopwords_min_cutoff,
|
685 |
+
cond_check_flagged_words,
|
686 |
+
flagged_words,
|
687 |
+
flagged_words_max_cutoff,
|
688 |
cond_check_lang_id,
|
689 |
lang_dataset_id,
|
690 |
model_lang_id,
|
|
|
729 |
stopwords_min_cutoff,
|
730 |
):
|
731 |
return False
|
732 |
+
if cond_check_flagged_words:
|
733 |
+
if not Filtering.check_flagged_words(
|
734 |
document,
|
735 |
sentencepiece_model_tok,
|
736 |
strip_characters,
|
737 |
cond_words_augmentation,
|
738 |
words_augmentation_group_sizes,
|
739 |
words_augmentation_join_char,
|
740 |
+
flagged_words,
|
741 |
+
flagged_words_max_cutoff,
|
742 |
):
|
743 |
return False
|
744 |
if cond_check_lang_id:
|
|
|
775 |
|
776 |
self.param = LoadParameters.load_parameters(lang_dataset_id)
|
777 |
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
|
778 |
+
self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
|
779 |
self.model_lang_id = LoadParameters.load_model_lang_id(
|
780 |
lang_dataset_id, path_fasttext_model
|
781 |
)
|
|
|
809 |
cond_check_stopwords=self.param["cond_check_stopwords"],
|
810 |
stopwords=self.stopwords,
|
811 |
stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
|
812 |
+
cond_check_flagged_words=self.param["cond_check_flagged_words"],
|
813 |
+
flagged_words=self.flagged_words,
|
814 |
+
flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"],
|
815 |
cond_check_lang_id=self.param["cond_check_lang_id"],
|
816 |
lang_dataset_id=self.lang_dataset_id,
|
817 |
model_lang_id=self.model_lang_id,
|
badwords.py → flagged_words.py
RENAMED
@@ -6,89 +6,21 @@
|
|
6 |
# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
|
7 |
|
8 |
|
9 |
-
|
10 |
-
"abuse",
|
11 |
"anal",
|
12 |
-
"anilingus",
|
13 |
-
"anus",
|
14 |
-
"aroused",
|
15 |
-
"arse",
|
16 |
-
"arsehole",
|
17 |
-
"ass",
|
18 |
-
"asses",
|
19 |
-
"assfuck",
|
20 |
-
"asshat",
|
21 |
-
"asshole",
|
22 |
-
"assholes",
|
23 |
-
"autoerotic",
|
24 |
-
"bangbros",
|
25 |
-
"banging",
|
26 |
"bareback",
|
27 |
-
"bastard",
|
28 |
-
"bastards",
|
29 |
-
"bazongas",
|
30 |
"bbw",
|
31 |
"bdsm",
|
32 |
-
"biatch",
|
33 |
-
"bicurious",
|
34 |
-
"bigass",
|
35 |
-
"bigtits",
|
36 |
-
"bimbo",
|
37 |
-
"bimbos",
|
38 |
-
"bitch",
|
39 |
-
"bitches",
|
40 |
-
"bitching",
|
41 |
"blowjob",
|
42 |
"blowjobs",
|
43 |
-
"
|
44 |
-
"boner",
|
45 |
-
"boners",
|
46 |
-
"boob",
|
47 |
-
"boobies",
|
48 |
-
"boobs",
|
49 |
-
"booty",
|
50 |
-
"brothel",
|
51 |
-
"buceta",
|
52 |
-
"bugger",
|
53 |
-
"buggered",
|
54 |
-
"buggery",
|
55 |
"bukkake",
|
56 |
-
"bule",
|
57 |
-
"buttcheeks",
|
58 |
-
"buttfuck",
|
59 |
-
"butthead",
|
60 |
-
"butthole",
|
61 |
-
"buttplug",
|
62 |
-
"cameltoe",
|
63 |
"camgirl",
|
64 |
"camwhore",
|
65 |
-
"chink",
|
66 |
-
"chinks",
|
67 |
-
"cialis",
|
68 |
-
"clit",
|
69 |
-
"clitoris",
|
70 |
-
"clits",
|
71 |
-
"clitty",
|
72 |
-
"clusterfuck",
|
73 |
-
"cock",
|
74 |
-
"cock-head",
|
75 |
-
"cockblock",
|
76 |
-
"cockfight",
|
77 |
-
"cockhead",
|
78 |
-
"cocks",
|
79 |
-
"cocksman",
|
80 |
-
"cocksucker",
|
81 |
"cocksucking",
|
82 |
-
"
|
83 |
-
"coitus",
|
84 |
-
"coochie",
|
85 |
-
"cooly",
|
86 |
-
"coon",
|
87 |
-
"coons",
|
88 |
-
"copulate",
|
89 |
-
"cowgirl",
|
90 |
-
"crabs",
|
91 |
"creampie",
|
|
|
92 |
"cum",
|
93 |
"cumming",
|
94 |
"cums",
|
@@ -96,399 +28,58 @@ english_badwords = [
|
|
96 |
"cumshots",
|
97 |
"cumslut",
|
98 |
"cunnilingus",
|
99 |
-
"cunny",
|
100 |
-
"cunt",
|
101 |
-
"cunts",
|
102 |
-
"cybersex",
|
103 |
-
"darkey",
|
104 |
-
"darkie",
|
105 |
-
"darkies",
|
106 |
-
"darky",
|
107 |
"deepthroat",
|
108 |
"deepthroating",
|
109 |
-
"dick",
|
110 |
-
"dickhole",
|
111 |
-
"dicks",
|
112 |
"dildo",
|
113 |
"dildos",
|
114 |
"dogging",
|
115 |
-
"doggy-style",
|
116 |
"doggystyle",
|
117 |
"dominatrix",
|
118 |
-
"dommes",
|
119 |
-
"dong",
|
120 |
-
"dp",
|
121 |
-
"dupa",
|
122 |
-
"dyke",
|
123 |
-
"dykes",
|
124 |
-
"ecchi",
|
125 |
-
"ejaculate",
|
126 |
-
"ejaculated",
|
127 |
-
"ejaculates",
|
128 |
-
"ejaculating",
|
129 |
-
"ejaculation",
|
130 |
-
"ejaculations",
|
131 |
-
"enema",
|
132 |
-
"erect",
|
133 |
-
"erection",
|
134 |
-
"ero",
|
135 |
"erotic",
|
136 |
-
"erotism",
|
137 |
-
"escort",
|
138 |
-
"fag",
|
139 |
-
"fagging",
|
140 |
-
"faggot",
|
141 |
-
"fagot",
|
142 |
-
"fagots",
|
143 |
-
"fags",
|
144 |
-
"felch",
|
145 |
-
"fellate",
|
146 |
"fellatio",
|
147 |
"femdom",
|
148 |
-
"fetish",
|
149 |
-
"figging",
|
150 |
-
"fingerbang",
|
151 |
"fingering",
|
152 |
-
"fisted",
|
153 |
-
"fister",
|
154 |
"fisting",
|
155 |
-
"floozy",
|
156 |
-
"fondle",
|
157 |
-
"footfetish",
|
158 |
"footjob",
|
159 |
-
"foreskin",
|
160 |
-
"fornicate",
|
161 |
-
"foursome",
|
162 |
-
"fuck",
|
163 |
-
"fuckable",
|
164 |
-
"fuckbook",
|
165 |
-
"fuckboy",
|
166 |
-
"fuckbuddy",
|
167 |
-
"fucked",
|
168 |
-
"fucker",
|
169 |
-
"fuckers",
|
170 |
-
"fuckfest",
|
171 |
-
"fuckhole",
|
172 |
-
"fuckin",
|
173 |
-
"fucking",
|
174 |
-
"fucks",
|
175 |
-
"fuk",
|
176 |
-
"fukin",
|
177 |
-
"fuking",
|
178 |
-
"g-spot",
|
179 |
"gangbang",
|
180 |
-
"gangbanged",
|
181 |
-
"gangbanger",
|
182 |
-
"gangbangs",
|
183 |
-
"genital",
|
184 |
-
"genitals",
|
185 |
-
"gigolo",
|
186 |
-
"glans",
|
187 |
-
"gonad",
|
188 |
-
"gonads",
|
189 |
-
"gook",
|
190 |
-
"gringo",
|
191 |
-
"gringos",
|
192 |
-
"grope",
|
193 |
-
"gspot",
|
194 |
-
"guido",
|
195 |
"handjob",
|
196 |
-
"haole",
|
197 |
-
"hapa",
|
198 |
-
"hardcore",
|
199 |
-
"hardon",
|
200 |
-
"harem",
|
201 |
"hentai",
|
202 |
-
"hindoo",
|
203 |
-
"hoe",
|
204 |
-
"hoes",
|
205 |
-
"honky",
|
206 |
-
"hooker",
|
207 |
-
"hookers",
|
208 |
-
"hooter",
|
209 |
-
"hooters",
|
210 |
-
"hori",
|
211 |
-
"horndog",
|
212 |
"horney",
|
213 |
"horniest",
|
214 |
"horny",
|
215 |
-
"humped",
|
216 |
-
"humper",
|
217 |
-
"humping",
|
218 |
-
"hussy",
|
219 |
-
"hymen",
|
220 |
-
"ikey",
|
221 |
-
"incest",
|
222 |
-
"injun",
|
223 |
-
"intercourse",
|
224 |
-
"interracial",
|
225 |
-
"jack-off",
|
226 |
-
"jackoff",
|
227 |
-
"jailbait",
|
228 |
-
"jerk-off",
|
229 |
-
"jerkoff",
|
230 |
-
"jiggy",
|
231 |
"jism",
|
232 |
"jizz",
|
233 |
-
"jizzed",
|
234 |
-
"kaffir",
|
235 |
-
"kafir",
|
236 |
-
"kike",
|
237 |
-
"kikes",
|
238 |
-
"kinkster",
|
239 |
-
"kinky",
|
240 |
-
"kkk",
|
241 |
-
"klan",
|
242 |
-
"kraut",
|
243 |
-
"labia",
|
244 |
-
"lapdance",
|
245 |
-
"libido",
|
246 |
-
"licker",
|
247 |
-
"licking",
|
248 |
-
"limey",
|
249 |
-
"lingerie",
|
250 |
-
"livesex",
|
251 |
-
"lolita",
|
252 |
-
"lovemaking",
|
253 |
-
"lust",
|
254 |
-
"lusting",
|
255 |
-
"masochist",
|
256 |
-
"masterbate",
|
257 |
"masterbating",
|
258 |
-
"masterbation",
|
259 |
"masturbate",
|
260 |
"masturbating",
|
261 |
"masturbation",
|
262 |
"milf",
|
263 |
-
"minge",
|
264 |
-
"missionary",
|
265 |
-
"molest",
|
266 |
-
"molestation",
|
267 |
-
"molester",
|
268 |
-
"munging",
|
269 |
-
"muschi",
|
270 |
-
"nads",
|
271 |
-
"naked",
|
272 |
-
"necked",
|
273 |
-
"necro",
|
274 |
-
"negress",
|
275 |
-
"negro",
|
276 |
-
"negroes",
|
277 |
-
"negroid",
|
278 |
-
"negros",
|
279 |
-
"nig",
|
280 |
-
"nigar",
|
281 |
-
"nigga",
|
282 |
-
"niggas",
|
283 |
-
"niggaz",
|
284 |
-
"nigger",
|
285 |
-
"niggers",
|
286 |
-
"nigra",
|
287 |
-
"nipple",
|
288 |
-
"nipples",
|
289 |
-
"nookie",
|
290 |
-
"nooky",
|
291 |
-
"nooner",
|
292 |
-
"nude",
|
293 |
-
"nudie",
|
294 |
-
"nudity",
|
295 |
-
"nymph",
|
296 |
-
"nympho",
|
297 |
-
"nymphomania",
|
298 |
-
"orgasim",
|
299 |
-
"orgasm",
|
300 |
-
"orgasms",
|
301 |
"orgies",
|
302 |
"orgy",
|
303 |
-
"orifice",
|
304 |
-
"p0rn",
|
305 |
-
"paedophile",
|
306 |
-
"pantie",
|
307 |
-
"panties",
|
308 |
-
"panty",
|
309 |
-
"pastie",
|
310 |
-
"pecker",
|
311 |
-
"pedo",
|
312 |
-
"pedophile",
|
313 |
-
"pedophilia",
|
314 |
-
"pedophiliac",
|
315 |
-
"peeper",
|
316 |
-
"peepshow",
|
317 |
"pegging",
|
318 |
-
"penetrate",
|
319 |
-
"penetration",
|
320 |
-
"penile",
|
321 |
-
"penis",
|
322 |
-
"penises",
|
323 |
-
"penus",
|
324 |
-
"perv",
|
325 |
-
"phallic",
|
326 |
-
"phonesex",
|
327 |
-
"pickaninnies",
|
328 |
-
"pimp",
|
329 |
-
"playboy",
|
330 |
-
"playgirl",
|
331 |
-
"poontang",
|
332 |
"porn",
|
|
|
333 |
"porno",
|
334 |
-
"pornography",
|
335 |
"pornos",
|
336 |
-
"
|
337 |
-
"
|
338 |
-
"
|
339 |
-
"pron",
|
340 |
-
"prostitute",
|
341 |
-
"pube",
|
342 |
-
"pubes",
|
343 |
-
"pubic",
|
344 |
-
"pubis",
|
345 |
-
"punani",
|
346 |
-
"pussies",
|
347 |
-
"pussy",
|
348 |
-
"pussys",
|
349 |
-
"pusy",
|
350 |
-
"puta",
|
351 |
-
"puto",
|
352 |
-
"queef",
|
353 |
-
"quickie",
|
354 |
-
"quicky",
|
355 |
-
"quim",
|
356 |
-
"randy",
|
357 |
-
"rape",
|
358 |
-
"raped",
|
359 |
-
"raper",
|
360 |
-
"raping",
|
361 |
-
"rapist",
|
362 |
-
"rectum",
|
363 |
-
"redneck",
|
364 |
-
"rednecks",
|
365 |
-
"redskin",
|
366 |
-
"redskins",
|
367 |
-
"rimjob",
|
368 |
"rimming",
|
369 |
-
"russki",
|
370 |
-
"s&m",
|
371 |
-
"sadism",
|
372 |
-
"sadist",
|
373 |
-
"sambo",
|
374 |
-
"santorum",
|
375 |
-
"schlong",
|
376 |
-
"scissoring",
|
377 |
-
"semen",
|
378 |
-
"sex",
|
379 |
-
"sexed",
|
380 |
-
"sexi",
|
381 |
-
"sexing",
|
382 |
-
"sexo",
|
383 |
-
"sexpot",
|
384 |
-
"sextoy",
|
385 |
-
"sexual",
|
386 |
-
"sexually",
|
387 |
-
"sexx",
|
388 |
-
"sexxx",
|
389 |
-
"sexxxy",
|
390 |
-
"sexxy",
|
391 |
-
"sexy",
|
392 |
-
"sh!t",
|
393 |
-
"sh1t",
|
394 |
-
"shagging",
|
395 |
-
"shemale",
|
396 |
-
"sissy",
|
397 |
-
"skank",
|
398 |
-
"skanks",
|
399 |
-
"slapper",
|
400 |
-
"slut",
|
401 |
-
"sluts",
|
402 |
-
"slutting",
|
403 |
"slutty",
|
404 |
-
"smut",
|
405 |
-
"smutty",
|
406 |
-
"sodomise",
|
407 |
-
"sodomite",
|
408 |
-
"sodomize",
|
409 |
-
"sodomy",
|
410 |
-
"spank",
|
411 |
-
"sperm",
|
412 |
-
"spic",
|
413 |
-
"spick",
|
414 |
-
"splooge",
|
415 |
-
"spooge",
|
416 |
-
"squaw",
|
417 |
"squirting",
|
418 |
-
"steamy",
|
419 |
-
"stiffy",
|
420 |
"strapon",
|
421 |
-
"suck",
|
422 |
-
"sucked",
|
423 |
-
"sucker",
|
424 |
-
"sucking",
|
425 |
-
"sucks",
|
426 |
-
"swallow",
|
427 |
-
"swallower",
|
428 |
-
"swinger",
|
429 |
-
"teabagging",
|
430 |
-
"testical",
|
431 |
-
"testicle",
|
432 |
-
"testicles",
|
433 |
-
"testis",
|
434 |
"threesome",
|
435 |
-
"threeway",
|
436 |
-
"titfuck",
|
437 |
-
"titjob",
|
438 |
-
"tits",
|
439 |
-
"tittie",
|
440 |
-
"titties",
|
441 |
-
"titty",
|
442 |
-
"tittyfuck",
|
443 |
-
"tity",
|
444 |
-
"toots",
|
445 |
-
"topless",
|
446 |
-
"trannie",
|
447 |
-
"tranny",
|
448 |
-
"tribadism",
|
449 |
-
"twat",
|
450 |
-
"twats",
|
451 |
-
"undies",
|
452 |
-
"undressing",
|
453 |
-
"upskirt",
|
454 |
-
"vag",
|
455 |
-
"vagina",
|
456 |
-
"vaginal",
|
457 |
-
"viagra",
|
458 |
"vibrator",
|
459 |
-
"
|
460 |
-
"
|
461 |
-
"
|
462 |
-
"vulva",
|
463 |
-
"wank",
|
464 |
-
"wanker",
|
465 |
-
"wanking",
|
466 |
-
"wazoo",
|
467 |
-
"wedgie",
|
468 |
-
"wench",
|
469 |
-
"wetback",
|
470 |
-
"whore",
|
471 |
-
"whored",
|
472 |
-
"whorehouse",
|
473 |
-
"whores",
|
474 |
-
"whoring",
|
475 |
-
"wigger",
|
476 |
-
"willie",
|
477 |
-
"willies",
|
478 |
-
"willy",
|
479 |
-
"wog",
|
480 |
-
"wop",
|
481 |
-
"x-rated",
|
482 |
"xxx",
|
483 |
-
"
|
484 |
-
"yaoi",
|
485 |
-
"yid",
|
486 |
-
"zoophile",
|
487 |
-
"zoophilia",
|
488 |
]
|
489 |
|
490 |
-
|
491 |
-
|
|
|
492 |
+ [
|
493 |
"احتلام",
|
494 |
"اغتصاب",
|
@@ -529,9 +120,8 @@ badwords = {
|
|
529 |
"مفلقسة",
|
530 |
"نيك",
|
531 |
],
|
532 |
-
"ca":
|
533 |
+ [
|
534 |
-
"avortament",
|
535 |
"anal",
|
536 |
"anus",
|
537 |
"cul",
|
@@ -664,8 +254,8 @@ badwords = {
|
|
664 |
"x classificat",
|
665 |
"xxx",
|
666 |
],
|
667 |
-
"en":
|
668 |
-
"es":
|
669 |
+ [
|
670 |
"Asesinato",
|
671 |
"Bollera",
|
@@ -846,7 +436,6 @@ badwords = {
|
|
846 |
"tirón",
|
847 |
"tizón",
|
848 |
"tonto",
|
849 |
-
"transexual",
|
850 |
"vagina",
|
851 |
"vete a la mierda",
|
852 |
"viagra",
|
@@ -858,7 +447,7 @@ badwords = {
|
|
858 |
"xxx",
|
859 |
"zurullo",
|
860 |
],
|
861 |
-
"eu":
|
862 |
+ [
|
863 |
"abortu",
|
864 |
"anal",
|
@@ -899,13 +488,12 @@ badwords = {
|
|
899 |
"bagina",
|
900 |
"viagra",
|
901 |
],
|
902 |
-
"fr":
|
903 |
+ [
|
904 |
"MALPT",
|
905 |
"anal",
|
906 |
"anus",
|
907 |
"arracher",
|
908 |
-
"avortement",
|
909 |
"baise",
|
910 |
"baiser",
|
911 |
"baiseur de chien",
|
@@ -1078,7 +666,6 @@ badwords = {
|
|
1078 |
"testicule",
|
1079 |
"teuch",
|
1080 |
"titt",
|
1081 |
-
"transexuelle",
|
1082 |
"tremper",
|
1083 |
"tringler",
|
1084 |
"trique",
|
@@ -1102,7 +689,7 @@ badwords = {
|
|
1102 |
"éjaculé",
|
1103 |
"étron",
|
1104 |
],
|
1105 |
-
"hi":
|
1106 |
+ [
|
1107 |
"aand",
|
1108 |
"aandu",
|
@@ -1163,7 +750,6 @@ badwords = {
|
|
1163 |
"fagging",
|
1164 |
"fagots",
|
1165 |
"felching",
|
1166 |
-
"fuckers",
|
1167 |
"fuckings",
|
1168 |
"fucks",
|
1169 |
"gaand",
|
@@ -1361,7 +947,7 @@ badwords = {
|
|
1361 |
"होमोसेक्सुअल",
|
1362 |
"होर",
|
1363 |
],
|
1364 |
-
"id":
|
1365 |
+ [
|
1366 |
"abortus",
|
1367 |
"anal",
|
@@ -1479,7 +1065,7 @@ badwords = {
|
|
1479 |
"x diberi peringkat",
|
1480 |
"xxx",
|
1481 |
],
|
1482 |
-
"kn":
|
1483 |
+ [
|
1484 |
"ಗರ್ಭಪಾತ",
|
1485 |
"ಗುದ",
|
@@ -1622,7 +1208,7 @@ badwords = {
|
|
1622 |
"x ರೇಟೆಡ್",
|
1623 |
"xxx",
|
1624 |
],
|
1625 |
-
"ml":
|
1626 |
+ [
|
1627 |
"ഗർഭഛിദ്രം",
|
1628 |
"വ��ശപ്പ്",
|
@@ -1759,7 +1345,7 @@ badwords = {
|
|
1759 |
"x റേറ്റുചെയ്തു",
|
1760 |
"xxx",
|
1761 |
],
|
1762 |
-
"mr":
|
1763 |
+ [
|
1764 |
"गर्भपात",
|
1765 |
"गुदा",
|
@@ -1898,7 +1484,7 @@ badwords = {
|
|
1898 |
"एक्स रेट केले",
|
1899 |
"xxx",
|
1900 |
],
|
1901 |
-
"pt":
|
1902 |
+ [
|
1903 |
"aborto",
|
1904 |
"amador",
|
@@ -2000,7 +1586,6 @@ badwords = {
|
|
2000 |
"gozar",
|
2001 |
"grelho",
|
2002 |
"heroína",
|
2003 |
-
"homem gay",
|
2004 |
"homoerótico",
|
2005 |
"homosexual",
|
2006 |
"hore",
|
@@ -2097,7 +1682,7 @@ badwords = {
|
|
2097 |
"xxx",
|
2098 |
"ânus",
|
2099 |
],
|
2100 |
-
"te":
|
2101 |
+ [
|
2102 |
"గర్భస్రావం",
|
2103 |
"అంగ",
|
@@ -2244,7 +1829,7 @@ badwords = {
|
|
2244 |
"x రేట్",
|
2245 |
"xxx",
|
2246 |
],
|
2247 |
-
"vi":
|
2248 |
+ [
|
2249 |
"sự phá thai",
|
2250 |
"hậu môn",
|
@@ -2362,7 +1947,7 @@ badwords = {
|
|
2362 |
"x đánh giá",
|
2363 |
"xxx",
|
2364 |
],
|
2365 |
-
"zh":
|
2366 |
+ [
|
2367 |
"13.",
|
2368 |
"13点",
|
|
|
6 |
# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
|
7 |
|
8 |
|
9 |
+
english_flagged_words = [
|
|
|
10 |
"anal",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"bareback",
|
|
|
|
|
|
|
12 |
"bbw",
|
13 |
"bdsm",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"blowjob",
|
15 |
"blowjobs",
|
16 |
+
"brazzers",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"bukkake",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
"camgirl",
|
19 |
"camwhore",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
"cocksucking",
|
21 |
+
"cougar",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
"creampie",
|
23 |
+
"cuckold",
|
24 |
"cum",
|
25 |
"cumming",
|
26 |
"cums",
|
|
|
28 |
"cumshots",
|
29 |
"cumslut",
|
30 |
"cunnilingus",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
"deepthroat",
|
32 |
"deepthroating",
|
|
|
|
|
|
|
33 |
"dildo",
|
34 |
"dildos",
|
35 |
"dogging",
|
|
|
36 |
"doggystyle",
|
37 |
"dominatrix",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
"erotic",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
"fellatio",
|
40 |
"femdom",
|
|
|
|
|
|
|
41 |
"fingering",
|
|
|
|
|
42 |
"fisting",
|
|
|
|
|
|
|
43 |
"footjob",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
"gangbang",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
"handjob",
|
|
|
|
|
|
|
|
|
|
|
46 |
"hentai",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
"horney",
|
48 |
"horniest",
|
49 |
"horny",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
"jism",
|
51 |
"jizz",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
"masterbating",
|
|
|
53 |
"masturbate",
|
54 |
"masturbating",
|
55 |
"masturbation",
|
56 |
"milf",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
"orgies",
|
58 |
"orgy",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
"pegging",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
"porn",
|
61 |
+
"pornhub",
|
62 |
"porno",
|
|
|
63 |
"pornos",
|
64 |
+
"pornstar",
|
65 |
+
"pornstars",
|
66 |
+
"redtube",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
"rimming",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
"slutty",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
"squirting",
|
|
|
|
|
70 |
"strapon",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"threesome",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"vibrator",
|
73 |
+
"xhamster",
|
74 |
+
"xnxx",
|
75 |
+
"xvideos",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
"xxx",
|
77 |
+
"youporn",
|
|
|
|
|
|
|
|
|
78 |
]
|
79 |
|
80 |
+
|
81 |
+
flagged_words = {
|
82 |
+
"ar": english_flagged_words
|
83 |
+ [
|
84 |
"احتلام",
|
85 |
"اغتصاب",
|
|
|
120 |
"مفلقسة",
|
121 |
"نيك",
|
122 |
],
|
123 |
+
"ca": english_flagged_words
|
124 |
+ [
|
|
|
125 |
"anal",
|
126 |
"anus",
|
127 |
"cul",
|
|
|
254 |
"x classificat",
|
255 |
"xxx",
|
256 |
],
|
257 |
+
"en": english_flagged_words,
|
258 |
+
"es": english_flagged_words
|
259 |
+ [
|
260 |
"Asesinato",
|
261 |
"Bollera",
|
|
|
436 |
"tirón",
|
437 |
"tizón",
|
438 |
"tonto",
|
|
|
439 |
"vagina",
|
440 |
"vete a la mierda",
|
441 |
"viagra",
|
|
|
447 |
"xxx",
|
448 |
"zurullo",
|
449 |
],
|
450 |
+
"eu": english_flagged_words
|
451 |
+ [
|
452 |
"abortu",
|
453 |
"anal",
|
|
|
488 |
"bagina",
|
489 |
"viagra",
|
490 |
],
|
491 |
+
"fr": english_flagged_words
|
492 |
+ [
|
493 |
"MALPT",
|
494 |
"anal",
|
495 |
"anus",
|
496 |
"arracher",
|
|
|
497 |
"baise",
|
498 |
"baiser",
|
499 |
"baiseur de chien",
|
|
|
666 |
"testicule",
|
667 |
"teuch",
|
668 |
"titt",
|
|
|
669 |
"tremper",
|
670 |
"tringler",
|
671 |
"trique",
|
|
|
689 |
"éjaculé",
|
690 |
"étron",
|
691 |
],
|
692 |
+
"hi": english_flagged_words
|
693 |
+ [
|
694 |
"aand",
|
695 |
"aandu",
|
|
|
750 |
"fagging",
|
751 |
"fagots",
|
752 |
"felching",
|
|
|
753 |
"fuckings",
|
754 |
"fucks",
|
755 |
"gaand",
|
|
|
947 |
"होमोसेक्सुअल",
|
948 |
"होर",
|
949 |
],
|
950 |
+
"id": english_flagged_words
|
951 |
+ [
|
952 |
"abortus",
|
953 |
"anal",
|
|
|
1065 |
"x diberi peringkat",
|
1066 |
"xxx",
|
1067 |
],
|
1068 |
+
"kn": english_flagged_words
|
1069 |
+ [
|
1070 |
"ಗರ್ಭಪಾತ",
|
1071 |
"ಗುದ",
|
|
|
1208 |
"x ರೇಟೆಡ್",
|
1209 |
"xxx",
|
1210 |
],
|
1211 |
+
"ml": english_flagged_words
|
1212 |
+ [
|
1213 |
"ഗർഭഛിദ്രം",
|
1214 |
"വ��ശപ്പ്",
|
|
|
1345 |
"x റേറ്റുചെയ്തു",
|
1346 |
"xxx",
|
1347 |
],
|
1348 |
+
"mr": english_flagged_words
|
1349 |
+ [
|
1350 |
"गर्भपात",
|
1351 |
"गुदा",
|
|
|
1484 |
"एक्स रेट केले",
|
1485 |
"xxx",
|
1486 |
],
|
1487 |
+
"pt": english_flagged_words
|
1488 |
+ [
|
1489 |
"aborto",
|
1490 |
"amador",
|
|
|
1586 |
"gozar",
|
1587 |
"grelho",
|
1588 |
"heroína",
|
|
|
1589 |
"homoerótico",
|
1590 |
"homosexual",
|
1591 |
"hore",
|
|
|
1682 |
"xxx",
|
1683 |
"ânus",
|
1684 |
],
|
1685 |
+
"te": english_flagged_words
|
1686 |
+ [
|
1687 |
"గర్భస్రావం",
|
1688 |
"అంగ",
|
|
|
1829 |
"x రేట్",
|
1830 |
"xxx",
|
1831 |
],
|
1832 |
+
"vi": english_flagged_words
|
1833 |
+ [
|
1834 |
"sự phá thai",
|
1835 |
"hậu môn",
|
|
|
1947 |
"x đánh giá",
|
1948 |
"xxx",
|
1949 |
],
|
1950 |
+
"zh": english_flagged_words
|
1951 |
+ [
|
1952 |
"13.",
|
1953 |
"13点",
|
languages_id.py
CHANGED
@@ -6,7 +6,7 @@ langs_id = [
|
|
6 |
"lang": "Afrikaans",
|
7 |
"dataset_id": "af",
|
8 |
"stopwords_id": "af",
|
9 |
-
"
|
10 |
"fasttext_id": "af",
|
11 |
"sentencepiece_id": "af",
|
12 |
"kenlm_id": "af",
|
@@ -15,7 +15,7 @@ langs_id = [
|
|
15 |
"lang": "Arabic",
|
16 |
"dataset_id": "ar",
|
17 |
"stopwords_id": "ar",
|
18 |
-
"
|
19 |
"fasttext_id": "ar",
|
20 |
"sentencepiece_id": "ar",
|
21 |
"kenlm_id": "ar",
|
@@ -24,7 +24,7 @@ langs_id = [
|
|
24 |
"lang": "Egyptian Arabic",
|
25 |
"dataset_id": "arz",
|
26 |
"stopwords_id": None,
|
27 |
-
"
|
28 |
"fasttext_id": "arz",
|
29 |
"sentencepiece_id": None,
|
30 |
"kenlm_id": None,
|
@@ -33,7 +33,7 @@ langs_id = [
|
|
33 |
"lang": "Assamese",
|
34 |
"dataset_id": "as",
|
35 |
"stopwords_id": None,
|
36 |
-
"
|
37 |
"fasttext_id": "as",
|
38 |
"sentencepiece_id": None,
|
39 |
"kenlm_id": None,
|
@@ -42,7 +42,7 @@ langs_id = [
|
|
42 |
"lang": "Bengali",
|
43 |
"dataset_id": "bn",
|
44 |
"stopwords_id": "bn",
|
45 |
-
"
|
46 |
"fasttext_id": "bn",
|
47 |
"sentencepiece_id": "bn",
|
48 |
"kenlm_id": "bn",
|
@@ -51,7 +51,7 @@ langs_id = [
|
|
51 |
"lang": "Catalan",
|
52 |
"dataset_id": "ca",
|
53 |
"stopwords_id": "ca",
|
54 |
-
"
|
55 |
"fasttext_id": "ca",
|
56 |
"sentencepiece_id": "ca",
|
57 |
"kenlm_id": "ca",
|
@@ -60,7 +60,7 @@ langs_id = [
|
|
60 |
"lang": "English",
|
61 |
"dataset_id": "en",
|
62 |
"stopwords_id": "en",
|
63 |
-
"
|
64 |
"fasttext_id": "en",
|
65 |
"sentencepiece_id": "en",
|
66 |
"kenlm_id": "en",
|
@@ -69,7 +69,7 @@ langs_id = [
|
|
69 |
"lang": "Spanish",
|
70 |
"dataset_id": "es",
|
71 |
"stopwords_id": "es",
|
72 |
-
"
|
73 |
"fasttext_id": "es",
|
74 |
"sentencepiece_id": "es",
|
75 |
"kenlm_id": "es",
|
@@ -78,7 +78,7 @@ langs_id = [
|
|
78 |
"lang": "Basque",
|
79 |
"dataset_id": "eu",
|
80 |
"stopwords_id": "eu",
|
81 |
-
"
|
82 |
"fasttext_id": "eu",
|
83 |
"sentencepiece_id": None,
|
84 |
"kenlm_id": None,
|
@@ -87,7 +87,7 @@ langs_id = [
|
|
87 |
"lang": "French",
|
88 |
"dataset_id": "fr",
|
89 |
"stopwords_id": "fr",
|
90 |
-
"
|
91 |
"fasttext_id": "fr",
|
92 |
"sentencepiece_id": "fr",
|
93 |
"kenlm_id": "fr",
|
@@ -96,7 +96,7 @@ langs_id = [
|
|
96 |
"lang": "Gujarati",
|
97 |
"dataset_id": "gu",
|
98 |
"stopwords_id": None,
|
99 |
-
"
|
100 |
"fasttext_id": "gu",
|
101 |
"sentencepiece_id": "gu",
|
102 |
"kenlm_id": "gu",
|
@@ -105,7 +105,7 @@ langs_id = [
|
|
105 |
"lang": "Hindi",
|
106 |
"dataset_id": "hi",
|
107 |
"stopwords_id": "hi",
|
108 |
-
"
|
109 |
"fasttext_id": "hi",
|
110 |
"sentencepiece_id": "hi",
|
111 |
"kenlm_id": "hi",
|
@@ -114,7 +114,7 @@ langs_id = [
|
|
114 |
"lang": "Indonesian",
|
115 |
"dataset_id": "id",
|
116 |
"stopwords_id": "id",
|
117 |
-
"
|
118 |
"fasttext_id": "id",
|
119 |
"sentencepiece_id": "id",
|
120 |
"kenlm_id": "id",
|
@@ -123,7 +123,7 @@ langs_id = [
|
|
123 |
"lang": "Kannada",
|
124 |
"dataset_id": "kn",
|
125 |
"stopwords_id": None,
|
126 |
-
"
|
127 |
"fasttext_id": "kn",
|
128 |
"sentencepiece_id": "kn",
|
129 |
"kenlm_id": "kn",
|
@@ -132,7 +132,7 @@ langs_id = [
|
|
132 |
"lang": "Malayalam",
|
133 |
"dataset_id": "ml",
|
134 |
"stopwords_id": None,
|
135 |
-
"
|
136 |
"fasttext_id": "ml",
|
137 |
"sentencepiece_id": "ml",
|
138 |
"kenlm_id": "ml",
|
@@ -141,7 +141,7 @@ langs_id = [
|
|
141 |
"lang": "Marathi",
|
142 |
"dataset_id": "mr",
|
143 |
"stopwords_id": "mr",
|
144 |
-
"
|
145 |
"fasttext_id": "mr",
|
146 |
"sentencepiece_id": "mr",
|
147 |
"kenlm_id": "mr",
|
@@ -150,7 +150,7 @@ langs_id = [
|
|
150 |
"lang": "Portuguese",
|
151 |
"dataset_id": "pt",
|
152 |
"stopwords_id": "pt",
|
153 |
-
"
|
154 |
"fasttext_id": "pt",
|
155 |
"sentencepiece_id": "pt",
|
156 |
"kenlm_id": "pt",
|
@@ -159,7 +159,7 @@ langs_id = [
|
|
159 |
"lang": "Somali",
|
160 |
"dataset_id": "so",
|
161 |
"stopwords_id": "so",
|
162 |
-
"
|
163 |
"fasttext_id": "so",
|
164 |
"sentencepiece_id": None,
|
165 |
"kenlm_id": None,
|
@@ -168,7 +168,7 @@ langs_id = [
|
|
168 |
"lang": "Swahili",
|
169 |
"dataset_id": "sw",
|
170 |
"stopwords_id": "sw",
|
171 |
-
"
|
172 |
"fasttext_id": "sw",
|
173 |
"sentencepiece_id": None,
|
174 |
"kenlm_id": None,
|
@@ -177,7 +177,7 @@ langs_id = [
|
|
177 |
"lang": "Tamil",
|
178 |
"dataset_id": "ta",
|
179 |
"stopwords_id": None,
|
180 |
-
"
|
181 |
"fasttext_id": "ta",
|
182 |
"sentencepiece_id": None,
|
183 |
"kenlm_id": None,
|
@@ -186,7 +186,7 @@ langs_id = [
|
|
186 |
"lang": "Telugu",
|
187 |
"dataset_id": "te",
|
188 |
"stopwords_id": None,
|
189 |
-
"
|
190 |
"fasttext_id": "te",
|
191 |
"sentencepiece_id": None,
|
192 |
"kenlm_id": None,
|
@@ -195,7 +195,7 @@ langs_id = [
|
|
195 |
"lang": "Urdu",
|
196 |
"dataset_id": "ur",
|
197 |
"stopwords_id": "ur",
|
198 |
-
"
|
199 |
"fasttext_id": "ur",
|
200 |
"sentencepiece_id": None,
|
201 |
"kenlm_id": None,
|
@@ -204,7 +204,7 @@ langs_id = [
|
|
204 |
"lang": "Vietnamese",
|
205 |
"dataset_id": "vi",
|
206 |
"stopwords_id": "vi",
|
207 |
-
"
|
208 |
"fasttext_id": "vi",
|
209 |
"sentencepiece_id": None,
|
210 |
"kenlm_id": None,
|
@@ -213,7 +213,7 @@ langs_id = [
|
|
213 |
"lang": "Yoruba",
|
214 |
"dataset_id": "yo",
|
215 |
"stopwords_id": "yo",
|
216 |
-
"
|
217 |
"fasttext_id": "yo",
|
218 |
"sentencepiece_id": None,
|
219 |
"kenlm_id": None,
|
@@ -222,7 +222,7 @@ langs_id = [
|
|
222 |
"lang": "Chinese",
|
223 |
"dataset_id": "zh",
|
224 |
"stopwords_id": "zh",
|
225 |
-
"
|
226 |
"fasttext_id": "zh",
|
227 |
"sentencepiece_id": "zh",
|
228 |
"kenlm_id": "zh",
|
|
|
6 |
"lang": "Afrikaans",
|
7 |
"dataset_id": "af",
|
8 |
"stopwords_id": "af",
|
9 |
+
"flagged_words_id": None,
|
10 |
"fasttext_id": "af",
|
11 |
"sentencepiece_id": "af",
|
12 |
"kenlm_id": "af",
|
|
|
15 |
"lang": "Arabic",
|
16 |
"dataset_id": "ar",
|
17 |
"stopwords_id": "ar",
|
18 |
+
"flagged_words_id": "ar",
|
19 |
"fasttext_id": "ar",
|
20 |
"sentencepiece_id": "ar",
|
21 |
"kenlm_id": "ar",
|
|
|
24 |
"lang": "Egyptian Arabic",
|
25 |
"dataset_id": "arz",
|
26 |
"stopwords_id": None,
|
27 |
+
"flagged_words_id": None,
|
28 |
"fasttext_id": "arz",
|
29 |
"sentencepiece_id": None,
|
30 |
"kenlm_id": None,
|
|
|
33 |
"lang": "Assamese",
|
34 |
"dataset_id": "as",
|
35 |
"stopwords_id": None,
|
36 |
+
"flagged_words_id": None,
|
37 |
"fasttext_id": "as",
|
38 |
"sentencepiece_id": None,
|
39 |
"kenlm_id": None,
|
|
|
42 |
"lang": "Bengali",
|
43 |
"dataset_id": "bn",
|
44 |
"stopwords_id": "bn",
|
45 |
+
"flagged_words_id": None,
|
46 |
"fasttext_id": "bn",
|
47 |
"sentencepiece_id": "bn",
|
48 |
"kenlm_id": "bn",
|
|
|
51 |
"lang": "Catalan",
|
52 |
"dataset_id": "ca",
|
53 |
"stopwords_id": "ca",
|
54 |
+
"flagged_words_id": "ca",
|
55 |
"fasttext_id": "ca",
|
56 |
"sentencepiece_id": "ca",
|
57 |
"kenlm_id": "ca",
|
|
|
60 |
"lang": "English",
|
61 |
"dataset_id": "en",
|
62 |
"stopwords_id": "en",
|
63 |
+
"flagged_words_id": "en",
|
64 |
"fasttext_id": "en",
|
65 |
"sentencepiece_id": "en",
|
66 |
"kenlm_id": "en",
|
|
|
69 |
"lang": "Spanish",
|
70 |
"dataset_id": "es",
|
71 |
"stopwords_id": "es",
|
72 |
+
"flagged_words_id": "es",
|
73 |
"fasttext_id": "es",
|
74 |
"sentencepiece_id": "es",
|
75 |
"kenlm_id": "es",
|
|
|
78 |
"lang": "Basque",
|
79 |
"dataset_id": "eu",
|
80 |
"stopwords_id": "eu",
|
81 |
+
"flagged_words_id": "eu",
|
82 |
"fasttext_id": "eu",
|
83 |
"sentencepiece_id": None,
|
84 |
"kenlm_id": None,
|
|
|
87 |
"lang": "French",
|
88 |
"dataset_id": "fr",
|
89 |
"stopwords_id": "fr",
|
90 |
+
"flagged_words_id": "fr",
|
91 |
"fasttext_id": "fr",
|
92 |
"sentencepiece_id": "fr",
|
93 |
"kenlm_id": "fr",
|
|
|
96 |
"lang": "Gujarati",
|
97 |
"dataset_id": "gu",
|
98 |
"stopwords_id": None,
|
99 |
+
"flagged_words_id": None,
|
100 |
"fasttext_id": "gu",
|
101 |
"sentencepiece_id": "gu",
|
102 |
"kenlm_id": "gu",
|
|
|
105 |
"lang": "Hindi",
|
106 |
"dataset_id": "hi",
|
107 |
"stopwords_id": "hi",
|
108 |
+
"flagged_words_id": "hi",
|
109 |
"fasttext_id": "hi",
|
110 |
"sentencepiece_id": "hi",
|
111 |
"kenlm_id": "hi",
|
|
|
114 |
"lang": "Indonesian",
|
115 |
"dataset_id": "id",
|
116 |
"stopwords_id": "id",
|
117 |
+
"flagged_words_id": "id",
|
118 |
"fasttext_id": "id",
|
119 |
"sentencepiece_id": "id",
|
120 |
"kenlm_id": "id",
|
|
|
123 |
"lang": "Kannada",
|
124 |
"dataset_id": "kn",
|
125 |
"stopwords_id": None,
|
126 |
+
"flagged_words_id": "kn",
|
127 |
"fasttext_id": "kn",
|
128 |
"sentencepiece_id": "kn",
|
129 |
"kenlm_id": "kn",
|
|
|
132 |
"lang": "Malayalam",
|
133 |
"dataset_id": "ml",
|
134 |
"stopwords_id": None,
|
135 |
+
"flagged_words_id": "ml",
|
136 |
"fasttext_id": "ml",
|
137 |
"sentencepiece_id": "ml",
|
138 |
"kenlm_id": "ml",
|
|
|
141 |
"lang": "Marathi",
|
142 |
"dataset_id": "mr",
|
143 |
"stopwords_id": "mr",
|
144 |
+
"flagged_words_id": "mr",
|
145 |
"fasttext_id": "mr",
|
146 |
"sentencepiece_id": "mr",
|
147 |
"kenlm_id": "mr",
|
|
|
150 |
"lang": "Portuguese",
|
151 |
"dataset_id": "pt",
|
152 |
"stopwords_id": "pt",
|
153 |
+
"flagged_words_id": "pt",
|
154 |
"fasttext_id": "pt",
|
155 |
"sentencepiece_id": "pt",
|
156 |
"kenlm_id": "pt",
|
|
|
159 |
"lang": "Somali",
|
160 |
"dataset_id": "so",
|
161 |
"stopwords_id": "so",
|
162 |
+
"flagged_words_id": None,
|
163 |
"fasttext_id": "so",
|
164 |
"sentencepiece_id": None,
|
165 |
"kenlm_id": None,
|
|
|
168 |
"lang": "Swahili",
|
169 |
"dataset_id": "sw",
|
170 |
"stopwords_id": "sw",
|
171 |
+
"flagged_words_id": None,
|
172 |
"fasttext_id": "sw",
|
173 |
"sentencepiece_id": None,
|
174 |
"kenlm_id": None,
|
|
|
177 |
"lang": "Tamil",
|
178 |
"dataset_id": "ta",
|
179 |
"stopwords_id": None,
|
180 |
+
"flagged_words_id": None,
|
181 |
"fasttext_id": "ta",
|
182 |
"sentencepiece_id": None,
|
183 |
"kenlm_id": None,
|
|
|
186 |
"lang": "Telugu",
|
187 |
"dataset_id": "te",
|
188 |
"stopwords_id": None,
|
189 |
+
"flagged_words_id": "te",
|
190 |
"fasttext_id": "te",
|
191 |
"sentencepiece_id": None,
|
192 |
"kenlm_id": None,
|
|
|
195 |
"lang": "Urdu",
|
196 |
"dataset_id": "ur",
|
197 |
"stopwords_id": "ur",
|
198 |
+
"flagged_words_id": None,
|
199 |
"fasttext_id": "ur",
|
200 |
"sentencepiece_id": None,
|
201 |
"kenlm_id": None,
|
|
|
204 |
"lang": "Vietnamese",
|
205 |
"dataset_id": "vi",
|
206 |
"stopwords_id": "vi",
|
207 |
+
"flagged_words_id": "vi",
|
208 |
"fasttext_id": "vi",
|
209 |
"sentencepiece_id": None,
|
210 |
"kenlm_id": None,
|
|
|
213 |
"lang": "Yoruba",
|
214 |
"dataset_id": "yo",
|
215 |
"stopwords_id": "yo",
|
216 |
+
"flagged_words_id": None,
|
217 |
"fasttext_id": "yo",
|
218 |
"sentencepiece_id": None,
|
219 |
"kenlm_id": None,
|
|
|
222 |
"lang": "Chinese",
|
223 |
"dataset_id": "zh",
|
224 |
"stopwords_id": "zh",
|
225 |
+
"flagged_words_id": "zh",
|
226 |
"fasttext_id": "zh",
|
227 |
"sentencepiece_id": "zh",
|
228 |
"kenlm_id": "zh",
|
parameters_filtering.py
CHANGED
@@ -39,8 +39,8 @@ parameters_filtering_default = {
|
|
39 |
"words_augmentation_join_char": "",
|
40 |
"cond_check_stopwords": False,
|
41 |
"stopwords_min_cutoff": 0,
|
42 |
-
"
|
43 |
-
"
|
44 |
"cond_check_lang_id": True,
|
45 |
"lang_id_min_cutoff": 0.70,
|
46 |
"cond_check_perplexity": False,
|
@@ -70,8 +70,8 @@ parameters_filtering_af = {
|
|
70 |
"words_augmentation_join_char": "",
|
71 |
"cond_check_stopwords": True,
|
72 |
"stopwords_min_cutoff": 0,
|
73 |
-
"
|
74 |
-
"
|
75 |
"cond_check_lang_id": True,
|
76 |
"lang_id_min_cutoff": 0.6,
|
77 |
"cond_check_perplexity": True,
|
@@ -101,8 +101,8 @@ parameters_filtering_ar = {
|
|
101 |
"words_augmentation_join_char": "",
|
102 |
"cond_check_stopwords": True,
|
103 |
"stopwords_min_cutoff": 0,
|
104 |
-
"
|
105 |
-
"
|
106 |
"cond_check_lang_id": True,
|
107 |
"lang_id_min_cutoff": 0.75,
|
108 |
"cond_check_perplexity": True,
|
@@ -132,8 +132,8 @@ parameters_filtering_arz = {
|
|
132 |
"words_augmentation_join_char": "",
|
133 |
"cond_check_stopwords": True,
|
134 |
"stopwords_min_cutoff": 0,
|
135 |
-
"
|
136 |
-
"
|
137 |
"cond_check_lang_id": True,
|
138 |
"lang_id_min_cutoff": 0.75,
|
139 |
"cond_check_perplexity": False,
|
@@ -163,8 +163,8 @@ parameters_filtering_as = {
|
|
163 |
"words_augmentation_join_char": "",
|
164 |
"cond_check_stopwords": True,
|
165 |
"stopwords_min_cutoff": 0,
|
166 |
-
"
|
167 |
-
"
|
168 |
"cond_check_lang_id": True,
|
169 |
"lang_id_min_cutoff": 0.75,
|
170 |
"cond_check_perplexity": False,
|
@@ -194,8 +194,8 @@ parameters_filtering_bn = {
|
|
194 |
"words_augmentation_join_char": "",
|
195 |
"cond_check_stopwords": True,
|
196 |
"stopwords_min_cutoff": 0.05,
|
197 |
-
"
|
198 |
-
"
|
199 |
"cond_check_lang_id": True,
|
200 |
"lang_id_min_cutoff": 0.75,
|
201 |
"cond_check_perplexity": False,
|
@@ -225,8 +225,8 @@ parameters_filtering_ca = {
|
|
225 |
"words_augmentation_join_char": "",
|
226 |
"cond_check_stopwords": True,
|
227 |
"stopwords_min_cutoff": 0,
|
228 |
-
"
|
229 |
-
"
|
230 |
"cond_check_lang_id": True,
|
231 |
"lang_id_min_cutoff": 0.75,
|
232 |
"cond_check_perplexity": True,
|
@@ -256,8 +256,8 @@ parameters_filtering_en = {
|
|
256 |
"words_augmentation_join_char": "",
|
257 |
"cond_check_stopwords": True,
|
258 |
"stopwords_min_cutoff": 0.3,
|
259 |
-
"
|
260 |
-
"
|
261 |
"cond_check_lang_id": True,
|
262 |
"lang_id_min_cutoff": 0.80,
|
263 |
"cond_check_perplexity": True,
|
@@ -287,8 +287,8 @@ parameters_filtering_es = {
|
|
287 |
"words_augmentation_join_char": "",
|
288 |
"cond_check_stopwords": True,
|
289 |
"stopwords_min_cutoff": 0.2,
|
290 |
-
"
|
291 |
-
"
|
292 |
"cond_check_lang_id": True,
|
293 |
"lang_id_min_cutoff": 0.75,
|
294 |
"cond_check_perplexity": True,
|
@@ -318,8 +318,8 @@ parameters_filtering_eu = {
|
|
318 |
"words_augmentation_join_char": "",
|
319 |
"cond_check_stopwords": True,
|
320 |
"stopwords_min_cutoff": 0,
|
321 |
-
"
|
322 |
-
"
|
323 |
"cond_check_lang_id": True,
|
324 |
"lang_id_min_cutoff": 0.75,
|
325 |
"cond_check_perplexity": False,
|
@@ -349,8 +349,8 @@ parameters_filtering_fr = {
|
|
349 |
"words_augmentation_join_char": "",
|
350 |
"cond_check_stopwords": True,
|
351 |
"stopwords_min_cutoff": 0.15,
|
352 |
-
"
|
353 |
-
"
|
354 |
"cond_check_lang_id": True,
|
355 |
"lang_id_min_cutoff": 0.75,
|
356 |
"cond_check_perplexity": True,
|
@@ -380,8 +380,8 @@ parameters_filtering_gu = {
|
|
380 |
"words_augmentation_join_char": "",
|
381 |
"cond_check_stopwords": True,
|
382 |
"stopwords_min_cutoff": 0,
|
383 |
-
"
|
384 |
-
"
|
385 |
"cond_check_lang_id": True,
|
386 |
"lang_id_min_cutoff": 0.75,
|
387 |
"cond_check_perplexity": True,
|
@@ -411,8 +411,8 @@ parameters_filtering_hi = {
|
|
411 |
"words_augmentation_join_char": "",
|
412 |
"cond_check_stopwords": True,
|
413 |
"stopwords_min_cutoff": 0,
|
414 |
-
"
|
415 |
-
"
|
416 |
"cond_check_lang_id": True,
|
417 |
"lang_id_min_cutoff": 0.75,
|
418 |
"cond_check_perplexity": True,
|
@@ -442,8 +442,8 @@ parameters_filtering_id = {
|
|
442 |
"words_augmentation_join_char": "",
|
443 |
"cond_check_stopwords": True,
|
444 |
"stopwords_min_cutoff": 0.25,
|
445 |
-
"
|
446 |
-
"
|
447 |
"cond_check_lang_id": True,
|
448 |
"lang_id_min_cutoff": 0.75,
|
449 |
"cond_check_perplexity": True,
|
@@ -473,8 +473,8 @@ parameters_filtering_kn = {
|
|
473 |
"words_augmentation_join_char": "",
|
474 |
"cond_check_stopwords": True,
|
475 |
"stopwords_min_cutoff": 0,
|
476 |
-
"
|
477 |
-
"
|
478 |
"cond_check_lang_id": True,
|
479 |
"lang_id_min_cutoff": 0.75,
|
480 |
"cond_check_perplexity": True,
|
@@ -504,8 +504,8 @@ parameters_filtering_ml = {
|
|
504 |
"words_augmentation_join_char": "",
|
505 |
"cond_check_stopwords": True,
|
506 |
"stopwords_min_cutoff": 0,
|
507 |
-
"
|
508 |
-
"
|
509 |
"cond_check_lang_id": True,
|
510 |
"lang_id_min_cutoff": 0.75,
|
511 |
"cond_check_perplexity": True,
|
@@ -535,8 +535,8 @@ parameters_filtering_mr = {
|
|
535 |
"words_augmentation_join_char": "",
|
536 |
"cond_check_stopwords": True,
|
537 |
"stopwords_min_cutoff": 0,
|
538 |
-
"
|
539 |
-
"
|
540 |
"cond_check_lang_id": True,
|
541 |
"lang_id_min_cutoff": 0.75,
|
542 |
"cond_check_perplexity": True,
|
@@ -566,8 +566,8 @@ parameters_filtering_pt = {
|
|
566 |
"words_augmentation_join_char": "",
|
567 |
"cond_check_stopwords": True,
|
568 |
"stopwords_min_cutoff": 0.15,
|
569 |
-
"
|
570 |
-
"
|
571 |
"cond_check_lang_id": True,
|
572 |
"lang_id_min_cutoff": 0.75,
|
573 |
"cond_check_perplexity": True,
|
@@ -597,8 +597,8 @@ parameters_filtering_so = {
|
|
597 |
"words_augmentation_join_char": "",
|
598 |
"cond_check_stopwords": False,
|
599 |
"stopwords_min_cutoff": 0,
|
600 |
-
"
|
601 |
-
"
|
602 |
"cond_check_lang_id": True,
|
603 |
"lang_id_min_cutoff": 0.75,
|
604 |
"cond_check_perplexity": False,
|
@@ -628,8 +628,8 @@ parameters_filtering_sw = {
|
|
628 |
"words_augmentation_join_char": "",
|
629 |
"cond_check_stopwords": True,
|
630 |
"stopwords_min_cutoff": 0,
|
631 |
-
"
|
632 |
-
"
|
633 |
"cond_check_lang_id": True,
|
634 |
"lang_id_min_cutoff": 0.75,
|
635 |
"cond_check_perplexity": False,
|
@@ -659,8 +659,8 @@ parameters_filtering_ta = {
|
|
659 |
"words_augmentation_join_char": "",
|
660 |
"cond_check_stopwords": True,
|
661 |
"stopwords_min_cutoff": 0,
|
662 |
-
"
|
663 |
-
"
|
664 |
"cond_check_lang_id": True,
|
665 |
"lang_id_min_cutoff": 0.75,
|
666 |
"cond_check_perplexity": False,
|
@@ -690,8 +690,8 @@ parameters_filtering_te = {
|
|
690 |
"words_augmentation_join_char": "",
|
691 |
"cond_check_stopwords": True,
|
692 |
"stopwords_min_cutoff": 0,
|
693 |
-
"
|
694 |
-
"
|
695 |
"cond_check_lang_id": True,
|
696 |
"lang_id_min_cutoff": 0.75,
|
697 |
"cond_check_perplexity": False,
|
@@ -721,8 +721,8 @@ parameters_filtering_ur = {
|
|
721 |
"words_augmentation_join_char": "",
|
722 |
"cond_check_stopwords": True,
|
723 |
"stopwords_min_cutoff": 0,
|
724 |
-
"
|
725 |
-
"
|
726 |
"cond_check_lang_id": True,
|
727 |
"lang_id_min_cutoff": 0.75,
|
728 |
"cond_check_perplexity": False,
|
@@ -752,8 +752,8 @@ parameters_filtering_vi = {
|
|
752 |
"words_augmentation_join_char": " ",
|
753 |
"cond_check_stopwords": True,
|
754 |
"stopwords_min_cutoff": 0,
|
755 |
-
"
|
756 |
-
"
|
757 |
"cond_check_lang_id": True,
|
758 |
"lang_id_min_cutoff": 0.75,
|
759 |
"cond_check_perplexity": False,
|
@@ -783,8 +783,8 @@ parameters_filtering_yo = {
|
|
783 |
"words_augmentation_join_char": "",
|
784 |
"cond_check_stopwords": True,
|
785 |
"stopwords_min_cutoff": 0,
|
786 |
-
"
|
787 |
-
"
|
788 |
"cond_check_lang_id": True,
|
789 |
"lang_id_min_cutoff": 0.75,
|
790 |
"cond_check_perplexity": False,
|
@@ -814,8 +814,8 @@ parameters_filtering_zh = {
|
|
814 |
"words_augmentation_join_char": "",
|
815 |
"cond_check_stopwords": False,
|
816 |
"stopwords_min_cutoff": 0,
|
817 |
-
"
|
818 |
-
"
|
819 |
"cond_check_lang_id": True,
|
820 |
"lang_id_min_cutoff": 0.75,
|
821 |
"cond_check_perplexity": False,
|
|
|
39 |
"words_augmentation_join_char": "",
|
40 |
"cond_check_stopwords": False,
|
41 |
"stopwords_min_cutoff": 0,
|
42 |
+
"cond_check_flagged_words": False,
|
43 |
+
"flagged_words_max_cutoff": 0.2,
|
44 |
"cond_check_lang_id": True,
|
45 |
"lang_id_min_cutoff": 0.70,
|
46 |
"cond_check_perplexity": False,
|
|
|
70 |
"words_augmentation_join_char": "",
|
71 |
"cond_check_stopwords": True,
|
72 |
"stopwords_min_cutoff": 0,
|
73 |
+
"cond_check_flagged_words": False,
|
74 |
+
"flagged_words_max_cutoff": 0.2,
|
75 |
"cond_check_lang_id": True,
|
76 |
"lang_id_min_cutoff": 0.6,
|
77 |
"cond_check_perplexity": True,
|
|
|
101 |
"words_augmentation_join_char": "",
|
102 |
"cond_check_stopwords": True,
|
103 |
"stopwords_min_cutoff": 0,
|
104 |
+
"cond_check_flagged_words": False,
|
105 |
+
"flagged_words_max_cutoff": 0.2,
|
106 |
"cond_check_lang_id": True,
|
107 |
"lang_id_min_cutoff": 0.75,
|
108 |
"cond_check_perplexity": True,
|
|
|
132 |
"words_augmentation_join_char": "",
|
133 |
"cond_check_stopwords": True,
|
134 |
"stopwords_min_cutoff": 0,
|
135 |
+
"cond_check_flagged_words": False,
|
136 |
+
"flagged_words_max_cutoff": 0.2,
|
137 |
"cond_check_lang_id": True,
|
138 |
"lang_id_min_cutoff": 0.75,
|
139 |
"cond_check_perplexity": False,
|
|
|
163 |
"words_augmentation_join_char": "",
|
164 |
"cond_check_stopwords": True,
|
165 |
"stopwords_min_cutoff": 0,
|
166 |
+
"cond_check_flagged_words": False,
|
167 |
+
"flagged_words_max_cutoff": 0.2,
|
168 |
"cond_check_lang_id": True,
|
169 |
"lang_id_min_cutoff": 0.75,
|
170 |
"cond_check_perplexity": False,
|
|
|
194 |
"words_augmentation_join_char": "",
|
195 |
"cond_check_stopwords": True,
|
196 |
"stopwords_min_cutoff": 0.05,
|
197 |
+
"cond_check_flagged_words": False,
|
198 |
+
"flagged_words_max_cutoff": 0.2,
|
199 |
"cond_check_lang_id": True,
|
200 |
"lang_id_min_cutoff": 0.75,
|
201 |
"cond_check_perplexity": False,
|
|
|
225 |
"words_augmentation_join_char": "",
|
226 |
"cond_check_stopwords": True,
|
227 |
"stopwords_min_cutoff": 0,
|
228 |
+
"cond_check_flagged_words": False,
|
229 |
+
"flagged_words_max_cutoff": 0.2,
|
230 |
"cond_check_lang_id": True,
|
231 |
"lang_id_min_cutoff": 0.75,
|
232 |
"cond_check_perplexity": True,
|
|
|
256 |
"words_augmentation_join_char": "",
|
257 |
"cond_check_stopwords": True,
|
258 |
"stopwords_min_cutoff": 0.3,
|
259 |
+
"cond_check_flagged_words": True,
|
260 |
+
"flagged_words_max_cutoff": 0.045,
|
261 |
"cond_check_lang_id": True,
|
262 |
"lang_id_min_cutoff": 0.80,
|
263 |
"cond_check_perplexity": True,
|
|
|
287 |
"words_augmentation_join_char": "",
|
288 |
"cond_check_stopwords": True,
|
289 |
"stopwords_min_cutoff": 0.2,
|
290 |
+
"cond_check_flagged_words": False,
|
291 |
+
"flagged_words_max_cutoff": 0.2,
|
292 |
"cond_check_lang_id": True,
|
293 |
"lang_id_min_cutoff": 0.75,
|
294 |
"cond_check_perplexity": True,
|
|
|
318 |
"words_augmentation_join_char": "",
|
319 |
"cond_check_stopwords": True,
|
320 |
"stopwords_min_cutoff": 0,
|
321 |
+
"cond_check_flagged_words": False,
|
322 |
+
"flagged_words_max_cutoff": 0.2,
|
323 |
"cond_check_lang_id": True,
|
324 |
"lang_id_min_cutoff": 0.75,
|
325 |
"cond_check_perplexity": False,
|
|
|
349 |
"words_augmentation_join_char": "",
|
350 |
"cond_check_stopwords": True,
|
351 |
"stopwords_min_cutoff": 0.15,
|
352 |
+
"cond_check_flagged_words": False,
|
353 |
+
"flagged_words_max_cutoff": 0.2,
|
354 |
"cond_check_lang_id": True,
|
355 |
"lang_id_min_cutoff": 0.75,
|
356 |
"cond_check_perplexity": True,
|
|
|
380 |
"words_augmentation_join_char": "",
|
381 |
"cond_check_stopwords": True,
|
382 |
"stopwords_min_cutoff": 0,
|
383 |
+
"cond_check_flagged_words": False,
|
384 |
+
"flagged_words_max_cutoff": 0.2,
|
385 |
"cond_check_lang_id": True,
|
386 |
"lang_id_min_cutoff": 0.75,
|
387 |
"cond_check_perplexity": True,
|
|
|
411 |
"words_augmentation_join_char": "",
|
412 |
"cond_check_stopwords": True,
|
413 |
"stopwords_min_cutoff": 0,
|
414 |
+
"cond_check_flagged_words": False,
|
415 |
+
"flagged_words_max_cutoff": 0.2,
|
416 |
"cond_check_lang_id": True,
|
417 |
"lang_id_min_cutoff": 0.75,
|
418 |
"cond_check_perplexity": True,
|
|
|
442 |
"words_augmentation_join_char": "",
|
443 |
"cond_check_stopwords": True,
|
444 |
"stopwords_min_cutoff": 0.25,
|
445 |
+
"cond_check_flagged_words": False,
|
446 |
+
"flagged_words_max_cutoff": 0.2,
|
447 |
"cond_check_lang_id": True,
|
448 |
"lang_id_min_cutoff": 0.75,
|
449 |
"cond_check_perplexity": True,
|
|
|
473 |
"words_augmentation_join_char": "",
|
474 |
"cond_check_stopwords": True,
|
475 |
"stopwords_min_cutoff": 0,
|
476 |
+
"cond_check_flagged_words": False,
|
477 |
+
"flagged_words_max_cutoff": 0.2,
|
478 |
"cond_check_lang_id": True,
|
479 |
"lang_id_min_cutoff": 0.75,
|
480 |
"cond_check_perplexity": True,
|
|
|
504 |
"words_augmentation_join_char": "",
|
505 |
"cond_check_stopwords": True,
|
506 |
"stopwords_min_cutoff": 0,
|
507 |
+
"cond_check_flagged_words": False,
|
508 |
+
"flagged_words_max_cutoff": 0.2,
|
509 |
"cond_check_lang_id": True,
|
510 |
"lang_id_min_cutoff": 0.75,
|
511 |
"cond_check_perplexity": True,
|
|
|
535 |
"words_augmentation_join_char": "",
|
536 |
"cond_check_stopwords": True,
|
537 |
"stopwords_min_cutoff": 0,
|
538 |
+
"cond_check_flagged_words": False,
|
539 |
+
"flagged_words_max_cutoff": 0.2,
|
540 |
"cond_check_lang_id": True,
|
541 |
"lang_id_min_cutoff": 0.75,
|
542 |
"cond_check_perplexity": True,
|
|
|
566 |
"words_augmentation_join_char": "",
|
567 |
"cond_check_stopwords": True,
|
568 |
"stopwords_min_cutoff": 0.15,
|
569 |
+
"cond_check_flagged_words": False,
|
570 |
+
"flagged_words_max_cutoff": 0.2,
|
571 |
"cond_check_lang_id": True,
|
572 |
"lang_id_min_cutoff": 0.75,
|
573 |
"cond_check_perplexity": True,
|
|
|
597 |
"words_augmentation_join_char": "",
|
598 |
"cond_check_stopwords": False,
|
599 |
"stopwords_min_cutoff": 0,
|
600 |
+
"cond_check_flagged_words": False,
|
601 |
+
"flagged_words_max_cutoff": 0.2,
|
602 |
"cond_check_lang_id": True,
|
603 |
"lang_id_min_cutoff": 0.75,
|
604 |
"cond_check_perplexity": False,
|
|
|
628 |
"words_augmentation_join_char": "",
|
629 |
"cond_check_stopwords": True,
|
630 |
"stopwords_min_cutoff": 0,
|
631 |
+
"cond_check_flagged_words": False,
|
632 |
+
"flagged_words_max_cutoff": 0.2,
|
633 |
"cond_check_lang_id": True,
|
634 |
"lang_id_min_cutoff": 0.75,
|
635 |
"cond_check_perplexity": False,
|
|
|
659 |
"words_augmentation_join_char": "",
|
660 |
"cond_check_stopwords": True,
|
661 |
"stopwords_min_cutoff": 0,
|
662 |
+
"cond_check_flagged_words": False,
|
663 |
+
"flagged_words_max_cutoff": 0.2,
|
664 |
"cond_check_lang_id": True,
|
665 |
"lang_id_min_cutoff": 0.75,
|
666 |
"cond_check_perplexity": False,
|
|
|
690 |
"words_augmentation_join_char": "",
|
691 |
"cond_check_stopwords": True,
|
692 |
"stopwords_min_cutoff": 0,
|
693 |
+
"cond_check_flagged_words": False,
|
694 |
+
"flagged_words_max_cutoff": 0.2,
|
695 |
"cond_check_lang_id": True,
|
696 |
"lang_id_min_cutoff": 0.75,
|
697 |
"cond_check_perplexity": False,
|
|
|
721 |
"words_augmentation_join_char": "",
|
722 |
"cond_check_stopwords": True,
|
723 |
"stopwords_min_cutoff": 0,
|
724 |
+
"cond_check_flagged_words": False,
|
725 |
+
"flagged_words_max_cutoff": 0.2,
|
726 |
"cond_check_lang_id": True,
|
727 |
"lang_id_min_cutoff": 0.75,
|
728 |
"cond_check_perplexity": False,
|
|
|
752 |
"words_augmentation_join_char": " ",
|
753 |
"cond_check_stopwords": True,
|
754 |
"stopwords_min_cutoff": 0,
|
755 |
+
"cond_check_flagged_words": False,
|
756 |
+
"flagged_words_max_cutoff": 0.2,
|
757 |
"cond_check_lang_id": True,
|
758 |
"lang_id_min_cutoff": 0.75,
|
759 |
"cond_check_perplexity": False,
|
|
|
783 |
"words_augmentation_join_char": "",
|
784 |
"cond_check_stopwords": True,
|
785 |
"stopwords_min_cutoff": 0,
|
786 |
+
"cond_check_flagged_words": False,
|
787 |
+
"flagged_words_max_cutoff": 0.2,
|
788 |
"cond_check_lang_id": True,
|
789 |
"lang_id_min_cutoff": 0.75,
|
790 |
"cond_check_perplexity": False,
|
|
|
814 |
"words_augmentation_join_char": "",
|
815 |
"cond_check_stopwords": False,
|
816 |
"stopwords_min_cutoff": 0,
|
817 |
+
"cond_check_flagged_words": False,
|
818 |
+
"flagged_words_max_cutoff": 0.2,
|
819 |
"cond_check_lang_id": True,
|
820 |
"lang_id_min_cutoff": 0.75,
|
821 |
"cond_check_perplexity": False,
|