File size: 1,370 Bytes
4b10007
 
 
 
 
e318707
6f7b5ee
07dd827
4b10007
 
9a63d60
c83f950
e318707
c83f950
4b10007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61d120e
4b10007
 
 
6f7b5ee
 
 
8afa1c0
6f7b5ee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from sklearn.feature_extraction.text import TfidfVectorizer
from TurkishStemmer import TurkishStemmer
import string
# import for loading python objects (scikit-learn models)
import pickle
import nltk
import streamlit as st
import sklearn

def custom_tokenizer_with_Turkish_stemmer(text):
    # tokenize text
    # tokens = text.split(" ")
    tokens = nltk.word_tokenize(text)
    print(tokens)
    stems = [stemmerTR.stem(item.lower()) for item in tokens]
    return stems

def predictSMSdata(test_text):
    categories = ["legitimate", "spam"]
    categories.sort()

    # load model
    filename1 = "LinearSVC_SMS_spam_TR.pickle"
    file_handle1 = open(filename1, "rb")
    classifier = pickle.load(file_handle1)
    file_handle1.close()

    # load tfidf_vectorizer for transforming test text data
    filename2 = "tfidf_vectorizer_TR.pickle"
    file_handle2 = open(filename2, "rb")
    tfidf_vectorizer = pickle.load(file_handle2)
    file_handle2.close()

    test_list=[test_text]
    tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
    predicted = classifier.predict(tfidf_vectorizer_vectors_test)
    print(categories[predicted[0]])
    return categories[predicted[0]]

stemmerTR = TurkishStemmer()

text = st.text_area("enter some text!")
if text:
    out = predictSMSdata(text)
    st.write("The category of SMS = " + out.upper())