from sklearn.feature_extraction.text import TfidfVectorizer from TurkishStemmer import TurkishStemmer import nltk import string # import for loading python objects (scikit-learn models) import pickle import streamlit as st import sklearn def custom_tokenizer_with_Turkish_stemmer(text): # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None` tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))] stems = [stemmerTR.stem(item.lower()) for item in tokens] return stems def predictSMSdata(test_text): categories = ["legitimate", "spam"] categories.sort() # load model filename1 = "LinearSVC_SMS_spam_TR.pickle" file_handle1 = open(filename1, "rb") classifier = pickle.load(file_handle1) file_handle1.close() # load tfidf_vectorizer for transforming test text data filename2 = "tfidf_vectorizer_TR.pickle" file_handle2 = open(filename2, "rb") tfidf_vectorizer = pickle.load(file_handle2) file_handle2.close() test_list=[test_text] tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list) predicted = classifier.predict(tfidf_vectorizer_vectors_test) print(categories[predicted[0]]) trans_table = {ord(c): None for c in string.punctuation + string.digits} stemmerTR = TurkishStemmer() text = st.text_area("enter some text!") if text: out = predictSMSdata(text) st.json(out)