srivarshan commited on
Commit
a90b4b9
·
1 Parent(s): 876b82d

Add text preprocessing

Browse files
Files changed (2) hide show
  1. app.py +2 -0
  2. preprocess.py +23 -0
app.py CHANGED
@@ -1,11 +1,13 @@
1
  import gradio as gr
2
  from model import CustomModel
 
3
  import os
4
 
5
  os.system("cp -r ./nltk_data/ /home/user/nltk_data")
6
 
7
  def analyze(text):
8
  model = CustomModel()
 
9
  return text
10
 
11
  app = gr.Interface(fn=analyze, inputs="text", outputs="text")
 
1
  import gradio as gr
2
  from model import CustomModel
3
+ from preprocess import preprocess_pipeline
4
  import os
5
 
6
  os.system("cp -r ./nltk_data/ /home/user/nltk_data")
7
 
8
  def analyze(text):
9
  model = CustomModel()
10
+ text = preprocess_pipeline(text)
11
  return text
12
 
13
  app = gr.Interface(fn=analyze, inputs="text", outputs="text")
preprocess.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem import SnowballStemmer
4
+
5
+
6
+ def clean_text(text):
7
+ stop_words = set(stopwords.words("english"))
8
+ # english_stopwords = stopwords.words("english")
9
+ english_stemmer = SnowballStemmer("english")
10
+ text = text.replace('', '') # Remove
11
+ text = re.sub(r'[^\w]', ' ', text) # Remove symbols
12
+ text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
13
+ text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces
14
+ tokens = []
15
+ for token in text.split():
16
+ if token not in stop_words:
17
+ token = english_stemmer.stem(token)
18
+ tokens.append(token)
19
+ return " ".join(tokens)
20
+
21
+ def preprocess_pipeline(text):
22
+ cleaned_text = [clean_text(text) for text in text]
23
+ return cleaned_text