Update utils.py
Browse files
utils.py
CHANGED
@@ -63,12 +63,15 @@ import nltk
|
|
63 |
from nltk.corpus import stopwords
|
64 |
from nltk.tokenize import word_tokenize
|
65 |
from nltk.stem import WordNetLemmatizer
|
66 |
-
nltk.download('punkt')
|
67 |
|
68 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
69 |
from sklearn.metrics.pairwise import cosine_similarity
|
70 |
import numpy as np
|
71 |
|
|
|
|
|
|
|
|
|
72 |
|
73 |
|
74 |
################################################
|
@@ -109,9 +112,7 @@ def normalise_prompt (prompt):
|
|
109 |
tokens = [word for word in tokens if word.isalnum()]
|
110 |
|
111 |
# Stop Word Entfernung
|
112 |
-
|
113 |
-
stop_words = set(stopwords.words('deutsch'))
|
114 |
-
tokens = [word for word in tokens if not word in stop_words]
|
115 |
# 5. Lemmatisierung: Worte in Grundform bringen, um Text besser vergleichen zu können
|
116 |
nltk.download('wordnet')
|
117 |
lemmatizer = WordNetLemmatizer()
|
|
|
63 |
from nltk.corpus import stopwords
|
64 |
from nltk.tokenize import word_tokenize
|
65 |
from nltk.stem import WordNetLemmatizer
|
|
|
66 |
|
67 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
68 |
from sklearn.metrics.pairwise import cosine_similarity
|
69 |
import numpy as np
|
70 |
|
71 |
+
#für die Normalisierung
|
72 |
+
nltk.download('punkt')
|
73 |
+
nltk.download('stopwords')
|
74 |
+
german_stopwords = set(stopwords.words('german'))
|
75 |
|
76 |
|
77 |
################################################
|
|
|
112 |
tokens = [word for word in tokens if word.isalnum()]
|
113 |
|
114 |
# Stop Word Entfernung
|
115 |
+
tokens = [word for word in tokens if not word in german_stopwords]
|
|
|
|
|
116 |
# 5. Lemmatisierung: Worte in Grundform bringen, um Text besser vergleichen zu können
|
117 |
nltk.download('wordnet')
|
118 |
lemmatizer = WordNetLemmatizer()
|