DiegoTheExplorar commited on
Commit
9da3c99
·
verified ·
1 Parent(s): 3a3c829

Update DataPPwithspecial.py

Browse files
Files changed (1) hide show
  1. DataPPwithspecial.py +55 -55
DataPPwithspecial.py CHANGED
@@ -1,55 +1,55 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.model_selection import train_test_split
4
- import tensorflow as tf
5
-
6
- def preprocess():
7
- # Load dataset
8
- data = pd.read_csv('./backend/English_To_Klingon.csv')
9
-
10
-
11
- # Append <BOS> and <EOS> tags to the Klingon sentences
12
- data['klingon'] = data['klingon'].apply(lambda x: '<BOS> ' + x + ' <EOS>')
13
-
14
- # Separate the sentences
15
- english_sentences = data['english'].values
16
- klingon_sentences = data['klingon'].values
17
-
18
- # Split data into training and testing sets. An 80 - 20 split is used here
19
- english_train, english_test, klingon_train, klingon_test = train_test_split(
20
- english_sentences, klingon_sentences, test_size=0.2, random_state=42)
21
-
22
- # Initialize tokenizers with specified vocabulary size
23
- english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
24
- klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
25
-
26
- # Fit tokenizers on training data
27
- english_tokenizer.fit_on_texts(english_train)
28
- klingon_tokenizer.fit_on_texts(klingon_train)
29
-
30
- # Tokenize the sentences
31
- english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
32
- klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
33
- english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
34
- klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
35
-
36
- # Padding sequences to a fixed length
37
- english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=50, padding='post')
38
- klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=50, padding='post')
39
- english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=50, padding='post')
40
- klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=50, padding='post')
41
-
42
- # Prepare target data for training
43
- klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
44
- klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
45
- klingon_train_target = np.expand_dims(klingon_train_target, -1)
46
-
47
- # Prepare target data for testing
48
- klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
49
- klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
50
- klingon_test_target = np.expand_dims(klingon_test_target, -1)
51
-
52
- return (english_tokenizer, klingon_tokenizer, 50, # max_length
53
- english_train_padded, klingon_train_input, klingon_train_target,
54
- english_test_padded, klingon_test_input, klingon_test_target)
55
-
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ import tensorflow as tf
5
+
6
+ def preprocess():
7
+ # Load dataset
8
+ data = pd.read_csv('English_To_Klingon.csv')
9
+
10
+
11
+ # Append <BOS> and <EOS> tags to the Klingon sentences
12
+ data['klingon'] = data['klingon'].apply(lambda x: '<BOS> ' + x + ' <EOS>')
13
+
14
+ # Separate the sentences
15
+ english_sentences = data['english'].values
16
+ klingon_sentences = data['klingon'].values
17
+
18
+ # Split data into training and testing sets. An 80 - 20 split is used here
19
+ english_train, english_test, klingon_train, klingon_test = train_test_split(
20
+ english_sentences, klingon_sentences, test_size=0.2, random_state=42)
21
+
22
+ # Initialize tokenizers with specified vocabulary size
23
+ english_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
24
+ klingon_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<UNK>')
25
+
26
+ # Fit tokenizers on training data
27
+ english_tokenizer.fit_on_texts(english_train)
28
+ klingon_tokenizer.fit_on_texts(klingon_train)
29
+
30
+ # Tokenize the sentences
31
+ english_train_sequences = english_tokenizer.texts_to_sequences(english_train)
32
+ klingon_train_sequences = klingon_tokenizer.texts_to_sequences(klingon_train)
33
+ english_test_sequences = english_tokenizer.texts_to_sequences(english_test)
34
+ klingon_test_sequences = klingon_tokenizer.texts_to_sequences(klingon_test)
35
+
36
+ # Padding sequences to a fixed length
37
+ english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train_sequences, maxlen=50, padding='post')
38
+ klingon_train_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_train_sequences, maxlen=50, padding='post')
39
+ english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test_sequences, maxlen=50, padding='post')
40
+ klingon_test_padded = tf.keras.preprocessing.sequence.pad_sequences(klingon_test_sequences, maxlen=50, padding='post')
41
+
42
+ # Prepare target data for training
43
+ klingon_train_input = klingon_train_padded[:, :-1] # The decoder input, which is the Klingon sentence shifted by one position to the right for training data.
44
+ klingon_train_target = klingon_train_padded[:, 1:] # The target output, which is the same sentence shifted by one position to the left for training data.
45
+ klingon_train_target = np.expand_dims(klingon_train_target, -1)
46
+
47
+ # Prepare target data for testing
48
+ klingon_test_input = klingon_test_padded[:, :-1] # The decoder input for testing data.
49
+ klingon_test_target = klingon_test_padded[:, 1:] # The target output for testing data.
50
+ klingon_test_target = np.expand_dims(klingon_test_target, -1)
51
+
52
+ return (english_tokenizer, klingon_tokenizer, 50, # max_length
53
+ english_train_padded, klingon_train_input, klingon_train_target,
54
+ english_test_padded, klingon_test_input, klingon_test_target)
55
+