File size: 15,173 Bytes
cfe9686 c3994cd cfe9686 c3994cd cfe9686 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 |
import streamlit as st
import numpy as np
import cv2
import tensorflow as tf
from PIL import Image
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
import pickle
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from PIL import Image
# from google.colab.patches import cv2_imshow
def label_smoothing(y_true,y_pred):
return tf.keras.losses.binary_crossentropy(y_true,y_pred,label_smoothing=0.1)
def sparse_cross_entropy(y_true, y_pred):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
logits=y_pred)
loss_mean = tf.reduce_mean(loss)
return loss_mean
model1 = load_model('densenet.h5',custom_objects={'label_smoothing': label_smoothing})
image_model_transfer=load_model("image_model_transfer.h5")
decoder_model=load_model("Final_ISRO_DenseNet201_Epoch50.h5",custom_objects={'sparse_cross_entropy': sparse_cross_entropy})
class TokenizerWrap(Tokenizer):
"""Wrap the Tokenizer-class from Keras with more functionality."""
def _init_(self, texts, num_words=None):
"""
:param texts: List of strings with the data-set.
:param num_words: Max number of words to use.
"""
Tokenizer._init_(self, num_words=num_words)
# Create the vocabulary from the texts.
self.fit_on_texts(texts)
# Create inverse lookup from integer-tokens to words.
# word_index is a dictionary. its values are tokens and the keys are words
# opposite to index_to_word
self.index_to_word = dict(zip(self.word_index.values(),
self.word_index.keys()))
def token_to_word(self, token):
"""Lookup a single word from an integer-token."""
word = " " if token == 0 else self.index_to_word[token]
return word
def tokens_to_string(self, tokens):
"""Convert a list of integer-tokens to a string."""
# Create a list of the individual words.
words = [self.index_to_word[token]
for token in tokens
if token != 0]
# Concatenate the words to a single string
# with space between all the words.
text = " ".join(words)
return text
def captions_to_tokens(self, captions_listlist):
"""
Convert a list-of-list with text-captions to
a list-of-list of integer-tokens.
"""
# Note that text_to_sequences() takes a list of texts.
tokens = [self.texts_to_sequences(captions_list)
for captions_list in captions_listlist]
return tokens
with open('Train_Label.pickle', 'rb') as efile:
labels=pickle.load(efile)
with open('tokenizer.pkl', 'rb') as efile:
tokenizer=pickle.load(efile)
le=LabelEncoder()
labels=le.fit_transform(labels)
def framing(video):#defining a small function named"framing" with a parameter "i" that's supposed to be provided for reading the video
fr = []#creating an empty list named fr
fr_pre=[]#creating an empty list named fr_pre
cap = cv2.VideoCapture(video)#reading the video file
while (cap.isOpened()):#This command builds a loop to check if the data is still being read from the video
ret,frame = cap.read()#reading the data tunnel,gives two output where one tells about presence of frames(here it's ret) & the other speaks frame data(here it's frame)
if ret == True:#checking for presence of frames
# cv2_imshow(frame)#displaying the frames
grayed = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)#Converting the frames to Grayscale from BGR
canned = cv2.Canny(grayed,320,320)#For extrating edges we use Canny Edge detection method
fr.append(frame)#Appending the read frame
fr_pre.append(canned)#Appending the edge extracted frames
# cv2_imshow(grayed)#Displaying the original frames
# cv2_imshow(canned)#Displaying the edge detected frames
k = cv2.waitKey(10) & 0XFF#this is an arrangement for displaying the video where the secs for which each frame needs to be displayed in given in the paranthesis
if k == ord('q'):#pressing 'q' key will close the video
break
else:
break
cap.release()#Here we release the resoures
cv2.destroyAllWindows()#Here we delete all the windows that were created during the program
return fr_pre,fr
def difference_of_frames(frames):
diff = []#creatin a list variable
for i in range(0,len(frames)-1):#defining the range
diff.append(cv2.absdiff(frames[i],frames[i+1]))#appending the diff between frames to the list variable so we're supposed to get only the difference between frames
return diff
def cal_threshold(diff):
mn = np.mean(diff)#This gives mean
st_d = np.std(diff)#This gives standard deviation
a = 4#Setting a random value we can modify it to any value
ts = mn + (a * st_d)#defining the standard threshold value for the project/global threshold value
return ts
def imp_frames(diff, ts, ogframes):
a_fr = []#Creating an empty list
for i in range(len(diff)):#Defining the for loop to be looped over all the frames obtained after finding the frames resulted from subtracting
mn = np.mean(diff[i])#Calculating the mean for each frame
st_d = np.std(diff[i])#Calculating the standard deviation for each frame
fr_ts = mn + (4*st_d)#Finding the threshold values for each frame/image
a_fr.append([i,fr_ts])#Appending the frame number & the threshold values
imp_fr = []#Creating an empty list
for i,ac_tr in(a_fr):#Defining the loop on the list obtained from above code
if ac_tr >= ts:#Comapring the threshold values to the standard threshold/global threshold values
imp_fr.append([i,ac_tr])#Appending the list with the imp frames based on their index & the values
key_fr = []#Creating an empty list
for i,_ in imp_fr:#Defining the loop over the list obtained from above code
key_fr.append(ogframes[i])#This extracts the frames based on the index of frames
return key_fr
def final_image(video):
frames,ogframes = framing(video)#calling function framing & then extracting the images
diff=difference_of_frames(frames)
ts=cal_threshold(diff)
key_fr=imp_frames(diff, ts, ogframes)
frame_no=key_fr[int(len(key_fr)/2)] #this is a frame
cv2.imwrite("Testing1.jpg",frame_no)
return "Testing1.jpg"
cv2.destroyAllWindows()
def image_test(image_path):
image=Image.open(image_path)
image = image.resize((224,224))
image = np.array(image)
image= np.expand_dims(image, axis=0)
return image
def largest_indices(ary, n):
flat = ary.flatten()
indices = np.argpartition(flat, -n)[-n:]
indices = indices[np.argsort(-flat[indices])]
return indices
mark_start = 'ssss'
mark_end = ' eeee'
token_start = tokenizer.word_index[mark_start.strip()]
token_end = tokenizer.word_index[mark_end.strip()]
def load_image(path, size=None):
"""
Load the image from the given file-path and resize it
to the given size if not None.
"""
# Load the image using PIL.
img = Image.open(path)
# Resize image if desired.
if not size is None:
img = img.resize(size=size, resample=Image.LANCZOS)
img = np.array(img)
img = img / 255.0
# Convert 2-dim gray-scale array to 3-dim RGB array.
if (len(img.shape) == 2):
img = np.repeat(img[:, :, np.newaxis], 3, axis=2)
return img
def greedy_search(image_path, max_tokens=30):
"""
Generate a caption for the image in the given path.
The caption is limited to the given number of tokens (words).
"""
# ---------------------------ENCODE IMAGE--------------------------------
# Load and resize the image.
image = load_image(image_path, size=(224,224))
# Expand the 3-dim numpy array to 4-dim
# because the image-model expects a whole batch as input,
# so we give it a batch with just one image.
image_batch = np.expand_dims(image, axis=0)
# Process the image with the pre-trained image-model
# to get the transfer-values.
transfer_values = image_model_transfer.predict(image_batch)
# -------------------------------------------------------------------
# Pre-allocate the 2-dim array used as input to the decoder.
# This holds just a single sequence of integer-tokens,
# but the decoder-model expects a batch of sequences.
shape = (1, max_tokens)
decoder_input_data = np.zeros(shape=shape, dtype=int)
# The first input-token is the special start-token for 'ssss '.
token_int = token_start #1
# Initialize an empty output-text.
output_text = ''
# Initialize the number of tokens we have processed.
count_tokens = 0
# While we haven't sampled the special end-token for ' eeee'
# and we haven't processed the max number of tokens.
while token_int != token_end and count_tokens < max_tokens:
# Update the input-sequence to the decoder
# with the last token that was sampled.
# In the first iteration this will set the
# first element to the start-token.
decoder_input_data[0, count_tokens] = token_int
# Wrap the input-data in a dict for clarity and safety,
# so we are sure we input the data in the right order.
x_data = \
{
'transfer_values_input': transfer_values,
'decoder_input': decoder_input_data
}
# Note that we input the entire sequence of tokens
# to the decoder. This wastes a lot of computation
# because we are only interested in the last input
# and output. We could modify the code to return
# the GRU-states when calling predict() and then
# feeding these GRU-states as well the next time
# we call predict(), but it would make the code
# much more complicated.
# Input this data to the decoder and get the predicted output.
decoder_output = decoder_model.predict(x_data)
# print(decoder_output.shape) (1,30,15000) for every iteration
# Get the last predicted token as a one-hot encoded array.
# Note that this is not limited by softmax, but we just
# need the index of the largest element so it doesn't matter.
token_onehot = decoder_output[0, count_tokens, :]
# print(token_onehot.shape) (15000, ) for every iteration
# Convert to an integer-token.
token_int = np.argmax(token_onehot)
# print(token_int) #the token of a word with the highest score
# Lookup the word corresponding to this integer-token.
sampled_word = tokenizer.token_to_word(token_int)
# print(sampled_word)
# Append the word to the output-text.
output_text += " " + sampled_word
# Increment the token-counter.
count_tokens += 1
# This is the sequence of tokens output by the decoder.
output_tokens = decoder_input_data[0]
# print(output_tokens)
# Plot the image.
# plt.imshow(image)
# plt.show()
predicted_caption=output_text.split()
del (predicted_caption[-1])
output_text = " "
output_text = output_text.join(predicted_caption)
# Print the predicted caption.
# print("Predicted caption:")
# print(output_text)
# print()
return predicted_caption
def beam_search(beam_index, image_path, max_tokens=30):
image = load_image(image_path, size=(224,224))
# Expand the 3-dim numpy array to 4-dim
# because the image-model expects a whole batch as input,
# so we give it a batch with just one image.
image_batch = np.expand_dims(image, axis=0)
# Process the image with the pre-trained image-model
# to get the transfer-values.
transfer_values = image_model_transfer.predict(image_batch)
token_int = [token_start]
start_word = [[token_int, 0.0]]
count_tokens = 0
while len(start_word[0][0])<max_tokens:
temp = []
for s in start_word:
par_caps = pad_sequences([s[0]], maxlen=max_tokens, padding='post')
preds = decoder_model.predict([transfer_values,par_caps], verbose=0)
token_onehot = preds[0, count_tokens, :]
# print(token_onehot.shape)
word_preds = np.argsort(token_onehot)[-beam_index:]
# print(word_preds.shape)
for w in word_preds:
next_cap, prob = s[0][:], s[1]
next_cap.append(w)
prob += token_onehot[w]
temp.append([next_cap, prob])
start_word = temp
count_tokens+=1
# Sorting according to the probabilities
start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
# Getting the top words
start_word = start_word[-beam_index:]
start_word = start_word[-1][0]
intermediate_caption = [tokenizer.token_to_word(i) for i in start_word]
final_caption = []
for i in intermediate_caption:
if i != 'eeee':
final_caption.append(i)
else:
break
# final_caption = ' '.join(final_caption[1:])
return final_caption[1:]
def generate_caption_any(image_path):
predicted_caption1=' '.join((greedy_search(image_path=image_path)))
predicted_caption2=' '.join(beam_search(beam_index=3,image_path=image_path))
predicted_caption3=' '.join(beam_search(beam_index=5,image_path=image_path))
return predicted_caption2
# show_image_using_path(image_path)
def main():
st.title("ISRO Video Classification & Captioning")
st.write('In this project, we introduce a technique for video classification and captioning, harnessing a keyframe extraction method to streamline the process. Utilizing Densenet 201, our model is designed to classify videos by focusing on the most crucial frame, optimizing efficiency and performance. Users can experience our innovative approach by employing any of the provided three videos which have provided as an example')
video_options = {
"Video 1": "Video001-Scene-001.mp4",
"Video 2": "Video015-Scene-074.mp4",
"Video 3": "Video005-Scene-043.mp4",
"Video 4": "Video002-Scene-023.mp4",
}
selected_video = st.selectbox("Select a video to submit", list(video_options.keys()))
video_path = video_options[selected_video]
if st.button("Submit"):
st.video(video_path)
path=final_image(video_path)
image=image_test(path)
output_class=model1.predict(image)
caption=generate_caption_any(path)
indices=largest_indices(output_class, 3)
st.title('The predicted category is:')
st.write(le.inverse_transform(indices)[0])
st.title('Caption:')
caption = caption.capitalize()
st.write(caption)
# st.video(uploaded_file)
if __name__ == "__main__":
main()
|