kastan commited on
Commit
c0e0a13
Β·
1 Parent(s): b85aa9e

remove unnecessary pptx dependency

Browse files
Files changed (1) hide show
  1. clip_for_ppts.py +138 -146
clip_for_ppts.py CHANGED
@@ -1,157 +1,149 @@
1
  import os
2
- import sys
3
- import torch
4
  import clip
 
5
  from PIL import Image
6
- from pptx import Presentation
7
- from pptx.enum.shapes import MSO_SHAPE_TYPE
8
- import time
9
 
 
 
 
 
10
 
11
 
12
  class ClipImage:
13
- def __init__(self, path_of_ppt_folders, path_to_save_image_features, mode='image', device='cuda'):
14
- """
15
- :param input_image_path: path of the input image (mode = 'image') or the actual text to be searched (mode='text')
16
- :param path_of_ppt_folders: path of the folder containing all the ppt folders
17
- :param path_to_save_image_features: path to save the image features
18
- :param mode: 'image' or 'text' based on the type of input
19
- :param device: device to run the model on
20
- """
21
- # Path
22
- directory = 'input_features'
23
- path = os.path.join(path_to_save_image_features, directory)
24
- if not os.path.exists(path):
25
- # Create the directory
26
- os.mkdir(path)
27
- print("Directory '% s' created" % directory)
28
-
29
- self.res = []
30
- if not os.path.isdir(path_of_ppt_folders):
31
- raise TypeError(
32
- f"{path_of_ppt_folders} is not a directory. Please only enter a directory")
33
-
34
- # if mode == 'image' and not os.path.exists(input_image_path):
35
- # raise FileNotFoundError(f"{input_image_path} does not exist.")
36
- if not os.path.exists(path_to_save_image_features) or not os.path.isdir(path_to_save_image_features):
37
- raise FileNotFoundError(
38
- f"{path_to_save_image_features} is not a directory or doesn't exist.")
39
- self.mode = mode
40
- self.path_of_ppt_folders = path_of_ppt_folders
41
- self.path_to_save_image_features = path_to_save_image_features
42
- self.device = device
43
-
44
- # consider ViT-L/14 should be the best one
45
- self.model, self.preprocess = clip.load('ViT-B/32', self.device)
46
-
47
- #print("πŸ‘‰ RUNNING CLIP'S ONE-TIME ENCODING STEP... will be slow the first time, and hopefully only the first time.")
48
- # passing in an image as a cheap hack, to make one funciton work for initial embedding.
49
- #self.calculate_similarity('/home/rsalvi/chatbotai/rohan/ai-teaching-assistant-uiuc/lecture_slides/001/Slide1.jpeg')
50
- #print("πŸ”₯ DONE with CLIP's ONE TIME ENCODING")
51
-
52
- def text_to_image_search(self, search_text: str, top_k_to_return: int = 4):
53
- """ Written after the fact by kastan, so that we don't have to call init every time. """
54
- assert type(search_text) == str, f"Must provide a single string, instead I got type {type(search_text)}"
55
- # self.create_input_features(search_text, mode='text')
56
- self.mode = 'text'
57
- return self.calculate_similarity(search_text, top_k_to_return)
58
-
59
- # TODO: WIP.
60
- def image_to_images_search(self, input_image, top_k_to_return: int = 4):
61
- """ Written after the fact by kastan, so that we don't have to call init every time. """
62
- self.mode = 'image'
63
- return self.calculate_similarity(input_image, top_k_to_return)
64
-
65
-
66
- def create_input_features(self, input_text_or_img):
67
- if self.mode == 'image':
68
- # Load the image
69
- #input_image = Image.open(input_text_or_img) # Not needed as image comes from gradio in PIL format
70
- # Preprocess the image
71
- input_arr = torch.cat(
72
- [self.preprocess(input_text_or_img).unsqueeze(0)]).to(self.device)
73
-
74
- elif self.mode == 'text':
75
- # Preprocess the text
76
- input_arr = torch.cat(
77
- [clip.tokenize(f"{input_text_or_img}")]).to(self.device)
78
-
79
- # Encode the image or text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  with torch.no_grad():
81
- if self.mode == 'image':
82
- input_features = self.model.encode_image(input_arr)
83
- elif self.mode == 'text':
84
- input_features = self.model.encode_text(input_arr)
85
- input_features /= input_features.norm(dim=-1, keepdim=True)
86
- return input_features
87
-
88
- def new_most_similar_slide_file(self, top_k: int):
89
- # Sort the results
90
- ans = sorted(self.res, key=lambda x: x[2], reverse=True)
91
- return ans[:top_k]
92
-
93
- def calculate_similarity(self, input_text_or_img, topk_val: int = 4):
94
- ## Similarities across folders
95
- self.res = []
96
- all_similarities = []
97
- slide_numbers = []
98
- # Create the input features
99
- input_features = self.create_input_features(input_text_or_img)
100
-
101
- # Iterate through all the folders
102
- ppts = list(os.listdir(self.path_of_ppt_folders))
103
- #start_time = time.monotonic()
104
- for i in ppts:
105
- # Get the path of the folder containing the ppt images
106
- imgs = list(os.listdir(os.path.join(self.path_of_ppt_folders, i)))
107
- slide_numbers.append(imgs)
108
- # Iterate through all the images and preprocess them
109
-
110
-
111
- # Check if the preprocessed file exists and load it
112
- img_flag = os.path.exists(
113
- self.path_to_save_image_features+'/input_features'+"/slides_"+i+"_tensor.pt")
114
- if img_flag:
115
- image_features = torch.load(
116
- self.path_to_save_image_features+'/input_features'+"/slides_"+i+"_tensor.pt", map_location=self.device)
117
- else:
118
- # Encode the images and save the encoding
119
- with torch.no_grad():
120
- image_input = torch.cat([self.preprocess(Image.open(os.path.join(
121
- self.path_of_ppt_folders, i, image))).unsqueeze(0) for image in imgs]).to(self.device)
122
- image_features = self.model.encode_image(image_input)
123
- image_features /= image_features.norm(dim=-1, keepdim=True)
124
- torch.save(image_features,
125
- self.path_to_save_image_features+'/input_features'+"/slides_"+i+"_tensor.pt")
126
- print("Saved the image features (for faster future loading) to: ",
127
- self.path_to_save_image_features+"/slides_"+i+"_tensor.pt")
128
-
129
- # Calculate the similarity between the input image and the images in the folder
130
-
131
- # TODO: THIS REQUIRES REFACTOR. We're only looking in a SINGLE FOLDER. need to APPEND to similarity.
132
- if self.mode == 'image':
133
- similarity = (100.0 * input_features @
134
- image_features.T).softmax(dim=-1)
135
- all_similarities.append((i,similarity))
136
- elif self.mode == 'text':
137
- similarity = (100.0 * input_features @
138
- image_features.T).softmax(dim=-1)
139
- all_similarities.append((i,similarity))
140
-
141
-
142
- ## Looking over all the folders
143
- similarity_results = []
144
-
145
- for j in range(0,len(all_similarities)):
146
- folder_name = all_similarities[j][0]
147
- folder_values = all_similarities[j][1][0]
148
- for i in range(0,len(folder_values)):
149
- self.res.append((folder_name,slide_numbers[j][i],folder_values[i]))
150
-
151
- #print(self.res)
152
-
153
- return self.new_most_similar_slide_file(topk_val)
154
- # Return the sorted results
155
 
156
  # if __name__ == "__main__":
157
 
 
1
  import os
2
+
 
3
  import clip
4
+ import torch
5
  from PIL import Image
 
 
 
6
 
7
+ # import sys
8
+ # from pptx import Presentation
9
+ # from pptx.enum.shapes import MSO_SHAPE_TYPE
10
+ # import time
11
 
12
 
13
  class ClipImage:
14
+
15
+ def __init__(self, path_of_ppt_folders, path_to_save_image_features, mode='image', device='cuda'):
16
+ """
17
+ :param input_image_path: path of the input image (mode = 'image') or the actual text to be searched (mode='text')
18
+ :param path_of_ppt_folders: path of the folder containing all the ppt folders
19
+ :param path_to_save_image_features: path to save the image features
20
+ :param mode: 'image' or 'text' based on the type of input
21
+ :param device: device to run the model on
22
+ """
23
+ # Path
24
+ directory = 'input_features'
25
+ path = os.path.join(path_to_save_image_features, directory)
26
+ if not os.path.exists(path):
27
+ # Create the directory
28
+ os.mkdir(path)
29
+ print("Directory '% s' created" % directory)
30
+
31
+ self.res = []
32
+ if not os.path.isdir(path_of_ppt_folders):
33
+ raise TypeError(f"{path_of_ppt_folders} is not a directory. Please only enter a directory")
34
+
35
+ # if mode == 'image' and not os.path.exists(input_image_path):
36
+ # raise FileNotFoundError(f"{input_image_path} does not exist.")
37
+ if not os.path.exists(path_to_save_image_features) or not os.path.isdir(path_to_save_image_features):
38
+ raise FileNotFoundError(f"{path_to_save_image_features} is not a directory or doesn't exist.")
39
+ self.mode = mode
40
+ self.path_of_ppt_folders = path_of_ppt_folders
41
+ self.path_to_save_image_features = path_to_save_image_features
42
+ self.device = device
43
+
44
+ # consider ViT-L/14 should be the best one
45
+ self.model, self.preprocess = clip.load('ViT-B/32', self.device)
46
+
47
+ #print("πŸ‘‰ RUNNING CLIP'S ONE-TIME ENCODING STEP... will be slow the first time, and hopefully only the first time.")
48
+ # passing in an image as a cheap hack, to make one funciton work for initial embedding.
49
+ #self.calculate_similarity('/home/rsalvi/chatbotai/rohan/ai-teaching-assistant-uiuc/lecture_slides/001/Slide1.jpeg')
50
+ #print("πŸ”₯ DONE with CLIP's ONE TIME ENCODING")
51
+
52
+ def text_to_image_search(self, search_text: str, top_k_to_return: int = 4):
53
+ """ Written after the fact by kastan, so that we don't have to call init every time. """
54
+ assert type(search_text) == str, f"Must provide a single string, instead I got type {type(search_text)}"
55
+ # self.create_input_features(search_text, mode='text')
56
+ self.mode = 'text'
57
+ return self.calculate_similarity(search_text, top_k_to_return)
58
+
59
+ # TODO: WIP.
60
+ def image_to_images_search(self, input_image, top_k_to_return: int = 4):
61
+ """ Written after the fact by kastan, so that we don't have to call init every time. """
62
+ self.mode = 'image'
63
+ return self.calculate_similarity(input_image, top_k_to_return)
64
+
65
+ def create_input_features(self, input_text_or_img):
66
+ if self.mode == 'image':
67
+ # Load the image
68
+ #input_image = Image.open(input_text_or_img) # Not needed as image comes from gradio in PIL format
69
+ # Preprocess the image
70
+ input_arr = torch.cat([self.preprocess(input_text_or_img).unsqueeze(0)]).to(self.device)
71
+
72
+ elif self.mode == 'text':
73
+ # Preprocess the text
74
+ input_arr = torch.cat([clip.tokenize(f"{input_text_or_img}")]).to(self.device)
75
+
76
+ # Encode the image or text
77
+ with torch.no_grad():
78
+ if self.mode == 'image':
79
+ input_features = self.model.encode_image(input_arr)
80
+ elif self.mode == 'text':
81
+ input_features = self.model.encode_text(input_arr)
82
+ input_features /= input_features.norm(dim=-1, keepdim=True)
83
+ return input_features
84
+
85
+ def new_most_similar_slide_file(self, top_k: int):
86
+ # Sort the results
87
+ ans = sorted(self.res, key=lambda x: x[2], reverse=True)
88
+ return ans[:top_k]
89
+
90
+ def calculate_similarity(self, input_text_or_img, topk_val: int = 4):
91
+ ## Similarities across folders
92
+ self.res = []
93
+ all_similarities = []
94
+ slide_numbers = []
95
+ # Create the input features
96
+ input_features = self.create_input_features(input_text_or_img)
97
+
98
+ # Iterate through all the folders
99
+ ppts = list(os.listdir(self.path_of_ppt_folders))
100
+ #start_time = time.monotonic()
101
+ for i in ppts:
102
+ # Get the path of the folder containing the ppt images
103
+ imgs = list(os.listdir(os.path.join(self.path_of_ppt_folders, i)))
104
+ slide_numbers.append(imgs)
105
+ # Iterate through all the images and preprocess them
106
+
107
+ # Check if the preprocessed file exists and load it
108
+ img_flag = os.path.exists(self.path_to_save_image_features + '/input_features' + "/slides_" + i + "_tensor.pt")
109
+ if img_flag:
110
+ image_features = torch.load(self.path_to_save_image_features + '/input_features' + "/slides_" + i + "_tensor.pt",
111
+ map_location=self.device)
112
+ else:
113
+ # Encode the images and save the encoding
114
  with torch.no_grad():
115
+ image_input = torch.cat([
116
+ self.preprocess(Image.open(os.path.join(self.path_of_ppt_folders, i, image))).unsqueeze(0) for image in imgs
117
+ ]).to(self.device)
118
+ image_features = self.model.encode_image(image_input)
119
+ image_features /= image_features.norm(dim=-1, keepdim=True)
120
+ torch.save(image_features, self.path_to_save_image_features + '/input_features' + "/slides_" + i + "_tensor.pt")
121
+ print("Saved the image features (for faster future loading) to: ", self.path_to_save_image_features + "/slides_" + i + "_tensor.pt")
122
+
123
+ # Calculate the similarity between the input image and the images in the folder
124
+
125
+ # TODO: THIS REQUIRES REFACTOR. We're only looking in a SINGLE FOLDER. need to APPEND to similarity.
126
+ if self.mode == 'image':
127
+ similarity = (100.0 * input_features @ image_features.T).softmax(dim=-1)
128
+ all_similarities.append((i, similarity))
129
+ elif self.mode == 'text':
130
+ similarity = (100.0 * input_features @ image_features.T).softmax(dim=-1)
131
+ all_similarities.append((i, similarity))
132
+
133
+ ## Looking over all the folders
134
+ similarity_results = []
135
+
136
+ for j in range(0, len(all_similarities)):
137
+ folder_name = all_similarities[j][0]
138
+ folder_values = all_similarities[j][1][0]
139
+ for i in range(0, len(folder_values)):
140
+ self.res.append((folder_name, slide_numbers[j][i], folder_values[i]))
141
+
142
+ #print(self.res)
143
+
144
+ return self.new_most_similar_slide_file(topk_val)
145
+ # Return the sorted results
146
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # if __name__ == "__main__":
149