Spaces:
Sleeping
Sleeping
shambhuDATA
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,79 +1,107 @@
|
|
1 |
-
MODEL_NAME="allenai/MolmoE-1B-0924"
|
2 |
-
from transformers import AutoModelForCausalLM
|
3 |
-
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
4 |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
5 |
from PIL import Image
|
6 |
-
import
|
|
|
7 |
|
8 |
-
#
|
9 |
processor = AutoProcessor.from_pretrained(
|
10 |
-
'allenai/
|
11 |
trust_remote_code=True,
|
12 |
torch_dtype='auto',
|
13 |
device_map='auto'
|
14 |
)
|
15 |
|
16 |
-
# load the model
|
17 |
model = AutoModelForCausalLM.from_pretrained(
|
18 |
-
'allenai/
|
19 |
trust_remote_code=True,
|
20 |
torch_dtype='auto',
|
21 |
device_map='auto'
|
22 |
)
|
23 |
|
24 |
-
# process the image and text
|
25 |
-
inputs = processor.process(
|
26 |
-
images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)],
|
27 |
-
text="Describe this image."
|
28 |
-
)
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
#
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
-
#
|
41 |
-
generated_tokens = output[0,inputs['input_ids'].size(1):]
|
42 |
-
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
43 |
|
44 |
-
|
45 |
-
print(generated_text)
|
46 |
|
47 |
# >>> This photograph captures a small black puppy, likely a Labrador or a similar breed,
|
48 |
# sitting attentively on a weathered wooden deck. The deck, composed of three...
|
49 |
|
50 |
|
51 |
-
|
52 |
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
|
59 |
-
#
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
#
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
75 |
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
|
79 |
|
@@ -81,8 +109,8 @@ print(generated_text)
|
|
81 |
|
82 |
|
83 |
|
84 |
-
|
85 |
-
|
86 |
|
87 |
|
88 |
|
|
|
|
|
|
|
|
|
1 |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
2 |
from PIL import Image
|
3 |
+
import torch
|
4 |
+
import spaces
|
5 |
|
6 |
+
# Load the processor and model
|
7 |
processor = AutoProcessor.from_pretrained(
|
8 |
+
'allenai/Molmo-1B-D-0924',
|
9 |
trust_remote_code=True,
|
10 |
torch_dtype='auto',
|
11 |
device_map='auto'
|
12 |
)
|
13 |
|
|
|
14 |
model = AutoModelForCausalLM.from_pretrained(
|
15 |
+
'allenai/Molmo-1B-D-0924',
|
16 |
trust_remote_code=True,
|
17 |
torch_dtype='auto',
|
18 |
device_map='auto'
|
19 |
)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
@spaces.GPU(duration=120)
|
23 |
+
def process_image_and_text(image, text):
|
24 |
+
# Process the image and text
|
25 |
+
inputs = processor.process(
|
26 |
+
images=[Image.fromarray(image)],
|
27 |
+
text=text
|
28 |
+
)
|
29 |
|
30 |
+
# Move inputs to the correct device and make a batch of size 1
|
31 |
+
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
|
32 |
+
|
33 |
+
# Generate output
|
34 |
+
output = model.generate_from_batch(
|
35 |
+
inputs,
|
36 |
+
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
|
37 |
+
tokenizer=processor.tokenizer
|
38 |
+
)
|
39 |
|
40 |
+
# Only get generated tokens; decode them to text
|
41 |
+
generated_tokens = output[0, inputs['input_ids'].size(1):]
|
42 |
+
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
43 |
|
44 |
+
return generated_text
|
|
|
45 |
|
46 |
# >>> This photograph captures a small black puppy, likely a Labrador or a similar breed,
|
47 |
# sitting attentively on a weathered wooden deck. The deck, composed of three...
|
48 |
|
49 |
|
50 |
+
import cv2
|
51 |
|
52 |
|
53 |
+
class Video():
|
54 |
+
def __init__(self,prompt):
|
55 |
+
self.prompt= prompt
|
56 |
+
self.output_dir=None
|
57 |
|
58 |
+
# read a mp4 file and getting its frame at a particular interval.
|
59 |
+
def read_frame(self,file,interval=1):
|
60 |
+
video=cv2.VideoCapture(file)
|
61 |
+
fps= video.get(cv2.CAP_PROP_FPS)
|
62 |
+
|
63 |
+
frame_interval= fps*interval# fps= 24 frame/sec and interval = 1 sec so frame interval = 24 frame
|
64 |
+
while True:
|
65 |
+
success, frame=video.read()
|
66 |
+
if not success:
|
67 |
+
break
|
68 |
+
|
69 |
+
if frame % frame_interval==0:
|
70 |
+
# process this frame
|
71 |
+
"""
|
72 |
+
send the frame to MolMO which will return either co-ordinates : x,y or Null
|
73 |
+
"""
|
74 |
+
self.find(frame)
|
75 |
|
76 |
+
def find(self,frame):
|
77 |
+
"""
|
78 |
+
This function will take in the frame and input message and point to all the messages in the picture.
|
79 |
+
"""
|
80 |
+
model= Model()
|
81 |
+
text = model.generate(frame,self.prompt)
|
82 |
+
x,y=extract_coordinates(text)
|
83 |
+
if (x,y):
|
84 |
+
annotate_the_image_with_pointer(x,y,frame)
|
85 |
+
else:
|
86 |
+
"""
|
87 |
+
read next frame
|
88 |
+
"""
|
89 |
+
break;
|
90 |
+
|
91 |
+
import re
|
92 |
+
def extract_coordinates(text):
|
93 |
+
object=re.compile(r"\(([^)]+)\)")
|
94 |
+
co_ord= object.search(text)
|
95 |
+
if co_ord:
|
96 |
+
# Split the captured text on the comma to get the x and y values
|
97 |
+
x, y = map(float, co_ord.group(1).split(','))
|
98 |
+
coordinates = (x, y)
|
99 |
+
|
100 |
+
return co_ord
|
101 |
+
|
102 |
+
def annotate_the_image_with_pointer(x,y,frame):
|
103 |
+
return cv2.circle(frame,(x,y),2,(255,0,0),2)
|
104 |
+
|
105 |
|
106 |
|
107 |
|
|
|
109 |
|
110 |
|
111 |
|
112 |
+
read a .mp4 file
|
113 |
+
get a interval N spaced
|
114 |
|
115 |
|
116 |
|