File size: 3,236 Bytes
921a194
 
afebc2f
 
921a194
afebc2f
921a194
afebc2f
921a194
 
 
 
 
 
afebc2f
921a194
 
 
 
 
 
afebc2f
 
 
 
 
 
 
921a194
afebc2f
 
 
 
 
 
 
 
 
921a194
afebc2f
 
 
921a194
afebc2f
921a194
 
 
 
 
afebc2f
921a194
 
afebc2f
 
 
 
921a194
afebc2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921a194
afebc2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921a194
 
 
 
 
 
 
afebc2f
 
921a194
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import torch
import spaces

# Load the processor and model
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-1B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-1B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)


@spaces.GPU(duration=120)
def process_image_and_text(image, text):
    # Process the image and text
    inputs = processor.process(
        images=[Image.fromarray(image)],
        text=text
    )

    # Move inputs to the correct device and make a batch of size 1
    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

    # Generate output
    output = model.generate_from_batch(
        inputs,
        GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
        tokenizer=processor.tokenizer
    )

    # Only get generated tokens; decode them to text
    generated_tokens = output[0, inputs['input_ids'].size(1):]
    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return generated_text

# >>> This photograph captures a small black puppy, likely a Labrador or a similar breed,
#     sitting attentively on a weathered wooden deck. The deck, composed of three...


import cv2


class Video():
    def __init__(self,prompt):
        self.prompt= prompt
        self.output_dir=None
        
    # read a mp4 file and getting its frame at a particular interval.
    def read_frame(self,file,interval=1):
        video=cv2.VideoCapture(file)
        fps= video.get(cv2.CAP_PROP_FPS)

        frame_interval= fps*interval# fps= 24 frame/sec and interval = 1 sec so frame interval = 24 frame         
        while True:
            success, frame=video.read()
            if not success:
                break

            if frame % frame_interval==0:
                # process this frame
                """
                send the frame to MolMO which will return either co-ordinates : x,y or Null
                """
                self.find(frame)
                
    def find(self,frame):
        """
        This function will take in the frame and input message and point to all the messages in the picture.
        """
        model= Model()
        text = model.generate(frame,self.prompt)
        x,y=extract_coordinates(text)
        if (x,y):
            annotate_the_image_with_pointer(x,y,frame)
        else:
            """
            read next frame
            """
            break;

    import re
    def extract_coordinates(text):
        object=re.compile(r"\(([^)]+)\)")
        co_ord= object.search(text)
        if co_ord:
             # Split the captured text on the comma to get the x and y values
            x, y = map(float, co_ord.group(1).split(','))
        coordinates = (x, y)
        
        return co_ord        

    def annotate_the_image_with_pointer(x,y,frame):
        return cv2.circle(frame,(x,y),2,(255,0,0),2)
        
        
        
                
            



    read a .mp4 file
    get a interval N spaced