File size: 2,518 Bytes
921a194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
MODEL_NAME="allenai/MolmoE-1B-0924"
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)    
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import requests

# load the processor
processor = AutoProcessor.from_pretrained(
    'allenai/MolmoE-1B-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

# load the model
model = AutoModelForCausalLM.from_pretrained(
    'allenai/MolmoE-1B-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

# process the image and text
inputs = processor.process(
    images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)],
    text="Describe this image."
)

# move inputs to the correct device and make a batch of size 1
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

# generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
output = model.generate_from_batch(
    inputs,
    GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
    tokenizer=processor.tokenizer
)

# only get generated tokens; decode them to text
generated_tokens = output[0,inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

# print the generated text
print(generated_text)

# >>> This photograph captures a small black puppy, likely a Labrador or a similar breed,
#     sitting attentively on a weathered wooden deck. The deck, composed of three...


# import cv2


# class Solution():
#     def __init__(self,prompt):
#         self.prompt= prompt
#         self.output_dir=None
        
#     # read a mp4 file and getting its frame at a particular interval.
#     def read_frame(self,file,interval=1):
#         video=cv2.VideoCapture(file)
#         fps= video.get(cv2.CAP_PROP_FPS)

#         frame_interval= fps*interval# fps= 24 frame/sec and interval = 1 sec so frame interval = 24 frame         
#         while True:
#             success, frame=video.read()
#             if not success:
#                 break

#             if frame % frame_interval==0:
#                 # process this frame
#                 """
#                 .. to do 
#                 """
                
#     def find(self,input_message):
        
        
                
            



    # read a .mp4 file
    # get a interval N spaced