gpt-demo

Running

App Files Files Community

tomerk commited on Nov 3, 2024

Commit

7348981

verified ·

1 Parent(s): 692ecc3

Update app.py

Browse files

Files changed (1) hide show

app.py +353 -280

app.py CHANGED Viewed

@@ -1,295 +1,368 @@
-import gradio as gr
-import cv2
-import time
 import openai
-import base64
-import pytz
-import uuid
-from threading import Thread
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from datetime import datetime
-import json
 import os
-from moviepy.editor import ImageSequenceClip
-from gradio_client import Client, file
-import subprocess
-import ffmpeg
 api_key = os.getenv("OPEN_AI_KEY")
-user_name = os.getenv("USER_NAME")
-password = os.getenv("PASSWORD")
-LENGTH = 3
-WEBCAM = 0
-MARKDOWN = """
-# Conntour
 """
-AVATARS = (
-    "https://assets-global.website-files.com/63d6dca820934a77a340f31e/63dfb7a21b4c08282d524010_pyramid.png",
-    "https://media.roboflow.com/spaces/openai-white-logomark.png"
-)
-# Set your OpenAI API key
-openai.api_key = api_key
-MODEL="gpt-4o"
-client = openai.OpenAI(api_key=api_key)
-# Global variable to stop the video capture loop
-stop_capture = False
-alerts_mode = True
-def clip_video_segment_2(input_video_path, start_time, duration):
-    os.makedirs('videos', exist_ok=True)
-    output_video_path = f"videos/{uuid.uuid4()}.mp4"
-    # Use ffmpeg-python to clip the video
-    try:
-        (
-            ffmpeg
-            .input(input_video_path, ss=start_time)  # Seek to start_time
-            .output(output_video_path, t=duration, c='copy')  # Set the duration
-            .run(overwrite_output=True)
-        )
-        print('input_video_path', input_video_path, output_video_path)
-        return output_video_path
-    except ffmpeg.Error as e:
-        print(f"Error clipping video: {e}")
-        return None
-def clip_video_segment(input_video_path, start_time, duration):
-    os.makedirs('videos', exist_ok=True)
-    output_video_path = f"videos/{uuid.uuid4()}.mp4"
-    subprocess.call([
-        'ffmpeg', '-y', '-ss', str(start_time), '-i', input_video_path,
-        '-t', str(duration), '-c', 'copy', output_video_path
-    ])
-    print('input_video_path', input_video_path, output_video_path)
-    return output_video_path
-def encode_to_video_fast(frames, fps):
-    os.makedirs('videos', exist_ok=True)
-    video_clip_path = f"videos/{uuid.uuid4()}.mp4"
-    # Get frame size
-    height, width, layers = frames[0].shape
-    size = (width, height)
-    # Define the codec and create VideoWriter object
-    fourcc = cv2.VideoWriter_fourcc(*'h264')  # You can also try 'XVID', 'MJPG', etc.
-    out = cv2.VideoWriter(video_clip_path, fourcc, fps, size)
-    for frame in frames:
-        out.write(frame)
-    out.release()
-    return video_clip_path
-def encode_to_video(frames, fps):
-    os.makedirs('videos', exist_ok=True)
-    video_clip_path = f"videos/{uuid.uuid4()}.mp4"
-    # Create a video clip from the frames using moviepy
-    clip = ImageSequenceClip([frame[:, :, ::-1] for frame in frames], fps=fps)  # Convert from BGR to RGB
-    clip.write_videofile(video_clip_path, codec="libx264")
-    # Convert the video file to base64
-    with open(video_clip_path, "rb") as video_file:
-        video_data = base64.b64encode(video_file.read()).decode('utf-8')
-    return video_clip_path
-# Function to process video frames using GPT-4 API
-def process_frames(frames, frames_to_skip = 1):
-    os.makedirs('saved_frames', exist_ok=True)
-    curr_frame=0
-    base64Frames = []
-    while curr_frame < len(frames) - 1:
-        _, buffer = cv2.imencode(".jpg", frames[curr_frame])
-        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
-        curr_frame += frames_to_skip
-    return base64Frames
-# Function to check condition using GPT-4 API
-def check_condition(prompt, base64Frames):
-    start_time = time.time()
-    print('checking condition for frames:', len(base64Frames))
-        # Save frames as images
-    messages = [
-        {"role": "system", "content": """You are analyzing video to check if the user's condition is met.
-        Please respond with a JSON object in the following format:
-        {"condition_met": true/false, "details": "optional details or summary. in the summary DON'T mention the words: image, images, frame, or frames. Instead, make it look like you were provided with video input and avoid referring to individual images or frames explicitly."}"""},
-        {"role": "user", "content": [prompt, *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)]}
-    ]
     response = client.chat.completions.create(
         model="gpt-4o",
         messages=messages,
-        temperature=0,
-        response_format={ "type": "json_object" }
     )
-    end_time = time.time()
-    processing_time = end_time - start_time
-    frames_count = len(base64Frames)
-    api_response = response.choices[0].message.content
     try:
-        jsonNew = json.loads(api_response)
-        print('result', response.usage.total_tokens, jsonNew)
-        return frames_count, processing_time, jsonNew
-    except:
-        print('result', response.usage.total_tokens, api_response)
-        return frames_count, processing_time, api_response
-# Function to process video clip and update the chatbot
-def process_clip(prompt, frames, chatbot):
-    # Print current time in Israel
-    israel_tz = pytz.timezone('Asia/Jerusalem')
-    start_time = datetime.now(israel_tz).strftime('%H:%M:%S')
-    print("[Start]:", start_time, len(frames))
-    # Encode frames into a video clip
-    fps = int(len(frames) / LENGTH)
-    base64Frames = process_frames(frames, fps)
-    frames_count, processing_time, api_response = check_condition(prompt, base64Frames)
-    if api_response["condition_met"] == True:
-        finish_time = datetime.now(israel_tz).strftime('%H:%M:%S')
-        video_clip_path = encode_to_video(frames, fps)
-        chatbot.append(((video_clip_path,), None))
-        chatbot.append((f"Time: {start_time}\nDetails: {api_response.get('details', '')}", None))
-        frame_paths = []
-        for i, base64_frame in enumerate(base64Frames):
-            frame_data = base64.b64decode(base64_frame)
-            frame_path = f'saved_frames/frame_{uuid.uuid4()}.jpg'
-            with open(frame_path, "wb") as f:
-                f.write(frame_data)
-            frame_paths.append(frame_path)
-def process_clip_from_file(prompt, frames, chatbot, fps, video_path, id):
-    global stop_capture
-    if not stop_capture:
-        israel_tz = pytz.timezone('Asia/Jerusalem')
-        start_time = datetime.now(israel_tz).strftime('%H:%M:%S')
-        print("[Start]:", start_time, len(frames))
-        frames_to_skip = int(fps)
-        base64Frames = process_frames(frames, frames_to_skip)
-        frames_count, processing_time, api_response = check_condition(prompt, base64Frames)
-        result = None
-        if api_response and api_response.get("condition_met", False):
-            # video_clip_path = encode_to_video_fast(frames, fps)
-            video_clip_path = clip_video_segment_2(video_path, id*LENGTH, LENGTH)
-            chatbot.append(((video_clip_path,), None))
-            chatbot.append((f"Event ID: {id+1}\nDetails: {api_response.get('details', '')}", None))
-    return chatbot
-# Function to capture video frames
-def analyze_stream(prompt, stream, chatbot):
-    global stop_capture
-    stop_capture = False
-    cap = cv2.VideoCapture(stream or WEBCAM)
-    frames = []
-    start_time = time.time()
-    while not stop_capture:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        frames.append(frame)
-        # Sample the frames every 5 seconds
-        if time.time() - start_time >= LENGTH:
-            # Start a new thread for processing the video clip
-            Thread(target=process_clip, args=(prompt, frames.copy(), chatbot,)).start()
-            frames = []
-            start_time = time.time()
-        yield chatbot
-    cap.release()
-    return chatbot
-def analyze_video_file(prompt, video_path, chatbot):
-    global stop_capture
-    stop_capture = False  # Reset the stop flag when analysis starts
-    cap = cv2.VideoCapture(video_path)
-    # Get video properties
-    fps = int(cap.get(cv2.CAP_PROP_FPS))  # Frames per second
-    frames_per_chunk = fps * LENGTH  # Number of frames per 5-second chunk
-    frames = []
-    chunk = 0
-    # Create a thread pool for concurrent processing
-    with ThreadPoolExecutor(max_workers=4) as executor:
-        futures = []
-        while not stop_capture:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            frames.append(frame)
-            # Split the video into chunks of frames corresponding to 5 seconds
-            if len(frames) >= frames_per_chunk:
-                futures.append(executor.submit(process_clip_from_file, prompt, frames.copy(), chatbot, fps, video_path, chunk))
-                frames = []
-                chunk+=1
-        # If any remaining frames that are less than 5 seconds, process them as a final chunk
-        if len(frames) > 0:
-            futures.append(executor.submit(process_clip_from_file, prompt, frames.copy(), chatbot, fps, video_path, chunk))
-            chunk+=1
-        cap.release()
-        # Yield results as soon as each thread completes
-        for future in as_completed(futures):
-            result = future.result()
-            yield result
-    return chatbot
-# Function to stop video capture
-def stop_capture_func():
-    global stop_capture
-    stop_capture = True
-# Gradio interface
-with gr.Blocks(title="Conntour", fill_height=True) as demo:
-    with gr.Tab("Analyze"):
-        with gr.Row():
-            video = gr.Video(label="Video Source")
-            with gr.Column():
-                chatbot = gr.Chatbot(label="Events", bubble_full_width=False, avatar_images=AVATARS)
-                prompt = gr.Textbox(label="Enter your prompt alert")
-                start_btn = gr.Button("Start")
-                stop_btn = gr.Button("Stop")
-            start_btn.click(analyze_video_file, inputs=[prompt, video, chatbot], outputs=[chatbot], queue=True)
-            stop_btn.click(stop_capture_func)
-    with gr.Tab("Alerts"):
-        with gr.Row():
-            stream = gr.Textbox(label="Video Source", value="https://streamapi2.eu.loclx.io/video_feed/101 OR rtsp://admin:[email protected]:5678/Streaming/Channels/101")
-            with gr.Column():
-                chatbot = gr.Chatbot(label="Events", bubble_full_width=False, avatar_images=AVATARS)
-                prompt = gr.Textbox(label="Enter your prompt alert")
-                start_btn = gr.Button("Start")
-                stop_btn = gr.Button("Stop")
-            start_btn.click(analyze_stream, inputs=[prompt, stream, chatbot], outputs=[chatbot], queue=True)
-            stop_btn.click(stop_capture_func)
-demo.launch(favicon_path='favicon.ico', auth=(user_name, password))

 import openai
 import os
+import gradio as gr
+from enum import Enum
+from dataclasses import dataclass, asdict, field
+from typing import List, Optional, Union, Dict, Any
+import json
 api_key = os.getenv("OPEN_AI_KEY")
+# Define the COCOClass enum
+class COCOClass(Enum):
+    person = 0
+    bicycle = 1
+    car = 2
+    motorcycle = 3
+    airplane = 4
+    bus = 5
+    train = 6
+    truck = 7
+    boat = 8
+    traffic_light = 9
+    fire_hydrant = 10
+    stop_sign = 11
+    parking_meter = 12
+    bench = 13
+    bird = 14
+    cat = 15
+    dog = 16
+    horse = 17
+    sheep = 18
+    cow = 19
+    elephant = 20
+    bear = 21
+    zebra = 22
+    giraffe = 23
+    backpack = 24
+    umbrella = 25
+    handbag = 26
+    tie = 27
+    suitcase = 28
+    frisbee = 29
+    skis = 30
+    snowboard = 31
+    sports_ball = 32
+    kite = 33
+    baseball_bat = 34
+    baseball_glove = 35
+    skateboard = 36
+    surfboard = 37
+    tennis_racket = 38
+    bottle = 39
+    wine_glass = 40
+    cup = 41
+    fork = 42
+    knife = 43
+    spoon = 44
+    bowl = 45
+    banana = 46
+    apple = 47
+    sandwich = 48
+    orange = 49
+    broccoli = 50
+    carrot = 51
+    hot_dog = 52
+    pizza = 53
+    donut = 54
+    cake = 55
+    chair = 56
+    couch = 57
+    potted_plant = 58
+    bed = 59
+    dining_table = 60
+    toilet = 61
+    tv = 62
+    laptop = 63
+    mouse = 64
+    remote = 65
+    keyboard = 66
+    cell_phone = 67
+    microwave = 68
+    oven = 69
+    toaster = 70
+    sink = 71
+    refrigerator = 72
+    book = 73
+    clock = 74
+    vase = 75
+    scissors = 76
+    teddy_bear = 77
+    hair_drier = 78
+    toothbrush = 79
+# Define data classes
+@dataclass
+class VehicleProps:
+    brand: Optional[str] = None
+    type: Optional[COCOClass] = None  # Should be a vehicle class
+    plate: Optional[str] = None
+@dataclass
+class PersonProps:
+    face_images: Optional[List[str]] = field(default_factory=list)
+    age: Optional[int] = None
+    race: Optional[str] = None  # Should be one of the specified races
+    gender: Optional[str] = None  # Male or Female
+    top_color: Optional[str] = None  # Changed from shirt_color
+    bottom_color: Optional[str] = None
+@dataclass
+class Activity:
+    prompt: Optional[str] = None
+    type: Optional[str] = None  # "full_screen" or "square"
+@dataclass
+class Investigation:
+    target: COCOClass
+    images: List[str]
+    activity: Optional[Activity] = None
+    complex_appearance: Optional[str] = None
+    props: Optional[Union[VehicleProps, PersonProps]] = None
+    primary_color: Optional[str] = None
+    secondary_color: Optional[str] = None
+# Default system message (moved to a global variable)
+DEFAULT_SYSTEM_MESSAGE = """
+You are a helpful assistant that extracts structured information from text descriptions.
+Your task is to parse the following text prompt and extract information to populate an Investigation JSON object as per the definitions provided.
+Definitions:
+Investigation:
+{{
+    "target": A COCO class name (from the COCOClass enum),
+    "images": List of image URLs,
+    "activity": {{
+        "prompt": A description of an activity, e.g., "crossing the street", "crossing red light", "holding a gun",
+        "type": Either "full_screen" or "square"
+            - "full_screen": When the activity requires the full scene for context (e.g., "seeing a movie").
+            - "square": When the activity context can be understood from a close-up image (e.g., "holding a cat").
+    }},
+    "complex_appearance": Description of appearance details that do not fit into other fields, e.g., "has a hat with Nike logo" or "Tattoo on left arm",
+    "props": Either VehicleProps or PersonProps (only if the target is vehicle or person),
+    "primary_color": Primary color mentioned in the prompt,
+    "secondary_color": Secondary color mentioned in the prompt
+}}
+VehicleProps:
+{{
+    "brand": Vehicle brand, e.g., "Mercedes",
+    "type": COCO class name of vehicles (e.g., "truck"),
+    "plate": License plate number, e.g., "123AB"
+}}
+PersonProps:
+{{
+    "face_images": List of face image URLs,
+    "age": Age as a number,
+    "race": Race or ethnicity (one of: asian, white, middle eastern, indian, latino, black),
+    "gender": Gender (Male or Female),
+    "top_color": Color of the top garment (e.g., shirt, blouse),  # Changed from shirt_color
+    "bottom_color": Color of the bottom garment (pants, skirt, etc.)
+}}
+COCOClass Enum:
+{{
+{', '.join([f'"{member.name}"' for member in COCOClass])}
+}}
+Important Notes:
+1. The output JSON should be as minimal as possible. Do not include fields like 'primary_color' or 'secondary_color' if they are not mentioned in the prompt.
+2. Be especially careful with 'activity' and 'complex_appearance' fields. Use them only if the prompt has data that does not fit elsewhere in the JSON. For example:
+   - "a guy with red shirt" -> Map 'red shirt' to 'top_color' in PersonProps.
+   - "a guy with a black hat" -> Since there isn't any field for 'hat', include "black hat" in 'complex_appearance'.
+3. Avoid using 'complex_appearance' and 'activity' fields unless absolutely necessary.
+4. Do not include undefined fields or fields not mentioned in the prompt.
+5. Use the COCOClass enum for the target class name.
+Now, process the following prompt:
+'''prompt_text'''
+Provide the Investigation JSON object, including only the relevant fields based on the prompt. Do not include any explanations.
 """
+# Function to process the prompt
+def process_prompt(prompt_text: str, images: List[str], face_images: List[str],
+                   system_message: Optional[str] = None, user_message: Optional[str] = None,
+                   temperature: float = 0.0) -> Optional[Dict[str, Any]]:
+    client = openai.OpenAI(api_key=api_key)
+    # Default user message
+    if not user_message:
+        user_message = ""
+    # Prepare messages for the API
+    messages = []
+    if system_message.strip():
+        messages.append({"role": "system", "content": system_message.replace("prompt_text", prompt_text)})
+    if user_message.strip():
+        messages.append({"role": "user", "content": user_message.replace("prompt_text", prompt_text)})
     response = client.chat.completions.create(
         model="gpt-4o",
         messages=messages,
+        response_format={ "type": "json_object" },
+        temperature=temperature,
+        max_tokens=1000,
     )
+    # Extract the content
+    content  = response.choices[0].message.content
+    # Parse the JSON output
+    try:
+        investigation_data = json.loads(content)
+    except json.JSONDecodeError as e:
+        print("Error parsing JSON:", e)
+        print("OpenAI response:", content)
+        return None
+    # Construct the Investigation object
+    investigation = parse_investigation(investigation_data, images, face_images)
+    # Convert the Investigation object to dictionary
+    if investigation:
+        investigation_dict = asdict(investigation)
+        # Convert enums to their names
+        investigation_dict['target'] = investigation.target.name
+        if investigation.props:
+            if isinstance(investigation.props, VehicleProps) and investigation.props.type:
+                investigation_dict['props']['type'] = investigation.props.type.name
+            elif isinstance(investigation.props, PersonProps):
+                pass  # No enums in PersonProps
+        return investigation_dict
+    else:
+        return None
+# Function to parse the Investigation data
+def parse_investigation(data: Dict[str, Any], images: List[str], face_images: List[str]) -> Optional[Investigation]:
+    # Parse target
+    target_name = data.get('target')
     try:
+        target_enum = COCOClass[target_name]
+    except KeyError:
+        print(f"Invalid COCO class name: {target_name}")
+        return None
+    # Parse activity
+    activity_data = data.get('activity')
+    if activity_data:
+        activity = Activity(
+            prompt=activity_data.get('prompt'),
+            type=activity_data.get('type')
+        )
+    else:
+        activity = None
+    # Parse props
+    props_data = data.get('props')
+    props = None
+    if props_data:
+        if 'face_images' in props_data:
+            # PersonProps
+            props = PersonProps(
+                face_images=face_images,
+                age=props_data.get('age'),
+                race=props_data.get('race'),
+                gender=props_data.get('gender'),
+                top_color=props_data.get('top_color'),  # Changed from shirt_color
+                bottom_color=props_data.get('bottom_color')
+            )
+        elif 'brand' in props_data:
+            # VehicleProps
+            vehicle_type_name = props_data.get('type')
+            if vehicle_type_name:
+                try:
+                    vehicle_type_enum = COCOClass[vehicle_type_name]
+                except KeyError:
+                    print(f"Invalid vehicle type: {vehicle_type_name}")
+                    vehicle_type_enum = None
+            else:
+                vehicle_type_enum = None
+            props = VehicleProps(
+                brand=props_data.get('brand'),
+                type=vehicle_type_enum,
+                plate=props_data.get('plate')
+            )
+    # Construct the Investigation object
+    investigation = Investigation(
+        target=target_enum,
+        images=images,
+        activity=activity,
+        complex_appearance=data.get('complex_appearance'),
+        props=props,
+        primary_color=data.get('primary_color'),
+        secondary_color=data.get('secondary_color')
+    )
+    return investigation
+# Gradio app
+def gradio_app(prompts_text, system_message, user_message, temperature):
+    # Split prompts by commas and strip whitespace
+    prompts = [p.strip() for p in prompts_text.split(',') if p.strip()]
+    images = ["http://example.com/image1.jpg", "http://example.com/image2.jpg"]
+    face_images = ["http://example.com/face1.jpg"]
+    results = []
+    for p in prompts:
+        investigation_dict = process_prompt(
+            prompt_text=p,
+            images=images,
+            face_images=face_images,
+            system_message=system_message if system_message else None,
+            user_message=user_message if user_message else None,
+            temperature=temperature if temperature else 0.0
+        )
+        if investigation_dict:
+            results.append(json.dumps(investigation_dict, indent=4))
+        else:
+            results.append("Failed to process prompt.")
+    return "\n\n".join(results)
+if __name__ == "__main__":
+    # Default values
+    default_prompts = ", ".join([
+        "A red sports car with a license plate reading 'FAST123'.",
+        "An elderly woman wearing a green dress and a pearl necklace.",
+        "A cyclist in a yellow jersey riding a blue bicycle.",
+        "A group of people playing frisbee in the park.",
+        "A man with a large tattoo of a dragon on his right arm.",
+        "A black and white cat sitting on a red couch.",
+        "A delivery truck with the 'FedEx' logo on the side.",
+        "A child holding a red balloon shaped like a dog.",
+        "A person wearing a hoodie with the text 'OpenAI' on it.",
+        "A woman in a blue swimsuit swimming in the ocean."
+    ])
+    default_system_message = DEFAULT_SYSTEM_MESSAGE.replace("{{prompt_text}}", "{prompt_text}")  # Prepare for formatting
+    default_user_message = ""    # Optional user message
+    default_temperature = 0.0    # Default temperature
+    # Create Gradio interface
+    iface = gr.Interface(
+        fn=gradio_app,
+        inputs=[
+            gr.Textbox(lines=5, label="List of Prompts (comma-separated)", value=default_prompts),
+            gr.Textbox(lines=20, label="System Message (optional)", value=default_system_message),
+            gr.Textbox(lines=5, label="User Message (optional)", value=default_user_message),
+            gr.Slider(minimum=0, maximum=1, step=0.1, label="Temperature", value=default_temperature)
+        ],
+        outputs="text",
+        title="OpenAI Prompt Engineering Tester",
+        description="Test different prompts and messages with the OpenAI API."
+    )
+    # Launch the app
+    iface.launch()