Spaces:
Sleeping
Sleeping
Update model_loader.py
Browse files- model_loader.py +56 -64
model_loader.py
CHANGED
|
@@ -1,65 +1,57 @@
|
|
| 1 |
-
import google.generativeai as genai
|
| 2 |
-
from PIL import Image
|
| 3 |
-
import io
|
| 4 |
-
import base64
|
| 5 |
-
import os
|
| 6 |
-
from dotenv import load_dotenv
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
#
|
| 18 |
-
def
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
#
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
if prompt_type == "custom" and custom_prompt:
|
| 59 |
-
final_prompt = custom_prompt
|
| 60 |
-
else:
|
| 61 |
-
final_prompt = prompts.get(prompt_type, prompts["basic"])
|
| 62 |
-
|
| 63 |
-
# Generate response with the image and prompt
|
| 64 |
-
response = model.generate_content([final_prompt, {"mime_type": "image/png", "data": image_bytes}])
|
| 65 |
return response.text
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import io
|
| 4 |
+
import base64
|
| 5 |
+
import os
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# Function to get image bytes for Gemini (which requires image data)
|
| 10 |
+
def get_image_bytes(image):
|
| 11 |
+
"""Convert PIL Image to bytes for Gemini API"""
|
| 12 |
+
buffer = io.BytesIO()
|
| 13 |
+
image.save(buffer, format="PNG")
|
| 14 |
+
image_bytes = buffer.getvalue()
|
| 15 |
+
return image_bytes
|
| 16 |
+
|
| 17 |
+
# Load Gemini Vision Model
|
| 18 |
+
def load_gemini_model():
|
| 19 |
+
# Using Gemini 1.5 Flash which supports multimodal inputs (as of April 2025)
|
| 20 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
|
| 21 |
+
return model
|
| 22 |
+
|
| 23 |
+
# Generate basic caption with Gemini
|
| 24 |
+
def generate_caption_with_gemini(image, model):
|
| 25 |
+
image_bytes = get_image_bytes(image)
|
| 26 |
+
|
| 27 |
+
# Basic captioning prompt
|
| 28 |
+
prompt = "Describe what you see in this image in a single sentence."
|
| 29 |
+
|
| 30 |
+
response = model.generate_content([prompt, {"mime_type": "image/png", "data": image_bytes}])
|
| 31 |
+
caption = response.text
|
| 32 |
+
|
| 33 |
+
return caption
|
| 34 |
+
|
| 35 |
+
# Function to generate detailed description with Gemini
|
| 36 |
+
def generate_detailed_description(image, model, prompt_type, custom_prompt=None):
|
| 37 |
+
image_bytes = get_image_bytes(image)
|
| 38 |
+
|
| 39 |
+
# Define prompts based on type
|
| 40 |
+
prompts = {
|
| 41 |
+
"basic": "Please provide a description of the objects and their relationships in the image. Focus on identifying the most prominent objects and their actions.",
|
| 42 |
+
"chain_of_thought": "Step 1: Look at the image carefully and identify all visible objects. Step 2: Consider what actions are occurring in the scene, if any. Step 3: Conclude with a description of the environment, highlighting any emotions or atmosphere present in the image.",
|
| 43 |
+
"story": "Look at the image carefully. Based on what you see, create an imaginative story that could explain the scene in the picture. Include details that make the story engaging and creative.",
|
| 44 |
+
"emotional": "Analyze the image and describe the emotions conveyed by the scene. Focus on the mood, facial expressions (if any), and the overall atmosphere.",
|
| 45 |
+
"object": "List and describe all objects visible in the image in detail. Mention their characteristics, colors, and any notable features.",
|
| 46 |
+
"context": "Describe the context of the image. What could be the location, time of day, or situation in the scene? Try to infer as much information as possible based on the image.",
|
| 47 |
+
"action": "Look at the image and describe the main action or movement happening. Who or what is performing the action, and what is the outcome?"
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
if prompt_type == "custom" and custom_prompt:
|
| 51 |
+
final_prompt = custom_prompt
|
| 52 |
+
else:
|
| 53 |
+
final_prompt = prompts.get(prompt_type, prompts["basic"])
|
| 54 |
+
|
| 55 |
+
# Generate response with the image and prompt
|
| 56 |
+
response = model.generate_content([final_prompt, {"mime_type": "image/png", "data": image_bytes}])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
return response.text
|