Image_Analyzer / model_loader.py
AvocadoMuffin's picture
Update model_loader.py
b51b14d verified
import google.generativeai as genai
from PIL import Image
import io
import base64
import os
from dotenv import load_dotenv
# Function to get image bytes for Gemini (which requires image data)
def get_image_bytes(image):
"""Convert PIL Image to bytes for Gemini API"""
buffer = io.BytesIO()
image.save(buffer, format="PNG")
image_bytes = buffer.getvalue()
return image_bytes
# Load Gemini Vision Model
def load_gemini_model():
# Using Gemini 1.5 Flash which supports multimodal inputs (as of April 2025)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
return model
# Generate basic caption with Gemini
def generate_caption_with_gemini(image, model):
image_bytes = get_image_bytes(image)
# Basic captioning prompt
prompt = "Describe what you see in this image in a single sentence."
response = model.generate_content([prompt, {"mime_type": "image/png", "data": image_bytes}])
caption = response.text
return caption
# Function to generate detailed description with Gemini
def generate_detailed_description(image, model, prompt_type, custom_prompt=None):
image_bytes = get_image_bytes(image)
# Define prompts based on type
prompts = {
"basic": "Please provide a description of the objects and their relationships in the image. Focus on identifying the most prominent objects and their actions.",
"chain_of_thought": "Step 1: Look at the image carefully and identify all visible objects. Step 2: Consider what actions are occurring in the scene, if any. Step 3: Conclude with a description of the environment, highlighting any emotions or atmosphere present in the image.",
"story": "Look at the image carefully. Based on what you see, create an imaginative story that could explain the scene in the picture. Include details that make the story engaging and creative.",
"emotional": "Analyze the image and describe the emotions conveyed by the scene. Focus on the mood, facial expressions (if any), and the overall atmosphere.",
"object": "List and describe all objects visible in the image in detail. Mention their characteristics, colors, and any notable features.",
"context": "Describe the context of the image. What could be the location, time of day, or situation in the scene? Try to infer as much information as possible based on the image.",
"action": "Look at the image and describe the main action or movement happening. Who or what is performing the action, and what is the outcome?"
}
if prompt_type == "custom" and custom_prompt:
final_prompt = custom_prompt
else:
final_prompt = prompts.get(prompt_type, prompts["basic"])
# Generate response with the image and prompt
response = model.generate_content([final_prompt, {"mime_type": "image/png", "data": image_bytes}])
return response.text