AvocadoMuffin commited on
Commit
b51b14d
·
verified ·
1 Parent(s): 354ac73

Update model_loader.py

Browse files
Files changed (1) hide show
  1. model_loader.py +56 -64
model_loader.py CHANGED
@@ -1,65 +1,57 @@
1
- import google.generativeai as genai
2
- from PIL import Image
3
- import io
4
- import base64
5
- import os
6
- from dotenv import load_dotenv
7
-
8
- # Load environment variables
9
- load_dotenv()
10
-
11
- # Fetch API key securely
12
- api_key = os.getenv("GEMINI_API_KEY")
13
-
14
- # Configure Gemini with your API Key
15
- genai.configure(api_key=api_key)
16
-
17
- # Function to get image bytes for Gemini (which requires image data)
18
- def get_image_bytes(image):
19
- """Convert PIL Image to bytes for Gemini API"""
20
- buffer = io.BytesIO()
21
- image.save(buffer, format="PNG")
22
- image_bytes = buffer.getvalue()
23
- return image_bytes
24
-
25
- # Load Gemini Vision Model
26
- def load_gemini_model():
27
- # Using Gemini 1.5 Flash which supports multimodal inputs (as of April 2025)
28
- model = genai.GenerativeModel(model_name="gemini-1.5-flash")
29
- return model
30
-
31
- # Generate basic caption with Gemini
32
- def generate_caption_with_gemini(image, model):
33
- image_bytes = get_image_bytes(image)
34
-
35
- # Basic captioning prompt
36
- prompt = "Describe what you see in this image in a single sentence."
37
-
38
- response = model.generate_content([prompt, {"mime_type": "image/png", "data": image_bytes}])
39
- caption = response.text
40
-
41
- return caption
42
-
43
- # Function to generate detailed description with Gemini
44
- def generate_detailed_description(image, model, prompt_type, custom_prompt=None):
45
- image_bytes = get_image_bytes(image)
46
-
47
- # Define prompts based on type
48
- prompts = {
49
- "basic": "Please provide a description of the objects and their relationships in the image. Focus on identifying the most prominent objects and their actions.",
50
- "chain_of_thought": "Step 1: Look at the image carefully and identify all visible objects. Step 2: Consider what actions are occurring in the scene, if any. Step 3: Conclude with a description of the environment, highlighting any emotions or atmosphere present in the image.",
51
- "story": "Look at the image carefully. Based on what you see, create an imaginative story that could explain the scene in the picture. Include details that make the story engaging and creative.",
52
- "emotional": "Analyze the image and describe the emotions conveyed by the scene. Focus on the mood, facial expressions (if any), and the overall atmosphere.",
53
- "object": "List and describe all objects visible in the image in detail. Mention their characteristics, colors, and any notable features.",
54
- "context": "Describe the context of the image. What could be the location, time of day, or situation in the scene? Try to infer as much information as possible based on the image.",
55
- "action": "Look at the image and describe the main action or movement happening. Who or what is performing the action, and what is the outcome?"
56
- }
57
-
58
- if prompt_type == "custom" and custom_prompt:
59
- final_prompt = custom_prompt
60
- else:
61
- final_prompt = prompts.get(prompt_type, prompts["basic"])
62
-
63
- # Generate response with the image and prompt
64
- response = model.generate_content([final_prompt, {"mime_type": "image/png", "data": image_bytes}])
65
  return response.text
 
1
+ import google.generativeai as genai
2
+ from PIL import Image
3
+ import io
4
+ import base64
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+
9
+ # Function to get image bytes for Gemini (which requires image data)
10
+ def get_image_bytes(image):
11
+ """Convert PIL Image to bytes for Gemini API"""
12
+ buffer = io.BytesIO()
13
+ image.save(buffer, format="PNG")
14
+ image_bytes = buffer.getvalue()
15
+ return image_bytes
16
+
17
+ # Load Gemini Vision Model
18
+ def load_gemini_model():
19
+ # Using Gemini 1.5 Flash which supports multimodal inputs (as of April 2025)
20
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash")
21
+ return model
22
+
23
+ # Generate basic caption with Gemini
24
+ def generate_caption_with_gemini(image, model):
25
+ image_bytes = get_image_bytes(image)
26
+
27
+ # Basic captioning prompt
28
+ prompt = "Describe what you see in this image in a single sentence."
29
+
30
+ response = model.generate_content([prompt, {"mime_type": "image/png", "data": image_bytes}])
31
+ caption = response.text
32
+
33
+ return caption
34
+
35
+ # Function to generate detailed description with Gemini
36
+ def generate_detailed_description(image, model, prompt_type, custom_prompt=None):
37
+ image_bytes = get_image_bytes(image)
38
+
39
+ # Define prompts based on type
40
+ prompts = {
41
+ "basic": "Please provide a description of the objects and their relationships in the image. Focus on identifying the most prominent objects and their actions.",
42
+ "chain_of_thought": "Step 1: Look at the image carefully and identify all visible objects. Step 2: Consider what actions are occurring in the scene, if any. Step 3: Conclude with a description of the environment, highlighting any emotions or atmosphere present in the image.",
43
+ "story": "Look at the image carefully. Based on what you see, create an imaginative story that could explain the scene in the picture. Include details that make the story engaging and creative.",
44
+ "emotional": "Analyze the image and describe the emotions conveyed by the scene. Focus on the mood, facial expressions (if any), and the overall atmosphere.",
45
+ "object": "List and describe all objects visible in the image in detail. Mention their characteristics, colors, and any notable features.",
46
+ "context": "Describe the context of the image. What could be the location, time of day, or situation in the scene? Try to infer as much information as possible based on the image.",
47
+ "action": "Look at the image and describe the main action or movement happening. Who or what is performing the action, and what is the outcome?"
48
+ }
49
+
50
+ if prompt_type == "custom" and custom_prompt:
51
+ final_prompt = custom_prompt
52
+ else:
53
+ final_prompt = prompts.get(prompt_type, prompts["basic"])
54
+
55
+ # Generate response with the image and prompt
56
+ response = model.generate_content([final_prompt, {"mime_type": "image/png", "data": image_bytes}])
 
 
 
 
 
 
 
 
57
  return response.text