Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -83,34 +83,46 @@ def predict_answer(video, image, question):
|
|
| 83 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
| 84 |
|
| 85 |
elif video:
|
| 86 |
-
# Process as a video
|
| 87 |
frames = video_to_frames(video)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
image_tensor = model.image_preprocess([image])
|
| 92 |
-
|
| 93 |
-
# Generate the answer
|
| 94 |
output_ids = model.generate(
|
| 95 |
input_ids,
|
| 96 |
max_new_tokens=25,
|
| 97 |
images=image_tensor,
|
| 98 |
use_cache=True)[0]
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
# Modify this logic based on your specific needs
|
| 104 |
-
most_common_answer = Counter(answers).most_common(1)[0][0]
|
| 105 |
|
| 106 |
-
# Safely evaluate the most common answer assuming it's a string representation of a Python literal
|
| 107 |
-
try:
|
| 108 |
-
|
| 109 |
-
except (ValueError, SyntaxError):
|
| 110 |
-
|
| 111 |
-
|
| 112 |
|
| 113 |
-
return evaluated_answer
|
| 114 |
|
| 115 |
# return ast.literal_eval(answers[0])
|
| 116 |
|
|
|
|
| 83 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
| 84 |
|
| 85 |
elif video:
|
|
|
|
| 86 |
frames = video_to_frames(video)
|
| 87 |
+
image = extract_frames(frames[2])
|
| 88 |
+
image_tensor = model.image_preprocess([image])
|
| 89 |
+
# Generate the answer
|
|
|
|
|
|
|
|
|
|
| 90 |
output_ids = model.generate(
|
| 91 |
input_ids,
|
| 92 |
max_new_tokens=25,
|
| 93 |
images=image_tensor,
|
| 94 |
use_cache=True)[0]
|
| 95 |
|
| 96 |
+
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
| 97 |
+
|
| 98 |
+
# # Process as a video
|
| 99 |
+
# frames = video_to_frames(video)
|
| 100 |
+
# answers = []
|
| 101 |
+
# for frame in frames:
|
| 102 |
+
# image = extract_frames(frame)
|
| 103 |
+
# image_tensor = model.image_preprocess([image])
|
| 104 |
+
|
| 105 |
+
# # Generate the answer
|
| 106 |
+
# output_ids = model.generate(
|
| 107 |
+
# input_ids,
|
| 108 |
+
# max_new_tokens=25,
|
| 109 |
+
# images=image_tensor,
|
| 110 |
+
# use_cache=True)[0]
|
| 111 |
+
|
| 112 |
+
# answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
| 113 |
+
# answers.append(answer)
|
| 114 |
|
| 115 |
+
# # Modify this logic based on your specific needs
|
| 116 |
+
# most_common_answer = Counter(answers).most_common(1)[0][0]
|
| 117 |
|
| 118 |
+
# # Safely evaluate the most common answer assuming it's a string representation of a Python literal
|
| 119 |
+
# try:
|
| 120 |
+
# evaluated_answer = ast.literal_eval(most_common_answer)
|
| 121 |
+
# except (ValueError, SyntaxError):
|
| 122 |
+
# # Handle malformed answer string
|
| 123 |
+
# evaluated_answer = f"Error evaluating answer: {most_common_answer}"
|
| 124 |
|
| 125 |
+
# return evaluated_answer
|
| 126 |
|
| 127 |
# return ast.literal_eval(answers[0])
|
| 128 |
|