Spaces:

ManishThota
/

Build

Paused

App Files Files Community

ManishThota commited on Mar 10, 2024

Commit

bda5bd0

verified ·

1 Parent(s): 69344b8

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -18

app.py CHANGED Viewed

@@ -83,34 +83,46 @@ def predict_answer(video, image, question):
         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
     elif video:
-        # Process as a video
         frames = video_to_frames(video)
-        answers = []
-        for frame in frames:
-            image = extract_frames(frame)
-            image_tensor = model.image_preprocess([image])
-            # Generate the answer
             output_ids = model.generate(
                 input_ids,
                 max_new_tokens=25,
                 images=image_tensor,
                 use_cache=True)[0]
-            answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-            answers.append(answer)
-        # Modify this logic based on your specific needs
-        most_common_answer = Counter(answers).most_common(1)[0][0]
-        # Safely evaluate the most common answer assuming it's a string representation of a Python literal
-        try:
-            evaluated_answer = ast.literal_eval(most_common_answer)
-        except (ValueError, SyntaxError):
-            # Handle malformed answer string
-            evaluated_answer = f"Error evaluating answer: {most_common_answer}"
-        return evaluated_answer
     #     return ast.literal_eval(answers[0])

         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
     elif video:
         frames = video_to_frames(video)
+        image = extract_frames(frames[2])
+        image_tensor = model.image_preprocess([image])
+        # Generate the answer
             output_ids = model.generate(
                 input_ids,
                 max_new_tokens=25,
                 images=image_tensor,
                 use_cache=True)[0]
+        return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+        # # Process as a video
+        # frames = video_to_frames(video)
+        # answers = []
+        # for frame in frames:
+        #     image = extract_frames(frame)
+        #     image_tensor = model.image_preprocess([image])
+        #     # Generate the answer
+        #     output_ids = model.generate(
+        #         input_ids,
+        #         max_new_tokens=25,
+        #         images=image_tensor,
+        #         use_cache=True)[0]
+        #     answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+        #     answers.append(answer)
+        # # Modify this logic based on your specific needs
+        # most_common_answer = Counter(answers).most_common(1)[0][0]
+        # # Safely evaluate the most common answer assuming it's a string representation of a Python literal
+        # try:
+        #     evaluated_answer = ast.literal_eval(most_common_answer)
+        # except (ValueError, SyntaxError):
+        #     # Handle malformed answer string
+        #     evaluated_answer = f"Error evaluating answer: {most_common_answer}"
+        # return evaluated_answer
     #     return ast.literal_eval(answers[0])