Spaces:
Sleeping
Sleeping
| import cv2 | |
| import os | |
| import gradio as gr | |
| import requests | |
| from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from PIL import Image | |
| import torch | |
| import uuid | |
| # Load Models | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Model 1: ViT-GPT2 | |
| model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device) | |
| feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| # Model 2: FuseCap | |
| processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap") | |
| model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device) | |
| # Model 3: BLIP Large | |
| processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) | |
| # Frame Extraction and Captioning Logic | |
| def process_video(video_path): | |
| vidObj = cv2.VideoCapture(video_path) | |
| print(vidObj) | |
| count = 0 | |
| success = True | |
| frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []} | |
| print("LOGX") | |
| while success: | |
| success, frame = vidObj.read() | |
| print("LOGY") | |
| print(success) | |
| print(frame) | |
| if not success: | |
| break | |
| # Process every 20th frame | |
| if count % 20 == 0: | |
| image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
| # Model 1: ViT-GPT2 | |
| pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device) | |
| output_ids = model1.generate(pixel_values, max_length=16, num_beams=4) | |
| caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True) | |
| frame_captions["Model 1"].append(caption1) | |
| # Model 2: FuseCap | |
| inputs = processor2(image, "a picture of ", return_tensors="pt").to(device) | |
| out2 = model2.generate(**inputs, num_beams=3) | |
| caption2 = processor2.decode(out2[0], skip_special_tokens=True) | |
| frame_captions["Model 2"].append(caption2) | |
| # Model 3: BLIP Large | |
| inputs3 = processor3(image, return_tensors="pt").to(device) | |
| out3 = model3.generate(**inputs3) | |
| caption3 = processor3.decode(out3[0], skip_special_tokens=True) | |
| frame_captions["Model 3"].append(caption3) | |
| count += 1 | |
| vidObj.release() | |
| return frame_captions | |
| # Gradio Interface | |
| def generate_captions(video): | |
| print("LOG1") | |
| captions = process_video(video) | |
| print("LOG PO") | |
| result = "" | |
| for model_name, model_captions in captions.items(): | |
| result += f"### {model_name}\n" | |
| result += "\n".join(f"- {caption}" for caption in model_captions) | |
| result += "\n\n" | |
| print("LOG KONIEc") | |
| return result | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Video Captioning with Multiple Models 🎥") | |
| gr.Markdown("Upload a video to generate captions for its frames using three different models.") | |
| video_input = gr.Video(label="Upload Video") | |
| output = gr.Textbox(label="Generated Captions", lines=20) | |
| submit_button = gr.Button("Generate Captions") | |
| submit_button.click( | |
| fn=generate_captions, | |
| inputs=video_input, | |
| outputs=output, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |