Spaces:
Sleeping
Sleeping
| from transformers import CLIPProcessor, CLIPModel | |
| from PIL import Image | |
| import torch | |
| class IdentificationModel: | |
| def __init__(self): | |
| self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
| self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model.to(self.device) | |
| def identify_objects(self, image_path, text_descriptions): | |
| # Load image | |
| image = Image.open(image_path) | |
| # Prepare inputs | |
| inputs = self.processor(text=text_descriptions, images=image, return_tensors="pt", padding=True) | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| # Run inference | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| # Get logits and compute probabilities | |
| logits_per_image = outputs.logits_per_image # this is the image-text similarity score | |
| probs = logits_per_image.softmax(dim=1) # convert logits to probabilities | |
| # Find the detection with the maximum probability | |
| max_prob, max_idx = torch.max(probs[0], dim=0) | |
| # Prepare the result for the highest probability detection | |
| detection=[] | |
| detection.append({ | |
| 'description': text_descriptions[max_idx], | |
| 'probability': float(max_prob) | |
| }) | |
| return detection | |