BAAI
/

AltCLIP

@@ -249,209 +249,27 @@ Based on AltCLIP, we have also developed the AltDiffusion model, visualized as f
 ![](https://raw.githubusercontent.com/920232796/test/master/image7.png)
 ## 模型推理 Inference
 ```python
-import torch
 from PIL import Image
-from flagai.auto_model.auto_loader import AutoLoader
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-## 一行代码直接自动下载权重到'./checkpoints/clip-xlmr-large'，并自动加载CLIP模型权重
-## modelhub地址: Modelhub(https://model.baai.ac.cn/models)
-loader = AutoLoader(
-    task_name="txt_img_matching",
-    model_dir="./checkpoints",
-    model_name="AltCLIP-XLMR-L"
-)
-## 获取加载好的模型
-model = loader.get_model()
-## 获取tokenizer
-tokenizer = loader.get_tokenizer()
-## 获取transform用来处理图像
-transform = loader.get_transform()
-model.eval()
-model.to(device)
-## 推理过程,图像与文本匹配
-image = Image.open("./dog.jpeg")
-image = transform(image)
-image = torch.tensor(image["pixel_values"]).to(device)
-text = tokenizer(["a rat", "a dog", "a cat"])["input_ids"]
-text = torch.tensor(text).to(device)
-with torch.no_grad():
-    image_features = model.get_image_features(image)
-    text_features = model.get_text_features(text)
-    text_probs = (image_features @ text_features.T).softmax(dim=-1)
-print(text_probs.cpu().numpy()[0].tolist())
-```
-## CLIP微调 Finetuning
-微调采用cifar10数据集，并使用FlagAI的Trainer快速开始训练过程。
-Fine-tuning was done using the cifar10 dataset and using FlagAI's Trainer to quickly start the training process.
-```python
-# Copyright © 2022 BAAI. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License")
-import torch
-from flagai.auto_model.auto_loader import AutoLoader
-import os
-from flagai.trainer import Trainer
-from torchvision.datasets import (
-    CIFAR10
-)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dataset_root = "./clip_benchmark_datasets"
-dataset_name = "cifar10"
-batch_size = 4
-classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
-auto_loader = AutoLoader(
-    task_name="txt_img_matching",
-    model_dir="./checkpoints/",
-    model_name="AltCLIP-XLMR-L"   # Load the checkpoints from Modelhub(model.baai.ac.cn/models)
-)
-model = auto_loader.get_model()
-model.to(device)
-model.eval()
-tokenizer = auto_loader.get_tokenizer()
-transform = auto_loader.get_transform()
-trainer = Trainer(env_type="pytorch",
-                pytorch_device=device,
-                experiment_name="clip_finetuning",
-                batch_size=4,
-                lr=1e-4,
-                epochs=10,
-                log_interval=10)
-dataset = CIFAR10(root=os.path.join(dataset_root, dataset_name),
-                transform=transform,
-                download=True)
-def cifar10_collate_fn(batch):
-    # image shape is (batch, 3, 224, 224)
-    images = torch.tensor([b[0]["pixel_values"][0] for b in batch])
-    # text_id shape is (batch, n)
-    input_ids = torch.tensor([tokenizer(f"a photo of a {b[1]}",padding=True,truncation=True,max_length=77)["input_ids"] for b in batch])
-    return {
-        "pixel_values": images,
-        "input_ids": input_ids
-    }
-if __name__ == "__main__":
-    trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn)
-```
-## 模型验证 Evaluation
-我们提供了可以直接运行的验证脚本，在cifar10数据集上进行验证。
-期待的输出为：```{'dataset': 'cifar10', 'metrics': {'acc1': 0.95402, 'acc5': 0.99616, 'mean_per_class_recall': 0.9541200000000002}}```
-We provide validation scripts that can be run directly on the cifar10 dataset.
-```python
-# Copyright © 2022 BAAI. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License")
-import torch
-from flagai.auto_model.auto_loader import AutoLoader
-from metrics import zeroshot_classification
-import json
-import os
-from torchvision.datasets import CIFAR10
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-maxlen = 256
-dataset_root = "./clip_benchmark_datasets"
-dataset_name = "cifar10"
-auto_loader = AutoLoader(
-    task_name="txt_img_matching",
-    model_dir="./checkpoints/",
-    model_name="AltCLIP-XLMR-L"
-)
-model = auto_loader.get_model()
-model.to(device)
-model.eval()
-tokenizer = auto_loader.get_tokenizer()
-transform = auto_loader.get_transform()
-dataset = CIFAR10(root=os.path.join(dataset_root, dataset_name),
-                transform=transform,
-                download=True)
-batch_size = 128
-num_workers = 4
-template = {"cifar10": [
-        "a photo of a {c}.",
-        "a blurry photo of a {c}.",
-        "a black and white photo of a {c}.",
-        "a low contrast photo of a {c}.",
-        "a high contrast photo of a {c}.",
-        "a bad photo of a {c}.",
-        "a good photo of a {c}.",
-        "a photo of a small {c}.",
-        "a photo of a big {c}.",
-        "a photo of the {c}.",
-        "a blurry photo of the {c}.",
-        "a black and white photo of the {c}.",
-        "a low contrast photo of the {c}.",
-        "a high contrast photo of the {c}.",
-        "a bad photo of the {c}.",
-        "a good photo of the {c}.",
-        "a photo of the small {c}.",
-        "a photo of the big {c}."
-    ],
-}
-def evaluate():
-    if dataset:
-        dataloader = torch.utils.data.DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            num_workers=num_workers,
-        )
-        classnames = dataset.classes if hasattr(dataset, "classes") else None
-        zeroshot_templates = template["cifar10"]
-        metrics = zeroshot_classification.evaluate(
-            model,
-            dataloader,
-            tokenizer,
-            classnames,
-            zeroshot_templates,
-            device=device,
-            amp=True,
-        )
-        dump = {
-            "dataset": dataset_name,
-            "metrics": metrics
-        }
-        print(dump)
-        with open("./result.txt", "w") as f:
-            json.dump(dump, f)
-        return metrics
-if __name__ == "__main__":
-    evaluate()
 ```

 ![](https://raw.githubusercontent.com/920232796/test/master/image7.png)
 ## 模型推理 Inference
+Please download the code from [FlagAI AltCLIP](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP)
 ```python
 from PIL import Image
+import requests
+# transformers version >= 4.21.0
+from modeling_altclip import AltCLIP
+from processing_altclip import AltCLIPProcessor
+# now our repo's in private, so we need `use_auth_token=True`
+model = AltCLIP.from_pretrained("BAAI/AltCLIP")
+processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+outputs = model(**inputs)
+logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
 ```