Pnp vqa (#27)

Release of PnP-VQA (Anthony TMH et al, EMNLP Findings 2022). * add cache to gitignore * pnp-vqa-okvqa * rename readme * add extra config and misc change * edit model path * add gqa dataset * add gqa inference for generate * pnp-vqa misc change * compute gradcam in batch * allow straightforward model.predict_answers() * Created using Colaboratory * pnp_vqa * add test and update colab * add vqav2 test config and sh * log vqa result * fix dict key * reduce memory by offload model for pnpvqa3b * misc * update readme
salesforce · Oct 25, 2022 · c2cfb00 · c2cfb00
1 parent 09e636c
commit c2cfb00
Show file tree

Hide file tree

Showing 59 changed files with 2,729 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -152,3 +152,5 @@ debug*/
 *.dat
 *.tsv
 *.gz
+
+cache/
diff --git a/app/image_text_match.py b/app/image_text_match.py
@@ -74,7 +74,7 @@ def app():
         qry_tok = tokenizer(qry, return_tensors="pt").to(device)
         gradcam, output = compute_gradcam(model, img, qry, qry_tok, block_num=layer_num)
 
-        avg_gradcam = getAttMap(norm_img, gradcam[1], blur=True)
+        avg_gradcam = getAttMap(norm_img, gradcam[0][1], blur=True)
 
         col2.image(avg_gradcam, use_column_width=True, clamp=True)
         # output = model(img, question)

diff --git a/app/text_localization.py b/app/text_localization.py
@@ -73,15 +73,15 @@ def app():
 
         gradcam, _ = compute_gradcam(model, img, qry, qry_tok, block_num=layer_num)
 
-        avg_gradcam = getAttMap(norm_img, gradcam[1], blur=True)
+        avg_gradcam = getAttMap(norm_img, gradcam[0][1], blur=True)
         col2.image(avg_gradcam, use_column_width=True, clamp=True)
 
         num_cols = 4.0
         num_tokens = len(qry_tok.input_ids[0]) - 2
 
         num_rows = int(math.ceil(num_tokens / num_cols))
 
-        gradcam_iter = iter(gradcam[2:-1])
+        gradcam_iter = iter(gradcam[0][2:-1])
         token_id_iter = iter(qry_tok.input_ids[0][1:-1])
 
         for _ in range(num_rows):

diff --git a/dataset_card/gqa.md b/dataset_card/gqa.md
@@ -0,0 +1,32 @@
+![From https://arxiv.org/abs/1902.09506.pdf.](imgs/gqa.png)
+
+# GQA Dataset
+
+## Description
+(from https://cs.stanford.edu/people/dorarad/gqa/about.html)
+
+GQA is a VQA dataset for real-word images which requires visual, spatial and compositional reasoning. 
+It consists of 22M questions and 110K images.
+
+## Task
+(from https://arxiv.org/abs/1902.09506)
+
+Given an image and a question, the model is required to output a correct answer. 
+GQA questions require spatial understanding, multiple reasoning skills and multiple-step inference. 
+
+## Metrics
+
+The metrics are accuracy, consistency, validity, plausibility. The commonly reported metric is accuracy.
+
+## Leaderboard
+
+TBD
+
+## Auto-Downloading
+
+```
+cd lavis/datasets/download_scripts && python download_gqa.py
+```
+
+## References
+"GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering", Drew A. Hudson, Christopher D. Manning
diff --git a/dataset_card/imgs/gqa.png b/dataset_card/imgs/gqa.png
diff --git a/examples/blip_text_localization.ipynb b/examples/blip_text_localization.ipynb
@@ -150,7 +150,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "avg_gradcam = getAttMap(norm_img, gradcam[1], blur=True)"
+    "avg_gradcam = getAttMap(norm_img, gradcam[0][1], blur=True)"
    ]
   },
   {
@@ -214,7 +214,7 @@
     "num_image = len(txt_tokens.input_ids[0]) - 2\n",
     "fig, ax = plt.subplots(num_image, 1, figsize=(15, 5 * num_image))\n",
     "\n",
-    "gradcam_iter = iter(gradcam[2:-1])\n",
+    "gradcam_iter = iter(gradcam[0][2:-1])\n",
     "token_id_iter = iter(txt_tokens.input_ids[0][1:-1])\n",
     "\n",
     "for i, (gradcam, token_id) in enumerate(zip(gradcam_iter, token_id_iter)):\n",
@@ -230,7 +230,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.8.10 ('base')",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -244,9 +244,8 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.13"
   },
-  "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
     "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"

diff --git a/lavis/common/vqa_tools/vqa_eval.py b/lavis/common/vqa_tools/vqa_eval.py
@@ -16,15 +16,16 @@
 
 
 class VQAEval:
-    def __init__(self, vqa, vqaRes, n=2):
+    def __init__(self, vqa=None, vqaRes=None, n=2):
         self.n = n
         self.accuracy = {}
         self.evalQA = {}
         self.evalQuesType = {}
         self.evalAnsType = {}
         self.vqa = vqa
         self.vqaRes = vqaRes
-        self.params = {"question_id": vqa.getQuesIds()}
+        if vqa is not None:
+            self.params = {"question_id": vqa.getQuesIds()}
         self.contractions = {
             "aint": "ain't",
             "arent": "aren't",

diff --git a/lavis/configs/datasets/gqa/balanced_testdev.yaml b/lavis/configs/datasets/gqa/balanced_testdev.yaml
@@ -0,0 +1,30 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  gqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+          storage:
+              - gqa/annotations/train_balanced_questions.json
+        val:
+          url:
+            - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
+          storage:
+            - gqa/annotations/testdev_balanced_questions.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
+          storage:
+              - gqa/annotations/test_balanced_questions.json
+      images:
+          storage: gqa/images/
diff --git a/lavis/configs/datasets/gqa/balanced_val.yaml b/lavis/configs/datasets/gqa/balanced_val.yaml
@@ -0,0 +1,30 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  gqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+          storage:
+              - gqa/annotations/train_balanced_questions.json
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
+          storage:
+              - gqa/annotations/val_balanced_questions.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
+          storage:
+              - gqa/annotations/test_balanced_questions.json
+      images:
+          storage: gqa/images/
diff --git a/lavis/configs/datasets/gqa/defaults.yaml b/lavis/configs/datasets/gqa/defaults.yaml
@@ -33,4 +33,4 @@ datasets:
               - aokvqa/annotations/aokvqa_v1p0_test.json
               - aokvqa/annotations/large_vocab_train_lavis.json
       images:
-          storage: coco/images/
+          storage: gqa/images/
diff --git a/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml b/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml
@@ -0,0 +1,59 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pnp_vqa
+
+  image_question_matching_model:
+    arch: blip_image_text_matching
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
+
+    # vit encoder
+    vit_type: "large"
+    vit_grad_ckpt: False
+    vit_ckpt_layer: 0
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    embed_dim: 256
+
+  image_captioning_model:
+    arch: blip_caption
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
+
+    vit_type: "large"
+    vit_grad_ckpt: True
+    vit_ckpt_layer: 5
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    # generation configs
+    prompt: "a picture of "
+
+  question_answering_model:
+    arch: pnp_unifiedqav2_fid
+
+    pretrained: "allenai/unifiedqa-v2-t5-3b-1363200"
+
+    t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml b/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml
@@ -0,0 +1,59 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pnp_vqa
+
+  image_question_matching_model:
+    arch: blip_image_text_matching
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
+
+    # vit encoder
+    vit_type: "large"
+    vit_grad_ckpt: False
+    vit_ckpt_layer: 0
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    embed_dim: 256
+
+  image_captioning_model:
+    arch: blip_caption
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
+
+    vit_type: "large"
+    vit_grad_ckpt: True
+    vit_ckpt_layer: 5
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    # generation configs
+    prompt: "a picture of "
+
+  question_answering_model:
+    arch: pnp_unifiedqav2_fid
+
+    pretrained: "allenai/unifiedqa-v2-t5-base-1363200"
+
+    t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      eval:
+        name: "blip_caption"
diff --git a/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml b/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml
@@ -0,0 +1,59 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: pnp_vqa
+
+  image_question_matching_model:
+    arch: blip_image_text_matching
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
+
+    # vit encoder
+    vit_type: "large"
+    vit_grad_ckpt: False
+    vit_ckpt_layer: 0
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    embed_dim: 256
+
+  image_captioning_model:
+    arch: blip_caption
+    load_finetuned: True
+
+    finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
+
+    vit_type: "large"
+    vit_grad_ckpt: True
+    vit_ckpt_layer: 5
+
+    image_size: 384
+
+    # bert config
+    med_config_path: "configs/models/med_large_config.json"
+
+    # generation configs
+    prompt: "a picture of "
+
+  question_answering_model:
+    arch: pnp_unifiedqav2_fid
+
+    pretrained: "allenai/unifiedqa-v2-t5-large-1363200"
+
+    t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json"
+
+preprocess:
+  vis_processor:
+      eval:
+        name: "blip_image_eval"
+        image_size: 384
+  text_processor:
+      eval:
+        name: "blip_caption"
-Original file line number
+Diff line change
@@ Expand Up / @@ -152,3 +152,5 @@ debug*/ @@
     *.dat
     *.tsv
     *.gz
+    cache/