Skip to content

Commit

Permalink
add inference scripts for internlm-xcomposer
Browse files Browse the repository at this point in the history
  • Loading branch information
teowu committed Nov 9, 2023
1 parent 8f5824b commit 0d8ebfd
Show file tree
Hide file tree
Showing 4 changed files with 322 additions and 3 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ print(f'Bot: {response}')
<summary>Multi-choice question (MCQ) in Q-Bench.</summary>

```shell
python eval_scripts/mplug_owl_2/eval_qbench_mcq.py
python eval_scripts/internlm_xcomposer_vl/eval_qbench_mcq.py
```

</details>
Expand All @@ -306,13 +306,13 @@ python eval_scripts/mplug_owl_2/eval_qbench_mcq.py
<strong>Image Quality Assessment:</strong>

```shell
python eval_scripts/mplug_owl_2/eval_image_quality.py
python eval_scripts/internlm_xcomposer_vl/eval_image_quality.py
```

<strong>Video Quality Assessment:</strong>

```shell
python eval_scripts/mplug_owl_2/eval_video_quality.py
python eval_scripts/internlm_xcomposer_vl/eval_video_quality.py
```

</details>
Expand Down
87 changes: 87 additions & 0 deletions eval_scripts/internlm_xcomposer_vl/eval_image_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from transformers import AutoModel, AutoTokenizer
import torch

import random
from copy import deepcopy
from PIL import Image
import json
from tqdm import tqdm

torch.set_grad_enabled(False)


import os
os.makedirs("results/mix-internlm_xcomposer_vl/",exist_ok=True)
torch.manual_seed(1234)

# init model and tokenizer
model = AutoModel.from_pretrained('DLight1551/internlm-xcomposer-vl-7b-qinstruct-full', trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained('DLight1551/internlm-xcomposer-vl-7b-qinstruct-full', trust_remote_code=True)
model.tokenizer = tokenizer

image_paths = [
"../datasets/AGIQA-3K/database/",
"../datasets/1024x768/",
"../datasets/SPAQ/",
"../datasets/FLIVE_Database/database/",
"../datasets/LIVEC/Images/",
"../datasets/CGIQA-6K/database/",
"../datasets/kadid10k/images/",
]

json_prefix = "../datasets/json/"
jsons = [
json_prefix + "agi.json",
json_prefix + "koniq.json",
json_prefix + "spaq.json",
json_prefix + "flive.json",
json_prefix + "livec.json",
json_prefix + "cgi.json",
json_prefix + "kadid.json",
]

def get_logits(model, text, image_path):
image = Image.open(image_path).convert("RGB")
with torch.cuda.amp.autocast():
image = model.vis_processor(image).unsqueeze(0).to(model.device)
img_embeds = model.encode_img(image)
prompt_segs = text.split('<ImageHere>')
prompt_seg_tokens = [
model.tokenizer(seg,
return_tensors='pt',
add_special_tokens=i == 0).
to(model.internlm_model.model.embed_tokens.weight.device).input_ids
for i, seg in enumerate(prompt_segs)
]
prompt_seg_embs = [
model.internlm_model.model.embed_tokens(seg)
for seg in prompt_seg_tokens
]
prompt_seg_embs = [prompt_seg_embs[0], img_embeds, prompt_seg_embs[1]]

prompt_embs = torch.cat(prompt_seg_embs, dim=1)

return model.internlm_model(
inputs_embeds=prompt_embs).logits[:,-1]

for image_path, input_json in zip(image_paths, jsons):
with open(input_json) as f:
iqa_data = json.load(f)

for i, llddata in enumerate(tqdm(iqa_data, desc=image_path)):
message = "Rate the quality of the image."

llddata["logit_good"] = 0.
llddata["logit_poor"] = 0.

images = [image_path+llddata["img_path"]]
for image in images:
# 1st dialogue turn
output_logits = get_logits(model, message, image)
probs, inds = output_logits.sort(dim=-1,descending=True)
lgood, lpoor = output_logits[0,18682].item(), output_logits[0,5527].item()
llddata["logit_good"] += lgood
llddata["logit_poor"] += lpoor

with open(f"results/mix-internlm_xcomposer_vl/{input_json.split('/')[-1]}", "a") as wf:
json.dump(llddata, wf)
143 changes: 143 additions & 0 deletions eval_scripts/internlm_xcomposer_vl/eval_qbench_mcq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from transformers import AutoModel, AutoTokenizer
from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM

import random
from copy import deepcopy
import io
import os
import base64
import torch
import numpy as np
from transformers import StoppingCriteria, StoppingCriteriaList
import os
from typing import Optional
import xlsxwriter
import pandas as pd
from PIL import Image
import pandas as pd
from torch.utils.data import Dataset
import torchvision

torch.set_grad_enabled(False)

torch.manual_seed(1234)

model = AutoModel.from_pretrained('DLight1551/internlm-xcomposer-vl-7b-qinstruct-full', trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(tgt_dir, trust_remote_code=True)
model.tokenizer = tokenizer

def generate_answer(model, text, image_path):
image = Image.open(image_path).convert("RGB")
with torch.cuda.amp.autocast():
image = model.vis_processor(image).unsqueeze(0).to(model.device)
img_embeds = model.encode_img(image)
prompt_segs = text.split('<ImageHere>')
prompt_seg_tokens = [
model.tokenizer(seg,
return_tensors='pt',
add_special_tokens=i == 0).
to(model.internlm_model.model.embed_tokens.weight.device).input_ids
for i, seg in enumerate(prompt_segs)
]
prompt_seg_embs = [
model.internlm_model.model.embed_tokens(seg)
for seg in prompt_seg_tokens
]
prompt_seg_embs = [prompt_seg_embs[0], img_embeds, prompt_seg_embs[1]]

prompt_embs = torch.cat(prompt_seg_embs, dim=1)

outputs = model.internlm_model.generate(
inputs_embeds=prompt_embs,
max_new_tokens=5,
num_beams=5,
do_sample=False,
min_length=1,
top_p=0.9,
repetition_penalty=1.5,
length_penalty=1.0,
temperature=1.0,
stopping_criteria=stopping_criteria,
)
#print (outputs)
output_token = outputs[0]
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = model.tokenizer.decode(output_token,
add_special_tokens=False)

output_text = output_text.split(model.eoa)[0]
output_text = output_text.split('<|Bot|>')[-1].strip()
return output_text

class StoppingCriteriaSub(StoppingCriteria):
def __init__(self, stops=[], encounters=1):
super().__init__()
self.stops = stops

def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
for stop in self.stops:
if torch.all((stop == input_ids[0][-len(stop):])).item():
return True

return False

stop_words_ids = [
torch.tensor([103027]).cuda(), ### end of human
torch.tensor([103028]).cuda(), ### end of bot
]
stopping_criteria = StoppingCriteriaList(
[StoppingCriteriaSub(stops=stop_words_ids)])

## define split/language here ##
lang = "en" # en | zh
split = "test" # dev | test (not supported for you)
## define split/language here ##

import json
from tqdm import tqdm
if lang == "en":
with open(f"llvisionqa_{split}.json") as f:
llvqa_data = json.load(f)
elif lang == "zh":
zh_split = "验证集" if split == "dev" else "测试集"
with open(f"质衡-问答-{zh_split}.json") as f:
llvqa_data = json.load(f)
else:
raise NotImplementedError("Q-Bench does not support languages other than English (en) and Chinese (zh) yet. Contact us (https://github.com/Q-Future/Q-Bench/) to convert Q-Bench into more languages.")


correct = np.zeros((3,4))
all_ = np.zeros((3,4))
answers = {}
for llddata in tqdm((llvqa_data)):
t, c = llddata["type"], llddata["concern"]

options_prompt = ''
for choice, ans in zip(["A.", "B.", "C.", "D."], llddata["candidates"]):
options_prompt += f"{choice} {ans}\n"
if "correct_ans" in llddata and ans == llddata["correct_ans"]:
correct_choice = choice[0]

img_prompt = ' <|User|>:<ImageHere>'
txt_prompt = 'Please answer this question by choosing the correct choice.'
context = 'N/A'
mid_prompt = 'Context: ' + context + '\nQuestion: ' + llddata["question"] + '\nOptions: ' + options_prompt
ans_prompt = ' <|Bot|>: Answer: The answer is'
text = img_prompt + txt_prompt + mid_prompt + '<TOKENS_UNUSED_0>' + ans_prompt
print(text)

img_path = f"../datasets/LLVQA/images/" + llddata["img_path"]
# 1st dialogue turn
response = generate_answer(model, text, img_path)
all_[t][c] += 1
if response[0] not in ['A', 'B', 'C', 'D']:
print("[Response]: {}, [Correct Ans]: {}".format(response, correct_choice))
if split == 'dev' and response[0] == correct_choice:
correct[t][c] += 1

print (correct.sum(1)/all_.sum(1))
print (correct.sum(0)/all_.sum(0))
print ("Final Correctness": correct.sum()/all_.sum())
89 changes: 89 additions & 0 deletions eval_scripts/internlm_xcomposer_vl/eval_video_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from transformers import AutoModel, AutoTokenizer
import torch

import random
from copy import deepcopy
import json
from tqdm import tqdm
from decord import VideoReader
from PIL import Image

torch.set_grad_enabled(False)


import os
os.makedirs("results/mix-internlm_xcomposer_vl/",exist_ok=True)
torch.manual_seed(1234)

# init model and tokenizer
model = AutoModel.from_pretrained('DLight1551/internlm-xcomposer-vl-7b-qinstruct-full', trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained('DLight1551/internlm-xcomposer-vl-7b-qinstruct-full', trust_remote_code=True)
model.tokenizer = tokenizer

print(tokenizer(["good"]), tokenizer.decode(5527))

image_paths = [
"../datasets/KoNViD_1k_videos/",
]

json_prefix = "../datasets/json/"
jsons = [
json_prefix + "konvid.json",
]

def load_video(video_file):
vr = VideoReader(video_file)

# Get video frame rate
fps = vr.get_avg_fps()

# Calculate frame indices for 1fps
frame_indices = [int(fps * i) for i in range(int(len(vr) / fps))]

return [Image.fromarray(vr[index].asnumpy()) for index in frame_indices]

def get_logits(model, text, image):
with torch.cuda.amp.autocast():
image = model.vis_processor(image).unsqueeze(0).to(model.device)
img_embeds = model.encode_img(image)
prompt_segs = text.split('<ImageHere>')
prompt_seg_tokens = [
model.tokenizer(seg,
return_tensors='pt',
add_special_tokens=i == 0).
to(model.internlm_model.model.embed_tokens.weight.device).input_ids
for i, seg in enumerate(prompt_segs)
]
prompt_seg_embs = [
model.internlm_model.model.embed_tokens(seg)
for seg in prompt_seg_tokens
]
prompt_seg_embs = [prompt_seg_embs[0], img_embeds, prompt_seg_embs[1]]

prompt_embs = torch.cat(prompt_seg_embs, dim=1)

return model.internlm_model(
inputs_embeds=prompt_embs).logits[:,-1]


for image_path, input_json in zip(image_paths, jsons):
with open(input_json) as f:
iqa_data = json.load(f)

for i, llddata in enumerate(tqdm(iqa_data, desc=image_path)):
message = "Rate the quality of the image."

images = load_video(image_path + llddata["img_path"])
llddata["logit_good"] = 0
llddata["logit_poor"] = 0

for image in images:
# 1st dialogue turn
output_logits = get_logits(model, message, image)
probs, inds = output_logits.sort(dim=-1,descending=True)
lgood, lpoor = output_logits[0,18682].item(), output_logits[0,5527].item()
llddata["logit_good"] += lgood
llddata["logit_poor"] += lpoor

with open(f"results/mix-internlm_xcomposer_vl/{input_json.split('/')[-1]}", "a") as wf:
json.dump(llddata, wf)

0 comments on commit 0d8ebfd

Please sign in to comment.