Visual-Agent · xjtupanda · Jun 23, 2025
diff --git a/eval/eval_hrbench.py b/eval/eval_hrbench.py
@@ -13,6 +13,8 @@
 import io
 from openai import OpenAI
 import requests
+import copy
+import pandas as pd
 
 
 parser = argparse.ArgumentParser()

diff --git a/eval/eval_vstar.py b/eval/eval_vstar.py
@@ -13,7 +13,7 @@
 import io
 from openai import OpenAI
 import requests
-
+from random import shuffle
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--model_name', type=str, default='qwen', help='Model name for result save')
@@ -127,7 +127,9 @@ def process(img_arg):
         anno = json.load(f)
     question = anno['question']
     options = anno['options']
-
+    correct_answer = anno['options'][0]
+    shuffle(options)
+
     option_str = "\n"
     for i in range(len(options)):
         option_str += abc_map[i + 1] + '. ' + options[i] + '\n'
@@ -270,7 +272,8 @@ def process(img_arg):
     save_info = {}
     save_info['image'] = img
     save_info['question'] = question
-    save_info['answer'] = anno['options'][0]
+    save_info['answer'] = correct_answer
+    save_info['answer_choice'] = chr(ord('A') + options.index(correct_answer))
     save_info['pred_ans'] = output_text
     save_info['pred_output'] = print_messages
     save_info['status'] = status

diff --git a/eval/judge_result.py b/eval/judge_result.py
@@ -134,23 +134,24 @@ def process(line):
     line = line.strip()
     data = json.loads(line)
     question = data['question']
+    choice = data['answer_choice']
     answer = data['answer']
     pred_ans = data['pred_ans']
     pred_output = data['pred_output']
-    answer = 'A. ' + answer
+    answer = f"{choice}. {answer}"
 
     if '\\boxed' in pred_ans:
         pred_ans = pred_ans.split('\\boxed{')[1].split('}')[0]
 
     # rule base check
     acc_reward = 0.0
     if len(pred_ans)==1:
-        if pred_ans == 'A':
+        if pred_ans == choice: #'A':
             acc_reward = 1.0
         else:
             acc_reward = 0.0
     elif len(pred_ans) == 2 and '.' in pred_ans:
-        if 'A' in pred_ans:
+        if choice in pred_ans: #'A' in pred_ans:
             acc_reward = 1.0
         else:
             acc_reward = 0.0

diff --git a/eval/judge_result_hrbench.py b/eval/judge_result_hrbench.py
@@ -36,7 +36,7 @@
 else:
     eval_model_name = args.eval_model_name
 
-hrbench_path = args.vstar_bench_path
+hrbench_path = args.hrbench_path
 result_root_path = args.save_path
 result_root_path = os.path.join(result_root_path, args.model_name)
 
@@ -129,6 +129,7 @@ def process(line):
     data = json.loads(line)
     question = data['question']
     answer = data['answer']
+    answer_str = data['answer_str']
     pred_ans = data['pred_ans']
     pred_output = data['pred_output']
     category = data['category']
@@ -148,7 +149,9 @@ def process(line):
             acc_reward = 1.0
         else:
             acc_reward = 0.0
-    elif answer in pred_ans:
+    # elif answer in pred_ans:
+    #     acc_reward = 1.0
+    elif answer_str in pred_ans:
         acc_reward = 1.0
     else:
         full_prompt = get_prompt(pred_ans, answer, question)