diff --git a/llava/eval/eval_gpt_review.py b/llava/eval/eval_gpt_review.py index 8af4559c6..1060c3d7d 100644 --- a/llava/eval/eval_gpt_review.py +++ b/llava/eval/eval_gpt_review.py @@ -27,9 +27,9 @@ def get_eval(content: str, max_tokens: int): ) break except openai.error.RateLimitError: - pass + print('RateLimitError: Retrying after sleeping') except Exception as e: - print(e) + print(f'Exception occurred: {e}') time.sleep(NUM_SECONDS_TO_SLEEP) print('success!') @@ -64,50 +64,51 @@ def parse_score(review): ray.init() - f_q = open(os.path.expanduser(args.question)) - f_ans1 = open(os.path.expanduser(args.answer_list[0])) - f_ans2 = open(os.path.expanduser(args.answer_list[1])) - rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) + with open(os.path.expanduser(args.question)) as f_q, \ + open(os.path.expanduser(args.answer_list[0])) as f_ans1, \ + open(os.path.expanduser(args.answer_list[1])) as f_ans2, \ + open(os.path.expanduser(args.rule), 'r') as f_rule, \ + open(f'{args.output}', 'w') as review_file: - review_file = open(f'{args.output}', 'w') + rule_dict = json.load(f_rule) - js_list = [] - handles = [] - idx = 0 - for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): - # if idx == 1: - # break + js_list = [] + handles = [] + idx = 0 + for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): + # if idx == 1: + # break - ques = json.loads(ques_js) - ans1 = json.loads(ans1_js) - ans2 = json.loads(ans2_js) + ques = json.loads(ques_js) + ans1 = json.loads(ans1_js) + ans2 = json.loads(ans2_js) - category = json.loads(ques_js)['category'] - if category in rule_dict: - rule = rule_dict[category] - else: - rule = rule_dict['default'] - prompt = rule['prompt'] - role = rule['role'] - content = (f'[Question]\n{ques["text"]}\n\n' - f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' - f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' - f'[System]\n{prompt}\n\n') - js_list.append({ - 'id': idx+1, - 'question_id': ques['question_id'], - 'answer1_id': ans1['answer_id'], - 'answer2_id': ans2['answer_id'], - 'category': category}) - idx += 1 - handles.append(get_eval.remote(content, args.max_tokens)) - # To avoid the rate limit set by OpenAI - time.sleep(NUM_SECONDS_TO_SLEEP) + category = json.loads(ques_js)['category'] + if category in rule_dict: + rule = rule_dict[category] + else: + rule = rule_dict['default'] + prompt = rule['prompt'] + role = rule['role'] + content = (f'[Question]\n{ques["text"]}\n\n' + f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' + f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' + f'[System]\n{prompt}\n\n') + js_list.append({ + 'id': idx+1, + 'question_id': ques['question_id'], + 'answer1_id': ans1['answer_id'], + 'answer2_id': ans2['answer_id'], + 'category': category}) + idx += 1 + handles.append(get_eval.remote(content, args.max_tokens)) + # To avoid the rate limit set by OpenAI + time.sleep(NUM_SECONDS_TO_SLEEP) + + reviews = ray.get(handles) + for idx, review in enumerate(reviews): + scores = parse_score(review) + js_list[idx]['content'] = review + js_list[idx]['tuple'] = scores + review_file.write(json.dumps(js_list[idx]) + '\n') - reviews = ray.get(handles) - for idx, review in enumerate(reviews): - scores = parse_score(review) - js_list[idx]['content'] = review - js_list[idx]['tuple'] = scores - review_file.write(json.dumps(js_list[idx]) + '\n') - review_file.close()