llm_eval_system / llm_eval_script /aliyun_nxcloud_choice2.py
HoneyTian's picture
update
7ecbf9d
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
https://help.aliyun.com/zh/model-studio/qwen-api-reference
https://help.aliyun.com/zh/model-studio/models
https://help.aliyun.com/zh/model-studio/models?spm=a2c4g.11186623.0.i4#d4ccf72f23jh9
https://help.aliyun.com/zh/model-studio/text-generation?spm=a2c4g.11186623.0.0.6b772e068nnT1J#24e54b27d4agt
Deep-Thinking
https://help.aliyun.com/zh/model-studio/deep-thinking?spm=a2c4g.11186623.0.0.56076f58IJd4mP
"""
import argparse
from datetime import datetime
import json
import os
from pathlib import Path
import sys
import time
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../"))
from openai import OpenAI
from project_settings import environment, project_path
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
# default="qwen3-max-2025-09-23",
# default="qwen3-max-preview",
default="qwen-plus-2025-12-01",
# default="qwen-turbo-2025-07-15",
# default="qwen-flash-2025-07-28",
type=str
)
parser.add_argument(
"--eval_dataset_name",
default="agent-nxcloud-zh-375-choice.jsonl",
type=str
)
parser.add_argument(
"--eval_dataset_dir",
default=(project_path / "data/dataset").as_posix(),
type=str
)
parser.add_argument(
"--eval_data_dir",
default=(project_path / "data/eval_data").as_posix(),
type=str
)
parser.add_argument(
"--client",
default="shenzhen_sase",
type=str
)
parser.add_argument(
"--service",
default="aliyun_api_key_bj",
# default="aliyun_api_key_sgp",
type=str
)
parser.add_argument(
"--create_time_str",
default="null",
# default="20251209_104855",
type=str
)
parser.add_argument(
"--interval",
default=1,
type=int
)
args = parser.parse_args()
return args
def conversation_to_str(conversation: list):
conversation_str = ""
for turn in conversation:
role = turn["role"]
content = turn["content"]
row_ = f"{role}: {content}\n"
conversation_str += row_
return conversation_str
def main():
args = get_args()
eval_dataset_dir = Path(args.eval_dataset_dir)
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
eval_data_dir = Path(args.eval_data_dir)
eval_data_dir.mkdir(parents=True, exist_ok=True)
if args.create_time_str == "null":
tz = ZoneInfo("Asia/Shanghai")
now = datetime.now(tz)
create_time_str = now.strftime("%Y%m%d_%H%M%S")
# create_time_str = "20250724_090615"
else:
create_time_str = args.create_time_str
eval_dataset = eval_dataset_dir / args.eval_dataset_name
model_name_ = args.model_name.replace("/", "#")
output_file = eval_data_dir / f"aliyun_nxcloud_choice2/aliyun/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
output_file.parent.mkdir(parents=True, exist_ok=True)
api_key = environment.get(args.service, dtype=str)
if args.service == "aliyun_api_key_bj":
base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
elif args.service == "aliyun_api_key_sgp":
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
else:
raise AssertionError(f"invalid service: {args.service}")
client = OpenAI(
base_url=base_url,
# Read your Ark API Key from the environment variable.
api_key=api_key
)
total = 0
total_correct = 0
# finished
finished_idx_set = set()
if os.path.exists(output_file.as_posix()):
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
for row in f:
row = json.loads(row)
idx = row["idx"]
total = row["total"]
total_correct = row["total_correct"]
finished_idx_set.add(idx)
print(f"finished count: {len(finished_idx_set)}")
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
for row in fin:
row = json.loads(row)
idx = row["idx"]
system_prompt = row["system_prompt"]
conversation = row["conversation"]
examples = row["examples"]
choices = row["choices"]
response = row["response"]
if idx in finished_idx_set:
continue
# conversation
conversation_str = conversation_to_str(conversation)
examples_str = ""
for example in examples:
conversation_ = example["conversation"]
outputs = example["outputs"]
output = outputs["output"]
explanation = outputs["explanation"]
examples_str += conversation_to_str(conversation_)
output_json = {"Explanation": explanation, "output": output}
output_json_str = json.dumps(output_json, ensure_ascii=False)
examples_str += f"\nOutput: {output_json_str}\n"
# print(examples_str)
choices_str = ""
for choice in choices:
condition = choice["condition"]
choice_letter = choice["choice_letter"]
row_ = f"{condition}, output: {choice_letter}\n"
choices_str += row_
# choices_str += "\nRemember to output ONLY the corresponding letter.\nYour output is:"
choices_str += "\nPlease use only 10-15 words to explain.\nOutput:"
# prompt = f"{system_prompt}\n\n**Output**\n{choices_}\n**Examples**\n{examples_}"
prompt1 = f"{system_prompt}\n\n**Examples**\n{examples_str}"
prompt2 = f"**Conversation**\n{conversation_str}\n\n**Output**\n{choices_str}"
# print(prompt1)
# print(prompt2)
messages = list()
messages.append(
{"role": "system", "content": prompt1},
)
messages.append(
{"role": "user", "content": prompt2},
)
# print(f"messages: {json.dumps(messages, ensure_ascii=False, indent=4)}")
try:
time.sleep(args.interval)
print(f"sleep: {args.interval}")
time_begin = time.time()
completion = client.chat.completions.create(
model=args.model_name,
messages=messages,
# 由于 enable_thinking 非 OpenAI 标准参数,需要通过 extra_body 传入
extra_body={"enable_thinking": False},
stream=False,
)
time_cost = time.time() - time_begin
print(f"time_cost: {time_cost}")
except Exception as e:
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
continue
# print(f"completion: {completion}")
prediction_str = completion.choices[0].message.content
rid = completion.id
try:
prediction_ = json.loads(prediction_str)
prediction = prediction_["output"]
except Exception as error:
prediction_ = None
prediction = None
correct = 1 if prediction == response else 0
total += 1
total_correct += correct
score = total_correct / total
row_ = {
"idx": idx,
"rid": rid,
"messages": messages,
"response": response,
"prediction": prediction,
"prediction_": prediction_,
"correct": correct,
"total": total,
"total_correct": total_correct,
"score": score,
"time_cost": time_cost,
}
row_ = json.dumps(row_, ensure_ascii=False)
fout.write(f"{row_}\n")
fout.flush()
return
if __name__ == "__main__":
main()