Spaces:

qgyd2021
/

llm_eval_system

Running

App Files Files Community

llm_eval_system / llm_eval_script /aliyun_nxcloud_choice2.py

HoneyTian

update

7ecbf9d 5 days ago

raw

history blame contribute delete

8.35 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	"""
	https://help.aliyun.com/zh/model-studio/qwen-api-reference
	https://help.aliyun.com/zh/model-studio/models
	https://help.aliyun.com/zh/model-studio/models?spm=a2c4g.11186623.0.i4#d4ccf72f23jh9

	https://help.aliyun.com/zh/model-studio/text-generation?spm=a2c4g.11186623.0.0.6b772e068nnT1J#24e54b27d4agt

	Deep-Thinking
	https://help.aliyun.com/zh/model-studio/deep-thinking?spm=a2c4g.11186623.0.0.56076f58IJd4mP

	"""
	import argparse
	from datetime import datetime
	import json
	import os
	from pathlib import Path
	import sys
	import time
	from zoneinfo import ZoneInfo # Python 3.9+ 自带，无需安装

	pwd = os.path.abspath(os.path.dirname(__file__))
	sys.path.append(os.path.join(pwd, "../"))

	from openai import OpenAI

	from project_settings import environment, project_path


	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--model_name",
	# default="qwen3-max-2025-09-23",
	# default="qwen3-max-preview",
	default="qwen-plus-2025-12-01",
	# default="qwen-turbo-2025-07-15",
	# default="qwen-flash-2025-07-28",
	type=str
	)
	parser.add_argument(
	"--eval_dataset_name",
	default="agent-nxcloud-zh-375-choice.jsonl",
	type=str
	)
	parser.add_argument(
	"--eval_dataset_dir",
	default=(project_path / "data/dataset").as_posix(),
	type=str
	)
	parser.add_argument(
	"--eval_data_dir",
	default=(project_path / "data/eval_data").as_posix(),
	type=str
	)
	parser.add_argument(
	"--client",
	default="shenzhen_sase",
	type=str
	)
	parser.add_argument(
	"--service",
	default="aliyun_api_key_bj",
	# default="aliyun_api_key_sgp",
	type=str
	)
	parser.add_argument(
	"--create_time_str",
	default="null",
	# default="20251209_104855",
	type=str
	)
	parser.add_argument(
	"--interval",
	default=1,
	type=int
	)
	args = parser.parse_args()
	return args


	def conversation_to_str(conversation: list):
	conversation_str = ""
	for turn in conversation:
	role = turn["role"]
	content = turn["content"]
	row_ = f"{role}: {content}\n"
	conversation_str += row_

	return conversation_str


	def main():
	args = get_args()

	eval_dataset_dir = Path(args.eval_dataset_dir)
	eval_dataset_dir.mkdir(parents=True, exist_ok=True)
	eval_data_dir = Path(args.eval_data_dir)
	eval_data_dir.mkdir(parents=True, exist_ok=True)

	if args.create_time_str == "null":
	tz = ZoneInfo("Asia/Shanghai")
	now = datetime.now(tz)
	create_time_str = now.strftime("%Y%m%d_%H%M%S")
	# create_time_str = "20250724_090615"
	else:
	create_time_str = args.create_time_str

	eval_dataset = eval_dataset_dir / args.eval_dataset_name

	model_name_ = args.model_name.replace("/", "#")
	output_file = eval_data_dir / f"aliyun_nxcloud_choice2/aliyun/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
	output_file.parent.mkdir(parents=True, exist_ok=True)

	api_key = environment.get(args.service, dtype=str)
	if args.service == "aliyun_api_key_bj":
	base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
	elif args.service == "aliyun_api_key_sgp":
	base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
	else:
	raise AssertionError(f"invalid service: {args.service}")

	client = OpenAI(
	base_url=base_url,
	# Read your Ark API Key from the environment variable.
	api_key=api_key
	)

	total = 0
	total_correct = 0

	# finished
	finished_idx_set = set()
	if os.path.exists(output_file.as_posix()):
	with open(output_file.as_posix(), "r", encoding="utf-8") as f:
	for row in f:
	row = json.loads(row)
	idx = row["idx"]
	total = row["total"]
	total_correct = row["total_correct"]
	finished_idx_set.add(idx)
	print(f"finished count: {len(finished_idx_set)}")

	with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
	for row in fin:
	row = json.loads(row)
	idx = row["idx"]
	system_prompt = row["system_prompt"]
	conversation = row["conversation"]
	examples = row["examples"]
	choices = row["choices"]
	response = row["response"]

	if idx in finished_idx_set:
	continue

	# conversation
	conversation_str = conversation_to_str(conversation)

	examples_str = ""
	for example in examples:
	conversation_ = example["conversation"]
	outputs = example["outputs"]
	output = outputs["output"]
	explanation = outputs["explanation"]

	examples_str += conversation_to_str(conversation_)
	output_json = {"Explanation": explanation, "output": output}
	output_json_str = json.dumps(output_json, ensure_ascii=False)
	examples_str += f"\nOutput: {output_json_str}\n"

	# print(examples_str)

	choices_str = ""
	for choice in choices:
	condition = choice["condition"]
	choice_letter = choice["choice_letter"]

	row_ = f"{condition}, output: {choice_letter}\n"
	choices_str += row_
	# choices_str += "\nRemember to output ONLY the corresponding letter.\nYour output is:"
	choices_str += "\nPlease use only 10-15 words to explain.\nOutput:"

	# prompt = f"{system_prompt}\n\nOutput\n{choices_}\nExamples\n{examples_}"
	prompt1 = f"{system_prompt}\n\nExamples\n{examples_str}"
	prompt2 = f"Conversation\n{conversation_str}\n\nOutput\n{choices_str}"
	# print(prompt1)
	# print(prompt2)

	messages = list()
	messages.append(
	{"role": "system", "content": prompt1},
	)
	messages.append(
	{"role": "user", "content": prompt2},
	)
	# print(f"messages: {json.dumps(messages, ensure_ascii=False, indent=4)}")

	try:
	time.sleep(args.interval)
	print(f"sleep: {args.interval}")
	time_begin = time.time()
	completion = client.chat.completions.create(
	model=args.model_name,
	messages=messages,
	# 由于 enable_thinking 非 OpenAI 标准参数，需要通过 extra_body 传入
	extra_body={"enable_thinking": False},
	stream=False,
	)
	time_cost = time.time() - time_begin
	print(f"time_cost: {time_cost}")
	except Exception as e:
	print(f"request failed, error type: {type(e)}, error text: {str(e)}")
	continue

	# print(f"completion: {completion}")
	prediction_str = completion.choices[0].message.content
	rid = completion.id

	try:
	prediction_ = json.loads(prediction_str)
	prediction = prediction_["output"]
	except Exception as error:
	prediction_ = None
	prediction = None

	correct = 1 if prediction == response else 0

	total += 1
	total_correct += correct
	score = total_correct / total

	row_ = {
	"idx": idx,
	"rid": rid,
	"messages": messages,
	"response": response,
	"prediction": prediction,
	"prediction_": prediction_,
	"correct": correct,
	"total": total,
	"total_correct": total_correct,
	"score": score,
	"time_cost": time_cost,
	}
	row_ = json.dumps(row_, ensure_ascii=False)
	fout.write(f"{row_}\n")
	fout.flush()

	return


	if __name__ == "__main__":
	main()