Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Running on Zero

AIDAS-Omni-Modal-Diffusion / EMOVA_speech_tokenizer /eval.py

jaeikkim

Vendor EMOVA_speech_tokenizer locally

7bd8b78 24 days ago

2.82 kB

	from transformers import AutoModel, AutoProcessor
	from PIL import Image
	import torch

	### Uncomment if you want to use Ascend NPUs
	# import torch_npu
	# from torch_npu.contrib import transfer_to_npu

	# prepare models and processors
	model = AutoModel.from_pretrained(
	"Emova-ollm/emova-qwen-2-5-7b-hf",
	torch_dtype=torch.bfloat16,
	attn_implementation='flash_attention_2', # OR 'sdpa' for NPUs
	low_cpu_mem_usage=True,
	trust_remote_code=True).eval().cuda()
	processor = AutoProcessor.from_pretrained("Emova-ollm/emova-qwen-2-5-7b-hf", trust_remote_code=True)

	# only necessary for spoken dialogue
	# Note to inference with speech inputs/outputs, emova_speech_tokenizer is still a necessary dependency (https://huggingface.co/Emova-ollm/emova_speech_tokenizer_hf#installation).
	speeck_tokenizer = AutoModel.from_pretrained("Emova-ollm/emova_speech_tokenizer_hf", torch_dtype=torch.float32, trust_remote_code=True).eval().cuda()
	processor.set_speech_tokenizer(speeck_tokenizer)

	# # Example 1: image-text
	# inputs = dict(
	# text=[
	# {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
	# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What's shown in this image?"}]},
	# {"role": "assistant", "content": [{"type": "text", "text": "This image shows a red stop sign."}]},
	# {"role": "user", "content": [{"type": "text", "text": "Describe the image in more details."}]},
	# ],
	# images=Image.open('path/to/image')
	# )

	# Example 2: text-audio
	inputs = dict(
	text=[
	{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
	{"role": "user", "content": [{"type": "text", "text": "Please synthesize the speech corresponding to the follwing text.\nhe hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce"}]},
	]
	)

	# # Example 3: image-text-audio
	# inputs = dict(
	# text=[{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}],
	# images=Image.open('path/to/image'),
	# audios='path/to/audio'
	# )

	# run processors
	inputs = processor(**inputs, return_tensors="pt")
	inputs = inputs.to(model.device)

	# prepare generation arguments
	gen_kwargs = {"max_new_tokens": 4096, "do_sample": False} # add if necessary
	speech_kwargs = {"speaker": "female", "output_wav_prefix": "output"}

	# run generation
	# for speech outputs, we will return the saved wav paths (c.f., output_wav_prefix)
	with torch.no_grad():
	outputs = model.generate(inputs, gen_kwargs)
	outputs = outputs[:, inputs['input_ids'].shape[1]:]
	print(processor.batch_decode(outputs, skip_special_tokens=True, **speech_kwargs))