from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch

### Uncomment if you want to use Ascend NPUs
# import torch_npu
# from torch_npu.contrib import transfer_to_npu

# prepare models and processors
model = AutoModel.from_pretrained(
    "Emova-ollm/emova-qwen-2-5-7b-hf",
    torch_dtype=torch.bfloat16,
    attn_implementation='flash_attention_2', # OR 'sdpa' for NPUs
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()
processor = AutoProcessor.from_pretrained("Emova-ollm/emova-qwen-2-5-7b-hf", trust_remote_code=True)

# only necessary for spoken dialogue
# Note to inference with speech inputs/outputs, **emova_speech_tokenizer** is still a necessary dependency (https://huggingface.co/Emova-ollm/emova_speech_tokenizer_hf#installation).
speeck_tokenizer = AutoModel.from_pretrained("Emova-ollm/emova_speech_tokenizer_hf", torch_dtype=torch.float32, trust_remote_code=True).eval().cuda()
processor.set_speech_tokenizer(speeck_tokenizer)

# # Example 1: image-text
# inputs = dict(
#     text=[
#         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
#         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What's shown in this image?"}]},
#         {"role": "assistant", "content": [{"type": "text", "text": "This image shows a red stop sign."}]},
#         {"role": "user", "content": [{"type": "text", "text": "Describe the image in more details."}]},
#     ],
#     images=Image.open('path/to/image')
# )

# Example 2: text-audio
inputs = dict(
        text=[
        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
        {"role": "user", "content": [{"type": "text", "text": "Please synthesize the speech corresponding to the follwing text.\nhe hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce"}]},
    ]
)

# # Example 3: image-text-audio
# inputs = dict(
#     text=[{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}],
#     images=Image.open('path/to/image'),
#     audios='path/to/audio'
# )

# run processors
inputs = processor(**inputs, return_tensors="pt")
inputs = inputs.to(model.device)

# prepare generation arguments
gen_kwargs = {"max_new_tokens": 4096, "do_sample": False} # add if necessary
speech_kwargs = {"speaker": "female", "output_wav_prefix": "output"}

# run generation
# for speech outputs, we will return the saved wav paths (c.f., output_wav_prefix)
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(processor.batch_decode(outputs, skip_special_tokens=True, **speech_kwargs))