from transformers import AutoModel, AutoProcessor from PIL import Image import torch ### Uncomment if you want to use Ascend NPUs # import torch_npu # from torch_npu.contrib import transfer_to_npu # prepare models and processors model = AutoModel.from_pretrained( "Emova-ollm/emova-qwen-2-5-7b-hf", torch_dtype=torch.bfloat16, attn_implementation='flash_attention_2', # OR 'sdpa' for NPUs low_cpu_mem_usage=True, trust_remote_code=True).eval().cuda() processor = AutoProcessor.from_pretrained("Emova-ollm/emova-qwen-2-5-7b-hf", trust_remote_code=True) # only necessary for spoken dialogue # Note to inference with speech inputs/outputs, **emova_speech_tokenizer** is still a necessary dependency (https://huggingface.co/Emova-ollm/emova_speech_tokenizer_hf#installation). speeck_tokenizer = AutoModel.from_pretrained("Emova-ollm/emova_speech_tokenizer_hf", torch_dtype=torch.float32, trust_remote_code=True).eval().cuda() processor.set_speech_tokenizer(speeck_tokenizer) # # Example 1: image-text # inputs = dict( # text=[ # {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What's shown in this image?"}]}, # {"role": "assistant", "content": [{"type": "text", "text": "This image shows a red stop sign."}]}, # {"role": "user", "content": [{"type": "text", "text": "Describe the image in more details."}]}, # ], # images=Image.open('path/to/image') # ) # Example 2: text-audio inputs = dict( text=[ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, {"role": "user", "content": [{"type": "text", "text": "Please synthesize the speech corresponding to the follwing text.\nhe hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce"}]}, ] ) # # Example 3: image-text-audio # inputs = dict( # text=[{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}], # images=Image.open('path/to/image'), # audios='path/to/audio' # ) # run processors inputs = processor(**inputs, return_tensors="pt") inputs = inputs.to(model.device) # prepare generation arguments gen_kwargs = {"max_new_tokens": 4096, "do_sample": False} # add if necessary speech_kwargs = {"speaker": "female", "output_wav_prefix": "output"} # run generation # for speech outputs, we will return the saved wav paths (c.f., output_wav_prefix) with torch.no_grad(): outputs = model.generate(**inputs, **gen_kwargs) outputs = outputs[:, inputs['input_ids'].shape[1]:] print(processor.batch_decode(outputs, skip_special_tokens=True, **speech_kwargs))