Spaces:
Running
on
Zero
Running
on
Zero
| from transformers import AutoModel, AutoProcessor | |
| from PIL import Image | |
| import torch | |
| ### Uncomment if you want to use Ascend NPUs | |
| # import torch_npu | |
| # from torch_npu.contrib import transfer_to_npu | |
| # prepare models and processors | |
| model = AutoModel.from_pretrained( | |
| "Emova-ollm/emova-qwen-2-5-7b-hf", | |
| torch_dtype=torch.bfloat16, | |
| attn_implementation='flash_attention_2', # OR 'sdpa' for NPUs | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True).eval().cuda() | |
| processor = AutoProcessor.from_pretrained("Emova-ollm/emova-qwen-2-5-7b-hf", trust_remote_code=True) | |
| # only necessary for spoken dialogue | |
| # Note to inference with speech inputs/outputs, **emova_speech_tokenizer** is still a necessary dependency (https://huggingface.co/Emova-ollm/emova_speech_tokenizer_hf#installation). | |
| speeck_tokenizer = AutoModel.from_pretrained("Emova-ollm/emova_speech_tokenizer_hf", torch_dtype=torch.float32, trust_remote_code=True).eval().cuda() | |
| processor.set_speech_tokenizer(speeck_tokenizer) | |
| # # Example 1: image-text | |
| # inputs = dict( | |
| # text=[ | |
| # {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, | |
| # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What's shown in this image?"}]}, | |
| # {"role": "assistant", "content": [{"type": "text", "text": "This image shows a red stop sign."}]}, | |
| # {"role": "user", "content": [{"type": "text", "text": "Describe the image in more details."}]}, | |
| # ], | |
| # images=Image.open('path/to/image') | |
| # ) | |
| # Example 2: text-audio | |
| inputs = dict( | |
| text=[ | |
| {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, | |
| {"role": "user", "content": [{"type": "text", "text": "Please synthesize the speech corresponding to the follwing text.\nhe hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce"}]}, | |
| ] | |
| ) | |
| # # Example 3: image-text-audio | |
| # inputs = dict( | |
| # text=[{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}], | |
| # images=Image.open('path/to/image'), | |
| # audios='path/to/audio' | |
| # ) | |
| # run processors | |
| inputs = processor(**inputs, return_tensors="pt") | |
| inputs = inputs.to(model.device) | |
| # prepare generation arguments | |
| gen_kwargs = {"max_new_tokens": 4096, "do_sample": False} # add if necessary | |
| speech_kwargs = {"speaker": "female", "output_wav_prefix": "output"} | |
| # run generation | |
| # for speech outputs, we will return the saved wav paths (c.f., output_wav_prefix) | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, **gen_kwargs) | |
| outputs = outputs[:, inputs['input_ids'].shape[1]:] | |
| print(processor.batch_decode(outputs, skip_special_tokens=True, **speech_kwargs)) |