Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import transformers | |
| # import torch | |
| import json | |
| import os | |
| # from transformers import AutoTokenizer, TextStreamer , pipeline | |
| # model_id = "WizardLM/WizardMath-7B-V1.1" | |
| # # Configuration | |
| # runtimeFlag = "cuda:0" #Run on GPU (you can't run GPTQ on cpu) | |
| # cache_dir = None # by default, don't set a cache directory. This is automatically updated if you connect Google Drive. | |
| # scaling_factor = 1.0 # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires Colab Pro and a V100 or A100 to have sufficient RAM. | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| model_id = "WizardLM/WizardMath-7B-V1.1" | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16 | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto") | |
| question = st.text_area("Enter questoin") | |
| # text = "Sum of two numbers is 20 and difference is 4. What are the numbers?" | |
| text = st.text_area("Enter questoin") | |
| # print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
| if text: | |
| device = "cuda:0" | |
| inputs = tokenizer(str(text), return_tensors="pt").to(device) | |
| outputs = model_4bit.generate(**inputs, max_new_tokens=512) | |
| # out = pipe(question)[0]['generated_text'] | |
| st.write(tokenizer.decode(outputs[0], skip_special_tokens=True)) |