Spaces:

SagarKeshave
/

math_app

Sleeping

math_app / app.py

Update app.py

6d6c51d verified almost 2 years ago

1.54 kB

	import streamlit as st


	import transformers
	# import torch
	import json
	import os
	# from transformers import AutoTokenizer, TextStreamer , pipeline


	# model_id = "WizardLM/WizardMath-7B-V1.1"


	# # Configuration
	# runtimeFlag = "cuda:0" #Run on GPU (you can't run GPTQ on cpu)
	# cache_dir = None # by default, don't set a cache directory. This is automatically updated if you connect Google Drive.
	# scaling_factor = 1.0 # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires Colab Pro and a V100 or A100 to have sufficient RAM.

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig



	model_id = "WizardLM/WizardMath-7B-V1.1"

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16
	)

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

	question = st.text_area("Enter questoin")

	# text = "Sum of two numbers is 20 and difference is 4. What are the numbers?"
	text = st.text_area("Enter questoin")


	# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

	if text:

	device = "cuda:0"

	inputs = tokenizer(str(text), return_tensors="pt").to(device)

	outputs = model_4bit.generate(**inputs, max_new_tokens=512)
	# out = pipe(question)[0]['generated_text']

	st.write(tokenizer.decode(outputs[0], skip_special_tokens=True))