Spaces:
Sleeping
Sleeping
| # import required libraries | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.llms import HuggingFaceHub | |
| #from langchain.vectorstores import Chroma | |
| from langchain_community.vectorstores import Chroma | |
| import tensorflow_datasets as tfds | |
| from sentence_transformers import SentenceTransformer | |
| from datasets import load_dataset | |
| from transformers import BartForConditionalGeneration, BartTokenizer | |
| import textwrap | |
| import chromadb | |
| import streamlit as st | |
| import sys,yaml | |
| import uuid | |
| import Utilities as ut | |
| def text_summarizer(text): | |
| initdict = ut.get_tokens() | |
| BART_Model_Name = initdict["BART_model"] | |
| #model_name = "facebook/bart-large-cnn" | |
| model = BartForConditionalGeneration.from_pretrained(BART_Model_Name) | |
| tokenizer = BartTokenizer.from_pretrained(BART_Model_Name) | |
| inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True) | |
| summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True) | |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| formatted_summary = "\n".join(textwrap.wrap(summary, width=80)) | |
| return formatted_summary | |
| def load_patentBIGdata(): | |
| initdict={} | |
| initdict = ut.get_tokens() | |
| embedding_model_id = initdict["embedding_model"] | |
| chromadbpath = initdict["dataset_chroma_db"] | |
| chromadbcollname = initdict["dataset_chroma_db_collection_name"] | |
| embedding_model = SentenceTransformer(embedding_model_id) | |
| chroma_client = chromadb.PersistentClient(path= chromadbpath) | |
| collection = chroma_client.get_or_create_collection(name=chromadbcollname) | |
| # Load the Big patent dataset | |
| ds = load_dataset("big_patent", "a", split="validation[:1%]",trust_remote_code=True) | |
| for record in ds.take(10): | |
| abstract, desc = record ["abstract"], record["description"] | |
| # Summarize to 150 words | |
| abstract = text_summarizer(abstract) | |
| textembeddings = embedding_model.encode(abstract).tolist() | |
| genguid=str(uuid.uuid4()) | |
| #take 8 characters | |
| uniqueid = genguid[:8] | |
| # Now we will store the expert explanation field of first 10 questions from dataset into collection. | |
| collection.add( | |
| documents=[ | |
| abstract | |
| ], | |
| embeddings=[textembeddings], | |
| ids=[uniqueid] | |
| ) | |
| #print(abstract) | |
| st.title("Patent Ingestion - BIG Patent") | |
| # Main chat form | |
| with st.form("chat_form"): | |
| submit_button = st.form_submit_button("Upload BIG Patent data...") | |
| if submit_button: | |
| load_patentBIGdata() | |
| response = "BIG Patent dataset was successfully loaded" | |
| st.write (response) | |