Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from umap import UMAP | |
| from hdbscan import HDBSCAN | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from bertopic import BERTopic | |
| from bertopic.representation import MaximalMarginalRelevance | |
| from bertopic.vectorizers import ClassTfidfTransformer | |
| # Charger les stopwords | |
| try: | |
| stop_words = stopwords.words('english') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| stop_words = stopwords.words('english') | |
| # Stopwords personnalisés | |
| custom_stopwords = ["made", "sure"] | |
| stop_words.extend(custom_stopwords) | |
| # Pipeline BERTopic personnalisé | |
| def generate_topics(file): | |
| docs = pd.read_csv(file.name) | |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = embedding_model.encode(docs['text'].tolist(), show_progress_bar=True) | |
| umap_model = UMAP( | |
| n_neighbors=20, | |
| n_components=5, | |
| min_dist=0.0, | |
| metric='cosine', | |
| random_state=42 | |
| ) | |
| hdbscan_model = HDBSCAN( | |
| min_cluster_size=60, | |
| min_samples=1, | |
| metric='euclidean', | |
| cluster_selection_method='eom', | |
| prediction_data=True | |
| ) | |
| vectorizer_model = CountVectorizer( | |
| stop_words=stop_words, | |
| min_df=1, | |
| ngram_range=(1, 3) | |
| ) | |
| ctfidf_model = ClassTfidfTransformer() | |
| representation_model = MaximalMarginalRelevance(diversity=0.7) | |
| BERT_model = BERTopic( | |
| embedding_model=embedding_model, | |
| umap_model=umap_model, | |
| hdbscan_model=hdbscan_model, | |
| vectorizer_model=vectorizer_model, | |
| ctfidf_model=ctfidf_model, | |
| representation_model=representation_model, | |
| verbose=True | |
| ) | |
| topics, _ = BERT_model.fit_transform(docs['text'].tolist()) | |
| BERT_model.reduce_outliers(docs['text'].tolist(), topics) | |
| fig = BERT_model.visualize_documents(docs['text'].tolist()) | |
| return fig | |
| # Interface Gradio | |
| demo = gr.Interface( | |
| fn=generate_topics, | |
| inputs=gr.File(label="Upload bbc-text.csv"), | |
| outputs=gr.Plot(label="Topic Map"), | |
| title="Topic Modeling avec BERTopic", | |
| description="Téléversez un fichier CSV avec une colonne 'text' pour générer une visualisation thématique interactive." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |