Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| from wordcloud import WordCloud | |
| from langdetect import detect | |
| from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS | |
| from nltk.corpus import stopwords as nltk_stopwords | |
| import nltk | |
| import re | |
| import tempfile | |
| import os | |
| import chardet | |
| import csv | |
| nltk.download("stopwords") | |
| fr_stopwords = set(nltk_stopwords.words("french")) | |
| model = SentenceTransformer("all-MiniLM-L12-v2") | |
| def clean_text(text, lang): | |
| words = re.findall(r"\b\w+\b", text.lower()) | |
| stops = fr_stopwords if lang == "fr" else ENGLISH_STOP_WORDS if lang == "en" else set() | |
| return " ".join([w for w in words if w not in stops and len(w) > 2]) | |
| def detect_encoding(file_path): | |
| with open(file_path, "rb") as f: | |
| rawdata = f.read(10000) | |
| return chardet.detect(rawdata)["encoding"] | |
| def detect_separator(file_path, encoding): | |
| with open(file_path, "r", encoding=encoding) as f: | |
| sample = f.read(2048) | |
| sniffer = csv.Sniffer() | |
| try: | |
| dialect = sniffer.sniff(sample) | |
| return dialect.delimiter | |
| except Exception: | |
| return "," # fallback | |
| def semantic_search(file, text_column, query, threshold, top_k): | |
| try: | |
| encoding = detect_encoding(file.name) | |
| sep = detect_separator(file.name, encoding) | |
| df = pd.read_csv(file.name, sep=sep, encoding=encoding, quotechar='"', on_bad_lines="skip", engine="python") | |
| except Exception as e: | |
| return f"Erreur : {e}", None, None, None, None | |
| if text_column not in df.columns: | |
| return f"Colonne '{text_column}' introuvable. Colonnes disponibles : {list(df.columns)}", None, None, None, None | |
| texts = df[text_column].fillna("").astype(str).tolist() | |
| embeddings = model.encode(texts, normalize_embeddings=True) | |
| index = faiss.IndexFlatIP(embeddings.shape[1]) | |
| index.add(embeddings.astype("float32")) | |
| query_vec = model.encode([query], normalize_embeddings=True).astype("float32") | |
| scores, indices = index.search(query_vec, len(texts)) | |
| sims = scores[0] | |
| matches = sims >= threshold | |
| percent = 100 * np.sum(matches) / len(sims) | |
| top_indices = indices[0][:top_k] | |
| top_scores = sims[:top_k] | |
| top_texts = [texts[i] for i in top_indices] | |
| df_result = pd.DataFrame({ | |
| "Similarité": top_scores, | |
| "Texte": top_texts | |
| }) | |
| fig, ax = plt.subplots(figsize=(6, 4)) | |
| sns.histplot(sims, bins=30, ax=ax, kde=True) | |
| ax.axvline(threshold, color="red", linestyle="--", label=f"Seuil = {threshold}") | |
| ax.set_title("Distribution des similarités") | |
| ax.set_xlabel("Score de similarité") | |
| ax.set_ylabel("Nombre de textes") | |
| ax.legend() | |
| hist_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name | |
| fig.savefig(hist_path, bbox_inches="tight") | |
| plt.close(fig) | |
| try: | |
| lang = detect(" ".join(top_texts[:3])) | |
| except: | |
| lang = "en" | |
| cleaned = [clean_text(t, lang) for t in top_texts] | |
| wc_text = " ".join(cleaned) | |
| wc = WordCloud(width=800, height=400, background_color="white").generate(wc_text) | |
| wc_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name | |
| wc.to_file(wc_path) | |
| csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name | |
| df_result.to_csv(csv_path, index=False, encoding="utf-8") | |
| return f"{percent:.2f}% des textes sont jugés pertinents (sim ≥ {threshold})", df_result, hist_path, wc_path, csv_path | |
| # Interface Gradio | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🔍 Recherche Sémantique avec Visualisation et Export CSV") | |
| with gr.Row(): | |
| file_input = gr.File(label="📁 Fichier CSV", file_types=[".csv"]) | |
| load_columns_btn = gr.Button("🪄 Charger les colonnes") | |
| column_selector = gr.Dropdown(label="🧾 Sélectionne la colonne de texte", choices=[], interactive=True) | |
| query_input = gr.Textbox(label="🔎 Requête (ex : propos racistes)", value="propos racistes") | |
| threshold_input = gr.Slider(0.0, 1.0, value=0.35, label="Seuil de similarité") | |
| topk_input = gr.Slider(1, 100, value=20, label="Nombre de résultats affichés") | |
| search_btn = gr.Button("⚙️ Lancer la recherche") | |
| result_text = gr.Textbox(label="📊 Résumé", lines=1) | |
| result_table = gr.Dataframe(label="📋 Textes les plus proches", wrap=True) | |
| result_plot = gr.Image(label="📈 Histogramme des similarités") | |
| result_wc = gr.Image(label="☁️ Nuage de mots") | |
| result_csv = gr.File(label="⬇️ Télécharger résultats CSV") | |
| def load_columns(file): | |
| try: | |
| encoding = detect_encoding(file.name) | |
| sep = detect_separator(file.name, encoding) | |
| df = pd.read_csv(file.name, encoding=encoding, sep=sep, engine="python", on_bad_lines="skip") | |
| return gr.update(choices=sorted(df.columns.tolist())) | |
| except Exception as e: | |
| return gr.update(choices=[f"Erreur : {e}"]) | |
| load_columns_btn.click(fn=load_columns, inputs=[file_input], outputs=[column_selector]) | |
| search_btn.click( | |
| fn=semantic_search, | |
| inputs=[file_input, column_selector, query_input, threshold_input, topk_input], | |
| outputs=[result_text, result_table, result_plot, result_wc, result_csv] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |