import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sentence_transformers import SentenceTransformer import faiss from wordcloud import WordCloud from langdetect import detect from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS from nltk.corpus import stopwords as nltk_stopwords import nltk import re import tempfile import os import chardet import csv nltk.download("stopwords") fr_stopwords = set(nltk_stopwords.words("french")) model = SentenceTransformer("all-MiniLM-L12-v2") def clean_text(text, lang): words = re.findall(r"\b\w+\b", text.lower()) stops = fr_stopwords if lang == "fr" else ENGLISH_STOP_WORDS if lang == "en" else set() return " ".join([w for w in words if w not in stops and len(w) > 2]) def detect_encoding(file_path): with open(file_path, "rb") as f: rawdata = f.read(10000) return chardet.detect(rawdata)["encoding"] def detect_separator(file_path, encoding): with open(file_path, "r", encoding=encoding) as f: sample = f.read(2048) sniffer = csv.Sniffer() try: dialect = sniffer.sniff(sample) return dialect.delimiter except Exception: return "," # fallback def semantic_search(file, text_column, query, threshold, top_k): try: encoding = detect_encoding(file.name) sep = detect_separator(file.name, encoding) df = pd.read_csv(file.name, sep=sep, encoding=encoding, quotechar='"', on_bad_lines="skip", engine="python") except Exception as e: return f"Erreur : {e}", None, None, None, None if text_column not in df.columns: return f"Colonne '{text_column}' introuvable. Colonnes disponibles : {list(df.columns)}", None, None, None, None texts = df[text_column].fillna("").astype(str).tolist() embeddings = model.encode(texts, normalize_embeddings=True) index = faiss.IndexFlatIP(embeddings.shape[1]) index.add(embeddings.astype("float32")) query_vec = model.encode([query], normalize_embeddings=True).astype("float32") scores, indices = index.search(query_vec, len(texts)) sims = scores[0] matches = sims >= threshold percent = 100 * np.sum(matches) / len(sims) top_indices = indices[0][:top_k] top_scores = sims[:top_k] top_texts = [texts[i] for i in top_indices] df_result = pd.DataFrame({ "Similarité": top_scores, "Texte": top_texts }) fig, ax = plt.subplots(figsize=(6, 4)) sns.histplot(sims, bins=30, ax=ax, kde=True) ax.axvline(threshold, color="red", linestyle="--", label=f"Seuil = {threshold}") ax.set_title("Distribution des similarités") ax.set_xlabel("Score de similarité") ax.set_ylabel("Nombre de textes") ax.legend() hist_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name fig.savefig(hist_path, bbox_inches="tight") plt.close(fig) try: lang = detect(" ".join(top_texts[:3])) except: lang = "en" cleaned = [clean_text(t, lang) for t in top_texts] wc_text = " ".join(cleaned) wc = WordCloud(width=800, height=400, background_color="white").generate(wc_text) wc_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name wc.to_file(wc_path) csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name df_result.to_csv(csv_path, index=False, encoding="utf-8") return f"{percent:.2f}% des textes sont jugés pertinents (sim ≥ {threshold})", df_result, hist_path, wc_path, csv_path # Interface Gradio with gr.Blocks() as demo: gr.Markdown("# 🔍 Recherche Sémantique avec Visualisation et Export CSV") with gr.Row(): file_input = gr.File(label="📁 Fichier CSV", file_types=[".csv"]) load_columns_btn = gr.Button("🪄 Charger les colonnes") column_selector = gr.Dropdown(label="🧾 Sélectionne la colonne de texte", choices=[], interactive=True) query_input = gr.Textbox(label="🔎 Requête (ex : propos racistes)", value="propos racistes") threshold_input = gr.Slider(0.0, 1.0, value=0.35, label="Seuil de similarité") topk_input = gr.Slider(1, 100, value=20, label="Nombre de résultats affichés") search_btn = gr.Button("⚙️ Lancer la recherche") result_text = gr.Textbox(label="📊 Résumé", lines=1) result_table = gr.Dataframe(label="📋 Textes les plus proches", wrap=True) result_plot = gr.Image(label="📈 Histogramme des similarités") result_wc = gr.Image(label="☁️ Nuage de mots") result_csv = gr.File(label="⬇️ Télécharger résultats CSV") def load_columns(file): try: encoding = detect_encoding(file.name) sep = detect_separator(file.name, encoding) df = pd.read_csv(file.name, encoding=encoding, sep=sep, engine="python", on_bad_lines="skip") return gr.update(choices=sorted(df.columns.tolist())) except Exception as e: return gr.update(choices=[f"Erreur : {e}"]) load_columns_btn.click(fn=load_columns, inputs=[file_input], outputs=[column_selector]) search_btn.click( fn=semantic_search, inputs=[file_input, column_selector, query_input, threshold_input, topk_input], outputs=[result_text, result_table, result_plot, result_wc, result_csv] ) if __name__ == "__main__": demo.launch()