Spaces:

evannh
/

test_recherche_semantique

Sleeping

File size: 5,414 Bytes

import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import faiss
from wordcloud import WordCloud
from langdetect import detect
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords as nltk_stopwords
import nltk
import re
import tempfile
import os
import chardet
import csv

nltk.download("stopwords")
fr_stopwords = set(nltk_stopwords.words("french"))
model = SentenceTransformer("all-MiniLM-L12-v2")

def clean_text(text, lang):
    words = re.findall(r"\b\w+\b", text.lower())
    stops = fr_stopwords if lang == "fr" else ENGLISH_STOP_WORDS if lang == "en" else set()
    return " ".join([w for w in words if w not in stops and len(w) > 2])

def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        rawdata = f.read(10000)
    return chardet.detect(rawdata)["encoding"]

def detect_separator(file_path, encoding):
    with open(file_path, "r", encoding=encoding) as f:
        sample = f.read(2048)
        sniffer = csv.Sniffer()
        try:
            dialect = sniffer.sniff(sample)
            return dialect.delimiter
        except Exception:
            return ","  # fallback

def semantic_search(file, text_column, query, threshold, top_k):
    try:
        encoding = detect_encoding(file.name)
        sep = detect_separator(file.name, encoding)
        df = pd.read_csv(file.name, sep=sep, encoding=encoding, quotechar='"', on_bad_lines="skip", engine="python")
    except Exception as e:
        return f"Erreur : {e}", None, None, None, None

    if text_column not in df.columns:
        return f"Colonne '{text_column}' introuvable. Colonnes disponibles : {list(df.columns)}", None, None, None, None

    texts = df[text_column].fillna("").astype(str).tolist()
    embeddings = model.encode(texts, normalize_embeddings=True)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings.astype("float32"))

    query_vec = model.encode([query], normalize_embeddings=True).astype("float32")
    scores, indices = index.search(query_vec, len(texts))
    sims = scores[0]
    matches = sims >= threshold
    percent = 100 * np.sum(matches) / len(sims)

    top_indices = indices[0][:top_k]
    top_scores = sims[:top_k]
    top_texts = [texts[i] for i in top_indices]

    df_result = pd.DataFrame({
        "Similarité": top_scores,
        "Texte": top_texts
    })

    fig, ax = plt.subplots(figsize=(6, 4))
    sns.histplot(sims, bins=30, ax=ax, kde=True)
    ax.axvline(threshold, color="red", linestyle="--", label=f"Seuil = {threshold}")
    ax.set_title("Distribution des similarités")
    ax.set_xlabel("Score de similarité")
    ax.set_ylabel("Nombre de textes")
    ax.legend()
    hist_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
    fig.savefig(hist_path, bbox_inches="tight")
    plt.close(fig)

    try:
        lang = detect(" ".join(top_texts[:3]))
    except:
        lang = "en"
    cleaned = [clean_text(t, lang) for t in top_texts]
    wc_text = " ".join(cleaned)
    wc = WordCloud(width=800, height=400, background_color="white").generate(wc_text)
    wc_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
    wc.to_file(wc_path)

    csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
    df_result.to_csv(csv_path, index=False, encoding="utf-8")

    return f"{percent:.2f}% des textes sont jugés pertinents (sim ≥ {threshold})", df_result, hist_path, wc_path, csv_path

# Interface Gradio
with gr.Blocks() as demo:
    gr.Markdown("# 🔍 Recherche Sémantique avec Visualisation et Export CSV")

    with gr.Row():
        file_input = gr.File(label="📁 Fichier CSV", file_types=[".csv"])
        load_columns_btn = gr.Button("🪄 Charger les colonnes")

    column_selector = gr.Dropdown(label="🧾 Sélectionne la colonne de texte", choices=[], interactive=True)

    query_input = gr.Textbox(label="🔎 Requête (ex : propos racistes)", value="propos racistes")
    threshold_input = gr.Slider(0.0, 1.0, value=0.35, label="Seuil de similarité")
    topk_input = gr.Slider(1, 100, value=20, label="Nombre de résultats affichés")

    search_btn = gr.Button("⚙️ Lancer la recherche")

    result_text = gr.Textbox(label="📊 Résumé", lines=1)
    result_table = gr.Dataframe(label="📋 Textes les plus proches", wrap=True)
    result_plot = gr.Image(label="📈 Histogramme des similarités")
    result_wc = gr.Image(label="☁️ Nuage de mots")
    result_csv = gr.File(label="⬇️ Télécharger résultats CSV")

    def load_columns(file):
        try:
            encoding = detect_encoding(file.name)
            sep = detect_separator(file.name, encoding)
            df = pd.read_csv(file.name, encoding=encoding, sep=sep, engine="python", on_bad_lines="skip")
            return gr.update(choices=sorted(df.columns.tolist()))
        except Exception as e:
            return gr.update(choices=[f"Erreur : {e}"])

    load_columns_btn.click(fn=load_columns, inputs=[file_input], outputs=[column_selector])

    search_btn.click(
        fn=semantic_search,
        inputs=[file_input, column_selector, query_input, threshold_input, topk_input],
        outputs=[result_text, result_table, result_plot, result_wc, result_csv]
    )

if __name__ == "__main__":
    demo.launch()