File size: 5,414 Bytes
629aa16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10ec1e1
 
629aa16
 
 
 
 
 
 
10ec1e1
629aa16
 
10ec1e1
 
 
 
 
 
 
 
 
 
 
 
 
59ce0f1
10ec1e1
629aa16
 
10ec1e1
 
 
629aa16
 
 
 
10ec1e1
629aa16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10ec1e1
629aa16
 
 
 
 
210b407
59ce0f1
210b407
629aa16
 
 
 
 
 
 
 
 
 
 
 
 
791e25b
 
 
10ec1e1
 
59ce0f1
791e25b
59ce0f1
791e25b
210b407
791e25b
629aa16
 
791e25b
629aa16
 
 
 
4f34e0c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import faiss
from wordcloud import WordCloud
from langdetect import detect
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords as nltk_stopwords
import nltk
import re
import tempfile
import os
import chardet
import csv

nltk.download("stopwords")
fr_stopwords = set(nltk_stopwords.words("french"))
model = SentenceTransformer("all-MiniLM-L12-v2")

def clean_text(text, lang):
    words = re.findall(r"\b\w+\b", text.lower())
    stops = fr_stopwords if lang == "fr" else ENGLISH_STOP_WORDS if lang == "en" else set()
    return " ".join([w for w in words if w not in stops and len(w) > 2])

def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        rawdata = f.read(10000)
    return chardet.detect(rawdata)["encoding"]

def detect_separator(file_path, encoding):
    with open(file_path, "r", encoding=encoding) as f:
        sample = f.read(2048)
        sniffer = csv.Sniffer()
        try:
            dialect = sniffer.sniff(sample)
            return dialect.delimiter
        except Exception:
            return ","  # fallback

def semantic_search(file, text_column, query, threshold, top_k):
    try:
        encoding = detect_encoding(file.name)
        sep = detect_separator(file.name, encoding)
        df = pd.read_csv(file.name, sep=sep, encoding=encoding, quotechar='"', on_bad_lines="skip", engine="python")
    except Exception as e:
        return f"Erreur : {e}", None, None, None, None

    if text_column not in df.columns:
        return f"Colonne '{text_column}' introuvable. Colonnes disponibles : {list(df.columns)}", None, None, None, None

    texts = df[text_column].fillna("").astype(str).tolist()
    embeddings = model.encode(texts, normalize_embeddings=True)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings.astype("float32"))

    query_vec = model.encode([query], normalize_embeddings=True).astype("float32")
    scores, indices = index.search(query_vec, len(texts))
    sims = scores[0]
    matches = sims >= threshold
    percent = 100 * np.sum(matches) / len(sims)

    top_indices = indices[0][:top_k]
    top_scores = sims[:top_k]
    top_texts = [texts[i] for i in top_indices]

    df_result = pd.DataFrame({
        "Similarité": top_scores,
        "Texte": top_texts
    })

    fig, ax = plt.subplots(figsize=(6, 4))
    sns.histplot(sims, bins=30, ax=ax, kde=True)
    ax.axvline(threshold, color="red", linestyle="--", label=f"Seuil = {threshold}")
    ax.set_title("Distribution des similarités")
    ax.set_xlabel("Score de similarité")
    ax.set_ylabel("Nombre de textes")
    ax.legend()
    hist_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
    fig.savefig(hist_path, bbox_inches="tight")
    plt.close(fig)

    try:
        lang = detect(" ".join(top_texts[:3]))
    except:
        lang = "en"
    cleaned = [clean_text(t, lang) for t in top_texts]
    wc_text = " ".join(cleaned)
    wc = WordCloud(width=800, height=400, background_color="white").generate(wc_text)
    wc_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
    wc.to_file(wc_path)

    csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
    df_result.to_csv(csv_path, index=False, encoding="utf-8")

    return f"{percent:.2f}% des textes sont jugés pertinents (sim ≥ {threshold})", df_result, hist_path, wc_path, csv_path

# Interface Gradio
with gr.Blocks() as demo:
    gr.Markdown("# 🔍 Recherche Sémantique avec Visualisation et Export CSV")

    with gr.Row():
        file_input = gr.File(label="📁 Fichier CSV", file_types=[".csv"])
        load_columns_btn = gr.Button("🪄 Charger les colonnes")

    column_selector = gr.Dropdown(label="🧾 Sélectionne la colonne de texte", choices=[], interactive=True)

    query_input = gr.Textbox(label="🔎 Requête (ex : propos racistes)", value="propos racistes")
    threshold_input = gr.Slider(0.0, 1.0, value=0.35, label="Seuil de similarité")
    topk_input = gr.Slider(1, 100, value=20, label="Nombre de résultats affichés")

    search_btn = gr.Button("⚙️ Lancer la recherche")

    result_text = gr.Textbox(label="📊 Résumé", lines=1)
    result_table = gr.Dataframe(label="📋 Textes les plus proches", wrap=True)
    result_plot = gr.Image(label="📈 Histogramme des similarités")
    result_wc = gr.Image(label="☁️ Nuage de mots")
    result_csv = gr.File(label="⬇️ Télécharger résultats CSV")

    def load_columns(file):
        try:
            encoding = detect_encoding(file.name)
            sep = detect_separator(file.name, encoding)
            df = pd.read_csv(file.name, encoding=encoding, sep=sep, engine="python", on_bad_lines="skip")
            return gr.update(choices=sorted(df.columns.tolist()))
        except Exception as e:
            return gr.update(choices=[f"Erreur : {e}"])

    load_columns_btn.click(fn=load_columns, inputs=[file_input], outputs=[column_selector])

    search_btn.click(
        fn=semantic_search,
        inputs=[file_input, column_selector, query_input, threshold_input, topk_input],
        outputs=[result_text, result_table, result_plot, result_wc, result_csv]
    )

if __name__ == "__main__":
    demo.launch()