Spaces:
Sleeping
Sleeping
File size: 5,414 Bytes
629aa16 10ec1e1 629aa16 10ec1e1 629aa16 10ec1e1 59ce0f1 10ec1e1 629aa16 10ec1e1 629aa16 10ec1e1 629aa16 10ec1e1 629aa16 210b407 59ce0f1 210b407 629aa16 791e25b 10ec1e1 59ce0f1 791e25b 59ce0f1 791e25b 210b407 791e25b 629aa16 791e25b 629aa16 4f34e0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import faiss
from wordcloud import WordCloud
from langdetect import detect
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords as nltk_stopwords
import nltk
import re
import tempfile
import os
import chardet
import csv
nltk.download("stopwords")
fr_stopwords = set(nltk_stopwords.words("french"))
model = SentenceTransformer("all-MiniLM-L12-v2")
def clean_text(text, lang):
words = re.findall(r"\b\w+\b", text.lower())
stops = fr_stopwords if lang == "fr" else ENGLISH_STOP_WORDS if lang == "en" else set()
return " ".join([w for w in words if w not in stops and len(w) > 2])
def detect_encoding(file_path):
with open(file_path, "rb") as f:
rawdata = f.read(10000)
return chardet.detect(rawdata)["encoding"]
def detect_separator(file_path, encoding):
with open(file_path, "r", encoding=encoding) as f:
sample = f.read(2048)
sniffer = csv.Sniffer()
try:
dialect = sniffer.sniff(sample)
return dialect.delimiter
except Exception:
return "," # fallback
def semantic_search(file, text_column, query, threshold, top_k):
try:
encoding = detect_encoding(file.name)
sep = detect_separator(file.name, encoding)
df = pd.read_csv(file.name, sep=sep, encoding=encoding, quotechar='"', on_bad_lines="skip", engine="python")
except Exception as e:
return f"Erreur : {e}", None, None, None, None
if text_column not in df.columns:
return f"Colonne '{text_column}' introuvable. Colonnes disponibles : {list(df.columns)}", None, None, None, None
texts = df[text_column].fillna("").astype(str).tolist()
embeddings = model.encode(texts, normalize_embeddings=True)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings.astype("float32"))
query_vec = model.encode([query], normalize_embeddings=True).astype("float32")
scores, indices = index.search(query_vec, len(texts))
sims = scores[0]
matches = sims >= threshold
percent = 100 * np.sum(matches) / len(sims)
top_indices = indices[0][:top_k]
top_scores = sims[:top_k]
top_texts = [texts[i] for i in top_indices]
df_result = pd.DataFrame({
"Similarité": top_scores,
"Texte": top_texts
})
fig, ax = plt.subplots(figsize=(6, 4))
sns.histplot(sims, bins=30, ax=ax, kde=True)
ax.axvline(threshold, color="red", linestyle="--", label=f"Seuil = {threshold}")
ax.set_title("Distribution des similarités")
ax.set_xlabel("Score de similarité")
ax.set_ylabel("Nombre de textes")
ax.legend()
hist_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
fig.savefig(hist_path, bbox_inches="tight")
plt.close(fig)
try:
lang = detect(" ".join(top_texts[:3]))
except:
lang = "en"
cleaned = [clean_text(t, lang) for t in top_texts]
wc_text = " ".join(cleaned)
wc = WordCloud(width=800, height=400, background_color="white").generate(wc_text)
wc_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
wc.to_file(wc_path)
csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
df_result.to_csv(csv_path, index=False, encoding="utf-8")
return f"{percent:.2f}% des textes sont jugés pertinents (sim ≥ {threshold})", df_result, hist_path, wc_path, csv_path
# Interface Gradio
with gr.Blocks() as demo:
gr.Markdown("# 🔍 Recherche Sémantique avec Visualisation et Export CSV")
with gr.Row():
file_input = gr.File(label="📁 Fichier CSV", file_types=[".csv"])
load_columns_btn = gr.Button("🪄 Charger les colonnes")
column_selector = gr.Dropdown(label="🧾 Sélectionne la colonne de texte", choices=[], interactive=True)
query_input = gr.Textbox(label="🔎 Requête (ex : propos racistes)", value="propos racistes")
threshold_input = gr.Slider(0.0, 1.0, value=0.35, label="Seuil de similarité")
topk_input = gr.Slider(1, 100, value=20, label="Nombre de résultats affichés")
search_btn = gr.Button("⚙️ Lancer la recherche")
result_text = gr.Textbox(label="📊 Résumé", lines=1)
result_table = gr.Dataframe(label="📋 Textes les plus proches", wrap=True)
result_plot = gr.Image(label="📈 Histogramme des similarités")
result_wc = gr.Image(label="☁️ Nuage de mots")
result_csv = gr.File(label="⬇️ Télécharger résultats CSV")
def load_columns(file):
try:
encoding = detect_encoding(file.name)
sep = detect_separator(file.name, encoding)
df = pd.read_csv(file.name, encoding=encoding, sep=sep, engine="python", on_bad_lines="skip")
return gr.update(choices=sorted(df.columns.tolist()))
except Exception as e:
return gr.update(choices=[f"Erreur : {e}"])
load_columns_btn.click(fn=load_columns, inputs=[file_input], outputs=[column_selector])
search_btn.click(
fn=semantic_search,
inputs=[file_input, column_selector, query_input, threshold_input, topk_input],
outputs=[result_text, result_table, result_plot, result_wc, result_csv]
)
if __name__ == "__main__":
demo.launch() |