galihboy's picture
Upload 2 files
6b644a1 verified
raw
history blame
12.8 kB
import gradio as gr
from sentence_transformers import SentenceTransformer
import json
import numpy as np
import os
import httpx
# ==================== CONFIGURATION ====================
# Model - akan auto-download dari HF Hub saat pertama kali
HF_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# Path lokal untuk development (opsional, diabaikan jika tidak ada)
LOCAL_MODEL_PATH = r"E:\huggingface_models\hub\models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2\snapshots"
# Supabase configuration (dari environment variables untuk keamanan)
# Di HF Space: Settings > Repository secrets
# Di lokal: set environment variable atau gunakan default untuk testing
SUPABASE_URL = os.environ.get("SUPABASE_URL", "")
SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "")
def get_model_path():
"""Deteksi environment dan return path model yang sesuai"""
# Cek apakah folder lokal ada
if os.path.exists(LOCAL_MODEL_PATH):
# Cari snapshot terbaru
snapshots = os.listdir(LOCAL_MODEL_PATH)
if snapshots:
return os.path.join(LOCAL_MODEL_PATH, snapshots[0])
# Fallback ke HF Hub (untuk deployment di Space)
return HF_MODEL_NAME
# Load model saat startup
print("Loading model...")
model_path = get_model_path()
print(f"Using model from: {model_path}")
model = SentenceTransformer(model_path)
print("Model loaded successfully!")
def get_embedding(text: str):
"""Generate embedding untuk single text"""
if not text or not text.strip():
return {"error": "Text tidak boleh kosong"}
try:
embedding = model.encode(text.strip())
return {"embedding": embedding.tolist()}
except Exception as e:
return {"error": str(e)}
def get_embeddings_batch(texts_json: str):
"""Generate embeddings untuk multiple texts (JSON array)"""
try:
texts = json.loads(texts_json)
if not isinstance(texts, list):
return {"error": "Input harus JSON array"}
if len(texts) == 0:
return {"error": "Array tidak boleh kosong"}
# Filter empty strings
texts = [t.strip() for t in texts if t and t.strip()]
if len(texts) == 0:
return {"error": "Semua text kosong"}
embeddings = model.encode(texts)
return {"embeddings": embeddings.tolist()}
except json.JSONDecodeError:
return {"error": "Invalid JSON format. Gunakan format: [\"teks 1\", \"teks 2\"]"}
except Exception as e:
return {"error": str(e)}
def calculate_similarity(text1: str, text2: str):
"""Hitung cosine similarity antara dua teks"""
if not text1 or not text1.strip():
return {"error": "Text 1 tidak boleh kosong"}
if not text2 or not text2.strip():
return {"error": "Text 2 tidak boleh kosong"}
try:
embeddings = model.encode([text1.strip(), text2.strip()])
# Cosine similarity
similarity = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
return {
"similarity": float(similarity),
"percentage": f"{similarity * 100:.2f}%"
}
except Exception as e:
return {"error": str(e)}
# ==================== SUPABASE PROXY FUNCTIONS ====================
def get_supabase_headers():
"""Get headers untuk Supabase API calls"""
return {
"apikey": SUPABASE_KEY,
"Authorization": f"Bearer {SUPABASE_KEY}",
"Content-Type": "application/json",
"Prefer": "return=representation"
}
def db_get_all_embeddings():
"""Ambil semua embeddings dari Supabase"""
if not SUPABASE_URL or not SUPABASE_KEY:
return {"error": "Supabase not configured"}
try:
url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=nim,content_hash,embedding_combined,embedding_judul,embedding_deskripsi,embedding_problem,embedding_metode,nama,judul"
with httpx.Client(timeout=30.0) as client:
response = client.get(url, headers=get_supabase_headers())
if response.status_code == 200:
return {"data": response.json(), "count": len(response.json())}
else:
return {"error": f"Supabase error: {response.status_code}", "detail": response.text}
except Exception as e:
return {"error": str(e)}
def db_get_embedding(nim: str, content_hash: str):
"""Ambil embedding untuk NIM dan content_hash tertentu"""
if not SUPABASE_URL or not SUPABASE_KEY:
return {"error": "Supabase not configured"}
try:
url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?nim=eq.{nim}&content_hash=eq.{content_hash}&select=*"
with httpx.Client(timeout=30.0) as client:
response = client.get(url, headers=get_supabase_headers())
if response.status_code == 200:
data = response.json()
return {"data": data[0] if data else None, "found": len(data) > 0}
else:
return {"error": f"Supabase error: {response.status_code}"}
except Exception as e:
return {"error": str(e)}
def db_save_embedding(data_json: str):
"""Simpan embedding ke Supabase (upsert)"""
if not SUPABASE_URL or not SUPABASE_KEY:
return {"error": "Supabase not configured"}
try:
data = json.loads(data_json)
# Validate required fields
if not data.get("nim") or not data.get("content_hash"):
return {"error": "nim and content_hash are required"}
if not data.get("embedding_combined"):
return {"error": "embedding_combined is required"}
url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings"
headers = get_supabase_headers()
headers["Prefer"] = "resolution=merge-duplicates,return=representation"
payload = {
"nim": data["nim"],
"content_hash": data["content_hash"],
"embedding_combined": data["embedding_combined"],
"embedding_judul": data.get("embedding_judul"),
"embedding_deskripsi": data.get("embedding_deskripsi"),
"embedding_problem": data.get("embedding_problem"),
"embedding_metode": data.get("embedding_metode"),
"nama": data.get("nama"),
"judul": data.get("judul")
}
with httpx.Client(timeout=30.0) as client:
response = client.post(url, headers=headers, json=payload)
if response.status_code in [200, 201]:
return {"success": True, "data": response.json()}
else:
return {"error": f"Supabase error: {response.status_code}", "detail": response.text}
except json.JSONDecodeError:
return {"error": "Invalid JSON format"}
except Exception as e:
return {"error": str(e)}
def db_check_connection():
"""Test koneksi ke Supabase"""
if not SUPABASE_URL or not SUPABASE_KEY:
return {"connected": False, "error": "Supabase URL or KEY not configured"}
try:
url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=id&limit=1"
with httpx.Client(timeout=10.0) as client:
response = client.get(url, headers=get_supabase_headers())
return {
"connected": response.status_code == 200,
"status_code": response.status_code,
"supabase_url": SUPABASE_URL[:30] + "..." if len(SUPABASE_URL) > 30 else SUPABASE_URL
}
except Exception as e:
return {"connected": False, "error": str(e)}
# Gradio Interface
with gr.Blocks(title="Semantic Embedding API") as demo:
gr.Markdown("# πŸ”€ Semantic Embedding API")
gr.Markdown("API untuk menghasilkan text embedding menggunakan `paraphrase-multilingual-MiniLM-L12-v2`")
gr.Markdown("**Model**: Multilingual, mendukung 50+ bahasa termasuk Bahasa Indonesia")
with gr.Tab("πŸ”’ Single Embedding"):
gr.Markdown("Generate embedding vector untuk satu teks")
text_input = gr.Textbox(
label="Input Text",
placeholder="Masukkan teks untuk di-embed...",
lines=2
)
single_output = gr.JSON(label="Embedding Result")
single_btn = gr.Button("Generate Embedding", variant="primary")
single_btn.click(fn=get_embedding, inputs=text_input, outputs=single_output)
with gr.Tab("πŸ“¦ Batch Embedding"):
gr.Markdown("Generate embeddings untuk multiple teks sekaligus")
batch_input = gr.Textbox(
label="JSON Array of Texts",
placeholder='["teks pertama", "teks kedua", "teks ketiga"]',
lines=4
)
batch_output = gr.JSON(label="Embeddings Result")
batch_btn = gr.Button("Generate Embeddings", variant="primary")
batch_btn.click(fn=get_embeddings_batch, inputs=batch_input, outputs=batch_output)
with gr.Tab("πŸ“Š Similarity Check"):
gr.Markdown("Hitung kemiripan semantik antara dua teks")
with gr.Row():
sim_text1 = gr.Textbox(label="Text 1", placeholder="Teks pertama...", lines=2)
sim_text2 = gr.Textbox(label="Text 2", placeholder="Teks kedua...", lines=2)
sim_output = gr.JSON(label="Similarity Result")
sim_btn = gr.Button("Calculate Similarity", variant="primary")
sim_btn.click(fn=calculate_similarity, inputs=[sim_text1, sim_text2], outputs=sim_output)
with gr.Tab("πŸ’Ύ Database (Supabase)"):
gr.Markdown("### Supabase Cache Operations")
gr.Markdown("Proxy untuk akses Supabase (API key aman di server)")
with gr.Row():
db_check_btn = gr.Button("πŸ”Œ Check Connection", variant="secondary")
db_check_output = gr.JSON(label="Connection Status")
db_check_btn.click(fn=db_check_connection, outputs=db_check_output)
gr.Markdown("---")
gr.Markdown("#### Get All Cached Embeddings")
db_all_btn = gr.Button("πŸ“₯ Get All Embeddings", variant="primary")
db_all_output = gr.JSON(label="All Embeddings")
db_all_btn.click(fn=db_get_all_embeddings, outputs=db_all_output)
gr.Markdown("---")
gr.Markdown("#### Get Single Embedding by NIM")
with gr.Row():
db_nim_input = gr.Textbox(label="NIM", placeholder="10121xxx")
db_hash_input = gr.Textbox(label="Content Hash", placeholder="abc123...")
db_get_btn = gr.Button("πŸ” Get Embedding", variant="primary")
db_get_output = gr.JSON(label="Embedding Result")
db_get_btn.click(fn=db_get_embedding, inputs=[db_nim_input, db_hash_input], outputs=db_get_output)
gr.Markdown("---")
gr.Markdown("#### Save Embedding")
db_save_input = gr.Textbox(
label="Embedding Data (JSON)",
placeholder='{"nim": "123", "content_hash": "abc", "embedding_combined": [...], ...}',
lines=4
)
db_save_btn = gr.Button("πŸ’Ύ Save Embedding", variant="primary")
db_save_output = gr.JSON(label="Save Result")
db_save_btn.click(fn=db_save_embedding, inputs=db_save_input, outputs=db_save_output)
with gr.Accordion("πŸ“‘ API Usage (untuk Developer)", open=False):
gr.Markdown("""
### Endpoints
#### Embedding
- `get_embedding` - Single text embedding
- `get_embeddings_batch` - Batch text embeddings
- `calculate_similarity` - Compare two texts
#### Database (Supabase Proxy)
- `db_check_connection` - Test Supabase connection
- `db_get_all_embeddings` - Get all cached embeddings
- `db_get_embedding` - Get embedding by NIM + hash
- `db_save_embedding` - Save embedding to cache
### Example API Call
```javascript
// Get all cached embeddings
const response = await fetch("YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ data: [] })
});
const result = await response.json();
const eventId = result.event_id;
// Get result
const dataResponse = await fetch(`YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings/${eventId}`);
```
""")
gr.Markdown("---")
gr.Markdown("*Dibuat untuk Monitoring Proposal Skripsi KK E - UNIKOM*")
# Launch dengan API enabled
demo.launch()