import gradio as gr from sentence_transformers import SentenceTransformer import json import numpy as np import os import httpx # ==================== CONFIGURATION ==================== # Model - akan auto-download dari HF Hub saat pertama kali HF_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # Path lokal untuk development (opsional, diabaikan jika tidak ada) LOCAL_MODEL_PATH = r"E:\huggingface_models\hub\models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2\snapshots" # Supabase configuration (dari environment variables untuk keamanan) # Di HF Space: Settings > Repository secrets # Di lokal: set environment variable atau gunakan default untuk testing SUPABASE_URL = os.environ.get("SUPABASE_URL", "") SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "") def get_model_path(): """Deteksi environment dan return path model yang sesuai""" # Cek apakah folder lokal ada if os.path.exists(LOCAL_MODEL_PATH): # Cari snapshot terbaru snapshots = os.listdir(LOCAL_MODEL_PATH) if snapshots: return os.path.join(LOCAL_MODEL_PATH, snapshots[0]) # Fallback ke HF Hub (untuk deployment di Space) return HF_MODEL_NAME # Load model saat startup print("Loading model...") model_path = get_model_path() print(f"Using model from: {model_path}") model = SentenceTransformer(model_path) print("Model loaded successfully!") def get_embedding(text: str): """Generate embedding untuk single text""" if not text or not text.strip(): return {"error": "Text tidak boleh kosong"} try: embedding = model.encode(text.strip()) return {"embedding": embedding.tolist()} except Exception as e: return {"error": str(e)} def get_embeddings_batch(texts_json: str): """Generate embeddings untuk multiple texts (JSON array)""" try: texts = json.loads(texts_json) if not isinstance(texts, list): return {"error": "Input harus JSON array"} if len(texts) == 0: return {"error": "Array tidak boleh kosong"} # Filter empty strings texts = [t.strip() for t in texts if t and t.strip()] if len(texts) == 0: return {"error": "Semua text kosong"} embeddings = model.encode(texts) return {"embeddings": embeddings.tolist()} except json.JSONDecodeError: return {"error": "Invalid JSON format. Gunakan format: [\"teks 1\", \"teks 2\"]"} except Exception as e: return {"error": str(e)} def calculate_similarity(text1: str, text2: str): """Hitung cosine similarity antara dua teks""" if not text1 or not text1.strip(): return {"error": "Text 1 tidak boleh kosong"} if not text2 or not text2.strip(): return {"error": "Text 2 tidak boleh kosong"} try: embeddings = model.encode([text1.strip(), text2.strip()]) # Cosine similarity similarity = np.dot(embeddings[0], embeddings[1]) / ( np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) ) return { "similarity": float(similarity), "percentage": f"{similarity * 100:.2f}%" } except Exception as e: return {"error": str(e)} # ==================== SUPABASE PROXY FUNCTIONS ==================== def get_supabase_headers(): """Get headers untuk Supabase API calls""" return { "apikey": SUPABASE_KEY, "Authorization": f"Bearer {SUPABASE_KEY}", "Content-Type": "application/json", "Prefer": "return=representation" } def db_get_all_embeddings(): """Ambil semua embeddings dari Supabase""" if not SUPABASE_URL or not SUPABASE_KEY: return {"error": "Supabase not configured"} try: url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=nim,content_hash,embedding_combined,embedding_judul,embedding_deskripsi,embedding_problem,embedding_metode,nama,judul" with httpx.Client(timeout=30.0) as client: response = client.get(url, headers=get_supabase_headers()) if response.status_code == 200: return {"data": response.json(), "count": len(response.json())} else: return {"error": f"Supabase error: {response.status_code}", "detail": response.text} except Exception as e: return {"error": str(e)} def db_get_embedding(nim: str, content_hash: str): """Ambil embedding untuk NIM dan content_hash tertentu""" if not SUPABASE_URL or not SUPABASE_KEY: return {"error": "Supabase not configured"} try: url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?nim=eq.{nim}&content_hash=eq.{content_hash}&select=*" with httpx.Client(timeout=30.0) as client: response = client.get(url, headers=get_supabase_headers()) if response.status_code == 200: data = response.json() return {"data": data[0] if data else None, "found": len(data) > 0} else: return {"error": f"Supabase error: {response.status_code}"} except Exception as e: return {"error": str(e)} def db_save_embedding(data_json: str): """Simpan embedding ke Supabase (upsert)""" if not SUPABASE_URL or not SUPABASE_KEY: return {"error": "Supabase not configured"} try: data = json.loads(data_json) # Validate required fields if not data.get("nim") or not data.get("content_hash"): return {"error": "nim and content_hash are required"} if not data.get("embedding_combined"): return {"error": "embedding_combined is required"} url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings" headers = get_supabase_headers() headers["Prefer"] = "resolution=merge-duplicates,return=representation" payload = { "nim": data["nim"], "content_hash": data["content_hash"], "embedding_combined": data["embedding_combined"], "embedding_judul": data.get("embedding_judul"), "embedding_deskripsi": data.get("embedding_deskripsi"), "embedding_problem": data.get("embedding_problem"), "embedding_metode": data.get("embedding_metode"), "nama": data.get("nama"), "judul": data.get("judul") } with httpx.Client(timeout=30.0) as client: response = client.post(url, headers=headers, json=payload) if response.status_code in [200, 201]: return {"success": True, "data": response.json()} else: return {"error": f"Supabase error: {response.status_code}", "detail": response.text} except json.JSONDecodeError: return {"error": "Invalid JSON format"} except Exception as e: return {"error": str(e)} def db_check_connection(): """Test koneksi ke Supabase""" if not SUPABASE_URL or not SUPABASE_KEY: return {"connected": False, "error": "Supabase URL or KEY not configured"} try: url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=id&limit=1" with httpx.Client(timeout=10.0) as client: response = client.get(url, headers=get_supabase_headers()) return { "connected": response.status_code == 200, "status_code": response.status_code, "supabase_url": SUPABASE_URL[:30] + "..." if len(SUPABASE_URL) > 30 else SUPABASE_URL } except Exception as e: return {"connected": False, "error": str(e)} # Gradio Interface with gr.Blocks(title="Semantic Embedding API") as demo: gr.Markdown("# 🔤 Semantic Embedding API") gr.Markdown("API untuk menghasilkan text embedding menggunakan `paraphrase-multilingual-MiniLM-L12-v2`") gr.Markdown("**Model**: Multilingual, mendukung 50+ bahasa termasuk Bahasa Indonesia") with gr.Tab("🔢 Single Embedding"): gr.Markdown("Generate embedding vector untuk satu teks") text_input = gr.Textbox( label="Input Text", placeholder="Masukkan teks untuk di-embed...", lines=2 ) single_output = gr.JSON(label="Embedding Result") single_btn = gr.Button("Generate Embedding", variant="primary") single_btn.click(fn=get_embedding, inputs=text_input, outputs=single_output) with gr.Tab("📦 Batch Embedding"): gr.Markdown("Generate embeddings untuk multiple teks sekaligus") batch_input = gr.Textbox( label="JSON Array of Texts", placeholder='["teks pertama", "teks kedua", "teks ketiga"]', lines=4 ) batch_output = gr.JSON(label="Embeddings Result") batch_btn = gr.Button("Generate Embeddings", variant="primary") batch_btn.click(fn=get_embeddings_batch, inputs=batch_input, outputs=batch_output) with gr.Tab("📊 Similarity Check"): gr.Markdown("Hitung kemiripan semantik antara dua teks") with gr.Row(): sim_text1 = gr.Textbox(label="Text 1", placeholder="Teks pertama...", lines=2) sim_text2 = gr.Textbox(label="Text 2", placeholder="Teks kedua...", lines=2) sim_output = gr.JSON(label="Similarity Result") sim_btn = gr.Button("Calculate Similarity", variant="primary") sim_btn.click(fn=calculate_similarity, inputs=[sim_text1, sim_text2], outputs=sim_output) with gr.Tab("💾 Database (Supabase)"): gr.Markdown("### Supabase Cache Operations") gr.Markdown("Proxy untuk akses Supabase (API key aman di server)") with gr.Row(): db_check_btn = gr.Button("🔌 Check Connection", variant="secondary") db_check_output = gr.JSON(label="Connection Status") db_check_btn.click(fn=db_check_connection, outputs=db_check_output) gr.Markdown("---") gr.Markdown("#### Get All Cached Embeddings") db_all_btn = gr.Button("📥 Get All Embeddings", variant="primary") db_all_output = gr.JSON(label="All Embeddings") db_all_btn.click(fn=db_get_all_embeddings, outputs=db_all_output) gr.Markdown("---") gr.Markdown("#### Get Single Embedding by NIM") with gr.Row(): db_nim_input = gr.Textbox(label="NIM", placeholder="10121xxx") db_hash_input = gr.Textbox(label="Content Hash", placeholder="abc123...") db_get_btn = gr.Button("🔍 Get Embedding", variant="primary") db_get_output = gr.JSON(label="Embedding Result") db_get_btn.click(fn=db_get_embedding, inputs=[db_nim_input, db_hash_input], outputs=db_get_output) gr.Markdown("---") gr.Markdown("#### Save Embedding") db_save_input = gr.Textbox( label="Embedding Data (JSON)", placeholder='{"nim": "123", "content_hash": "abc", "embedding_combined": [...], ...}', lines=4 ) db_save_btn = gr.Button("💾 Save Embedding", variant="primary") db_save_output = gr.JSON(label="Save Result") db_save_btn.click(fn=db_save_embedding, inputs=db_save_input, outputs=db_save_output) with gr.Accordion("📡 API Usage (untuk Developer)", open=False): gr.Markdown(""" ### Endpoints #### Embedding - `get_embedding` - Single text embedding - `get_embeddings_batch` - Batch text embeddings - `calculate_similarity` - Compare two texts #### Database (Supabase Proxy) - `db_check_connection` - Test Supabase connection - `db_get_all_embeddings` - Get all cached embeddings - `db_get_embedding` - Get embedding by NIM + hash - `db_save_embedding` - Save embedding to cache ### Example API Call ```javascript // Get all cached embeddings const response = await fetch("YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ data: [] }) }); const result = await response.json(); const eventId = result.event_id; // Get result const dataResponse = await fetch(`YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings/${eventId}`); ``` """) gr.Markdown("---") gr.Markdown("*Dibuat untuk Monitoring Proposal Skripsi KK E - UNIKOM*") # Launch dengan API enabled demo.launch()