Spaces:
Running
Running
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| import json | |
| import numpy as np | |
| import os | |
| import httpx | |
| # ==================== CONFIGURATION ==================== | |
| # Model - akan auto-download dari HF Hub saat pertama kali | |
| HF_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| # Path lokal untuk development (opsional, diabaikan jika tidak ada) | |
| LOCAL_MODEL_PATH = r"E:\huggingface_models\hub\models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2\snapshots" | |
| # Supabase configuration (dari environment variables untuk keamanan) | |
| # Di HF Space: Settings > Repository secrets | |
| # Di lokal: set environment variable atau gunakan default untuk testing | |
| SUPABASE_URL = os.environ.get("SUPABASE_URL", "") | |
| SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "") | |
| def get_model_path(): | |
| """Deteksi environment dan return path model yang sesuai""" | |
| # Cek apakah folder lokal ada | |
| if os.path.exists(LOCAL_MODEL_PATH): | |
| # Cari snapshot terbaru | |
| snapshots = os.listdir(LOCAL_MODEL_PATH) | |
| if snapshots: | |
| return os.path.join(LOCAL_MODEL_PATH, snapshots[0]) | |
| # Fallback ke HF Hub (untuk deployment di Space) | |
| return HF_MODEL_NAME | |
| # Load model saat startup | |
| print("Loading model...") | |
| model_path = get_model_path() | |
| print(f"Using model from: {model_path}") | |
| model = SentenceTransformer(model_path) | |
| print("Model loaded successfully!") | |
| def get_embedding(text: str): | |
| """Generate embedding untuk single text""" | |
| if not text or not text.strip(): | |
| return {"error": "Text tidak boleh kosong"} | |
| try: | |
| embedding = model.encode(text.strip()) | |
| return {"embedding": embedding.tolist()} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def get_embeddings_batch(texts_json: str): | |
| """Generate embeddings untuk multiple texts (JSON array)""" | |
| try: | |
| texts = json.loads(texts_json) | |
| if not isinstance(texts, list): | |
| return {"error": "Input harus JSON array"} | |
| if len(texts) == 0: | |
| return {"error": "Array tidak boleh kosong"} | |
| # Filter empty strings | |
| texts = [t.strip() for t in texts if t and t.strip()] | |
| if len(texts) == 0: | |
| return {"error": "Semua text kosong"} | |
| embeddings = model.encode(texts) | |
| return {"embeddings": embeddings.tolist()} | |
| except json.JSONDecodeError: | |
| return {"error": "Invalid JSON format. Gunakan format: [\"teks 1\", \"teks 2\"]"} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def calculate_similarity(text1: str, text2: str): | |
| """Hitung cosine similarity antara dua teks""" | |
| if not text1 or not text1.strip(): | |
| return {"error": "Text 1 tidak boleh kosong"} | |
| if not text2 or not text2.strip(): | |
| return {"error": "Text 2 tidak boleh kosong"} | |
| try: | |
| embeddings = model.encode([text1.strip(), text2.strip()]) | |
| # Cosine similarity | |
| similarity = np.dot(embeddings[0], embeddings[1]) / ( | |
| np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) | |
| ) | |
| return { | |
| "similarity": float(similarity), | |
| "percentage": f"{similarity * 100:.2f}%" | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| # ==================== SUPABASE PROXY FUNCTIONS ==================== | |
| def get_supabase_headers(): | |
| """Get headers untuk Supabase API calls""" | |
| return { | |
| "apikey": SUPABASE_KEY, | |
| "Authorization": f"Bearer {SUPABASE_KEY}", | |
| "Content-Type": "application/json", | |
| "Prefer": "return=representation" | |
| } | |
| def db_get_all_embeddings(): | |
| """Ambil semua embeddings dari Supabase""" | |
| if not SUPABASE_URL or not SUPABASE_KEY: | |
| return {"error": "Supabase not configured"} | |
| try: | |
| url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=nim,content_hash,embedding_combined,embedding_judul,embedding_deskripsi,embedding_problem,embedding_metode,nama,judul" | |
| with httpx.Client(timeout=30.0) as client: | |
| response = client.get(url, headers=get_supabase_headers()) | |
| if response.status_code == 200: | |
| return {"data": response.json(), "count": len(response.json())} | |
| else: | |
| return {"error": f"Supabase error: {response.status_code}", "detail": response.text} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def db_get_embedding(nim: str, content_hash: str): | |
| """Ambil embedding untuk NIM dan content_hash tertentu""" | |
| if not SUPABASE_URL or not SUPABASE_KEY: | |
| return {"error": "Supabase not configured"} | |
| try: | |
| url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?nim=eq.{nim}&content_hash=eq.{content_hash}&select=*" | |
| with httpx.Client(timeout=30.0) as client: | |
| response = client.get(url, headers=get_supabase_headers()) | |
| if response.status_code == 200: | |
| data = response.json() | |
| return {"data": data[0] if data else None, "found": len(data) > 0} | |
| else: | |
| return {"error": f"Supabase error: {response.status_code}"} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def db_save_embedding(data_json: str): | |
| """Simpan embedding ke Supabase (upsert)""" | |
| if not SUPABASE_URL or not SUPABASE_KEY: | |
| return {"error": "Supabase not configured"} | |
| try: | |
| data = json.loads(data_json) | |
| # Validate required fields | |
| if not data.get("nim") or not data.get("content_hash"): | |
| return {"error": "nim and content_hash are required"} | |
| if not data.get("embedding_combined"): | |
| return {"error": "embedding_combined is required"} | |
| url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings" | |
| headers = get_supabase_headers() | |
| headers["Prefer"] = "resolution=merge-duplicates,return=representation" | |
| payload = { | |
| "nim": data["nim"], | |
| "content_hash": data["content_hash"], | |
| "embedding_combined": data["embedding_combined"], | |
| "embedding_judul": data.get("embedding_judul"), | |
| "embedding_deskripsi": data.get("embedding_deskripsi"), | |
| "embedding_problem": data.get("embedding_problem"), | |
| "embedding_metode": data.get("embedding_metode"), | |
| "nama": data.get("nama"), | |
| "judul": data.get("judul") | |
| } | |
| with httpx.Client(timeout=30.0) as client: | |
| response = client.post(url, headers=headers, json=payload) | |
| if response.status_code in [200, 201]: | |
| return {"success": True, "data": response.json()} | |
| else: | |
| return {"error": f"Supabase error: {response.status_code}", "detail": response.text} | |
| except json.JSONDecodeError: | |
| return {"error": "Invalid JSON format"} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def db_check_connection(): | |
| """Test koneksi ke Supabase""" | |
| if not SUPABASE_URL or not SUPABASE_KEY: | |
| return {"connected": False, "error": "Supabase URL or KEY not configured"} | |
| try: | |
| url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=id&limit=1" | |
| with httpx.Client(timeout=10.0) as client: | |
| response = client.get(url, headers=get_supabase_headers()) | |
| return { | |
| "connected": response.status_code == 200, | |
| "status_code": response.status_code, | |
| "supabase_url": SUPABASE_URL[:30] + "..." if len(SUPABASE_URL) > 30 else SUPABASE_URL | |
| } | |
| except Exception as e: | |
| return {"connected": False, "error": str(e)} | |
| # Gradio Interface | |
| with gr.Blocks(title="Semantic Embedding API") as demo: | |
| gr.Markdown("# π€ Semantic Embedding API") | |
| gr.Markdown("API untuk menghasilkan text embedding menggunakan `paraphrase-multilingual-MiniLM-L12-v2`") | |
| gr.Markdown("**Model**: Multilingual, mendukung 50+ bahasa termasuk Bahasa Indonesia") | |
| with gr.Tab("π’ Single Embedding"): | |
| gr.Markdown("Generate embedding vector untuk satu teks") | |
| text_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Masukkan teks untuk di-embed...", | |
| lines=2 | |
| ) | |
| single_output = gr.JSON(label="Embedding Result") | |
| single_btn = gr.Button("Generate Embedding", variant="primary") | |
| single_btn.click(fn=get_embedding, inputs=text_input, outputs=single_output) | |
| with gr.Tab("π¦ Batch Embedding"): | |
| gr.Markdown("Generate embeddings untuk multiple teks sekaligus") | |
| batch_input = gr.Textbox( | |
| label="JSON Array of Texts", | |
| placeholder='["teks pertama", "teks kedua", "teks ketiga"]', | |
| lines=4 | |
| ) | |
| batch_output = gr.JSON(label="Embeddings Result") | |
| batch_btn = gr.Button("Generate Embeddings", variant="primary") | |
| batch_btn.click(fn=get_embeddings_batch, inputs=batch_input, outputs=batch_output) | |
| with gr.Tab("π Similarity Check"): | |
| gr.Markdown("Hitung kemiripan semantik antara dua teks") | |
| with gr.Row(): | |
| sim_text1 = gr.Textbox(label="Text 1", placeholder="Teks pertama...", lines=2) | |
| sim_text2 = gr.Textbox(label="Text 2", placeholder="Teks kedua...", lines=2) | |
| sim_output = gr.JSON(label="Similarity Result") | |
| sim_btn = gr.Button("Calculate Similarity", variant="primary") | |
| sim_btn.click(fn=calculate_similarity, inputs=[sim_text1, sim_text2], outputs=sim_output) | |
| with gr.Tab("πΎ Database (Supabase)"): | |
| gr.Markdown("### Supabase Cache Operations") | |
| gr.Markdown("Proxy untuk akses Supabase (API key aman di server)") | |
| with gr.Row(): | |
| db_check_btn = gr.Button("π Check Connection", variant="secondary") | |
| db_check_output = gr.JSON(label="Connection Status") | |
| db_check_btn.click(fn=db_check_connection, outputs=db_check_output) | |
| gr.Markdown("---") | |
| gr.Markdown("#### Get All Cached Embeddings") | |
| db_all_btn = gr.Button("π₯ Get All Embeddings", variant="primary") | |
| db_all_output = gr.JSON(label="All Embeddings") | |
| db_all_btn.click(fn=db_get_all_embeddings, outputs=db_all_output) | |
| gr.Markdown("---") | |
| gr.Markdown("#### Get Single Embedding by NIM") | |
| with gr.Row(): | |
| db_nim_input = gr.Textbox(label="NIM", placeholder="10121xxx") | |
| db_hash_input = gr.Textbox(label="Content Hash", placeholder="abc123...") | |
| db_get_btn = gr.Button("π Get Embedding", variant="primary") | |
| db_get_output = gr.JSON(label="Embedding Result") | |
| db_get_btn.click(fn=db_get_embedding, inputs=[db_nim_input, db_hash_input], outputs=db_get_output) | |
| gr.Markdown("---") | |
| gr.Markdown("#### Save Embedding") | |
| db_save_input = gr.Textbox( | |
| label="Embedding Data (JSON)", | |
| placeholder='{"nim": "123", "content_hash": "abc", "embedding_combined": [...], ...}', | |
| lines=4 | |
| ) | |
| db_save_btn = gr.Button("πΎ Save Embedding", variant="primary") | |
| db_save_output = gr.JSON(label="Save Result") | |
| db_save_btn.click(fn=db_save_embedding, inputs=db_save_input, outputs=db_save_output) | |
| with gr.Accordion("π‘ API Usage (untuk Developer)", open=False): | |
| gr.Markdown(""" | |
| ### Endpoints | |
| #### Embedding | |
| - `get_embedding` - Single text embedding | |
| - `get_embeddings_batch` - Batch text embeddings | |
| - `calculate_similarity` - Compare two texts | |
| #### Database (Supabase Proxy) | |
| - `db_check_connection` - Test Supabase connection | |
| - `db_get_all_embeddings` - Get all cached embeddings | |
| - `db_get_embedding` - Get embedding by NIM + hash | |
| - `db_save_embedding` - Save embedding to cache | |
| ### Example API Call | |
| ```javascript | |
| // Get all cached embeddings | |
| const response = await fetch("YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings", { | |
| method: "POST", | |
| headers: { "Content-Type": "application/json" }, | |
| body: JSON.stringify({ data: [] }) | |
| }); | |
| const result = await response.json(); | |
| const eventId = result.event_id; | |
| // Get result | |
| const dataResponse = await fetch(`YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings/${eventId}`); | |
| ``` | |
| """) | |
| gr.Markdown("---") | |
| gr.Markdown("*Dibuat untuk Monitoring Proposal Skripsi KK E - UNIKOM*") | |
| # Launch dengan API enabled | |
| demo.launch() | |