galihboy commited on
Commit
6b644a1
Β·
verified Β·
1 Parent(s): b229af5

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +325 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from sentence_transformers import SentenceTransformer
3
+ import json
4
+ import numpy as np
5
+ import os
6
+ import httpx
7
+
8
+ # ==================== CONFIGURATION ====================
9
+
10
+ # Model - akan auto-download dari HF Hub saat pertama kali
11
+ HF_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
12
+
13
+ # Path lokal untuk development (opsional, diabaikan jika tidak ada)
14
+ LOCAL_MODEL_PATH = r"E:\huggingface_models\hub\models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2\snapshots"
15
+
16
+ # Supabase configuration (dari environment variables untuk keamanan)
17
+ # Di HF Space: Settings > Repository secrets
18
+ # Di lokal: set environment variable atau gunakan default untuk testing
19
+ SUPABASE_URL = os.environ.get("SUPABASE_URL", "")
20
+ SUPABASE_KEY = os.environ.get("SUPABASE_KEY", "")
21
+
22
+ def get_model_path():
23
+ """Deteksi environment dan return path model yang sesuai"""
24
+ # Cek apakah folder lokal ada
25
+ if os.path.exists(LOCAL_MODEL_PATH):
26
+ # Cari snapshot terbaru
27
+ snapshots = os.listdir(LOCAL_MODEL_PATH)
28
+ if snapshots:
29
+ return os.path.join(LOCAL_MODEL_PATH, snapshots[0])
30
+ # Fallback ke HF Hub (untuk deployment di Space)
31
+ return HF_MODEL_NAME
32
+
33
+ # Load model saat startup
34
+ print("Loading model...")
35
+ model_path = get_model_path()
36
+ print(f"Using model from: {model_path}")
37
+ model = SentenceTransformer(model_path)
38
+ print("Model loaded successfully!")
39
+
40
+
41
+ def get_embedding(text: str):
42
+ """Generate embedding untuk single text"""
43
+ if not text or not text.strip():
44
+ return {"error": "Text tidak boleh kosong"}
45
+
46
+ try:
47
+ embedding = model.encode(text.strip())
48
+ return {"embedding": embedding.tolist()}
49
+ except Exception as e:
50
+ return {"error": str(e)}
51
+
52
+
53
+ def get_embeddings_batch(texts_json: str):
54
+ """Generate embeddings untuk multiple texts (JSON array)"""
55
+ try:
56
+ texts = json.loads(texts_json)
57
+ if not isinstance(texts, list):
58
+ return {"error": "Input harus JSON array"}
59
+
60
+ if len(texts) == 0:
61
+ return {"error": "Array tidak boleh kosong"}
62
+
63
+ # Filter empty strings
64
+ texts = [t.strip() for t in texts if t and t.strip()]
65
+
66
+ if len(texts) == 0:
67
+ return {"error": "Semua text kosong"}
68
+
69
+ embeddings = model.encode(texts)
70
+ return {"embeddings": embeddings.tolist()}
71
+ except json.JSONDecodeError:
72
+ return {"error": "Invalid JSON format. Gunakan format: [\"teks 1\", \"teks 2\"]"}
73
+ except Exception as e:
74
+ return {"error": str(e)}
75
+
76
+
77
+ def calculate_similarity(text1: str, text2: str):
78
+ """Hitung cosine similarity antara dua teks"""
79
+ if not text1 or not text1.strip():
80
+ return {"error": "Text 1 tidak boleh kosong"}
81
+ if not text2 or not text2.strip():
82
+ return {"error": "Text 2 tidak boleh kosong"}
83
+
84
+ try:
85
+ embeddings = model.encode([text1.strip(), text2.strip()])
86
+
87
+ # Cosine similarity
88
+ similarity = np.dot(embeddings[0], embeddings[1]) / (
89
+ np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
90
+ )
91
+
92
+ return {
93
+ "similarity": float(similarity),
94
+ "percentage": f"{similarity * 100:.2f}%"
95
+ }
96
+ except Exception as e:
97
+ return {"error": str(e)}
98
+
99
+
100
+ # ==================== SUPABASE PROXY FUNCTIONS ====================
101
+
102
+ def get_supabase_headers():
103
+ """Get headers untuk Supabase API calls"""
104
+ return {
105
+ "apikey": SUPABASE_KEY,
106
+ "Authorization": f"Bearer {SUPABASE_KEY}",
107
+ "Content-Type": "application/json",
108
+ "Prefer": "return=representation"
109
+ }
110
+
111
+
112
+ def db_get_all_embeddings():
113
+ """Ambil semua embeddings dari Supabase"""
114
+ if not SUPABASE_URL or not SUPABASE_KEY:
115
+ return {"error": "Supabase not configured"}
116
+
117
+ try:
118
+ url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=nim,content_hash,embedding_combined,embedding_judul,embedding_deskripsi,embedding_problem,embedding_metode,nama,judul"
119
+
120
+ with httpx.Client(timeout=30.0) as client:
121
+ response = client.get(url, headers=get_supabase_headers())
122
+
123
+ if response.status_code == 200:
124
+ return {"data": response.json(), "count": len(response.json())}
125
+ else:
126
+ return {"error": f"Supabase error: {response.status_code}", "detail": response.text}
127
+ except Exception as e:
128
+ return {"error": str(e)}
129
+
130
+
131
+ def db_get_embedding(nim: str, content_hash: str):
132
+ """Ambil embedding untuk NIM dan content_hash tertentu"""
133
+ if not SUPABASE_URL or not SUPABASE_KEY:
134
+ return {"error": "Supabase not configured"}
135
+
136
+ try:
137
+ url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?nim=eq.{nim}&content_hash=eq.{content_hash}&select=*"
138
+
139
+ with httpx.Client(timeout=30.0) as client:
140
+ response = client.get(url, headers=get_supabase_headers())
141
+
142
+ if response.status_code == 200:
143
+ data = response.json()
144
+ return {"data": data[0] if data else None, "found": len(data) > 0}
145
+ else:
146
+ return {"error": f"Supabase error: {response.status_code}"}
147
+ except Exception as e:
148
+ return {"error": str(e)}
149
+
150
+
151
+ def db_save_embedding(data_json: str):
152
+ """Simpan embedding ke Supabase (upsert)"""
153
+ if not SUPABASE_URL or not SUPABASE_KEY:
154
+ return {"error": "Supabase not configured"}
155
+
156
+ try:
157
+ data = json.loads(data_json)
158
+
159
+ # Validate required fields
160
+ if not data.get("nim") or not data.get("content_hash"):
161
+ return {"error": "nim and content_hash are required"}
162
+
163
+ if not data.get("embedding_combined"):
164
+ return {"error": "embedding_combined is required"}
165
+
166
+ url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings"
167
+ headers = get_supabase_headers()
168
+ headers["Prefer"] = "resolution=merge-duplicates,return=representation"
169
+
170
+ payload = {
171
+ "nim": data["nim"],
172
+ "content_hash": data["content_hash"],
173
+ "embedding_combined": data["embedding_combined"],
174
+ "embedding_judul": data.get("embedding_judul"),
175
+ "embedding_deskripsi": data.get("embedding_deskripsi"),
176
+ "embedding_problem": data.get("embedding_problem"),
177
+ "embedding_metode": data.get("embedding_metode"),
178
+ "nama": data.get("nama"),
179
+ "judul": data.get("judul")
180
+ }
181
+
182
+ with httpx.Client(timeout=30.0) as client:
183
+ response = client.post(url, headers=headers, json=payload)
184
+
185
+ if response.status_code in [200, 201]:
186
+ return {"success": True, "data": response.json()}
187
+ else:
188
+ return {"error": f"Supabase error: {response.status_code}", "detail": response.text}
189
+ except json.JSONDecodeError:
190
+ return {"error": "Invalid JSON format"}
191
+ except Exception as e:
192
+ return {"error": str(e)}
193
+
194
+
195
+ def db_check_connection():
196
+ """Test koneksi ke Supabase"""
197
+ if not SUPABASE_URL or not SUPABASE_KEY:
198
+ return {"connected": False, "error": "Supabase URL or KEY not configured"}
199
+
200
+ try:
201
+ url = f"{SUPABASE_URL}/rest/v1/proposal_embeddings?select=id&limit=1"
202
+
203
+ with httpx.Client(timeout=10.0) as client:
204
+ response = client.get(url, headers=get_supabase_headers())
205
+
206
+ return {
207
+ "connected": response.status_code == 200,
208
+ "status_code": response.status_code,
209
+ "supabase_url": SUPABASE_URL[:30] + "..." if len(SUPABASE_URL) > 30 else SUPABASE_URL
210
+ }
211
+ except Exception as e:
212
+ return {"connected": False, "error": str(e)}
213
+
214
+
215
+ # Gradio Interface
216
+ with gr.Blocks(title="Semantic Embedding API") as demo:
217
+ gr.Markdown("# πŸ”€ Semantic Embedding API")
218
+ gr.Markdown("API untuk menghasilkan text embedding menggunakan `paraphrase-multilingual-MiniLM-L12-v2`")
219
+ gr.Markdown("**Model**: Multilingual, mendukung 50+ bahasa termasuk Bahasa Indonesia")
220
+
221
+ with gr.Tab("πŸ”’ Single Embedding"):
222
+ gr.Markdown("Generate embedding vector untuk satu teks")
223
+ text_input = gr.Textbox(
224
+ label="Input Text",
225
+ placeholder="Masukkan teks untuk di-embed...",
226
+ lines=2
227
+ )
228
+ single_output = gr.JSON(label="Embedding Result")
229
+ single_btn = gr.Button("Generate Embedding", variant="primary")
230
+ single_btn.click(fn=get_embedding, inputs=text_input, outputs=single_output)
231
+
232
+ with gr.Tab("πŸ“¦ Batch Embedding"):
233
+ gr.Markdown("Generate embeddings untuk multiple teks sekaligus")
234
+ batch_input = gr.Textbox(
235
+ label="JSON Array of Texts",
236
+ placeholder='["teks pertama", "teks kedua", "teks ketiga"]',
237
+ lines=4
238
+ )
239
+ batch_output = gr.JSON(label="Embeddings Result")
240
+ batch_btn = gr.Button("Generate Embeddings", variant="primary")
241
+ batch_btn.click(fn=get_embeddings_batch, inputs=batch_input, outputs=batch_output)
242
+
243
+ with gr.Tab("πŸ“Š Similarity Check"):
244
+ gr.Markdown("Hitung kemiripan semantik antara dua teks")
245
+ with gr.Row():
246
+ sim_text1 = gr.Textbox(label="Text 1", placeholder="Teks pertama...", lines=2)
247
+ sim_text2 = gr.Textbox(label="Text 2", placeholder="Teks kedua...", lines=2)
248
+ sim_output = gr.JSON(label="Similarity Result")
249
+ sim_btn = gr.Button("Calculate Similarity", variant="primary")
250
+ sim_btn.click(fn=calculate_similarity, inputs=[sim_text1, sim_text2], outputs=sim_output)
251
+
252
+ with gr.Tab("πŸ’Ύ Database (Supabase)"):
253
+ gr.Markdown("### Supabase Cache Operations")
254
+ gr.Markdown("Proxy untuk akses Supabase (API key aman di server)")
255
+
256
+ with gr.Row():
257
+ db_check_btn = gr.Button("πŸ”Œ Check Connection", variant="secondary")
258
+ db_check_output = gr.JSON(label="Connection Status")
259
+ db_check_btn.click(fn=db_check_connection, outputs=db_check_output)
260
+
261
+ gr.Markdown("---")
262
+
263
+ gr.Markdown("#### Get All Cached Embeddings")
264
+ db_all_btn = gr.Button("πŸ“₯ Get All Embeddings", variant="primary")
265
+ db_all_output = gr.JSON(label="All Embeddings")
266
+ db_all_btn.click(fn=db_get_all_embeddings, outputs=db_all_output)
267
+
268
+ gr.Markdown("---")
269
+
270
+ gr.Markdown("#### Get Single Embedding by NIM")
271
+ with gr.Row():
272
+ db_nim_input = gr.Textbox(label="NIM", placeholder="10121xxx")
273
+ db_hash_input = gr.Textbox(label="Content Hash", placeholder="abc123...")
274
+ db_get_btn = gr.Button("πŸ” Get Embedding", variant="primary")
275
+ db_get_output = gr.JSON(label="Embedding Result")
276
+ db_get_btn.click(fn=db_get_embedding, inputs=[db_nim_input, db_hash_input], outputs=db_get_output)
277
+
278
+ gr.Markdown("---")
279
+
280
+ gr.Markdown("#### Save Embedding")
281
+ db_save_input = gr.Textbox(
282
+ label="Embedding Data (JSON)",
283
+ placeholder='{"nim": "123", "content_hash": "abc", "embedding_combined": [...], ...}',
284
+ lines=4
285
+ )
286
+ db_save_btn = gr.Button("πŸ’Ύ Save Embedding", variant="primary")
287
+ db_save_output = gr.JSON(label="Save Result")
288
+ db_save_btn.click(fn=db_save_embedding, inputs=db_save_input, outputs=db_save_output)
289
+
290
+ with gr.Accordion("πŸ“‘ API Usage (untuk Developer)", open=False):
291
+ gr.Markdown("""
292
+ ### Endpoints
293
+
294
+ #### Embedding
295
+ - `get_embedding` - Single text embedding
296
+ - `get_embeddings_batch` - Batch text embeddings
297
+ - `calculate_similarity` - Compare two texts
298
+
299
+ #### Database (Supabase Proxy)
300
+ - `db_check_connection` - Test Supabase connection
301
+ - `db_get_all_embeddings` - Get all cached embeddings
302
+ - `db_get_embedding` - Get embedding by NIM + hash
303
+ - `db_save_embedding` - Save embedding to cache
304
+
305
+ ### Example API Call
306
+ ```javascript
307
+ // Get all cached embeddings
308
+ const response = await fetch("YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings", {
309
+ method: "POST",
310
+ headers: { "Content-Type": "application/json" },
311
+ body: JSON.stringify({ data: [] })
312
+ });
313
+ const result = await response.json();
314
+ const eventId = result.event_id;
315
+
316
+ // Get result
317
+ const dataResponse = await fetch(`YOUR_SPACE_URL/gradio_api/call/db_get_all_embeddings/${eventId}`);
318
+ ```
319
+ """)
320
+
321
+ gr.Markdown("---")
322
+ gr.Markdown("*Dibuat untuk Monitoring Proposal Skripsi KK E - UNIKOM*")
323
+
324
+ # Launch dengan API enabled
325
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ sentence-transformers>=2.2.0
3
+ torch
4
+ numpy
5
+ httpx>=0.24.0