Spaces:
Sleeping
Sleeping
Update search_utils.py
Browse files- search_utils.py +17 -17
search_utils.py
CHANGED
|
@@ -230,6 +230,7 @@ class SemanticSearch:
|
|
| 230 |
self.metadata_mgr = MetadataManager()
|
| 231 |
self.shard_sizes = []
|
| 232 |
self.cumulative_offsets = None
|
|
|
|
| 233 |
self.logger = logging.getLogger("SemanticSearch")
|
| 234 |
self.logger.info("Initializing SemanticSearch")
|
| 235 |
|
|
@@ -270,8 +271,8 @@ class SemanticSearch:
|
|
| 270 |
self.logger.info(f"Loaded index {shard_path.name} with {size} vectors")
|
| 271 |
except Exception as e:
|
| 272 |
self.logger.error(f"Error loading index {shard_path}: {str(e)}")
|
| 273 |
-
total_vectors = sum(self.shard_sizes)
|
| 274 |
-
self.logger.info(f"Total loaded vectors: {total_vectors} across {len(self.index_shards)} shards")
|
| 275 |
self.cumulative_offsets = np.cumsum([0] + self.shard_sizes)
|
| 276 |
|
| 277 |
def _load_single_index(self, shard_path):
|
|
@@ -348,52 +349,51 @@ class SemanticSearch:
|
|
| 348 |
except Exception as e:
|
| 349 |
self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
|
| 350 |
return None
|
| 351 |
-
|
| 352 |
def _process_results(self, distances, global_indices, top_k):
|
| 353 |
"""Process raw search results: retrieve metadata, calculate similarity, and deduplicate."""
|
| 354 |
process_start = time.time()
|
| 355 |
if global_indices.size == 0 or distances.size == 0:
|
| 356 |
self.logger.warning("No search results to process")
|
| 357 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
|
|
|
| 358 |
try:
|
| 359 |
self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
|
| 360 |
metadata_start = time.time()
|
| 361 |
results = self.metadata_mgr.get_metadata(global_indices)
|
| 362 |
self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
|
| 363 |
-
|
| 364 |
if len(results) == 0:
|
| 365 |
self.logger.warning("No metadata found for indices")
|
| 366 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
|
|
|
| 367 |
if len(results) != len(distances):
|
| 368 |
self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
|
| 369 |
if len(results) < len(distances):
|
| 370 |
distances = distances[:len(results)]
|
| 371 |
else:
|
| 372 |
distances = np.pad(distances, (0, len(results) - len(distances)), 'constant', constant_values=1.0)
|
| 373 |
-
|
| 374 |
self.logger.debug("Calculating similarity scores")
|
| 375 |
results['similarity'] = 1 - (distances / 2)
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
f"max={results['similarity'].max():.3f}, " +
|
| 379 |
-
f"mean={results['similarity'].mean():.3f}")
|
| 380 |
results['source'] = results["source"]
|
| 381 |
-
|
| 382 |
-
# Ensure we have all required columns
|
| 383 |
required_columns = ["title", "summary", "authors", "source", "similarity"]
|
| 384 |
for col in required_columns:
|
| 385 |
if col not in results.columns:
|
| 386 |
results[col] = None # Fill missing columns with None
|
| 387 |
-
|
| 388 |
pre_dedup = len(results)
|
| 389 |
-
results = results.drop_duplicates(subset=["title","authors", "source"]).sort_values("similarity", ascending=False).head(top_k)
|
| 390 |
-
|
| 391 |
post_dedup = len(results)
|
| 392 |
if pre_dedup > post_dedup:
|
| 393 |
self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
|
|
|
|
| 394 |
self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
|
| 395 |
-
|
| 396 |
-
return results.reset_index(drop=True)
|
| 397 |
except Exception as e:
|
| 398 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
| 399 |
-
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
|
|
|
| 230 |
self.metadata_mgr = MetadataManager()
|
| 231 |
self.shard_sizes = []
|
| 232 |
self.cumulative_offsets = None
|
| 233 |
+
self.total_vectors = 0
|
| 234 |
self.logger = logging.getLogger("SemanticSearch")
|
| 235 |
self.logger.info("Initializing SemanticSearch")
|
| 236 |
|
|
|
|
| 271 |
self.logger.info(f"Loaded index {shard_path.name} with {size} vectors")
|
| 272 |
except Exception as e:
|
| 273 |
self.logger.error(f"Error loading index {shard_path}: {str(e)}")
|
| 274 |
+
self.total_vectors = sum(self.shard_sizes)
|
| 275 |
+
self.logger.info(f"Total loaded vectors: {aelf.total_vectors} across {len(self.index_shards)} shards")
|
| 276 |
self.cumulative_offsets = np.cumsum([0] + self.shard_sizes)
|
| 277 |
|
| 278 |
def _load_single_index(self, shard_path):
|
|
|
|
| 349 |
except Exception as e:
|
| 350 |
self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
|
| 351 |
return None
|
| 352 |
+
|
| 353 |
def _process_results(self, distances, global_indices, top_k):
|
| 354 |
"""Process raw search results: retrieve metadata, calculate similarity, and deduplicate."""
|
| 355 |
process_start = time.time()
|
| 356 |
if global_indices.size == 0 or distances.size == 0:
|
| 357 |
self.logger.warning("No search results to process")
|
| 358 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
| 359 |
+
|
| 360 |
try:
|
| 361 |
self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
|
| 362 |
metadata_start = time.time()
|
| 363 |
results = self.metadata_mgr.get_metadata(global_indices)
|
| 364 |
self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
|
| 365 |
+
|
| 366 |
if len(results) == 0:
|
| 367 |
self.logger.warning("No metadata found for indices")
|
| 368 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
| 369 |
+
|
| 370 |
if len(results) != len(distances):
|
| 371 |
self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
|
| 372 |
if len(results) < len(distances):
|
| 373 |
distances = distances[:len(results)]
|
| 374 |
else:
|
| 375 |
distances = np.pad(distances, (0, len(results) - len(distances)), 'constant', constant_values=1.0)
|
| 376 |
+
|
| 377 |
self.logger.debug("Calculating similarity scores")
|
| 378 |
results['similarity'] = 1 - (distances / 2)
|
| 379 |
+
|
| 380 |
+
# Ensure all required columns
|
|
|
|
|
|
|
| 381 |
results['source'] = results["source"]
|
| 382 |
+
|
|
|
|
| 383 |
required_columns = ["title", "summary", "authors", "source", "similarity"]
|
| 384 |
for col in required_columns:
|
| 385 |
if col not in results.columns:
|
| 386 |
results[col] = None # Fill missing columns with None
|
| 387 |
+
|
| 388 |
pre_dedup = len(results)
|
| 389 |
+
results = results.drop_duplicates(subset=["title", "authors", "source"]).sort_values("similarity", ascending=False).head(top_k)
|
| 390 |
+
|
| 391 |
post_dedup = len(results)
|
| 392 |
if pre_dedup > post_dedup:
|
| 393 |
self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
|
| 394 |
+
|
| 395 |
self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
|
| 396 |
+
return results[required_columns].reset_index(drop=True)
|
|
|
|
| 397 |
except Exception as e:
|
| 398 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
| 399 |
+
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|