Spaces:
Sleeping
Sleeping
Update search_utils.py
Browse files- search_utils.py +4 -2
search_utils.py
CHANGED
|
@@ -38,17 +38,19 @@ class MetadataManager:
|
|
| 38 |
try:
|
| 39 |
# Load the parquet file
|
| 40 |
self.df = pd.read_parquet(self.metadata_path)
|
|
|
|
| 41 |
# Clean and format the data
|
| 42 |
self.df['source'] = self.df['source'].apply(
|
| 43 |
lambda x: [
|
| 44 |
url.strip()
|
| 45 |
for url in str(x).split(';')
|
| 46 |
-
if url.strip()
|
| 47 |
]
|
| 48 |
)
|
| 49 |
-
# Convert list of sources to a single string
|
| 50 |
self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
|
| 51 |
|
|
|
|
| 52 |
self.total_docs = len(self.df)
|
| 53 |
|
| 54 |
|
|
|
|
| 38 |
try:
|
| 39 |
# Load the parquet file
|
| 40 |
self.df = pd.read_parquet(self.metadata_path)
|
| 41 |
+
|
| 42 |
# Clean and format the data
|
| 43 |
self.df['source'] = self.df['source'].apply(
|
| 44 |
lambda x: [
|
| 45 |
url.strip()
|
| 46 |
for url in str(x).split(';')
|
| 47 |
+
if url.strip() and url.startswith('http')
|
| 48 |
]
|
| 49 |
)
|
| 50 |
+
# Convert list of sources to a single string, separate by ' | '
|
| 51 |
self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
|
| 52 |
|
| 53 |
+
|
| 54 |
self.total_docs = len(self.df)
|
| 55 |
|
| 56 |
|