Quintino Fernandes
commited on
Commit
·
a86dbdc
1
Parent(s):
ba99a45
Similarity matrix fix, another entity fix, more print stuff and a bit of model fine tune
Browse files- database/query.py +2 -5
- database/query_processor.py +8 -2
- models/summarization.py +2 -2
database/query.py
CHANGED
|
@@ -19,7 +19,7 @@ class DatabaseService:
|
|
| 19 |
start_date: Optional[datetime] = None,
|
| 20 |
end_date: Optional[datetime] = None,
|
| 21 |
topic: Optional[str] = None,
|
| 22 |
-
entities: Optional[List[str]] = None,
|
| 23 |
limit: int = 10
|
| 24 |
) -> List[Dict[str, any]]:
|
| 25 |
try:
|
|
@@ -31,9 +31,6 @@ class DatabaseService:
|
|
| 31 |
dbname=self.DB_NAME
|
| 32 |
) as conn:
|
| 33 |
with conn.cursor() as cursor:
|
| 34 |
-
# Enable unaccent extension if not already enabled
|
| 35 |
-
cursor.execute("CREATE EXTENSION IF NOT EXISTS unaccent;")
|
| 36 |
-
|
| 37 |
# Base query with date range and topic filters
|
| 38 |
base_query = sql.SQL('''
|
| 39 |
WITH filtered_articles AS (
|
|
@@ -124,7 +121,7 @@ class DatabaseService:
|
|
| 124 |
|
| 125 |
# Fallback: Retry with fewer filters if no results
|
| 126 |
if not articles:
|
| 127 |
-
print("No articles found with
|
| 128 |
fallback_query = sql.SQL('''
|
| 129 |
SELECT
|
| 130 |
content,
|
|
|
|
| 19 |
start_date: Optional[datetime] = None,
|
| 20 |
end_date: Optional[datetime] = None,
|
| 21 |
topic: Optional[str] = None,
|
| 22 |
+
entities: Optional[List[tuple[str,str]]] = None,
|
| 23 |
limit: int = 10
|
| 24 |
) -> List[Dict[str, any]]:
|
| 25 |
try:
|
|
|
|
| 31 |
dbname=self.DB_NAME
|
| 32 |
) as conn:
|
| 33 |
with conn.cursor() as cursor:
|
|
|
|
|
|
|
|
|
|
| 34 |
# Base query with date range and topic filters
|
| 35 |
base_query = sql.SQL('''
|
| 36 |
WITH filtered_articles AS (
|
|
|
|
| 121 |
|
| 122 |
# Fallback: Retry with fewer filters if no results
|
| 123 |
if not articles:
|
| 124 |
+
print("No articles found with entities...")
|
| 125 |
fallback_query = sql.SQL('''
|
| 126 |
SELECT
|
| 127 |
content,
|
database/query_processor.py
CHANGED
|
@@ -30,6 +30,7 @@ class QueryProcessor:
|
|
| 30 |
# Query processing
|
| 31 |
query_embedding = self.embedding_model.encode(query).tolist()
|
| 32 |
entities = self.nlp_model.extract_entities(query)
|
|
|
|
| 33 |
|
| 34 |
# Database search
|
| 35 |
articles = await self._execute_semantic_search(
|
|
@@ -44,6 +45,7 @@ class QueryProcessor:
|
|
| 44 |
return {"message": "No articles found", "articles": []}
|
| 45 |
|
| 46 |
# Summary generation
|
|
|
|
| 47 |
summary_data = self._generate_summary(articles)
|
| 48 |
return {
|
| 49 |
"summary": summary_data["summary"],
|
|
@@ -113,15 +115,19 @@ class QueryProcessor:
|
|
| 113 |
"summary": "No content available for summarization",
|
| 114 |
"key_sentences": []
|
| 115 |
}
|
| 116 |
-
|
|
|
|
| 117 |
embeddings = self.embedding_model.encode(sentences)
|
| 118 |
-
|
|
|
|
| 119 |
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
|
| 120 |
|
| 121 |
top_indices = np.argsort(-centrality_scores)[:10]
|
| 122 |
key_sentences = [sentences[idx].strip() for idx in top_indices]
|
| 123 |
combined_text = ' '.join(key_sentences)
|
| 124 |
|
|
|
|
|
|
|
| 125 |
return {
|
| 126 |
"summary": self.summarization_model.summarize(combined_text),
|
| 127 |
"key_sentences": key_sentences
|
|
|
|
| 30 |
# Query processing
|
| 31 |
query_embedding = self.embedding_model.encode(query).tolist()
|
| 32 |
entities = self.nlp_model.extract_entities(query)
|
| 33 |
+
print(f"Extracted entities: {entities}")
|
| 34 |
|
| 35 |
# Database search
|
| 36 |
articles = await self._execute_semantic_search(
|
|
|
|
| 45 |
return {"message": "No articles found", "articles": []}
|
| 46 |
|
| 47 |
# Summary generation
|
| 48 |
+
print("Starting summary generation")
|
| 49 |
summary_data = self._generate_summary(articles)
|
| 50 |
return {
|
| 51 |
"summary": summary_data["summary"],
|
|
|
|
| 115 |
"summary": "No content available for summarization",
|
| 116 |
"key_sentences": []
|
| 117 |
}
|
| 118 |
+
|
| 119 |
+
print("Starting first summary generation")
|
| 120 |
embeddings = self.embedding_model.encode(sentences)
|
| 121 |
+
print("Embeddings generated first summary")
|
| 122 |
+
similarity_matrix = self.embedding_model.similarity(embeddings, embeddings).numpy
|
| 123 |
centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
|
| 124 |
|
| 125 |
top_indices = np.argsort(-centrality_scores)[:10]
|
| 126 |
key_sentences = [sentences[idx].strip() for idx in top_indices]
|
| 127 |
combined_text = ' '.join(key_sentences)
|
| 128 |
|
| 129 |
+
print(f"First summary done with: {len(key_sentences)} sentences")
|
| 130 |
+
|
| 131 |
return {
|
| 132 |
"summary": self.summarization_model.summarize(combined_text),
|
| 133 |
"key_sentences": key_sentences
|
models/summarization.py
CHANGED
|
@@ -19,9 +19,9 @@ class SummarizationModel:
|
|
| 19 |
inputs,
|
| 20 |
max_length=max_length,
|
| 21 |
min_length=min_length,
|
| 22 |
-
num_beams=
|
| 23 |
no_repeat_ngram_size=3,
|
| 24 |
-
early_stopping=
|
| 25 |
)
|
| 26 |
|
| 27 |
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
|
|
|
| 19 |
inputs,
|
| 20 |
max_length=max_length,
|
| 21 |
min_length=min_length,
|
| 22 |
+
num_beams=4,
|
| 23 |
no_repeat_ngram_size=3,
|
| 24 |
+
early_stopping=True,
|
| 25 |
)
|
| 26 |
|
| 27 |
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|