File size: 6,298 Bytes
e884643
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
Data models for GraphLLM system following the manual specifications
"""
from pydantic import BaseModel, Field
from typing import Optional, List, Dict, Any, Literal
from datetime import datetime
from enum import Enum
import uuid


# Enums
class ChunkType(str, Enum):
    """Types of chunks extracted from PDF"""
    PARAGRAPH = "paragraph"
    CODE = "code"
    TABLE = "table"
    IMAGE = "image"
    IMAGE_TEXT = "image_text"


class NodeType(str, Enum):
    """Types of graph nodes"""
    CONCEPT = "concept"
    PERSON = "person"
    METHOD = "method"
    TERM = "term"
    CLASS = "class"
    FUNCTION = "function"
    ENTITY = "entity"


class RelationType(str, Enum):
    """Canonical relation types for edges"""
    IS_A = "is_a"
    PART_OF = "part_of"
    METHOD_OF = "method_of"
    CAUSES = "causes"
    USES = "uses"
    RELATED_TO = "related_to"
    DEFINED_AS = "defined_as"
    DEPENDS_ON = "depends_on"
    IMPLEMENTS = "implements"
    SIMILAR_TO = "similar_to"
    OBSERVES = "observes"
    MEASURES = "measures"
    PRODUCES = "produces"
    CONTAINS = "contains"
    AFFECTS = "affects"
    ENABLES = "enables"
    REQUIRES = "requires"
    INTERACTS_WITH = "interacts_with"
    ENRICHES = "enriches"
    ENHANCES = "enhances"
    SUPPORTS = "supports"
    DESCRIBES = "describes"
    EXPLAINS = "explains"
    REFERS_TO = "refers_to"
    ASSOCIATED_WITH = "associated_with"


# Core Data Models

class Chunk(BaseModel):
    """Individual chunk of text/content from PDF"""
    chunk_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    pdf_id: str
    page_number: int
    char_range: tuple[int, int]
    type: ChunkType
    text: str
    table_json: Optional[Dict[str, Any]] = None
    image_id: Optional[str] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
    created_at: datetime = Field(default_factory=datetime.utcnow)


class EmbeddingEntry(BaseModel):
    """Vector embedding for a chunk"""
    chunk_id: str
    embedding: List[float]
    created_at: datetime = Field(default_factory=datetime.utcnow)
    metadata: Dict[str, Any] = Field(default_factory=dict)


class SupportingChunk(BaseModel):
    """Reference to a chunk supporting a node or edge"""
    chunk_id: str
    score: float
    page_number: Optional[int] = None
    snippet: Optional[str] = None


class GraphNode(BaseModel):
    """Node in the knowledge graph"""
    node_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    label: str
    type: NodeType
    aliases: List[str] = Field(default_factory=list)
    supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
    importance_score: float = 0.0
    metadata: Dict[str, Any] = Field(default_factory=dict)
    created_at: datetime = Field(default_factory=datetime.utcnow)


class GraphEdge(BaseModel):
    """Edge in the knowledge graph"""
    edge_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    from_node: str = Field(alias="from")
    to_node: str = Field(alias="to")
    relation: RelationType
    confidence: float
    supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
    metadata: Dict[str, Any] = Field(default_factory=dict)
    created_at: datetime = Field(default_factory=datetime.utcnow)

    class Config:
        populate_by_name = True
        # FastAPI automatically serializes enums as their string values in JSON


class Triple(BaseModel):
    """Extracted triple from text"""
    subject: str
    predicate: str
    object: str
    confidence: float = 1.0
    source_chunk_id: Optional[str] = None
    page_number: Optional[int] = None
    justification: Optional[str] = None


class CanonicalTriple(BaseModel):
    """LLM-canonicalized triple"""
    subject_label: str
    object_label: str
    relation: RelationType
    confidence: float
    justification: str
    page_number: int


# API Request/Response Models

class UploadResponse(BaseModel):
    """Response from PDF upload"""
    pdf_id: str
    filename: str
    status: str
    message: str
    num_pages: Optional[int] = None
    num_chunks: Optional[int] = None


class GraphResponse(BaseModel):
    """Response containing graph data"""
    nodes: List[GraphNode]
    edges: List[GraphEdge]
    metadata: Dict[str, Any] = Field(default_factory=dict)


class SourceCitation(BaseModel):
    """Source citation with page number and snippet"""
    page_number: int
    snippet: str
    chunk_id: str
    score: Optional[float] = None


class NodeDetailResponse(BaseModel):
    """Response for node detail request"""
    node_id: str
    label: str
    type: NodeType
    summary: str
    sources: List[SourceCitation]
    related_nodes: List[Dict[str, Any]] = Field(default_factory=list)
    raw_chunks: Optional[List[Chunk]] = None


class ChatMessage(BaseModel):
    """Chat message"""
    role: Literal["user", "assistant", "system"]
    content: str
    sources: Optional[List[SourceCitation]] = None
    timestamp: datetime = Field(default_factory=datetime.utcnow)


class ChatRequest(BaseModel):
    """Chat request"""
    query: str
    pdf_id: str
    include_citations: bool = True
    max_sources: int = 5


class ChatResponse(BaseModel):
    """Chat response with answer and citations"""
    answer: str
    sources: List[SourceCitation]
    context_chunks: Optional[List[str]] = None


class PDFMetadata(BaseModel):
    """Metadata for uploaded PDF"""
    pdf_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    filename: str
    filepath: str
    num_pages: int
    file_size_bytes: int
    upload_timestamp: datetime = Field(default_factory=datetime.utcnow)
    processing_status: str = "pending"
    num_chunks: int = 0
    num_nodes: int = 0
    num_edges: int = 0
    metadata: Dict[str, Any] = Field(default_factory=dict)


class IngestionLog(BaseModel):
    """Log entry for ingestion process"""
    log_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    pdf_id: str
    timestamp: datetime = Field(default_factory=datetime.utcnow)
    stage: str
    status: str
    message: str
    details: Optional[Dict[str, Any]] = None


class AdminStatus(BaseModel):
    """Admin status response"""
    total_pdfs: int
    total_chunks: int
    total_nodes: int
    total_edges: int
    vector_index_size: int
    recent_logs: List[IngestionLog]