File size: 1,443 Bytes
631eb6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> list:
    chunks = []
    start = 0
    text_length = len(text)
    
    # If text is shorter than chunk_size, return as single chunk
    if text_length <= chunk_size:
        return [text]
    
    while start < text_length:
        end = min(start + chunk_size, text_length)
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
        
        # Prevent infinite loop
        if start >= text_length:
            break

    return chunks

def chunked_summarize(text: str, summarize_func, max_chunk_size: int = 1500) -> str:
    if len(text) <= max_chunk_size:
        return summarize_func(text)
    
    text_chunks = chunk_text(text, chunk_size=max_chunk_size, overlap=200)
    print(f"Processing {len(text_chunks)} chunks...")
    
    partial_summaries = []
    for i, chunk in enumerate(text_chunks):
        print(f"Summarizing chunk {i+1}/{len(text_chunks)}...")
        summary = summarize_func(chunk)
        partial_summaries.append(summary)
    
    combined_summary_input = " ".join(partial_summaries)
    
    # Final summarization if combined text is still long
    if len(combined_summary_input) > max_chunk_size:
        print("Final summarization of combined chunks...")
        return summarize_func(combined_summary_input)
    
    return combined_summary_input