Spaces:
Sleeping
Sleeping
File size: 1,443 Bytes
631eb6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> list:
chunks = []
start = 0
text_length = len(text)
# If text is shorter than chunk_size, return as single chunk
if text_length <= chunk_size:
return [text]
while start < text_length:
end = min(start + chunk_size, text_length)
chunk = text[start:end]
chunks.append(chunk)
start += chunk_size - overlap
# Prevent infinite loop
if start >= text_length:
break
return chunks
def chunked_summarize(text: str, summarize_func, max_chunk_size: int = 1500) -> str:
if len(text) <= max_chunk_size:
return summarize_func(text)
text_chunks = chunk_text(text, chunk_size=max_chunk_size, overlap=200)
print(f"Processing {len(text_chunks)} chunks...")
partial_summaries = []
for i, chunk in enumerate(text_chunks):
print(f"Summarizing chunk {i+1}/{len(text_chunks)}...")
summary = summarize_func(chunk)
partial_summaries.append(summary)
combined_summary_input = " ".join(partial_summaries)
# Final summarization if combined text is still long
if len(combined_summary_input) > max_chunk_size:
print("Final summarization of combined chunks...")
return summarize_func(combined_summary_input)
return combined_summary_input |