Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

App Files Files Community

Prashasst commited on Mar 7

Commit

8a8a96a

verified ·

1 Parent(s): a6e539a

Create file_processor.py

Browse files

refactored the code to make it scalable for more file types

Files changed (1) hide show

file_processor.py +77 -0

file_processor.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import fitz  # PyMuPDF
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
+from abc import ABC, abstractmethod
+#Abstract Base Class (Interface)
+class FileProcessor(ABC):
+    """Abstract class for file processing."""
+    @abstractmethod
+    def extract_text(self, file_path):
+        """Method to extract text from a file."""
+        pass
+#PDF Processor (Handles Text + OCR for Scanned PDFs)
+class PDFProcessor(FileProcessor):
+    def extract_text(self, pdf_path):
+        text = ""
+        doc = fitz.open(pdf_path)
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            page_text = page.get_text("text").strip()  # Extract text from page
+            # Extract Images for OCR
+            images = page.get_images(full=True)
+            ocr_text = ""
+            if images:
+                img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
+                for img in img_pages:
+                    ocr_text += pytesseract.image_to_string(img).strip() + "\n"
+            # Combine both text extraction methods
+            combined_text = f"{page_text}\n{ocr_text}".strip()
+            text += combined_text + "\n\n"
+        return text.strip() if text else "No text found in PDF."
+#Image Processor (OCR)
+class ImageProcessor(FileProcessor):
+    def extract_text(self, image_path):
+        img = Image.open(image_path)
+        text = pytesseract.image_to_string(img).strip()
+        return text if text else "No text found in Image."
+#Factory to Select the Right Processor
+class FileProcessorFactory:
+    """Factory class to get the correct file processor based on file extension."""
+    _processors = {
+        ".pdf": PDFProcessor(),
+        ".png": ImageProcessor(),
+        ".jpg": ImageProcessor(),
+        ".jpeg": ImageProcessor(),
+    }
+    @classmethod
+    def get_processor(cls, file_path):
+        ext = os.path.splitext(file_path)[-1].lower()
+        return cls._processors.get(ext, None)
+#Unified File Reading Function
+def read_file(file_path):
+    processor = FileProcessorFactory.get_processor(file_path)
+    if processor:
+        return processor.extract_text(file_path)
+    else:
+        return f"Unsupported file format: {file_path}"