Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

App Files Files Community

Prashasst commited on Mar 17

Commit

6977359

verified ·

1 Parent(s): ffd74e7

Update file_processing.py

Browse files

Files changed (1) hide show

file_processing.py +20 -12

file_processing.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import fitz  # PyMuPDF
 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
 from abc import ABC, abstractmethod
@@ -10,7 +11,6 @@ from abc import ABC, abstractmethod
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 #Abstract Base Class (Interface)
 class FileProcessor(ABC):
     """Abstract class for file processing."""
@@ -23,21 +23,26 @@ class FileProcessor(ABC):
 #PDF Processor (Handles Text + OCR for Scanned PDFs)
 class PDFProcessor(FileProcessor):
-    def extract_text(self, pdf_path):
         text = ""
         doc = fitz.open(pdf_path)
         for page_num in range(len(doc)):
             page = doc.load_page(page_num)
             page_text = page.get_text("text").strip()  # Extract text from page
             # Extract Images for OCR
             images = page.get_images(full=True)
             ocr_text = ""
             if images:
                 img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                 for img in img_pages:
-                    ocr_text += pytesseract.image_to_string(img).strip() + "\n"
             # Combine both text extraction methods
             combined_text = f"{page_text}\n{ocr_text}".strip()
@@ -48,10 +53,14 @@ class PDFProcessor(FileProcessor):
 #Image Processor (OCR)
 class ImageProcessor(FileProcessor):
-    def extract_text(self, image_path):
-        img = Image.open(image_path)
-        text = pytesseract.image_to_string(img).strip()
-        return text if text else "No text found in Image."
 #Factory to Select the Right Processor
@@ -72,11 +81,10 @@ class FileProcessorFactory:
 #Unified File Reading Function
-def read_file(file_path):
     processor = FileProcessorFactory.get_processor(file_path)
     if processor:
-        return processor.extract_text(file_path)
     else:
         return f"Unsupported file format: {file_path}"

 import os
 import fitz  # PyMuPDF
 import pytesseract
+import easyocr
 from pdf2image import convert_from_path
 from PIL import Image
 from abc import ABC, abstractmethod
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 #Abstract Base Class (Interface)
 class FileProcessor(ABC):
     """Abstract class for file processing."""
 #PDF Processor (Handles Text + OCR for Scanned PDFs)
 class PDFProcessor(FileProcessor):
+    def extract_text(self, pdf_path,reader):
         text = ""
         doc = fitz.open(pdf_path)
         for page_num in range(len(doc)):
             page = doc.load_page(page_num)
             page_text = page.get_text("text").strip()  # Extract text from page
+            print(f"page- {page_num} text : {page_text}") #DEBUG
             # Extract Images for OCR
             images = page.get_images(full=True)
             ocr_text = ""
             if images:
                 img_pages = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                 for img in img_pages:
+                    img = np.array(img)  #easy ocr expects np image
+                    text_ocr = reader.readtext(img, detail=0)  # I have initialized reader globally
+                    if text_ocr:  # Ensure text_ocr is not empty
+                        ocr_text += " ".join(text_ocr).strip() + "\n"
+                    # ocr_text += pytesseract.image_to_string(img).strip() + "\n"
+            print(f"page- {page_num} orc : {ocr_text}") #DEBUG
             # Combine both text extraction methods
             combined_text = f"{page_text}\n{ocr_text}".strip()
 #Image Processor (OCR)
 class ImageProcessor(FileProcessor):
+    def extract_text(self, image_path,reader):
+        print("Single Image")
+        # text = pytesseract.image_to_string(img).strip()
+        text = reader.readtext(image_path, detail=0)   #I have initilized reader globally already
+        # print(text)
+        return " ".join(text) if text else "No text found in Image."
+        # return text if text else "No text found in Image."
 #Factory to Select the Right Processor
 #Unified File Reading Function
+def read_file(file_path,reader):
     processor = FileProcessorFactory.get_processor(file_path)
     if processor:
+        return processor.extract_text(file_path,reader)
     else:
         return f"Unsupported file format: {file_path}"