Spaces:

Prashasst
/

Medical_Lab_Test_Extraction_Pipeline

Sleeping

App Files Files Community

Prashasst commited on Mar 7

Commit

371da07

verified ·

1 Parent(s): 970417f

Create entity_recognition.py

Browse files

Files changed (1) hide show

entity_recognition.py +278 -0

entity_recognition.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import json
+from config import google_api
+def process_text(extracted_text):
+    """Lab Test and metadata entity recognition using gemini flash"""
+    ''' Return type: JSON '''
+    print("Performing Named Entity Recognition...")
+    client = genai.Client(
+        api_key=google_api,
+    )
+    model = "gemini-2.0-flash"
+    contents = [
+        types.Content(
+            role="user",
+            parts=[
+                types.Part.from_text(text="""The following text is extracted from a medical lab report using OCR.
+There may be errors such as missing decimals, incorrect test names, and incorrect reference ranges.
+Please correct the errors and extract both metadata and structured lab test data.
+ALWAYS MAKE SURE THAT THE VALUE ALIGNS WITH THE REAL RANGE OF THE TEST
+AND CLEARLY IDENTIFY REDS WITH LOW AND HIGH
+Return the output in structured JSON format with all the information in lowercase to standardization.
+And follow the JSON format provided and don't add any additional details in meta data or lab report other than that are specified
+Extracted Text:
+Dr. Onkar Test Sanjeevan Hospital\\n\\nMBBS, MD | Reg No: T123 12/4, Paud Road, Kothrud, Pune - 411023\\nPh: 0202526245, 8983390126, Timing: 09:15 AM -\\n02:30 PM, 05:30 PM - 09:30 PM, APPOINTMENTS\\nONLY | Closed: Monday,Friday\\n\\n \\n\\nPatient UID: 87 Report No: 00018\\n\\nName: AMAR SHAHA (Male} Rey, Date: 09-Jul-20\\n\\nAge 40 years Sample Collected At Hospital Lab\\n\\nAddress: MG Road, PUNE Sample Type/Quantity: Blood\\n\\nRef. By Doctor . Sample Collection D/T: 09-Jul-20, 9.50 AM\\nCr Test Result D/T: 09-Jul-20, 4:53 PM\\n\\n \\n \\n\\nDr. Amit Deshmukh\\n\\n     \\n\\nHEMOGRAM\\n\\nINVESTIGATION RESULT UNIT REF, RANGE\\nHAEMOGLOBIN : 14 gms/dl 12.0 - 17.0\\nRBC COUNT E 44 millfeumm 4.1-5.1\\nHAEMOTOCRIT (PCV) E 30 % 32.0 - 47.0\\nMCV $ 78 fl 760 - 100.0\\nMCH H 3246 Py 260-320\\nMCHC | : 328 n% 315-3465 ,\\nROW ; 13.9 % 11.6-150\\nMPV ; 11.2 fn 68- 12.6\\nWBC COUNT : 4567 /eamm 4000 - 11000\\nDIFFERENTIAL COUNT\\nNEUTROPHILS |» : 56 %y 40-70\\nLYMPHOCYTES ; 20 % 20.0- 45.0\\nEOSINOPHILS . 4 « % 0-6\\nMONOCYTES : 5 %
+Expected JSON format:
+{
+    \"metadata\": {
+        \"patient_name\": \"Prasahsst Pawar\",
+        \"age\": \"20\",
+        \"gender\": \"Male\",
+        \"lab_name\": \"XYZ Diagnostics\",
+        \"report_date\": \"05-03-2025\"
+    },
+    \"lab_tests\": [
+        {
+            \"test_name\": \"hemoglobin\",
+            \"value\": \"14.2\",
+            \"unit\": \"g/dL\",
+            \"reference_range\": \"13.5 - 17.5 g/dL\"
+        },
+        {
+      \"test_name\": \"rbc count\",
+      \"value\": \"5.2\",
+      \"unit\": \"million/cu mm\",
+      \"reference_range\": \"4.1-5.1\"
+    },
+        {
+            \"test_name\": \"glucose\",
+            \"value\": \"65\",
+            \"unit\": \"mg/dL\",
+            \"reference_range\": \"70 - 110 mg/dL\"
+        }
+    ],
+\"reds\":{
+    \"low\":[\"glucose\"],
+    \"high\":[\"rbc count\"]
+}
+}"""),
+            ],
+        ),
+        types.Content(
+            role="model",
+            parts=[
+                types.Part.from_text(text="""{
+  \"lab_tests\": [
+    {
+      \"reference_range\": \"12.0 - 17.0\",
+      \"test_name\": \"haemoglobin\",
+      \"unit\": \"gms/dl\",
+      \"value\": \"14\"
+    },
+    {
+      \"reference_range\": \"4.1-5.1\",
+      \"test_name\": \"rbc count\",
+      \"unit\": \"mill/cu mm\",
+      \"value\": \"4.4\"
+    },
+    {
+      \"reference_range\": \"32.0 - 47.0\",
+      \"test_name\": \"haematocrit (pcv)\",
+      \"unit\": \"%\",
+      \"value\": \"30\"
+    },
+    {
+      \"reference_range\": \"76.0 - 100.0\",
+      \"test_name\": \"mcv\",
+      \"unit\": \"fl\",
+      \"value\": \"78\"
+    },
+    {
+      \"reference_range\": \"26.0-32.0\",
+      \"test_name\": \"mch\",
+      \"unit\": \"pg\",
+      \"value\": \"32.46\"
+    },
+    {
+      \"reference_range\": \"31.5-34.5\",
+      \"test_name\": \"mchc\",
+      \"unit\": \"%\",
+      \"value\": \"32.8\"
+    },
+    {
+      \"reference_range\": \"11.6-15.0\",
+      \"test_name\": \"rdw\",
+      \"unit\": \"%\",
+      \"value\": \"13.9\"
+    },
+    {
+      \"reference_range\": \"6.8- 12.6\",
+      \"test_name\": \"mpv\",
+      \"unit\": \"fn\",
+      \"value\": \"11.2\"
+    },
+    {
+      \"reference_range\": \"4000 - 11000\",
+      \"test_name\": \"wbc count\",
+      \"unit\": \"/cu mm\",
+      \"value\": \"4567\"
+    },
+    {
+      \"reference_range\": \"40-70\",
+      \"test_name\": \"neutrophils\",
+      \"unit\": \"%\",
+      \"value\": \"56\"
+    },
+    {
+      \"reference_range\": \"20.0- 45.0\",
+      \"test_name\": \"lymphocytes\",
+      \"unit\": \"%\",
+      \"value\": \"20\"
+    },
+    {
+      \"reference_range\": \"0-6\",
+      \"test_name\": \"eosinophils\",
+      \"unit\": \"%\",
+      \"value\": \"4\"
+    },
+    {
+      \"reference_range\": \"2-10\",
+      \"test_name\": \"monocytes\",
+      \"unit\": \"%\",
+      \"value\": \"5\"
+    }
+  ],
+  \"metadata\": {
+    \"age\": \"40\",
+    \"gender\": \"male\",
+    \"lab_name\": \"sanjeevan hospital\",
+    \"patient_name\": \"amar shaha\",
+    \"report_date\": \"09-jul-20\"
+  },
+  \"reds\": {
+    \"high\": [
+      \"mch\"
+    ],
+    \"low\": [
+      \"haematocrit (pcv)\"
+    ]
+  }
+}"""),
+            ],
+        ),
+        types.Content(
+            role="user",
+            parts=[
+                types.Part.from_text(text=extracted_text),
+            ],
+        ),
+    ]
+    generate_content_config = types.GenerateContentConfig(
+        temperature=1,
+        top_p=0.95,
+        top_k=40,
+        max_output_tokens=8192,
+        response_mime_type="application/json",
+        response_schema=genai.types.Schema(
+            type = genai.types.Type.OBJECT,
+            enum = [],
+            required = ["metadata", "lab_tests", "reds"],
+            properties = {
+                "metadata": genai.types.Schema(
+                    type = genai.types.Type.OBJECT,
+                    enum = [],
+                    required = ["patient_name", "age", "gender", "lab_name", "report_date"],
+                    properties = {
+                        "patient_name": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "age": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "gender": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "lab_name": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                        "report_date": genai.types.Schema(
+                            type = genai.types.Type.STRING,
+                        ),
+                    },
+                ),
+                "lab_tests": genai.types.Schema(
+                    type = genai.types.Type.ARRAY,
+                    items = genai.types.Schema(
+                        type = genai.types.Type.OBJECT,
+                        enum = [],
+                        required = ["test_name", "value", "unit", "reference_range"],
+                        properties = {
+                            "test_name": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                            "value": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                            "unit": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                            "reference_range": genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                        },
+                    ),
+                ),
+                "reds": genai.types.Schema(
+                    type = genai.types.Type.OBJECT,
+                    enum = [],
+                    required = ["low", "high"],
+                    properties = {
+                        "low": genai.types.Schema(
+                            type = genai.types.Type.ARRAY,
+                            items = genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                        ),
+                        "high": genai.types.Schema(
+                            type = genai.types.Type.ARRAY,
+                            items = genai.types.Schema(
+                                type = genai.types.Type.STRING,
+                            ),
+                        ),
+                    },
+                ),
+            },
+        ),
+        system_instruction=[
+            types.Part.from_text(text="""Always return the output as JSON only"""),
+        ],
+    )
+    # for chunk in client.models.generate_content_stream(
+    #     model=model,
+    #     contents=contents,
+    #     config=generate_content_config,
+    # ):
+    #     print(chunk.text, end="")
+    try:
+        response = client.models.generate_content(
+            model=model, contents=contents, config=generate_content_config
+        )
+        json_response = response.text  # Ensure response is JSON formatted
+        parsed_json = json.loads(json_response)  # Convert JSON string to Python dictionary
+        return parsed_json
+    except json.JSONDecodeError:
+        print("Error: Invalid JSON response from the model.")
+        return None