Fix issues with BaseDataset.load_table() (#368)

zzachw · John Wu · web-flow · commit abc5723013df · 2025-05-01T16:35:08.000-05:00
* Fix two issues with `BaseDataset.load_table()`

1. Introduced `scan_csv_gz_or_csv` function to handle both CSV and CSV.gz files.
2. Added a preprocessing function call to `BaseDataset.preprocess_{table_name}` that allows table specific customized processing

* Fix file extension handling in `scan_csv_gz_or_csv` function. Updated the logic to check for `.gz` suffix and modified the logging message for clarity when falling back to CSV files.

* Edited it so it would be able to load Synthetic API datasets through url checking and API calls, this way our tutorials should still work

* Cleans up `scan_csv_gz_or_csv` function

* Fix datetime parsing in MortalityPredictionMIMIC3

---------

Co-authored-by: John Wu &lt;johnwu3@sunlab-serv-03.cs.illinois.edu&gt;
diff --git a/pyhealth/datasets/base_dataset.py b/pyhealth/datasets/base_dataset.py
@@ -2,9 +2,12 @@
 import os
 from abc import ABC
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
 from typing import Iterator, List, Optional
+from urllib.parse import urlparse, urlunparse
 
 import polars as pl
+import requests
 from tqdm import tqdm
 
 from ..data import Patient
@@ -15,6 +18,67 @@
 logger = logging.getLogger(__name__)
 
 
+def is_url(path: str) -> bool:
+    """URL detection."""
+    result = urlparse(path)
+    # Both scheme and netloc must be present for a valid URL
+    return all([result.scheme, result.netloc])
+
+
+def clean_path(path: str) -> str:
+    """Clean a path string."""
+    if is_url(path):
+        parsed = urlparse(path)
+        cleaned_path = os.path.normpath(parsed.path)
+        # Rebuild the full URL
+        return urlunparse(parsed._replace(path=cleaned_path))
+    else:
+        # It's a local path — resolve and normalize
+        return str(Path(path).expanduser().resolve())
+
+
+def path_exists(path: str) -> bool:
+    """
+    Check if a path exists.
+    If the path is a URL, it will send a HEAD request.
+    If the path is a local file, it will use the Path.exists().
+    """
+    if is_url(path):
+        try:
+            response = requests.head(path, timeout=5)
+            return response.status_code == 200
+        except requests.RequestException:
+            return False
+    else:
+        return Path(path).exists()
+
+
+def scan_csv_gz_or_csv(path: str) -> pl.LazyFrame:
+    """
+    Scan a CSV.gz or CSV file and returns a LazyFrame.
+    It will fall back to the other extension if not found.
+
+    Args:
+        path (str): URL or local path to a .csv or .csv.gz file
+
+    Returns:
+        pl.LazyFrame: The LazyFrame for the CSV.gz or CSV file.
+    """
+    if path_exists(path):
+        return pl.scan_csv(path, infer_schema=False)
+    # Try the alternative extension
+    if path.endswith(".csv.gz"):
+        alt_path = path[:-3]  # Remove .gz
+    elif path.endswith(".csv"):
+        alt_path = f"{path}.gz"  # Add .gz
+    else:
+        raise FileNotFoundError(f"Path does not have expected extension: {path}")
+    if path_exists(alt_path):
+        logger.info(f"Original path does not exist. Using alternative: {alt_path}")
+        return pl.scan_csv(alt_path, infer_schema=False)
+    raise FileNotFoundError(f"Neither path exists: {path} or {alt_path}")
+
+
 class BaseDataset(ABC):
     """Abstract base class for all PyHealth datasets.
 
@@ -79,15 +143,13 @@ def collected_global_event_df(self) -> pl.DataFrame:
             if self.dev:
                 # Limit the number of patients in dev mode
                 logger.info("Dev mode enabled: limiting to 1000 patients")
-                limited_patients = (
-                    df.select(pl.col("patient_id"))
-                    .unique()
-                    .limit(1000)
-                )
+                limited_patients = df.select(pl.col("patient_id")).unique().limit(1000)
                 df = df.join(limited_patients, on="patient_id", how="inner")
 
             self._collected_global_event_df = df.collect()
-            logger.info(f"Collected dataframe with shape: {self._collected_global_event_df.shape}")
+            logger.info(
+                f"Collected dataframe with shape: {self._collected_global_event_df.shape}"
+            )
 
         return self._collected_global_event_df
 
@@ -118,36 +180,42 @@ def load_table(self, table_name: str) -> pl.LazyFrame:
 
         table_cfg = self.config.tables[table_name]
         csv_path = f"{self.root}/{table_cfg.file_path}"
-        # TODO: check if it's zipped or not.
-
-        # TODO: make this work for remote files
-        # if not Path(csv_path).exists():
-        #     raise FileNotFoundError(f"CSV not found: {csv_path}")
+        csv_path = clean_path(csv_path)
 
         logger.info(f"Scanning table: {table_name} from {csv_path}")
-
-        df = pl.scan_csv(csv_path, infer_schema=False)
-
-        # TODO: this is an ad hoc fix for the MIMIC-III dataset
-        df = df.with_columns([pl.col(col).alias(col.lower()) for col in df.collect_schema().names()])
+        df = scan_csv_gz_or_csv(csv_path)
+
+        # Convert column names to lowercase before calling preprocess_func
+        col_names = df.collect_schema().names()
+        if any(col != col.lower() for col in col_names):
+            logger.warning("Some column names were converted to lowercase")
+        df = df.with_columns([pl.col(col).alias(col.lower()) for col in col_names])
+
+        # Check if there is a preprocessing function for this table
+        preprocess_func = getattr(self, f"preprocess_{table_name}", None)
+        if preprocess_func is not None:
+            logger.info(
+                f"Preprocessing table: {table_name} with {preprocess_func.__name__}"
+            )
+            df = preprocess_func(df)
 
         # Handle joins
         for join_cfg in table_cfg.join:
             other_csv_path = f"{self.root}/{join_cfg.file_path}"
-            # if not Path(other_csv_path).exists():
-            #     raise FileNotFoundError(
-            #         f"Join CSV not found: {other_csv_path}"
-            #     )
-
-            join_df = pl.scan_csv(other_csv_path, infer_schema=False)
-            join_df = join_df.with_columns([pl.col(col).alias(col.lower()) for col in join_df.collect_schema().names()])
+            other_csv_path = clean_path(other_csv_path)
+            logger.info(f"Joining with table: {other_csv_path}")
+            join_df = scan_csv_gz_or_csv(other_csv_path)
+            join_df = join_df.with_columns(
+                [
+                    pl.col(col).alias(col.lower())
+                    for col in join_df.collect_schema().names()
+                ]
+            )
             join_key = join_cfg.on
             columns = join_cfg.columns
             how = join_cfg.how
 
-            df = df.join(
-                join_df.select([join_key] + columns), on=join_key, how=how
-            )
+            df = df.join(join_df.select([join_key] + columns), on=join_key, how=how)
 
         patient_id_col = table_cfg.patient_id
         timestamp_col = table_cfg.timestamp
@@ -158,10 +226,9 @@ def load_table(self, table_name: str) -> pl.LazyFrame:
         if timestamp_col:
             if isinstance(timestamp_col, list):
                 # Concatenate all timestamp parts in order with no separator
-                combined_timestamp = (
-                    pl.concat_str([pl.col(col) for col in timestamp_col])
-                    .str.strptime(pl.Datetime, format=timestamp_format, strict=True)
-                )
+                combined_timestamp = pl.concat_str(
+                    [pl.col(col) for col in timestamp_col]
+                ).str.strptime(pl.Datetime, format=timestamp_format, strict=True)
                 timestamp_expr = combined_timestamp
             else:
                 # Single timestamp column
@@ -185,8 +252,7 @@ def load_table(self, table_name: str) -> pl.LazyFrame:
 
         # Flatten attribute columns with event_type prefix
         attribute_columns = [
-            pl.col(attr).alias(f"{table_name}/{attr}")
-            for attr in attribute_cols
+            pl.col(attr).alias(f"{table_name}/{attr}") for attr in attribute_cols
         ]
 
         event_frame = df.select(base_columns + attribute_columns)
@@ -225,9 +291,7 @@ def get_patient(self, patient_id: str) -> Patient:
         assert (
             patient_id in self.unique_patient_ids
         ), f"Patient {patient_id} not found in dataset"
-        df = self.collected_global_event_df.filter(
-            pl.col("patient_id") == patient_id
-        )
+        df = self.collected_global_event_df.filter(pl.col("patient_id") == patient_id)
         return Patient(patient_id=patient_id, data_source=df)
 
     def iter_patients(self, df: Optional[pl.LazyFrame] = None) -> Iterator[Patient]:
@@ -260,11 +324,9 @@ def default_task(self) -> Optional[BaseTask]:
             Optional[BaseTask]: The default task, if any.
         """
         return None
-    
+
     def set_task(
-        self,
-        task: Optional[BaseTask] = None,
-        num_workers: Optional[int] = None
+        self, task: Optional[BaseTask] = None, num_workers: Optional[int] = None
     ) -> SampleDataset:
         """Processes the base dataset to generate the task-specific sample dataset.
 
@@ -283,7 +345,9 @@ def set_task(
             assert self.default_task is not None, "No default tasks found"
             task = self.default_task
 
-        logger.info(f"Setting task {task.task_name} for {self.dataset_name} base dataset...")
+        logger.info(
+            f"Setting task {task.task_name} for {self.dataset_name} base dataset..."
+        )
 
         filtered_global_event_df = task.pre_filter(self.collected_global_event_df)
 
@@ -298,7 +362,7 @@ def set_task(
         if num_workers == 1:
             for patient in tqdm(
                 self.iter_patients(filtered_global_event_df),
-                desc=f"Generating samples for {task.task_name}"
+                desc=f"Generating samples for {task.task_name}",
             ):
                 samples.extend(task(patient))
         else:
diff --git a/pyhealth/datasets/configs/mimic3.yaml b/pyhealth/datasets/configs/mimic3.yaml
@@ -1,7 +1,7 @@
 version: "1.4"
 tables:
   patients:
-    file_path: "PATIENTS.csv"
+    file_path: "PATIENTS.csv.gz"
     patient_id: "subject_id"
     timestamp: null
     attributes:
@@ -13,7 +13,7 @@ tables:
     - "expire_flag"
 
   admissions:
-    file_path: "ADMISSIONS.csv"
+    file_path: "ADMISSIONS.csv.gz"
     patient_id: "subject_id"
     timestamp: "admittime"
     attributes:
@@ -33,7 +33,7 @@ tables:
       - "hospital_expire_flag"
 
   icustays:
-    file_path: "ICUSTAYS.csv"
+    file_path: "ICUSTAYS.csv.gz"
     patient_id: "subject_id"
     timestamp: "intime"
     attributes:
@@ -44,10 +44,10 @@ tables:
       - "outtime" 
 
   diagnoses_icd:
-    file_path: "DIAGNOSES_ICD.csv"
+    file_path: "DIAGNOSES_ICD.csv.gz"
     patient_id: "subject_id"
     join:
-      - file_path: "ADMISSIONS.csv"
+      - file_path: "ADMISSIONS.csv.gz"
         "on": "hadm_id"
         how: "inner"
         columns:
@@ -58,7 +58,7 @@ tables:
       - "seq_num"
 
   prescriptions:
-    file_path: "PRESCRIPTIONS.csv"
+    file_path: "PRESCRIPTIONS.csv.gz"
     patient_id: "subject_id"
     timestamp: "startdate"
     attributes:
@@ -76,12 +76,12 @@ tables:
       - "form_unit_disp"
       - "route"
       - "enddate"
-      
+
   procedures_icd:
-    file_path: "PROCEDURES_ICD.csv"
+    file_path: "PROCEDURES_ICD.csv.gz"
     patient_id: "subject_id"
     join:
-      - file_path: "ADMISSIONS.csv"
+      - file_path: "ADMISSIONS.csv.gz"
         "on": "hadm_id"
         how: "inner"
         columns:
@@ -92,10 +92,10 @@ tables:
       - "seq_num"
 
   labevents:
-    file_path: "LABEVENTS.csv"
+    file_path: "LABEVENTS.csv.gz"
     patient_id: "subject_id"
     join:
-      - file_path: "D_LABITEMS.csv"
+      - file_path: "D_LABITEMS.csv.gz"
         "on": "itemid"
         how: "inner"
         columns:
@@ -114,12 +114,10 @@ tables:
       - "flag"
 
   noteevents:
-    file_path: "NOTEEVENTS.csv"
+    file_path: "NOTEEVENTS.csv.gz"
     patient_id: "subject_id"
     timestamp:
-      - "chartdate"
       - "charttime"
-    timestamp_format: "%Y%m%d%H%M%S"
     attributes:
       - "text"
       - "category"
diff --git a/pyhealth/datasets/mimic3.py b/pyhealth/datasets/mimic3.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 from typing import List, Optional
 
+import polars as pl
+
 from .base_dataset import BaseDataset
 
 logger = logging.getLogger(__name__)
@@ -58,3 +60,28 @@ def __init__(
             **kwargs
         )
         return
+
+    def preprocess_noteevents(self, df: pl.LazyFrame) -> pl.LazyFrame:
+        """
+        Table-specific preprocess function which will be called by BaseDataset.load_table().
+    
+        Preprocesses the noteevents table by ensuring that the charttime column
+        is populated. If charttime is null, it uses chartdate with a default
+        time of 00:00:00.
+
+        See: https://mimic.mit.edu/docs/iii/tables/noteevents/#chartdate-charttime-storetime.
+
+        Args:
+            df (pl.LazyFrame): The input dataframe containing noteevents data.
+
+        Returns:
+            pl.LazyFrame: The processed dataframe with updated charttime
+            values.
+        """
+        df = df.with_columns(
+            pl.when(pl.col("charttime").is_null())
+            .then(pl.col("chartdate") + pl.lit(" 00:00:00"))
+            .otherwise(pl.col("charttime"))
+            .alias("charttime")
+        )
+        return df
diff --git a/pyhealth/tasks/mortality_prediction.py b/pyhealth/tasks/mortality_prediction.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 from typing import Any, Dict, List, Optional
+
 from .base_task import BaseTask
 
 
@@ -41,7 +42,7 @@ def __call__(self, patient: Any) -> List[Dict[str, Any]]:
             try:
                 # Check the type and convert if necessary
                 if isinstance(visit.dischtime, str):
-                    discharge_time = datetime.strptime(visit.dischtime, "%Y-%m-%d")
+                    discharge_time = datetime.strptime(visit.dischtime, "%Y-%m-%d %H:%M:%S")
                 else:
                     discharge_time = visit.dischtime
             except (ValueError, AttributeError):