From 415ff778f8ebddc15e43d5a6fdd04a8c6ff65408 Mon Sep 17 00:00:00 2001
From: Tanmayi Bondu <tbondu@Tanmayis-MBP.attlocal.net>
Date: Thu, 24 Jul 2025 23:18:09 -0500
Subject: [PATCH 1/2] Add length of stay task and dataset

---
 .../tasks/length_of_stay/fake_los_data.csv    |   6 +
 .../contrib/tasks/length_of_stay/fakedata.py  |  25 ++
 .../tasks/length_of_stay/lengthofstay.yaml    |  11 +
 .../length_of_stay/lengthofstay.yaml.save     |  17 +
 .../tasks/length_of_stay/lengthofstay1.yaml   |  17 +
 .../contrib/tasks/length_of_stay/task.py      | 339 ++++++++++++++++++
 6 files changed, 415 insertions(+)
 create mode 100644 pyhealth/pyhealth/contrib/tasks/length_of_stay/fake_los_data.csv
 create mode 100644 pyhealth/pyhealth/contrib/tasks/length_of_stay/fakedata.py
 create mode 100644 pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml
 create mode 100644 pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml.save
 create mode 100644 pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay1.yaml
 create mode 100644 pyhealth/pyhealth/contrib/tasks/length_of_stay/task.py

diff --git a/pyhealth/pyhealth/contrib/tasks/length_of_stay/fake_los_data.csv b/pyhealth/pyhealth/contrib/tasks/length_of_stay/fake_los_data.csv
new file mode 100644
index 00000000..12a107b7
--- /dev/null
+++ b/pyhealth/pyhealth/contrib/tasks/length_of_stay/fake_los_data.csv
@@ -0,0 +1,6 @@
+patient_id,admission_date,discharge_date
+1,2023-01-01,2023-01-05
+2,2023-02-10,2023-02-15
+3,2023-03-20,2023-03-22
+4,2023-04-01,2023-04-10
+5,2023-05-05,2023-05-07
diff --git a/pyhealth/pyhealth/contrib/tasks/length_of_stay/fakedata.py b/pyhealth/pyhealth/contrib/tasks/length_of_stay/fakedata.py
new file mode 100644
index 00000000..7f19543c
--- /dev/null
+++ b/pyhealth/pyhealth/contrib/tasks/length_of_stay/fakedata.py
@@ -0,0 +1,25 @@
+import csv
+from datetime import datetime, timedelta
+
+def create_fake_los_data(filename="fake_los_data.csv"):
+    # Define headers expected by the LOS task
+    headers = ['admission_id', 'patient_id', 'admission_time', 'discharge_time']
+
+    # Prepare some example rows
+    base_date = datetime(2024, 1, 1, 10, 0)
+    rows = [
+        ['1', '1001', base_date.strftime("%Y-%m-%d %H:%M:%S"), (base_date + timedelta(days=4, hours=2)).strftime("%Y-%m-%d %H:%M:%S")],
+        ['2', '1002', (base_date + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S"), (base_date + timedelta(days=3, hours=5)).strftime("%Y-%m-%d %H:%M:%S")],
+        ['3', '1003', (base_date + timedelta(days=2)).strftime("%Y-%m-%d %H:%M:%S"), (base_date + timedelta(days=5)).strftime("%Y-%m-%d %H:%M:%S")],
+    ]
+
+    # Write to CSV
+    with open(filename, mode='w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(headers)
+        writer.writerows(rows)
+    
+    print(f"Created fake LOS data CSV file: {filename}")
+
+if __name__ == "__main__":
+    create_fake_los_data()
diff --git a/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml b/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml
new file mode 100644
index 00000000..206bbe18
--- /dev/null
+++ b/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml
@@ -0,0 +1,11 @@
+version: "1.4"
+
+tables:
+  admissions:
+    file_path: "fake_los_data.csv"
+    patient_id: "patient_id"
+    timestamp: "admission_date"
+    attributes:
+      - "patient_id"
+      - "admission_date"
+      - "discharge_date"
\ No newline at end of file
diff --git a/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml.save b/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml.save
new file mode 100644
index 00000000..f79e5431
--- /dev/null
+++ b/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay.yaml.save
@@ -0,0 +1,17 @@
+root: ./  # base path for your CSV files
+
+tables:
+  - fakedata.csv
+
+columns:
+  patient_id: patient_id
+  admission_time: admission_time
+  discharge_time: discharge_time
+  features:
+    - lab_results
+    - diagnoses
+  target:
+    length_of_stay: length_of_stay_days
+
+
+
diff --git a/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay1.yaml b/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay1.yaml
new file mode 100644
index 00000000..f79e5431
--- /dev/null
+++ b/pyhealth/pyhealth/contrib/tasks/length_of_stay/lengthofstay1.yaml
@@ -0,0 +1,17 @@
+root: ./  # base path for your CSV files
+
+tables:
+  - fakedata.csv
+
+columns:
+  patient_id: patient_id
+  admission_time: admission_time
+  discharge_time: discharge_time
+  features:
+    - lab_results
+    - diagnoses
+  target:
+    length_of_stay: length_of_stay_days
+
+
+
diff --git a/pyhealth/pyhealth/contrib/tasks/length_of_stay/task.py b/pyhealth/pyhealth/contrib/tasks/length_of_stay/task.py
new file mode 100644
index 00000000..ca667d56
--- /dev/null
+++ b/pyhealth/pyhealth/contrib/tasks/length_of_stay/task.py
@@ -0,0 +1,339 @@
+"""
+Author: Muni Bondu
+Description: This task predicts hospital length of stay (LOS) using synthetic admissions data.
+This implements a regression task using admission and discharge dates.
+
+Paper (if applicable): N/A """
+
+import os
+import pandas as pd
+from datetime import datetime
+from typing import List, Dict, Any
+
+from pyhealth.datasets import BaseDataset
+from pyhealth.tasks import BaseTask
+
+class LOSDataset(BaseDataset):
+    """Length of Stay Dataset compatible with PyHealth BaseDataset.
+
+    Args:
+        root (str): Root directory containing data files.
+        dev (bool): If True, load a small sample for development.
+
+    Attributes:
+        patients (List[Dict]): List of patient data dictionaries.
+    """
+
+    def __init__(self, root: str = ".", dev: bool = False):
+        tables = ["admissions"]  # Must match YAML table key
+        config_path = os.path.join(root, "lengthofstay.yaml")
+        
+        # parent class
+        super().__init__(root=root, tables=tables, config_path=config_path)
+        self.dev = dev
+        
+        # Load and process data
+        self.patients = self.load_patient_data()
+
+    def load_patient_data(self) -> List[Dict]:
+        """Load CSV and convert to internal patient data format."""
+        try:
+            # Get the file path from the config
+            table_config = self.config.tables["admissions"]
+            csv_filename = table_config.file_path
+            csv_path = os.path.join(self.root, csv_filename)
+            
+            if not os.path.exists(csv_path):
+                raise FileNotFoundError(f"CSV file not found: {csv_path}")
+                
+            print(f"Loading data from: {csv_path}")
+            df = pd.read_csv(csv_path)
+            
+            # Strip whitespace from column names
+            df.columns = df.columns.str.strip()
+            
+            if self.dev:
+                df = df.sample(n=min(5, len(df)), random_state=42)
+                print(f"Development mode: Using {len(df)} samples")
+
+            patients = []
+            for idx, row in df.iterrows():
+                try:
+                    # Parse dates
+                    admission_date = pd.to_datetime(row['admission_date']).date()
+                    discharge_date = pd.to_datetime(row['discharge_date']).date()
+                    
+                    # Calculate length of stay
+                    los = (discharge_date - admission_date).days
+                    if los < 0:
+                        print(f"Warning: Negative LOS for patient {row['patient_id']}, setting to 0")
+                        los = 0
+                    # Create patient record
+                    patient_data = {
+                        'patient_id': str(row['patient_id']),
+                        'admission_date': admission_date,
+                        'discharge_date': discharge_date,
+                        'length_of_stay': los,
+                        # Add features for prediction
+                        'admission_day_of_year': admission_date.timetuple().tm_yday,
+                        'admission_month': admission_date.month,
+                        'admission_weekday': admission_date.weekday(),
+                    }
+                    patients.append(patient_data)
+                except (ValueError, KeyError) as e:
+                    print(f"Error processing row {idx} for patient {row.get('patient_id', 'unknown')}: {e}")
+                    continue
+            print(f"Successfully loaded {len(patients)} patient records")
+            return patients
+        except Exception as e:
+            print(f"Error loading data: {e}")
+            return []
+    def get_patient_data(self) -> List[Dict]:
+        """Return patient data for compatibility with PyHealth."""
+        return self.patients
+
+
+class LOSTask(BaseTask):
+    """Length of Stay Prediction Task compatible with PyHealth BaseTask."""
+
+    def __init__(self, dataset: LOSDataset):
+        super().__init__()
+        self.task_name = "length_of_stay"
+        self.dataset = dataset
+
+    def __call__(self, patient_id: str) -> Dict[str, Any]:
+        """Process a single patient and return task-specific data.
+        
+        This method is required by PyHealth's BaseTask.
+        Args:
+            patient_id: The patient ID to process
+            
+        Returns:
+            Dictionary containing patient features and target
+        """
+        # Find the patient in the dataset
+        patient_data = None
+        for p in self.dataset.patients:
+            if p['patient_id'] == patient_id:
+                patient_data = p
+                break
+        
+        if patient_data is None:
+            raise ValueError(f"Patient {patient_id} not found in dataset")
+        
+        # Return the processed data for this patient
+        return {
+            "patient_id": patient_data["patient_id"],
+            "features": {
+                "admission_day_of_year": patient_data["admission_day_of_year"],
+                "admission_month": patient_data["admission_month"], 
+                "admission_weekday": patient_data["admission_weekday"],
+            },
+            "target": patient_data["length_of_stay"],
+            "metadata": {
+                "admission_date": patient_data["admission_date"],
+                "discharge_date": patient_data["discharge_date"],
+            }
+        }
+
+    def get_targets(self) -> List[int]:
+        """Get the length of stay targets for the dataset."""
+        return [p['length_of_stay'] for p in self.dataset.patients]
+
+    def get_features(self) -> List[Dict[str, Any]]:
+        """Extract features to be used for prediction."""
+        features = []
+        for p in self.dataset.patients:
+            features.append({
+                "patient_id": p["patient_id"],
+                "admission_day_of_year": p["admission_day_of_year"],
+                "admission_month": p["admission_month"],
+                "admission_weekday": p["admission_weekday"],
+            })
+        return features
+
+    def get_patient_splits(self, train_ratio: float = 0.7, val_ratio: float = 0.15):
+        """Split patients into train/validation/test sets."""
+        import random
+        random.seed(42)
+        
+        patients = list(range(len(self.dataset.patients)))
+        random.shuffle(patients)
+        
+        n_train = int(len(patients) * train_ratio)
+        n_val = int(len(patients) * val_ratio)
+        
+        train_idx = patients[:n_train]
+        val_idx = patients[n_train:n_train + n_val]
+        test_idx = patients[n_train + n_val:]
+        
+        return train_idx, val_idx, test_idx
+
+
+def create_sample_data():
+    """Create a sample CSV file for testing."""
+    csv_content = """patient_id,admission_date,discharge_date
+1,2023-01-01,2023-01-05
+2,2023-02-10,2023-02-15
+3,2023-03-20,2023-03-22
+4,2023-04-01,2023-04-10
+5,2023-05-05,2023-05-07
+6,2023-06-15,2023-06-18
+7,2023-07-20,2023-07-25
+8,2023-08-10,2023-08-12
+9,2023-09-05,2023-09-08
+10,2023-10-12,2023-10-20
+11,2023-11-01,2023-11-03
+12,2023-12-15,2023-12-18
+"""
+    filename = "fake_los_data.csv"
+    with open(filename, "w") as f:
+        f.write(csv_content)
+    print(f"Created sample data file: {filename}")
+
+
+def run_simple_prediction():
+    """Run a simple prediction using scikit-learn."""
+    try:
+        from sklearn.ensemble import RandomForestRegressor
+        from sklearn.metrics import mean_squared_error, mean_absolute_error
+        from sklearn.model_selection import train_test_split
+        import numpy as np
+        
+        print("\n" + "="*50)
+        print("RUNNING SIMPLE PREDICTION MODEL")
+        print("="*50)
+        
+        # Load dataset
+        dataset = LOSDataset(dev=False)
+        task = LOSTask(dataset)
+        
+        # Get features and targets
+        features = task.get_features()
+        targets = task.get_targets()
+        
+        if not features or not targets:
+            print("No data available for prediction")
+            return
+        
+        # Convert to arrays
+        X = np.array([[f['admission_day_of_year'], f['admission_month'], f['admission_weekday']] 
+                     for f in features])
+        y = np.array(targets)
+        
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+        
+        # Train model
+        model = RandomForestRegressor(n_estimators=10, random_state=42)
+        model.fit(X_train, y_train)
+        
+        # Make predictions
+        y_pred = model.predict(X_test)
+        
+        # Calculate metrics
+        mse = mean_squared_error(y_test, y_pred)
+        mae = mean_absolute_error(y_test, y_pred)
+        
+        print(f"Training samples: {len(X_train)}")
+        print(f"Test samples: {len(X_test)}")
+        print(f"Mean Squared Error: {mse:.2f}")
+        print(f"Mean Absolute Error: {mae:.2f}")
+        print(f"Actual vs Predicted (first 5 test samples):")
+        for i in range(min(5, len(y_test))):
+            print(f"  Actual: {y_test[i]}, Predicted: {y_pred[i]:.1f}")
+            
+    except ImportError:
+        print("Scikit-learn not available. Skipping prediction demo.")
+    except Exception as e:
+        print(f"Error in prediction: {e}")
+def test_task_call():
+    """Test the __call__ method of the task."""
+    print("\n" + "="*50)
+    print("TESTING TASK __call__ METHOD")
+    print("="*50)
+    try:
+        dataset = LOSDataset(dev=True)
+        task = LOSTask(dataset)
+        
+        if dataset.patients:
+            # Test with the first patient
+            first_patient_id = dataset.patients[0]['patient_id']
+            result = task(first_patient_id)
+            
+            print(f"Patient ID: {result['patient_id']}")
+            print(f"Features: {result['features']}")
+            print(f"Target (LOS): {result['target']} days")
+            print(f"Metadata: {result['metadata']}")
+        else:
+            print("No patients available for testing")      
+    except Exception as e:
+        print(f"Error testing task call: {e}")
+def main():
+    print("PyHealth Length of Stay Prediction Task")
+    print("="*40)
+    
+    # Create sample data if it doesn't exist
+    if not os.path.exists("fake_los_data.csv"):
+        print("Creating sample data...")
+        create_sample_data()
+    
+    # Check if YAML config exists
+    if not os.path.exists("lengthofstay.yaml"):
+        print("Error: lengthofstay.yaml not found!")
+        print("Please create the YAML configuration file first.")
+        print("You can use the provided lengthofstay.yaml configuration.")
+        return
+
+    try:
+        print("Initializing dataset...")
+        dataset = LOSDataset(dev=True)
+
+        if not dataset.patients:
+            print("No patient data loaded. Please check your data file.")
+            return
+
+        print("Creating task...")
+        task = LOSTask(dataset)
+
+        print("Getting targets...")
+        targets = task.get_targets()
+        print(f"Length of Stay targets: {targets}")
+
+        print("Getting features...")
+        features = task.get_features()
+        print("Features:")
+        for feat in features[:3]:  # Show first 3
+            print(f"  {feat}")
+        if len(features) > 3:
+            print(f"  ... and {len(features) - 3} more")
+
+        print(f"\nDataset Summary:")
+        print(f"{'='*30}")
+        print(f"Number of patients: {len(dataset.patients)}")
+        if targets:
+            print(f"Average LOS: {sum(targets) / len(targets):.1f} days")
+            print(f"Min LOS: {min(targets)} days")
+            print(f"Max LOS: {max(targets)} days")
+        
+        # Test the __call__ method
+        test_task_call()
+        
+        # Run simple prediction if possible
+        run_simple_prediction()
+        
+        print(f"\n{'='*50}")
+        print("SUCCESS! Dataset and task created successfully.")
+        print("You can now use this dataset with PyHealth models.")
+        
+    except Exception as e:
+        print(f"Error: {e}")
+        print("\nTroubleshooting:")
+        print("1. Ensure yaml file exists and has the correct structure")
+        print("2. Ensure csv file exists and is readable")
+        print("3. Check that PyHealth is properly installed")
+        print("4. Verify your Python environment has required dependencies")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 58a3f2e67d1c9bfdbedef6041d70e84088fc580e Mon Sep 17 00:00:00 2001
From: Tanmayi Bondu <tbondu@Tanmayis-MBP.attlocal.net>
Date: Thu, 24 Jul 2025 23:29:35 -0500
Subject: [PATCH 2/2] Add fake LOS data and Polars test script for length of
 stay prediction

---
 pyhealth/fake_los_data.csv |  6 ++++++
 pyhealth/test_polars.py    | 11 +++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 pyhealth/fake_los_data.csv
 create mode 100644 pyhealth/test_polars.py

diff --git a/pyhealth/fake_los_data.csv b/pyhealth/fake_los_data.csv
new file mode 100644
index 00000000..12a107b7
--- /dev/null
+++ b/pyhealth/fake_los_data.csv
@@ -0,0 +1,6 @@
+patient_id,admission_date,discharge_date
+1,2023-01-01,2023-01-05
+2,2023-02-10,2023-02-15
+3,2023-03-20,2023-03-22
+4,2023-04-01,2023-04-10
+5,2023-05-05,2023-05-07
diff --git a/pyhealth/test_polars.py b/pyhealth/test_polars.py
new file mode 100644
index 00000000..044cce34
--- /dev/null
+++ b/pyhealth/test_polars.py
@@ -0,0 +1,11 @@
+import polars as pl
+
+def main():
+    df = pl.DataFrame({
+        "a": [1, 2, 3],
+        "b": [4, 5, 6]
+    })
+    print(df)
+
+if __name__ == "__main__":
+    main()