fix: first pass of metadata issue fix

dylanpilz · dylanpilz · commit 985e058edf13 · 2025-04-07T11:55:52.000-07:00
diff --git a/scripts/aggregate_demix_by_week.py b/scripts/aggregate_demix_by_week.py
@@ -8,7 +8,11 @@
     'Midwest': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'],
     'South': ['Delaware', 'Maryland', 'Florida', 'Georgia', 'North Carolina', 'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
     'West': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
-}   
+}
+CENSUS_REGIONS = {region: [state.lower() for state in states] for region, states in CENSUS_REGIONS.items()}
+
+def sum_unique(series):
+    return series.unique().sum()
 
 # Create state to region mapping for easier reference
 STATE_TO_REGION = {}
@@ -44,6 +48,48 @@
 df_agg['collection_date'] = pd.to_datetime(df_agg['collection_date'])
 df_agg['epiweek'] = df_agg['collection_date'].apply(lambda x: Week.fromdate(x))
 
+# Calculate population, sample count, and site count for each region/week
+region_stats = df_agg.groupby(['geo_loc_region', 'epiweek']).agg({
+    'ww_population': 'sum',  # Sum the populations within each state
+    'sra_accession': 'nunique',
+    'collection_site_id': 'nunique'
+}).reset_index()
+
+# Add census region to region_stats
+region_stats['census_region'] = region_stats['geo_loc_region'].map(STATE_TO_REGION)
+
+# Print value counts for samples where census_region is null
+print('census_region null', df_agg[df_agg['census_region'].isna()]['geo_loc_region'].value_counts())
+
+# For census regions, aggregate the already-aggregated state data to avoid double counting
+census_stats = region_stats.groupby(['census_region', 'epiweek']).agg({
+    'ww_population': 'sum',
+    'sra_accession': 'sum',  # Sum the unique counts at state level for region totals
+    'collection_site_id': 'sum'
+}).reset_index()
+
+# For national stats, use the census region data
+nation_stats = census_stats.groupby(['epiweek']).agg({
+    'ww_population': 'sum',
+    'sra_accession': 'sum',
+    'collection_site_id': 'sum'
+}).reset_index()
+nation_stats['geo_loc_region'] = 'USA'
+
+# Create dictionaries from the corrected data
+population_dict = {f"{row['geo_loc_region']}_{row['epiweek']}": row['ww_population'] for _, row in region_stats.iterrows()}
+population_dict.update({f"{row['census_region']}_{row['epiweek']}": row['ww_population'] for _, row in census_stats.iterrows()})
+population_dict.update({f"USA_{row['epiweek']}": row['ww_population'] for _, row in nation_stats.iterrows()})
+
+num_samples_dict = {f"{row['geo_loc_region']}_{row['epiweek']}": row['sra_accession'] for _, row in region_stats.iterrows()}
+num_samples_dict.update({f"{row['census_region']}_{row['epiweek']}": row['sra_accession'] for _, row in census_stats.iterrows()})
+num_samples_dict.update({f"USA_{row['epiweek']}": row['sra_accession'] for _, row in nation_stats.iterrows()})
+
+num_sites_dict = {f"{row['geo_loc_region']}_{row['epiweek']}": row['collection_site_id'] for _, row in region_stats.iterrows()}
+num_sites_dict.update({f"{row['census_region']}_{row['epiweek']}": row['collection_site_id'] for _, row in census_stats.iterrows()})
+num_sites_dict.update({f"USA_{row['epiweek']}": row['collection_site_id'] for _, row in nation_stats.iterrows()})
+
+
 # First aggregate by state
 total_lineage_prevalence_state = df_agg.groupby(['epiweek', 'geo_loc_region']).agg({
     'pop_weighted_prevalence': 'sum',
@@ -68,14 +114,7 @@
 # Aggregate by state
 df_agg_weekly = df_agg.groupby(['epiweek', 'geo_loc_region', 'name']).agg({
     'pop_weighted_prevalence': 'sum',
-    'collection_site_id': 'nunique',
-    'sra_accession': 'nunique',
-    'ww_population': 'mean',
-}).reset_index().rename(columns={
-    'collection_site_id': 'num_sites',
-    'sra_accession': 'num_samples',
-    'ww_population': 'total_population'
-})
+}).reset_index()
 
 df_agg_weekly['id'] = df_agg_weekly['epiweek'].astype(str) + '_' + df_agg_weekly['geo_loc_region']
 df_agg_weekly['total_lineage_prevalence'] = df_agg_weekly['id'].map(total_prev_state_dict)
@@ -90,14 +129,7 @@
     # Aggregate by epiweek and lineage
     df_region = df_region_data.groupby(['epiweek', 'name']).agg({
         'pop_weighted_prevalence': 'sum',
-        'collection_site_id': 'nunique',
-        'sra_accession': 'nunique',
-        'ww_population': 'mean',
-    }).reset_index().rename(columns={
-        'collection_site_id': 'num_sites',
-        'sra_accession': 'num_samples',
-        'ww_population': 'total_population'
-    })
+    }).reset_index()
     
     # Calculate proper weighted mean prevalence for region
     df_region['geo_loc_region'] = region
@@ -110,31 +142,35 @@
 # Aggregate by nation (USA)
 df_nation = df_agg.groupby(['epiweek', 'name']).agg({
     'pop_weighted_prevalence': 'sum',
-    'collection_site_id': 'nunique',
-    'sra_accession': 'nunique',
-    'ww_population': 'mean',
-}).reset_index().rename(columns={
-    'collection_site_id': 'num_sites',
-    'sra_accession': 'num_samples',
-    'ww_population': 'total_population'
-})
+}).reset_index()
 
 # Calculate proper weighted mean prevalence for USA
 df_nation['geo_loc_region'] = 'USA'
 df_nation['id'] = df_nation['epiweek'].astype(str) + '_USA'
+df_nation['region_id'] = 'USA_' + df_nation['epiweek'].astype(str)
 df_nation['total_lineage_prevalence'] = df_nation['id'].map(total_prev_nation_dict)
 df_nation['mean_lineage_prevalence'] = df_nation['pop_weighted_prevalence'] / df_nation['total_lineage_prevalence']
+df_nation['total_population'] = df_nation['region_id'].map(population_dict)
+df_nation['num_samples'] = df_nation['region_id'].map(num_samples_dict)
+df_nation['num_sites'] = df_nation['region_id'].map(num_sites_dict)
 
 # Combine all census regions with state data and national data
 df_region_combined = pd.concat(df_agg_census)
 df_agg_weekly = pd.concat([df_agg_weekly, df_region_combined, df_nation])
-df_agg_weekly['total_population'] = df_agg_weekly.groupby(['epiweek', 'geo_loc_region'])['total_population'].transform('mean') # Ensure total population is consistent across lineages in the same region
 
 df_agg_weekly['id'] = df_agg_weekly['epiweek'].astype(str) + '_' + df_agg_weekly['geo_loc_region'] + '_' + df_agg_weekly['name']
+df_agg_weekly['region_id'] = df_agg_weekly['geo_loc_region'] + '_' + df_agg_weekly['epiweek'].astype(str)
 df_agg_weekly['crumbs'] = df_agg_weekly['name'].map(crumbs)
 df_agg_weekly['week_start'] = df_agg_weekly['epiweek'].apply(lambda x: x.startdate()).astype(str)
 df_agg_weekly['week_end'] = df_agg_weekly['epiweek'].apply(lambda x: x.enddate()).astype(str)
 
+df_agg_weekly['total_population'] = df_agg_weekly['region_id'].map(population_dict)
+df_agg_weekly['num_samples'] = df_agg_weekly['region_id'].map(num_samples_dict)
+df_agg_weekly['num_sites'] = df_agg_weekly['region_id'].map(num_sites_dict)
+
+print('california', df_agg_weekly[(df_agg_weekly['geo_loc_region'] == 'California') & (df_agg_weekly['epiweek'] == 202423)]['total_population'].value_counts())
+print('west', df_agg_weekly[(df_agg_weekly['geo_loc_region'] == 'West') & (df_agg_weekly['epiweek'] == 202423)]['total_population'].value_counts())
+
 df_agg_weekly = df_agg_weekly[['id', 'epiweek', 'week_start', 'week_end', 'geo_loc_region', 'total_population', 'num_sites', 'num_samples', 'name', 'mean_lineage_prevalence', 'crumbs']]
 
 # Workaround to save to json