|
8 | 8 | 'Midwest': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'],
|
9 | 9 | 'South': ['Delaware', 'Maryland', 'Florida', 'Georgia', 'North Carolina', 'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
|
10 | 10 | 'West': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
|
11 |
| -} |
| 11 | +} |
| 12 | +CENSUS_REGIONS = {region: [state.lower() for state in states] for region, states in CENSUS_REGIONS.items()} |
| 13 | + |
| 14 | +def sum_unique(series): |
| 15 | + return series.unique().sum() |
12 | 16 |
|
13 | 17 | # Create state to region mapping for easier reference
|
14 | 18 | STATE_TO_REGION = {}
|
|
44 | 48 | df_agg['collection_date'] = pd.to_datetime(df_agg['collection_date'])
|
45 | 49 | df_agg['epiweek'] = df_agg['collection_date'].apply(lambda x: Week.fromdate(x))
|
46 | 50 |
|
| 51 | +# Calculate population, sample count, and site count for each region/week |
| 52 | +region_stats = df_agg.groupby(['geo_loc_region', 'epiweek']).agg({ |
| 53 | + 'ww_population': 'sum', # Sum the populations within each state |
| 54 | + 'sra_accession': 'nunique', |
| 55 | + 'collection_site_id': 'nunique' |
| 56 | +}).reset_index() |
| 57 | + |
| 58 | +# Add census region to region_stats |
| 59 | +region_stats['census_region'] = region_stats['geo_loc_region'].map(STATE_TO_REGION) |
| 60 | + |
| 61 | +# Print value counts for samples where census_region is null |
| 62 | +print('census_region null', df_agg[df_agg['census_region'].isna()]['geo_loc_region'].value_counts()) |
| 63 | + |
| 64 | +# For census regions, aggregate the already-aggregated state data to avoid double counting |
| 65 | +census_stats = region_stats.groupby(['census_region', 'epiweek']).agg({ |
| 66 | + 'ww_population': 'sum', |
| 67 | + 'sra_accession': 'sum', # Sum the unique counts at state level for region totals |
| 68 | + 'collection_site_id': 'sum' |
| 69 | +}).reset_index() |
| 70 | + |
| 71 | +# For national stats, use the census region data |
| 72 | +nation_stats = census_stats.groupby(['epiweek']).agg({ |
| 73 | + 'ww_population': 'sum', |
| 74 | + 'sra_accession': 'sum', |
| 75 | + 'collection_site_id': 'sum' |
| 76 | +}).reset_index() |
| 77 | +nation_stats['geo_loc_region'] = 'USA' |
| 78 | + |
| 79 | +# Create dictionaries from the corrected data |
| 80 | +population_dict = {f"{row['geo_loc_region']}_{row['epiweek']}": row['ww_population'] for _, row in region_stats.iterrows()} |
| 81 | +population_dict.update({f"{row['census_region']}_{row['epiweek']}": row['ww_population'] for _, row in census_stats.iterrows()}) |
| 82 | +population_dict.update({f"USA_{row['epiweek']}": row['ww_population'] for _, row in nation_stats.iterrows()}) |
| 83 | + |
| 84 | +num_samples_dict = {f"{row['geo_loc_region']}_{row['epiweek']}": row['sra_accession'] for _, row in region_stats.iterrows()} |
| 85 | +num_samples_dict.update({f"{row['census_region']}_{row['epiweek']}": row['sra_accession'] for _, row in census_stats.iterrows()}) |
| 86 | +num_samples_dict.update({f"USA_{row['epiweek']}": row['sra_accession'] for _, row in nation_stats.iterrows()}) |
| 87 | + |
| 88 | +num_sites_dict = {f"{row['geo_loc_region']}_{row['epiweek']}": row['collection_site_id'] for _, row in region_stats.iterrows()} |
| 89 | +num_sites_dict.update({f"{row['census_region']}_{row['epiweek']}": row['collection_site_id'] for _, row in census_stats.iterrows()}) |
| 90 | +num_sites_dict.update({f"USA_{row['epiweek']}": row['collection_site_id'] for _, row in nation_stats.iterrows()}) |
| 91 | + |
| 92 | + |
47 | 93 | # First aggregate by state
|
48 | 94 | total_lineage_prevalence_state = df_agg.groupby(['epiweek', 'geo_loc_region']).agg({
|
49 | 95 | 'pop_weighted_prevalence': 'sum',
|
|
68 | 114 | # Aggregate by state
|
69 | 115 | df_agg_weekly = df_agg.groupby(['epiweek', 'geo_loc_region', 'name']).agg({
|
70 | 116 | 'pop_weighted_prevalence': 'sum',
|
71 |
| - 'collection_site_id': 'nunique', |
72 |
| - 'sra_accession': 'nunique', |
73 |
| - 'ww_population': 'mean', |
74 |
| -}).reset_index().rename(columns={ |
75 |
| - 'collection_site_id': 'num_sites', |
76 |
| - 'sra_accession': 'num_samples', |
77 |
| - 'ww_population': 'total_population' |
78 |
| -}) |
| 117 | +}).reset_index() |
79 | 118 |
|
80 | 119 | df_agg_weekly['id'] = df_agg_weekly['epiweek'].astype(str) + '_' + df_agg_weekly['geo_loc_region']
|
81 | 120 | df_agg_weekly['total_lineage_prevalence'] = df_agg_weekly['id'].map(total_prev_state_dict)
|
|
90 | 129 | # Aggregate by epiweek and lineage
|
91 | 130 | df_region = df_region_data.groupby(['epiweek', 'name']).agg({
|
92 | 131 | 'pop_weighted_prevalence': 'sum',
|
93 |
| - 'collection_site_id': 'nunique', |
94 |
| - 'sra_accession': 'nunique', |
95 |
| - 'ww_population': 'mean', |
96 |
| - }).reset_index().rename(columns={ |
97 |
| - 'collection_site_id': 'num_sites', |
98 |
| - 'sra_accession': 'num_samples', |
99 |
| - 'ww_population': 'total_population' |
100 |
| - }) |
| 132 | + }).reset_index() |
101 | 133 |
|
102 | 134 | # Calculate proper weighted mean prevalence for region
|
103 | 135 | df_region['geo_loc_region'] = region
|
|
110 | 142 | # Aggregate by nation (USA)
|
111 | 143 | df_nation = df_agg.groupby(['epiweek', 'name']).agg({
|
112 | 144 | 'pop_weighted_prevalence': 'sum',
|
113 |
| - 'collection_site_id': 'nunique', |
114 |
| - 'sra_accession': 'nunique', |
115 |
| - 'ww_population': 'mean', |
116 |
| -}).reset_index().rename(columns={ |
117 |
| - 'collection_site_id': 'num_sites', |
118 |
| - 'sra_accession': 'num_samples', |
119 |
| - 'ww_population': 'total_population' |
120 |
| -}) |
| 145 | +}).reset_index() |
121 | 146 |
|
122 | 147 | # Calculate proper weighted mean prevalence for USA
|
123 | 148 | df_nation['geo_loc_region'] = 'USA'
|
124 | 149 | df_nation['id'] = df_nation['epiweek'].astype(str) + '_USA'
|
| 150 | +df_nation['region_id'] = 'USA_' + df_nation['epiweek'].astype(str) |
125 | 151 | df_nation['total_lineage_prevalence'] = df_nation['id'].map(total_prev_nation_dict)
|
126 | 152 | df_nation['mean_lineage_prevalence'] = df_nation['pop_weighted_prevalence'] / df_nation['total_lineage_prevalence']
|
| 153 | +df_nation['total_population'] = df_nation['region_id'].map(population_dict) |
| 154 | +df_nation['num_samples'] = df_nation['region_id'].map(num_samples_dict) |
| 155 | +df_nation['num_sites'] = df_nation['region_id'].map(num_sites_dict) |
127 | 156 |
|
128 | 157 | # Combine all census regions with state data and national data
|
129 | 158 | df_region_combined = pd.concat(df_agg_census)
|
130 | 159 | df_agg_weekly = pd.concat([df_agg_weekly, df_region_combined, df_nation])
|
131 |
| -df_agg_weekly['total_population'] = df_agg_weekly.groupby(['epiweek', 'geo_loc_region'])['total_population'].transform('mean') # Ensure total population is consistent across lineages in the same region |
132 | 160 |
|
133 | 161 | df_agg_weekly['id'] = df_agg_weekly['epiweek'].astype(str) + '_' + df_agg_weekly['geo_loc_region'] + '_' + df_agg_weekly['name']
|
| 162 | +df_agg_weekly['region_id'] = df_agg_weekly['geo_loc_region'] + '_' + df_agg_weekly['epiweek'].astype(str) |
134 | 163 | df_agg_weekly['crumbs'] = df_agg_weekly['name'].map(crumbs)
|
135 | 164 | df_agg_weekly['week_start'] = df_agg_weekly['epiweek'].apply(lambda x: x.startdate()).astype(str)
|
136 | 165 | df_agg_weekly['week_end'] = df_agg_weekly['epiweek'].apply(lambda x: x.enddate()).astype(str)
|
137 | 166 |
|
| 167 | +df_agg_weekly['total_population'] = df_agg_weekly['region_id'].map(population_dict) |
| 168 | +df_agg_weekly['num_samples'] = df_agg_weekly['region_id'].map(num_samples_dict) |
| 169 | +df_agg_weekly['num_sites'] = df_agg_weekly['region_id'].map(num_sites_dict) |
| 170 | + |
| 171 | +print('california', df_agg_weekly[(df_agg_weekly['geo_loc_region'] == 'California') & (df_agg_weekly['epiweek'] == 202423)]['total_population'].value_counts()) |
| 172 | +print('west', df_agg_weekly[(df_agg_weekly['geo_loc_region'] == 'West') & (df_agg_weekly['epiweek'] == 202423)]['total_population'].value_counts()) |
| 173 | + |
138 | 174 | df_agg_weekly = df_agg_weekly[['id', 'epiweek', 'week_start', 'week_end', 'geo_loc_region', 'total_population', 'num_sites', 'num_samples', 'name', 'mean_lineage_prevalence', 'crumbs']]
|
139 | 175 |
|
140 | 176 | # Workaround to save to json
|
|
0 commit comments