Skip to content

Commit e0bafc9

Browse files
committed
fix: leave as mean for now
1 parent 985e058 commit e0bafc9

File tree

1 file changed

+9
-10
lines changed

1 file changed

+9
-10
lines changed

scripts/aggregate_demix_by_week.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,8 @@
99
'South': ['Delaware', 'Maryland', 'Florida', 'Georgia', 'North Carolina', 'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
1010
'West': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
1111
}
12-
CENSUS_REGIONS = {region: [state.lower() for state in states] for region, states in CENSUS_REGIONS.items()}
1312

14-
def sum_unique(series):
15-
return series.unique().sum()
13+
1614

1715
# Create state to region mapping for easier reference
1816
STATE_TO_REGION = {}
@@ -49,8 +47,11 @@ def sum_unique(series):
4947
df_agg['epiweek'] = df_agg['collection_date'].apply(lambda x: Week.fromdate(x))
5048

5149
# Calculate population, sample count, and site count for each region/week
50+
def sum_unique(series):
51+
return series.unique().sum()
52+
5253
region_stats = df_agg.groupby(['geo_loc_region', 'epiweek']).agg({
53-
'ww_population': 'sum', # Sum the populations within each state
54+
'ww_population': 'mean', # Sum the populations within each state
5455
'sra_accession': 'nunique',
5556
'collection_site_id': 'nunique'
5657
}).reset_index()
@@ -59,18 +60,19 @@ def sum_unique(series):
5960
region_stats['census_region'] = region_stats['geo_loc_region'].map(STATE_TO_REGION)
6061

6162
# Print value counts for samples where census_region is null
62-
print('census_region null', df_agg[df_agg['census_region'].isna()]['geo_loc_region'].value_counts())
63+
print('census_region', region_stats['census_region'].value_counts())
64+
region_stats.to_csv('outputs/aggregate/region_stats.csv', index=False)
6365

6466
# For census regions, aggregate the already-aggregated state data to avoid double counting
6567
census_stats = region_stats.groupby(['census_region', 'epiweek']).agg({
66-
'ww_population': 'sum',
68+
'ww_population': 'mean',
6769
'sra_accession': 'sum', # Sum the unique counts at state level for region totals
6870
'collection_site_id': 'sum'
6971
}).reset_index()
7072

7173
# For national stats, use the census region data
7274
nation_stats = census_stats.groupby(['epiweek']).agg({
73-
'ww_population': 'sum',
75+
'ww_population': 'mean',
7476
'sra_accession': 'sum',
7577
'collection_site_id': 'sum'
7678
}).reset_index()
@@ -168,9 +170,6 @@ def sum_unique(series):
168170
df_agg_weekly['num_samples'] = df_agg_weekly['region_id'].map(num_samples_dict)
169171
df_agg_weekly['num_sites'] = df_agg_weekly['region_id'].map(num_sites_dict)
170172

171-
print('california', df_agg_weekly[(df_agg_weekly['geo_loc_region'] == 'California') & (df_agg_weekly['epiweek'] == 202423)]['total_population'].value_counts())
172-
print('west', df_agg_weekly[(df_agg_weekly['geo_loc_region'] == 'West') & (df_agg_weekly['epiweek'] == 202423)]['total_population'].value_counts())
173-
174173
df_agg_weekly = df_agg_weekly[['id', 'epiweek', 'week_start', 'week_end', 'geo_loc_region', 'total_population', 'num_sites', 'num_samples', 'name', 'mean_lineage_prevalence', 'crumbs']]
175174

176175
# Workaround to save to json

0 commit comments

Comments
 (0)