9
9
'South' : ['Delaware' , 'Maryland' , 'Florida' , 'Georgia' , 'North Carolina' , 'South Carolina' , 'Virginia' , 'District of Columbia' , 'West Virginia' , 'Alabama' , 'Kentucky' , 'Mississippi' , 'Tennessee' , 'Arkansas' , 'Louisiana' , 'Oklahoma' , 'Texas' ],
10
10
'West' : ['Arizona' , 'Colorado' , 'Idaho' , 'Montana' , 'Nevada' , 'New Mexico' , 'Utah' , 'Wyoming' , 'Alaska' , 'California' , 'Hawaii' , 'Oregon' , 'Washington' ]
11
11
}
12
- CENSUS_REGIONS = {region : [state .lower () for state in states ] for region , states in CENSUS_REGIONS .items ()}
13
12
14
- def sum_unique (series ):
15
- return series .unique ().sum ()
13
+
16
14
17
15
# Create state to region mapping for easier reference
18
16
STATE_TO_REGION = {}
@@ -49,8 +47,11 @@ def sum_unique(series):
49
47
df_agg ['epiweek' ] = df_agg ['collection_date' ].apply (lambda x : Week .fromdate (x ))
50
48
51
49
# Calculate population, sample count, and site count for each region/week
50
+ def sum_unique (series ):
51
+ return series .unique ().sum ()
52
+
52
53
region_stats = df_agg .groupby (['geo_loc_region' , 'epiweek' ]).agg ({
53
- 'ww_population' : 'sum ' , # Sum the populations within each state
54
+ 'ww_population' : 'mean ' , # Sum the populations within each state
54
55
'sra_accession' : 'nunique' ,
55
56
'collection_site_id' : 'nunique'
56
57
}).reset_index ()
@@ -59,18 +60,19 @@ def sum_unique(series):
59
60
region_stats ['census_region' ] = region_stats ['geo_loc_region' ].map (STATE_TO_REGION )
60
61
61
62
# Print value counts for samples where census_region is null
62
- print ('census_region null' , df_agg [df_agg ['census_region' ].isna ()]['geo_loc_region' ].value_counts ())
63
+ print ('census_region' , region_stats ['census_region' ].value_counts ())
64
+ region_stats .to_csv ('outputs/aggregate/region_stats.csv' , index = False )
63
65
64
66
# For census regions, aggregate the already-aggregated state data to avoid double counting
65
67
census_stats = region_stats .groupby (['census_region' , 'epiweek' ]).agg ({
66
- 'ww_population' : 'sum ' ,
68
+ 'ww_population' : 'mean ' ,
67
69
'sra_accession' : 'sum' , # Sum the unique counts at state level for region totals
68
70
'collection_site_id' : 'sum'
69
71
}).reset_index ()
70
72
71
73
# For national stats, use the census region data
72
74
nation_stats = census_stats .groupby (['epiweek' ]).agg ({
73
- 'ww_population' : 'sum ' ,
75
+ 'ww_population' : 'mean ' ,
74
76
'sra_accession' : 'sum' ,
75
77
'collection_site_id' : 'sum'
76
78
}).reset_index ()
@@ -168,9 +170,6 @@ def sum_unique(series):
168
170
df_agg_weekly ['num_samples' ] = df_agg_weekly ['region_id' ].map (num_samples_dict )
169
171
df_agg_weekly ['num_sites' ] = df_agg_weekly ['region_id' ].map (num_sites_dict )
170
172
171
- print ('california' , df_agg_weekly [(df_agg_weekly ['geo_loc_region' ] == 'California' ) & (df_agg_weekly ['epiweek' ] == 202423 )]['total_population' ].value_counts ())
172
- print ('west' , df_agg_weekly [(df_agg_weekly ['geo_loc_region' ] == 'West' ) & (df_agg_weekly ['epiweek' ] == 202423 )]['total_population' ].value_counts ())
173
-
174
173
df_agg_weekly = df_agg_weekly [['id' , 'epiweek' , 'week_start' , 'week_end' , 'geo_loc_region' , 'total_population' , 'num_sites' , 'num_samples' , 'name' , 'mean_lineage_prevalence' , 'crumbs' ]]
175
174
176
175
# Workaround to save to json
0 commit comments