Skip to content

Commit 6f66b36

Browse files
committed
1 parent fbc1760 commit 6f66b36

File tree

10 files changed

+2681
-38
lines changed

10 files changed

+2681
-38
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""Make responsible_committee column JSON in votes table
2+
3+
Revision ID: 8d1995cb0bed
4+
Revises: 848ef24718dd
5+
Create Date: 2025-03-16 17:13:08.000602
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from alembic import op
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "8d1995cb0bed"
14+
down_revision = "848ef24718dd"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade() -> None:
20+
op.drop_column("votes", "responsible_committee")
21+
op.add_column("votes", sa.Column("responsible_committees", sa.JSON))
22+
23+
24+
def downgrade() -> None:
25+
op.drop_column("votes", "responsible_committees")
26+
op.add_column("votes", sa.Column("responsible_committee", sa.Unicode))

backend/howtheyvote/api/serializers.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -258,16 +258,16 @@ class BaseVoteDict(TypedDict):
258258
"""Concepts from the [EuroVoc](https://eur-lex.europa.eu/browse/eurovoc.html) thesaurus
259259
that are related to this vote"""
260260

261-
responsible_committee: CommitteeDict | None
261+
responsible_committees: list[CommitteeDict] | None
262262
"""Committee responsible for the legislative procedure"""
263263

264264

265265
def serialize_base_vote(vote: Vote) -> BaseVoteDict:
266266
geo_areas = [serialize_country(geo_area) for geo_area in vote.geo_areas]
267267
eurovoc_concepts = [serialize_eurovoc_concept(ec) for ec in vote.eurovoc_concepts]
268-
responsible_committee = (
269-
serialize_committee(vote.responsible_committee) if vote.responsible_committee else None
270-
)
268+
responsible_committees = [
269+
serialize_committee(committee) for committee in vote.responsible_committees
270+
]
271271

272272
return {
273273
"id": vote.id,
@@ -278,7 +278,7 @@ def serialize_base_vote(vote: Vote) -> BaseVoteDict:
278278
"is_featured": vote.is_featured,
279279
"geo_areas": geo_areas,
280280
"eurovoc_concepts": eurovoc_concepts,
281-
"responsible_committee": responsible_committee,
281+
"responsible_committees": responsible_committees,
282282
}
283283

284284

backend/howtheyvote/export/__init__.py

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
import tempfile
55
from typing import Any, TypedDict
66

7-
from sqlalchemy import select
7+
from sqlalchemy import func, select
88
from structlog import get_logger
99

1010
from ..db import Session
11-
from ..models import Member, Vote
11+
from ..models import Committee, Member, Vote
1212
from ..vote_stats import count_vote_positions
1313
from .csvw_helpers import Table
1414

@@ -144,9 +144,6 @@ class VoteRow(TypedDict):
144144
procedure_title: str | None
145145
"""Title of the legislative procedure as listed in the Legislative Observatory"""
146146

147-
responsible_committee_code: str | None
148-
"""Committee responsible for the legislative procedure"""
149-
150147
count_for: int
151148
"""Number of MEPs who voted in favor"""
152149

@@ -182,6 +179,29 @@ class MemberVoteRow(TypedDict):
182179
of the vote. This is not necessarily the MEP’s current political group."""
183180

184181

182+
class CommitteeRow(TypedDict):
183+
"""Each row represents a committee of the European Parliament."""
184+
185+
code: str
186+
"""Unique identifier of the committee"""
187+
188+
label: str
189+
"""Label"""
190+
191+
abbreviation: str
192+
"""Abbreviation"""
193+
194+
195+
class ResponsibleCommitteeVotes(TypedDict):
196+
"""Committee responsible for the legislative procedure a vote is part of."""
197+
198+
vote_id: int
199+
"""Vote ID"""
200+
201+
committee_code: str
202+
"""Committee code"""
203+
204+
185205
class Export:
186206
def __init__(self, outdir: pathlib.Path):
187207
self.outdir = outdir
@@ -228,6 +248,20 @@ def __init__(self, outdir: pathlib.Path):
228248
primary_key=["member_id", "vote_id"],
229249
)
230250

251+
self.committees = Table(
252+
row_type=CommitteeRow,
253+
outdir=self.outdir,
254+
name="committees",
255+
primary_key=["code"],
256+
)
257+
258+
self.responsible_committee_votes = Table(
259+
row_type=ResponsibleCommitteeVotes,
260+
outdir=self.outdir,
261+
name="responsible_committee_votes",
262+
primary_key=["vote_id", "committee_code"],
263+
)
264+
231265
def run(self) -> None:
232266
self.fetch_members()
233267
self.write_export_timestamp()
@@ -239,10 +273,13 @@ def run(self) -> None:
239273
self.group_memberships,
240274
self.votes,
241275
self.member_votes,
276+
self.committees,
277+
self.responsible_committee_votes,
242278
]
243279
)
244280
self.export_members()
245281
self.export_votes()
282+
self.export_committees()
246283

247284
def fetch_members(self) -> None:
248285
self.members_by_id: dict[int, Member] = {}
@@ -351,10 +388,6 @@ def export_votes(self) -> None:
351388
if idx % 1000 == 0:
352389
log.info("Writing vote", index=idx)
353390

354-
responsible_committee_code = (
355-
vote.responsible_committee.code if vote.responsible_committee else None
356-
)
357-
358391
position_counts = count_vote_positions(vote.member_votes)
359392

360393
votes.write_row(
@@ -368,7 +401,6 @@ def export_votes(self) -> None:
368401
"is_featured": vote.is_featured,
369402
"procedure_reference": vote.procedure_reference,
370403
"procedure_title": vote.procedure_title,
371-
"responsible_committee_code": responsible_committee_code,
372404
"count_for": position_counts["FOR"],
373405
"count_against": position_counts["AGAINST"],
374406
"count_abstention": position_counts["ABSTENTION"],
@@ -392,6 +424,30 @@ def export_votes(self) -> None:
392424
}
393425
)
394426

427+
def export_committees(self) -> None:
428+
log.info("Exporting committees")
429+
430+
with self.committees.open() as committees:
431+
exp = func.json_each(Vote.responsible_committees).table_valued("value")
432+
query = (
433+
select(func.distinct(exp.c.value)).select_from(Vote, exp).order_by(exp.c.value)
434+
)
435+
committee_codes = Session.execute(query).scalars()
436+
437+
for committee_code in committee_codes:
438+
# `if True else None` is a hack to make mypy treat this as a normal value
439+
# expression and not as a type expression. If this keeps causing type checking
440+
# issues we might want to reconsider the use of metaclasses for this purpose.
441+
# See: https://github.com/python/mypy/issues/15107
442+
committee = Committee[committee_code] if True else None
443+
committees.write_row(
444+
{
445+
"code": committee.code,
446+
"label": committee.label,
447+
"abbreviation": committee.abbreviation,
448+
}
449+
)
450+
395451

396452
def generate_export(path: pathlib.Path) -> None:
397453
with tempfile.TemporaryDirectory() as outdir:

backend/howtheyvote/models/vote.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class Vote(BaseWithId):
9797
eurovoc_concepts: Mapped[list[EurovocConcept]] = mapped_column(
9898
ListType(EurovocConceptType())
9999
)
100-
responsible_committee: Mapped[Committee] = mapped_column(CommitteeType())
100+
responsible_committees: Mapped[list[Committee]] = mapped_column(ListType(CommitteeType()))
101101
press_release: Mapped[str | None] = mapped_column(sa.Unicode)
102102
issues: Mapped[list[DataIssue]] = mapped_column(ListType(sa.Enum(DataIssue)))
103103

backend/howtheyvote/scrapers/votes.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ def _url(self) -> str:
412412
def _extract_data(self, doc: BeautifulSoup) -> Fragment:
413413
title = self._title(doc)
414414
geo_areas = self._geo_areas(doc)
415-
responsible_committee = self._responsible_committee(doc)
415+
responsible_committees = self._responsible_committees(doc)
416416
self._log.info(
417417
"Extracted procedure information",
418418
title=title,
@@ -426,7 +426,7 @@ def _extract_data(self, doc: BeautifulSoup) -> Fragment:
426426
data={
427427
"procedure_title": title,
428428
"geo_areas": geo_areas,
429-
"responsible_committee": responsible_committee,
429+
"responsible_committees": responsible_committees,
430430
},
431431
)
432432

@@ -471,30 +471,30 @@ def _geo_areas(self, doc: BeautifulSoup) -> list[str]:
471471

472472
return geo_areas
473473

474-
def _responsible_committee(self, doc: BeautifulSoup) -> str | None:
474+
def _responsible_committees(self, doc: BeautifulSoup) -> list[str]:
475475
table = doc.select_one(
476-
'#erpl_accordion-committee table:has(th:-soup-contains("Committee responsible"))'
476+
"#erpl_accordion-committee :where("
477+
+ 'table:has(th:-soup-contains("Committee responsible")),'
478+
+ 'table:has(th:-soup-contains("Joint committee responsible"))'
479+
+ ")"
477480
)
478481

479482
if not table:
480483
return None
481484

482-
if len(table.select("tbody tr")) > 1:
483-
# We assume that there is at most one responsible committee
484-
log.warning("More than one responsible committee found")
485+
badges = table.select("tbody tr .erpl_badge-committee")
486+
committees = set()
485487

486-
badge = table.select_one("tbody tr .erpl_badge-committee")
488+
for badge in badges:
489+
text = badge.text.strip()
490+
committee = Committee.get(text)
487491

488-
if not badge:
489-
return None
490-
491-
text = badge.text.strip()
492-
committee = Committee.get(text)
492+
if not committee:
493+
raise ScrapingError(f"Could not find committee {text}")
493494

494-
if not committee:
495-
raise ScrapingError(f"Could not find committee {text}")
495+
committees.add(committee.code)
496496

497-
return committee.code
497+
return committees
498498

499499

500500
class EurlexProcedureScraper(BeautifulSoupScraper):

backend/howtheyvote/store/mappings.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ def map_vote(record: CompositeRecord) -> Vote:
6060
member_votes = [deserialize_member_vote(mv) for mv in record.first("member_votes")]
6161
geo_areas = {Country[code] for code in record.chain("geo_areas")}
6262
eurovoc_concepts = {EurovocConcept[id_] for id_ in record.chain("eurovoc_concepts")}
63-
responsible_committee = Committee.get(record.first("responsible_committee"))
63+
responsible_committees = {
64+
Committee[code] for code in record.chain("responsible_committees")
65+
}
6466

6567
press_release = record.first("press_release")
6668
is_featured = record.first("is_featured") or press_release is not None
@@ -82,7 +84,7 @@ def map_vote(record: CompositeRecord) -> Vote:
8284
member_votes=member_votes,
8385
geo_areas=geo_areas,
8486
eurovoc_concepts=eurovoc_concepts,
85-
responsible_committee=responsible_committee,
87+
responsible_committees=responsible_committees,
8688
press_release=press_release,
8789
issues=record.chain("issues"),
8890
)

backend/tests/api/test_votes_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ def test_votes_api_show(records, db_session, api):
536536
],
537537
"geo_areas": [],
538538
"eurovoc_concepts": [],
539-
"responsible_committee": None,
539+
"responsible_committees": [],
540540
"related": [],
541541
"sources": [
542542
{

backend/tests/export/test_init.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from howtheyvote.export import Export
77
from howtheyvote.models import (
8+
Committee,
89
Country,
910
Group,
1011
GroupMembership,
@@ -126,6 +127,7 @@ def test_export_votes(db_session, tmp_path):
126127
position=VotePosition.FOR,
127128
),
128129
],
130+
responsible_committees=[Committee["AFET"]],
129131
)
130132

131133
db_session.add_all([member, vote])
@@ -138,8 +140,8 @@ def test_export_votes(db_session, tmp_path):
138140
votes_meta = tmp_path.joinpath("votes.csv-metadata.json")
139141

140142
expected = (
141-
"id,timestamp,display_title,reference,description,is_main,is_featured,procedure_reference,procedure_title,responsible_committee_code,count_for,count_against,count_abstention,count_did_not_vote\n"
142-
"123456,2024-01-01 00:00:00,Lorem Ipsum,,,False,False,,,,1,0,0,0\n"
143+
"id,timestamp,display_title,reference,description,is_main,is_featured,procedure_reference,procedure_title,count_for,count_against,count_abstention,count_did_not_vote\n"
144+
"123456,2024-01-01 00:00:00,Lorem Ipsum,,,False,False,,,1,0,0,0\n"
143145
)
144146

145147
assert votes_csv.read_text() == expected
@@ -153,6 +155,14 @@ def test_export_votes(db_session, tmp_path):
153155
assert member_votes_csv.read_text() == expected
154156
assert member_votes_meta.is_file()
155157

158+
committees_csv = tmp_path.joinpath("committees.csv")
159+
committees_meta = tmp_path.joinpath("committees.csv-metadata.json")
160+
161+
expected = "code,label,abbreviation\nAFET,Committee on Foreign Affairs,AFET\n"
162+
163+
assert committees_csv.read_text() == expected
164+
assert committees_meta.is_file()
165+
156166

157167
def test_export_votes_country_group(db_session, tmp_path):
158168
member = Member(

0 commit comments

Comments
 (0)