Skip to content

Commit f62c263

Browse files
committed
Dataframes: Demonstrate pandas.read_sql() with both urllib3 vs. psycopg3
1 parent 598c776 commit f62c263

File tree

4 files changed

+116
-1
lines changed

4 files changed

+116
-1
lines changed

by-dataframe/pandas/read_pandas.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
"""
2+
About
3+
=====
4+
5+
Evaluate reading data from CrateDB into pandas dataframes.
6+
7+
Example program to demonstrate reading data in batches from CrateDB into
8+
pandas, using SQLAlchemy, supporting urllib3 vs. psycopg3.
9+
10+
11+
Setup
12+
=====
13+
::
14+
15+
pip install --upgrade click pandas 'sqlalchemy-cratedb[all]'
16+
17+
18+
Synopsis
19+
========
20+
::
21+
22+
# Run CrateDB.
23+
docker run --rm -it --publish=4200:4200 --publish=5432:5432 crate:latest
24+
25+
# Use CrateDB, either talking HTTP, or PostgreSQL wire protocol.
26+
python read_pandas.py --dburi='crate+urllib3://crate@localhost:4200'
27+
python read_pandas.py --dburi='crate+psycopg://crate@localhost:5432'
28+
29+
# Use bulk size parameter to exercise paging.
30+
python read_pandas.py --bulk-size 50
31+
32+
# Use CrateDB Cloud.
33+
python read_pandas.py --dburi='crate://admin:<PASSWORD>@example.aks1.westeurope.azure.cratedb.net:4200?ssl=true'
34+
35+
36+
Details
37+
=======
38+
To watch the HTTP traffic to your local CrateDB instance, invoke::
39+
40+
sudo ngrep -d lo0 -Wbyline port 4200
41+
42+
"""
43+
import logging
44+
45+
import click
46+
import pandas as pd
47+
import sqlalchemy as sa
48+
from pueblo.util.logging import setup_logging
49+
50+
logger = logging.getLogger(__name__)
51+
52+
53+
SQLALCHEMY_LOGGING = True
54+
55+
56+
class DatabaseWorkload:
57+
58+
table_name = "testdrive_pandas"
59+
60+
def __init__(self, dburi: str):
61+
self.dburi = dburi
62+
63+
def get_engine(self, **kwargs):
64+
return sa.create_engine(self.dburi, **kwargs)
65+
66+
def process(self, bulk_size: int):
67+
"""
68+
Exercise different insert methods of pandas, SQLAlchemy, and CrateDB.
69+
"""
70+
71+
logger.info(f"Connecting to {self.dburi}")
72+
logger.info(f"Reading data with bulk_size={bulk_size}")
73+
74+
engine = self.get_engine()
75+
frames = pd.read_sql(sql="SELECT * FROM sys.summits;", con=engine, chunksize=bulk_size)
76+
for df in frames:
77+
print(df)
78+
79+
80+
def tweak_log_levels(level=logging.INFO):
81+
82+
# Enable SQLAlchemy logging.
83+
if SQLALCHEMY_LOGGING:
84+
logging.getLogger("sqlalchemy").setLevel(level)
85+
86+
87+
@click.command()
88+
@click.option("--dburi", type=str, default="crate://localhost:4200", required=False, help="SQLAlchemy database connection URI.")
89+
@click.option("--bulk-size", type=int, default=5_000, required=False, help="Bulk size / chunk size.")
90+
@click.help_option()
91+
def main(dburi: str, bulk_size: int):
92+
setup_logging()
93+
tweak_log_levels()
94+
dbw = DatabaseWorkload(dburi=dburi)
95+
dbw.process(bulk_size)
96+
97+
98+
if __name__ == "__main__":
99+
main()

by-dataframe/pandas/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ colorlog<7
33
crate>=1.0.0.dev2
44
pandas==2.2.*
55
pueblo>=0.0.10
6-
sqlalchemy-cratedb>=0.40.0
6+
sqlalchemy-cratedb[all] @ git+https://github.com/crate-workbench/sqlalchemy-cratedb@amo/postgresql-async
File renamed without changes.

by-dataframe/pandas/test_read.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import shlex
2+
import subprocess
3+
4+
5+
def run(command: str):
6+
subprocess.check_call(shlex.split(command))
7+
8+
9+
def test_read_urllib3():
10+
cmd = "time python read_pandas.py --dburi=crate+urllib3://crate@localhost:4200"
11+
run(cmd)
12+
13+
14+
def test_read_psycopg3():
15+
cmd = "time python read_pandas.py --dburi=crate+psycopg://crate@localhost:5432"
16+
run(cmd)

0 commit comments

Comments
 (0)