@@ -214,8 +214,9 @@ table_schema = spark.table("test_vehicle_data").schema
214
214
215
215
print(table_schema)
216
216
217
- dataspec = (dg.DataGenerator(spark, rows=10000000, partitions=8)
218
- .withSchema(table_schema))
217
+ dataspec = (dg.DataGenerator(spark, rows=10000000, partitions=8,
218
+ randomSeedMethod="hash_fieldname")
219
+ .withSchema(table_schema))
219
220
220
221
dataspec = (dataspec
221
222
.withColumnSpec("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
@@ -280,13 +281,13 @@ manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Emba
280
281
lines = [' delta' , ' xyzzy' , ' lakehouse' , ' gadget' , ' droid' ]
281
282
282
283
testDataSpec = (dg.DataGenerator(spark, name = " device_data_set" , rows = data_rows,
283
- partitions = partitions_requested, randomSeedMethod = ' hash_fieldname ' ,
284
- verbose = True , debug = True )
284
+ partitions = partitions_requested,
285
+ randomSeedMethod = ' hash_fieldname ' )
285
286
.withIdOutput()
286
287
# we'll use hash of the base field to generate the ids to
287
288
# avoid a simple incrementing sequence
288
289
.withColumn(" internal_device_id" , LongType(), minValue = 0x 1000000000000 ,
289
- uniqueValues = device_population)
290
+ uniqueValues = device_population, omit = True , baseColumnType = " hash " )
290
291
291
292
# note for format strings, we must use "%lx" not "%x" as the
292
293
# underlying value is a long
@@ -297,16 +298,7 @@ testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
297
298
# so lets use the internal device id as the base column for these attribute
298
299
.withColumn(" country" , StringType(), values = country_codes,
299
300
weights = country_weights,
300
- baseColumn = " internal_device_id" , baseColumnType = " hash" )
301
- .withColumn(" country2a" , LongType(),
302
- expr = " ((hash(internal_device_id) % 3847) + 3847) % 3847" ,
303
301
baseColumn = " internal_device_id" )
304
- .withColumn(" country2" , IntegerType(),
305
- expr = """ floor(cast( (((internal_device_id % 3847) + 3847) % 3847)
306
- as double) )""" ,
307
- baseColumn = " internal_device_id" )
308
- .withColumn(" country3" , StringType(), values = country_codes,
309
- baseColumn = " country2" )
310
302
.withColumn(" manufacturer" , StringType(), values = manufacturers,
311
303
baseColumn = " internal_device_id" )
312
304
@@ -324,6 +316,7 @@ testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
324
316
values = [" activation" , " deactivation" , " plan change" ,
325
317
" telecoms activity" , " internet activity" , " device error" ],
326
318
random = True )
319
+ .withColumn(" event_ts" , " timestamp" , begin = " 2020-01-01 01:00:00" , end = " 2020-12-31 23:59:00" , interval = " 1 minute" , random = True )
327
320
328
321
)
329
322
@@ -363,7 +356,7 @@ or billions of rows.
363
356
For example, using the same code as before, but with different rows and partitions settings, you can generate a billion
364
357
rows of data and write it to a Delta table in under 2 minutes (with a 12 node x 8 core cluster)
365
358
366
- ```
359
+ ``` python
367
360
from pyspark.sql.types import LongType, IntegerType, StringType
368
361
369
362
import dbldatagen as dg
@@ -386,33 +379,23 @@ manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Emba
386
379
lines = [' delta' , ' xyzzy' , ' lakehouse' , ' gadget' , ' droid' ]
387
380
388
381
testDataSpec = (dg.DataGenerator(spark, name = " device_data_set" , rows = data_rows,
389
- partitions=partitions_requested, randomSeedMethod='hash_fieldname',
390
- verbose=True, debug=True)
382
+ partitions = partitions_requested, randomSeedMethod = ' hash_fieldname' )
391
383
.withIdOutput()
392
- # we'll use hash of the base field to generate the ids to avoid a
393
- # simple incrementing sequence
384
+ # we'll use hash of the base field to generate the ids to
385
+ # avoid a simple incrementing sequence
394
386
.withColumn(" internal_device_id" , LongType(), minValue = 0x 1000000000000 ,
395
- unique_values =device_population)
387
+ uniqueValues = device_population, omit = True , baseColumnType = " hash " )
396
388
397
- # note for format strings, we must use "%lx" not "%x" as the underlying
398
- # value is a long
389
+ # note for format strings, we must use "%lx" not "%x" as the
390
+ # underlying value is a long
399
391
.withColumn(" device_id" , StringType(), format = " 0x%013x " ,
400
392
baseColumn = " internal_device_id" )
401
393
402
394
# the device / user attributes will be the same for the same device id
403
395
# so lets use the internal device id as the base column for these attribute
404
396
.withColumn(" country" , StringType(), values = country_codes,
405
397
weights = country_weights,
406
- baseColumn="internal_device_id", baseColumnType="hash")
407
- .withColumn("country2a", LongType(),
408
- expr="((hash(internal_device_id) % 3847) + 3847) % 3847",
409
398
baseColumn = " internal_device_id" )
410
- .withColumn("country2", IntegerType(),
411
- expr="""floor(cast( (((internal_device_id % 3847) + 3847) % 3847)
412
- as double) )""",
413
- baseColumn="internal_device_id")
414
- .withColumn("country3", StringType(), values=country_codes,
415
- baseColumn="country2")
416
399
.withColumn(" manufacturer" , StringType(), values = manufacturers,
417
400
baseColumn = " internal_device_id" )
418
401
@@ -428,9 +411,9 @@ testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
428
411
baseColumn = [" line" , " model_ser" ])
429
412
.withColumn(" event_type" , StringType(),
430
413
values = [" activation" , " deactivation" , " plan change" ,
431
- "telecoms activity",
432
- "internet activity", "device error"],
414
+ " telecoms activity" , " internet activity" , " device error" ],
433
415
random = True )
416
+ .withColumn(" event_ts" , " timestamp" , begin = " 2020-01-01 01:00:00" , end = " 2020-12-31 23:59:00" , interval = " 1 minute" , random = True )
434
417
435
418
)
436
419
@@ -455,7 +438,7 @@ data_rows = 10000000
455
438
456
439
spark.conf.set(" spark.sql.shuffle.partitions" , shuffle_partitions_requested)
457
440
458
- dataspec = (dg.DataGenerator(spark, rows = 10000000 , partitions = 8 )
441
+ dataspec = (dg.DataGenerator(spark, rows = data_rows , partitions = 8 , randomSeedMethod = " hash_fieldname " )
459
442
.withColumn(" name" , percentNulls = 0.01 , template = r ' \\ w \\ w| \\ w a. \\ w' )
460
443
.withColumn(" payment_instrument_type" , values = [' paypal' , ' visa' , ' mastercard' , ' amex' ], random = True )
461
444
.withColumn(" payment_instrument" , minValue = 1000000 , maxValue = 10000000 , template = " dddd dddddd ddddd" )
@@ -500,7 +483,7 @@ data_rows = 10000000
500
483
spark.conf.set(" spark.sql.shuffle.partitions" , shuffle_partitions_requested)
501
484
502
485
dataspec = (
503
- dg.DataGenerator(spark, rows = 10000000 , partitions = 8 , randomSeedMethod = " hash_fieldname" , randomSeed = 42 )
486
+ dg.DataGenerator(spark, rows = data_rows , partitions = 8 , randomSeedMethod = " hash_fieldname" , randomSeed = 42 )
504
487
.withColumn(" name" , percentNulls = 0.01 , template = r ' \\ w \\ w| \\ w a. \\ w' )
505
488
.withColumn(" payment_instrument_type" , values = [' paypal' , ' visa' , ' mastercard' , ' amex' ],
506
489
random = True )
0 commit comments