Skip to content

Commit f22a79e

Browse files
doc updates : updates to github doc index, use only last 4 digits of fake payment instruments (#63)
* updates * use only last 4 digits of fake credit card numbers * use only last 4 digits of fake credit card numbers
1 parent f3167d8 commit f22a79e

File tree

6 files changed

+40
-18
lines changed

6 files changed

+40
-18
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ For example
7676
import dbldatagen as dg
7777
from pyspark.sql.types import IntegerType, FloatType, StringType
7878
79-
df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=cls.row_count,
79+
data_rows=1000 * 1000
80+
df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=data_rows,
8081
partitions=4)
8182
.withIdOutput()
8283
.withColumn("r", FloatType(), expr="floor(rand() * 350) * (86400 + 3600)",

docs/USING_THE_APIS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,8 @@ Note that for Sphinx source documents (.rst docs), correct rendering is only pro
1515
- [Using data ranges](source/DATARANGES.md)
1616
- [Using data distributions](source/DISTRIBUTIONS.md)
1717
- [Generating text data](source/textdata.rst)
18+
- [Repeatable data generation](source/repeatable_data_generation.rst)
19+
- [Generating CDC data](source/generating_cdc_data.rst)
20+
- [Multi-table data generation](source/multi_table_data.rst)
1821
- [Troubleshooting](source/troubleshooting.rst)
1922
- [License](source/license.rst)

docs/source/APIDOCS.md

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -426,8 +426,8 @@ dfTestData.write.format("delta").mode("overwrite").save(
426426
## Using SQL in data generation
427427
Any column specification can use arbitrary SQL expressions during data generation via the `expr` parameter.
428428

429-
The following example shows generation of synthetic names, email addresses, payment instruments and
430-
use of a SQL expression to compute MD5 hashes of synthetic credit card numbers:
429+
The following example shows generation of synthetic names, email addresses and
430+
use of a SQL expression to compute MD5 hashes of hypothetical synthetic credit card :
431431

432432
```python
433433
import dbldatagen as dg
@@ -440,8 +440,14 @@ spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
440440

441441
dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname")
442442
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
443-
.withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'], random=True)
444-
.withColumn("payment_instrument", minValue=1000000, maxValue=10000000, template="dddd dddddd ddddd")
443+
.withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'],
444+
random=True)
445+
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999,
446+
baseColumn="name",
447+
baseColumnType="hash", omit=True)
448+
.withColumn("payment_instrument",
449+
expr="format_number(int_payment_instrument, '**** ****** *####')",
450+
baseColumn="int_payment_instrument")
445451
.withColumn("email", template=r'\\w.\\w@\\w.com')
446452
.withColumn("md5_payment_instrument",
447453
expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
@@ -483,12 +489,17 @@ data_rows = 10000000
483489
spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
484490

485491
dataspec = (
486-
dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname", randomSeed=42)
492+
dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname",
493+
randomSeed=42)
487494
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
488495
.withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'],
489496
random=True)
490-
.withColumn("payment_instrument", minValue=1000000, maxValue=10000000,
491-
template="dddd dddddd ddddd")
497+
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999,
498+
baseColumn="name",
499+
baseColumnType="hash", omit=True)
500+
.withColumn("payment_instrument",
501+
expr="format_number(int_payment_instrument, '**** ****** *####')",
502+
baseColumn="int_payment_instrument")
492503
.withColumn("email", template=r'\\w.\\w@\\w.com')
493504
.withColumn("md5_payment_instrument",
494505
expr="md5(concat(payment_instrument_type, ':', payment_instrument))",

docs/source/extending_text_generation.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ extended syntax.
4444
fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
4545
randomSeedMethod="hash_fieldname")
4646
.withColumn("name", percentNulls=0.1, text=FakerText("name") )
47-
.withColumn("payment_instrument", text=FakerText("credit_card_number" ))
47+
.withColumn("address", text=FakerText("address" ))
4848
.withColumn("email", text=FakerText("ascii_company_email") )
4949
.withColumn("ip_address", text=FakerText("ipv4_private" ))
5050
.withColumn("faker_text", text=FakerText("sentence", ext_word_list=my_word_list) )
@@ -163,16 +163,16 @@ IP addresses and credit card numbers.
163163
164164
ip_address_generator = (lambda context, v : context.faker.ipv4_private())
165165
name_generator = (lambda context, v : context.faker.name())
166-
cc_generator = (lambda context, v : context.faker.credit_card_number())
166+
address_generator = (lambda context, v : context.faker.address())
167167
email_generator = (lambda context, v : context.faker.ascii_company_email())
168168
169169
fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
170170
randomSeedMethod="hash_fieldname")
171171
.withColumn("name",
172172
percentNulls=0.1,
173173
text=PyfuncText(name_generator , initFn=initFaker))
174-
.withColumn("payment_instrument",
175-
text=PyfuncText(cc_generator, initFn=initFaker))
174+
.withColumn("address",
175+
text=PyfuncText(address_generator, initFn=initFaker))
176176
.withColumn("email",
177177
text=PyfuncText(email_generator, initFn=initFaker))
178178
.withColumn("ip_address",

docs/source/generating_cdc_data.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,10 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh
5252
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
5353
.withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
5454
.withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard', 'American Express', 'discover', 'branded visa', 'branded mastercard'], random=True, distribution="normal")
55-
.withColumn("int_payment_instrument", "long", minValue=100000000000000, maxValue=999999999999999, baseColumn="customer_id", baseColumnType="hash", omit=True)
56-
.withColumn("payment_instrument", expr="format_number(int_payment_instrument, '#### ###### #####')", baseColumn="int_payment_instrument")
55+
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999, baseColumn="customer_id",
56+
baseColumnType="hash", omit=True)
57+
.withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
58+
baseColumn="int_payment_instrument")
5759
.withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
5860
.withColumn("email2", template=r'\\w.\\w@\\w.com')
5961
.withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')

tutorial/3-ChangeDataCapture-example.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,17 @@
4444
uniqueCustomers = 10 * 1000000
4545

4646
dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
47-
.withColumn("customer_id","long", uniqueValues=uniqueCustomers)
47+
.withColumn("customer_id", "long", uniqueValues=uniqueCustomers)
4848
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
4949
.withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
50-
.withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard', 'American Express', 'discover', 'branded visa', 'branded mastercard'], random=True, distribution="normal")
51-
.withColumn("int_payment_instrument", "long", minValue=100000000000000, maxValue=999999999999999, baseColumn="customer_id", baseColumnType="hash", omit=True)
52-
.withColumn("payment_instrument", expr="format_number(int_payment_instrument, '#### ###### #####')", baseColumn="int_payment_instrument")
50+
.withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
51+
'American Express', 'discover', 'branded visa',
52+
'branded mastercard'],
53+
random=True, distribution="normal")
54+
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999, baseColumn="customer_id",
55+
baseColumnType="hash", omit=True)
56+
.withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
57+
baseColumn="int_payment_instrument")
5358
.withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
5459
.withColumn("email2", template=r'\\w.\\w@\\w.com')
5560
.withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')

0 commit comments

Comments
 (0)