doc updates : updates to github doc index, use only last 4 digits of fake payment instruments (#63)

ronanstokes-db · web-flow · commit f22a79e16084 · 2021-08-04T08:48:49.000-07:00
* updates

* use only last 4 digits of fake credit card numbers

* use only last 4 digits of fake credit card numbers
diff --git a/README.md b/README.md
@@ -76,7 +76,8 @@ For example
 import dbldatagen as dg
 from pyspark.sql.types import IntegerType, FloatType, StringType
 
-df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=cls.row_count,
+data_rows=1000 * 1000
+df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=data_rows,
                                                   partitions=4)
                             .withIdOutput()
                             .withColumn("r", FloatType(), expr="floor(rand() * 350) * (86400 + 3600)",
diff --git a/docs/USING_THE_APIS.md b/docs/USING_THE_APIS.md
@@ -15,5 +15,8 @@ Note that for Sphinx source documents (.rst docs), correct rendering is only pro
 - [Using data ranges](source/DATARANGES.md)
 - [Using data distributions](source/DISTRIBUTIONS.md)
 - [Generating text data](source/textdata.rst)
+- [Repeatable data generation](source/repeatable_data_generation.rst)
+- [Generating CDC data](source/generating_cdc_data.rst)
+- [Multi-table data generation](source/multi_table_data.rst)
 - [Troubleshooting](source/troubleshooting.rst)
 - [License](source/license.rst)
diff --git a/docs/source/APIDOCS.md b/docs/source/APIDOCS.md
@@ -426,8 +426,8 @@ dfTestData.write.format("delta").mode("overwrite").save(
 ## Using SQL in data generation
 Any column specification can use arbitrary SQL expressions during data generation via the `expr` parameter.
 
-The following example shows generation of synthetic names, email addresses, payment instruments and 
-use of a SQL expression to compute MD5 hashes of synthetic credit card numbers:
+The following example shows generation of synthetic names, email addresses and 
+use of a SQL expression to compute MD5 hashes of hypothetical synthetic credit card :
 
 ```python
 import dbldatagen as dg
@@ -440,8 +440,14 @@ spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
 
 dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname")
                 .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w') 
-                .withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'], random=True)             
-                .withColumn("payment_instrument",  minValue=1000000, maxValue=10000000, template="dddd dddddd ddddd") 
+                .withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'], 
+                            random=True)             
+                .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  
+                            baseColumn="name",
+                            baseColumnType="hash", omit=True)
+                .withColumn("payment_instrument", 
+                             expr="format_number(int_payment_instrument, '**** ****** *####')",
+                             baseColumn="int_payment_instrument")
                 .withColumn("email", template=r'\\w.\\w@\\w.com')       
                 .withColumn("md5_payment_instrument", 
                             expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
@@ -483,12 +489,17 @@ data_rows = 10000000
 spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
 
 dataspec = (
-    dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname", randomSeed=42)
+    dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname", 
+                     randomSeed=42)
     .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
     .withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'],
                 random=True)
-    .withColumn("payment_instrument", minValue=1000000, maxValue=10000000,
-                template="dddd dddddd ddddd")
+    .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  
+                baseColumn="name",
+                baseColumnType="hash", omit=True)
+    .withColumn("payment_instrument", 
+                expr="format_number(int_payment_instrument, '**** ****** *####')",
+                baseColumn="int_payment_instrument")
     .withColumn("email", template=r'\\w.\\w@\\w.com')
     .withColumn("md5_payment_instrument",
                 expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
diff --git a/docs/source/extending_text_generation.rst b/docs/source/extending_text_generation.rst
@@ -44,7 +44,7 @@ extended syntax.
    fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
                     randomSeedMethod="hash_fieldname")
                .withColumn("name", percentNulls=0.1, text=FakerText("name") )
-               .withColumn("payment_instrument", text=FakerText("credit_card_number" ))
+               .withColumn("address", text=FakerText("address" ))
                .withColumn("email", text=FakerText("ascii_company_email") )
                .withColumn("ip_address", text=FakerText("ipv4_private" ))
                .withColumn("faker_text", text=FakerText("sentence", ext_word_list=my_word_list) )
@@ -163,16 +163,16 @@ IP addresses and credit card numbers.
 
    ip_address_generator = (lambda context, v : context.faker.ipv4_private())
    name_generator = (lambda context, v : context.faker.name())
-   cc_generator = (lambda context, v : context.faker.credit_card_number())
+   address_generator = (lambda context, v : context.faker.address())
    email_generator = (lambda context, v : context.faker.ascii_company_email())
 
    fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
                     randomSeedMethod="hash_fieldname")
                .withColumn("name",
                            percentNulls=0.1,
                            text=PyfuncText(name_generator , initFn=initFaker))
-               .withColumn("payment_instrument",
-                           text=PyfuncText(cc_generator, initFn=initFaker))
+               .withColumn("address",
+                           text=PyfuncText(address_generator, initFn=initFaker))
                .withColumn("email",
                            text=PyfuncText(email_generator, initFn=initFaker))
                .withColumn("ip_address",
diff --git a/docs/source/generating_cdc_data.rst b/docs/source/generating_cdc_data.rst
@@ -52,8 +52,10 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh
                .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
                .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
                .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard', 'American Express', 'discover', 'branded visa', 'branded mastercard'], random=True, distribution="normal")
-               .withColumn("int_payment_instrument", "long",  minValue=100000000000000, maxValue=999999999999999,  baseColumn="customer_id", baseColumnType="hash", omit=True)
-               .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '#### ###### #####')", baseColumn="int_payment_instrument")
+               .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  baseColumn="customer_id",
+                           baseColumnType="hash", omit=True)
+               .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
+                           baseColumn="int_payment_instrument")
                .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
                .withColumn("email2", template=r'\\w.\\w@\\w.com')
                .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
diff --git a/tutorial/3-ChangeDataCapture-example.py b/tutorial/3-ChangeDataCapture-example.py
@@ -44,12 +44,17 @@
 uniqueCustomers = 10 * 1000000
 
 dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
-            .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
+            .withColumn("customer_id", "long", uniqueValues=uniqueCustomers)
             .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
             .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
-            .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard', 'American Express', 'discover', 'branded visa', 'branded mastercard'], random=True, distribution="normal")
-            .withColumn("int_payment_instrument", "long",  minValue=100000000000000, maxValue=999999999999999,  baseColumn="customer_id", baseColumnType="hash", omit=True)
-            .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '#### ###### #####')", baseColumn="int_payment_instrument")
+            .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
+                                                           'American Express', 'discover', 'branded visa',
+                                                           'branded mastercard'],
+                        random=True, distribution="normal")
+            .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  baseColumn="customer_id",
+                        baseColumnType="hash", omit=True)
+            .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
+                        baseColumn="int_payment_instrument")
             .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
             .withColumn("email2", template=r'\\w.\\w@\\w.com')
             .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')