databrickslabs
diff --git a/‎docs/source/APIDOCS.md
Lines changed: 18 additions & 35 deletions b/‎docs/source/APIDOCS.md
Lines changed: 18 additions & 35 deletions
diff --git a/‎docs/source/extending_text_generation.rst
Lines changed: 9 additions & 6 deletions b/‎docs/source/extending_text_generation.rst
Lines changed: 9 additions & 6 deletions
diff --git a/‎docs/source/generating_cdc_data.rst
Lines changed: 162 additions & 0 deletions b/‎docs/source/generating_cdc_data.rst
Lines changed: 162 additions & 0 deletions
diff --git a/‎docs/source/index.rst
Lines changed: 3 additions & 1 deletion b/‎docs/source/index.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/source/multi_table_data.rst
Lines changed: 17 additions & 0 deletions b/‎docs/source/multi_table_data.rst
Lines changed: 17 additions & 0 deletions
@@ -214,8 +214,9 @@ table_schema = spark.table("test_vehicle_data").schema
 
 print(table_schema)
   
-dataspec = (dg.DataGenerator(spark, rows=10000000, partitions=8)
-                .withSchema(table_schema))
+dataspec = (dg.DataGenerator(spark, rows=10000000, partitions=8, 
+                  randomSeedMethod="hash_fieldname")
+            .withSchema(table_schema))
 
 dataspec = (dataspec
                 .withColumnSpec("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')                                       
@@ -280,13 +281,13 @@ manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Emba
 lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
 
 testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
-                                 partitions=partitions_requested, randomSeedMethod='hash_fieldname',
-                                 verbose=True, debug=True)
+                                 partitions=partitions_requested, 
+                                 randomSeedMethod='hash_fieldname')
                 .withIdOutput()
                 # we'll use hash of the base field to generate the ids to 
                 # avoid a simple incrementing sequence
                 .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
-                            uniqueValues=device_population)
+                            uniqueValues=device_population, omit=True, baseColumnType="hash")
 
                 # note for format strings, we must use "%lx" not "%x" as the 
                 # underlying value is a long
@@ -297,16 +298,7 @@ testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
                 # so lets use the internal device id as the base column for these attribute
                 .withColumn("country", StringType(), values=country_codes,
                             weights=country_weights,
-                            baseColumn="internal_device_id", baseColumnType="hash")
-                .withColumn("country2a", LongType(),
-                            expr="((hash(internal_device_id) % 3847) + 3847) % 3847",
                             baseColumn="internal_device_id")
-                .withColumn("country2", IntegerType(),
-                            expr="""floor(cast( (((internal_device_id % 3847) + 3847) % 3847) 
-                                          as double) )""",
-                            baseColumn="internal_device_id")
-                .withColumn("country3", StringType(), values=country_codes,
-                            baseColumn="country2")
                 .withColumn("manufacturer", StringType(), values=manufacturers,
                             baseColumn="internal_device_id")
 
@@ -324,6 +316,7 @@ testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
                             values=["activation", "deactivation", "plan change",
                                     "telecoms activity", "internet activity", "device error"],
                             random=True)
+                .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", interval="1 minute", random=True)
 
                 )
 
@@ -363,7 +356,7 @@ or billions of rows.
 For example, using the same code as before, but with different rows and partitions settings, you can generate a billion
 rows of data and write it to a Delta table in under 2 minutes (with a 12 node x 8 core cluster)
 
-```
+```python
 from pyspark.sql.types import LongType, IntegerType, StringType
 
 import dbldatagen as dg
@@ -386,33 +379,23 @@ manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Emba
 lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
 
 testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
-                                 partitions=partitions_requested, randomSeedMethod='hash_fieldname',
-                                 verbose=True, debug=True)
+                                 partitions=partitions_requested, randomSeedMethod='hash_fieldname')
                 .withIdOutput()
-                # we'll use hash of the base field to generate the ids to avoid a 
-                # simple incrementing sequence
+                # we'll use hash of the base field to generate the ids to 
+                # avoid a simple incrementing sequence
                 .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
-                            unique_values=device_population)
+                            uniqueValues=device_population, omit=True, baseColumnType="hash")
 
-                # note for format strings, we must use "%lx" not "%x" as the underlying 
-                # value is a long
+                # note for format strings, we must use "%lx" not "%x" as the 
+                # underlying value is a long
                 .withColumn("device_id", StringType(), format="0x%013x",
                             baseColumn="internal_device_id")
 
                 # the device / user attributes will be the same for the same device id 
                 # so lets use the internal device id as the base column for these attribute
                 .withColumn("country", StringType(), values=country_codes,
                             weights=country_weights,
-                            baseColumn="internal_device_id", baseColumnType="hash")
-                .withColumn("country2a", LongType(),
-                            expr="((hash(internal_device_id) % 3847) + 3847) % 3847",
                             baseColumn="internal_device_id")
-                .withColumn("country2", IntegerType(),
-                            expr="""floor(cast( (((internal_device_id % 3847) + 3847) % 3847) 
-                                                 as double) )""",
-                            baseColumn="internal_device_id")
-                .withColumn("country3", StringType(), values=country_codes,
-                            baseColumn="country2")
                 .withColumn("manufacturer", StringType(), values=manufacturers,
                             baseColumn="internal_device_id")
 
@@ -428,9 +411,9 @@ testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=data_rows,
                             baseColumn=["line", "model_ser"])
                 .withColumn("event_type", StringType(),
                             values=["activation", "deactivation", "plan change",
-                                    "telecoms activity",
-                                    "internet activity", "device error"],
+                                    "telecoms activity", "internet activity", "device error"],
                             random=True)
+                .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", interval="1 minute", random=True)
 
                 )
 
@@ -455,7 +438,7 @@ data_rows = 10000000
 
 spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
 
-dataspec = (dg.DataGenerator(spark, rows=10000000, partitions=8)
+dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname")
                 .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w') 
                 .withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'], random=True)             
                 .withColumn("payment_instrument",  minValue=1000000, maxValue=10000000, template="dddd dddddd ddddd") 
@@ -500,7 +483,7 @@ data_rows = 10000000
 spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
 
 dataspec = (
-    dg.DataGenerator(spark, rows=10000000, partitions=8, randomSeedMethod="hash_fieldname", randomSeed=42)
+    dg.DataGenerator(spark, rows=data_rows, partitions=8, randomSeedMethod="hash_fieldname", randomSeed=42)
     .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
     .withColumn("payment_instrument_type", values=['paypal', 'visa', 'mastercard', 'amex'],
                 random=True)
 
@@ -41,7 +41,8 @@ extended syntax.
    'sesame','Jelly','beans',
    'pie','bar','Ice','oat' ]
 
-   fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+   fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
+                    randomSeedMethod="hash_fieldname")
                .withColumn("name", percentNulls=0.1, text=FakerText("name") )
                .withColumn("payment_instrument", text=FakerText("credit_card_number" ))
                .withColumn("email", text=FakerText("ascii_company_email") )
@@ -71,7 +72,7 @@ For more information, see :data:`~dbldatagen.text_generator_plugins.PyfuncText`
 
 .. note::
 
-  The perform of text generation using external libraries or Python functions is substantially slower than the base
+  The performance of text generation using external libraries or Python functions may be substantially slower than the base
   text generation capabilities. However it should be sufficient for generation of tables of up to
   100 million rows on a medium sized cluster.
 
@@ -98,9 +99,10 @@ The following code shows use of a custom Python function to generate text:
    # the data generation function
    text_generator = (lambda context, value: context.prefix + str(value))
 
-   pluginDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
-                   .withColumn("text", text=PyfuncText(text_generator, initFn=initPluginContext))
-                   )
+   pluginDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
+                     randomSeedMethod="hash_fieldname")
+                     .withColumn("text", text=PyfuncText(text_generator, initFn=initPluginContext))
+                    )
    dfPlugin = pluginDataspec.build()
    dfPlugin.show()
 
@@ -164,7 +166,8 @@ IP addresses and credit card numbers.
    cc_generator = (lambda context, v : context.faker.credit_card_number())
    email_generator = (lambda context, v : context.faker.ascii_company_email())
 
-   fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+   fakerDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
+                    randomSeedMethod="hash_fieldname")
                .withColumn("name",
                            percentNulls=0.1,
                            text=PyfuncText(name_generator , initFn=initFaker))
 
@@ -0,0 +1,162 @@
+.. Test Data Generator documentation master file, created by
+   sphinx-quickstart on Sun Jun 21 10:54:30 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Generating Change Data Capture data
+===================================
+
+This section explores some of the features for generating CDC style data - that is exploring the abilitty to
+generate a base data set and then apply changes such as updates to existing rows and
+new rows that will be inserts to the existing data
+
+See the section on repeatable data generation for the concepts that underpin the data generation.
+
+Overview
+--------
+We'll generate a customer table, and write out the data.
+
+Then we generate changes for the table and show merging them in.
+
+To start, we'll specify some locations for our data:
+
+.. code-block:: python
+
+   BASE_PATH = '/tmp/dbldatagen/cdc/'
+   dbutils.fs.mkdirs(BASE_PATH)
+
+   customers1_location = BASE_PATH + "customers1"
+
+Lets generate 10 million customer style records.
+
+We'll add a timestamp for when the row was generated and a memo field to mark what operation added it.
+
+.. code-block:: python
+
+   import dbldatagen as dg
+   import pyspark.sql.functions as F
+
+   spark.catalog.clearCache()
+   shuffle_partitions_requested = 8
+   partitions_requested = 32
+   data_rows = 10 * 1000 * 1000
+
+   spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
+   spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
+   spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)
+
+   uniqueCustomers = 10 * 1000000
+
+   dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+               .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
+               .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+               .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+               .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard', 'American Express', 'discover', 'branded visa', 'branded mastercard'], random=True, distribution="normal")
+               .withColumn("int_payment_instrument", "long",  minValue=100000000000000, maxValue=999999999999999,  baseColumn="customer_id", baseColumnType="hash", omit=True)
+               .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '#### ###### #####')", baseColumn="int_payment_instrument")
+               .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
+               .withColumn("email2", template=r'\\w.\\w@\\w.com')
+               .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
+               .withColumn("md5_payment_instrument",
+                           expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
+                           base_column=['payment_instrument_type', 'payment_instrument'])
+               .withColumn("customer_notes", text=dg.ILText(words=(1,8)))
+               .withColumn("created_ts", "timestamp", expr="now()")
+               .withColumn("modified_ts", "timestamp", expr="now()")
+               .withColumn("memo", expr="'original data'")
+               )
+   df1 = dataspec.build()
+
+   # write table
+
+   df1.write.format("delta").save(customers1_location)
+
+Creating a table definition
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We can use the features of the data generator to script SQL definitions for table creation and merge
+statements.
+
+Lets create a table definition around our data. As we generate a SQL statement with an explicit location,
+the table is implicitly ``external`` and will not overwrite our data.
+
+.. code-block:: python
+
+   customers1_location = BASE_PATH + "customers1"
+   tableDefn=dataspec.scriptTable(name="customers1", location=customers1_location)
+
+   spark.sql(tableDefn)
+
+Now lets explore the table layout:
+
+.. code-block:: sql
+
+   %sql
+   -- lets check our table
+
+   select * from customers1
+
+Creating Changes
+^^^^^^^^^^^^^^^^
+
+Lets generate some changes.
+
+Here we want to generate a set of new rows, which we guarantee to be new by using customer ids greater than the maximum
+existing customer id.
+
+We will also generate a set of updates by sampling from the existing data and adding some modifications.
+
+.. code-block:: python
+
+   import dbldatagen as dg
+   import pyspark.sql.functions as F
+
+   start_of_new_ids = df1.select(F.max('customer_id')+1).collect()[0][0]
+
+   print(start_of_new_ids)
+
+   df1_inserts = (dataspec.clone()
+           .option("startingId", start_of_new_ids)
+           .withRowCount(10 * 1000)
+           .build()
+           .withColumn("memo", F.lit("insert"))
+           .withColumn("customer_id", F.expr(f"customer_id + {start_of_new_ids}"))
+                 )
+
+   df1_updates = (df1.sample(False, 0.1)
+           .limit(50 * 1000)
+           .withColumn("alias", F.lit('modified alias'))
+           .withColumn("modified_ts",F.expr('current_timestamp()'))
+           .withColumn("memo", F.lit("update")))
+
+
+   df_changes = df1_inserts.union(df1_updates)
+
+   display(df_changes)
+
+Merging in the changes
+^^^^^^^^^^^^^^^^^^^^^^
+
+We can script the merge statement in the data generator.
+
+The ``updateColumns`` argument, specifies which columns should be updated.
+The corresponding ``updateColumnExprs`` argument provides SQL expressions as overrides for the
+columns being updated. These do not have to provided - in which case the
+values of the columns from the source table will be used.
+
+.. code-block:: python
+
+   df_changes.dropDuplicates(["customer_id"]).createOrReplaceTempView("customers1_changes")
+   sqlStmt = dataspec.scriptMerge(tgtName="customers1", srcName="customers1_changes",
+                                  joinExpr="src.customer_id=tgt.customer_id",
+                                  updateColumns=["alias", "memo","modified_ts"],
+                                  updateColumnExprs=[ ("memo", "'updated on merge'"),
+                                                      ("modified_ts", "now()")
+                                                    ])
+
+   print(sqlStmt)
+
+   spark.sql(sqlStmt)
+
+That's all that's required to perform merges with the data generation framework.
+Note that these merge script statements can be used as part of a streaming merge implementation also.
@@ -27,8 +27,10 @@ to Scala or R based Spark applications also.
    Using data distributions <DISTRIBUTIONS>
    Options for column specification <options_and_features>
    Generating repeatable data  <repeatable_data_generation>
-   Extending text generation  <extending_text_generation>
    Using streaming data <using_streaming_data>
+   Generating Change Data Capture (CDC) data<generating_cdc_data>
+   Multi table data <multi_table_data>
+   Extending text generation  <extending_text_generation>
    Troubleshooting data generation <troubleshooting>
 
 .. toctree::
 
@@ -0,0 +1,17 @@
+.. Test Data Generator documentation master file, created by
+   sphinx-quickstart on Sun Jun 21 10:54:30 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Multi table data and change data capture
+========================================
+
+See the section repeatable data generation for the concepts that underpin the data generation.
+
+One common scenario is the need to be able to generate multiple tables
+with consistent primary and foreign keys to model join or merge scenarios.
+
+By generating tables with repeatable data, we can generate multiple versions of the same data for different tables and
+ensure that we have referential integrity across the tables.
+
+Examples to be added.