|
| 1 | +.. Test Data Generator documentation master file, created by |
| 2 | + sphinx-quickstart on Sun Jun 21 10:54:30 2020. |
| 3 | + You can adapt this file completely to your liking, but it should at least |
| 4 | + contain the root `toctree` directive. |
| 5 | +
|
| 6 | +Generating JSON and structured column data |
| 7 | +========================================== |
| 8 | + |
| 9 | +This section explores generating JSON and structured column data. By structured columns, |
| 10 | +we mean columns that are some combination of `struct`, `array` and `map` of other types. |
| 11 | + |
| 12 | +Generating JSON data |
| 13 | +-------------------- |
| 14 | +There are several methods for generating JSON data: |
| 15 | + |
| 16 | +- Generate a dataframe and save it as JSON will generate full data set as JSON |
| 17 | +- Generate JSON valued fields using SQL functions such as `named_struct` and `to_json` |
| 18 | + |
| 19 | +Writing dataframe as JSON data |
| 20 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 21 | + |
| 22 | +The following example illustrates the basic technique for generating JSON data from a dataframe. |
| 23 | + |
| 24 | +.. code-block:: python |
| 25 | +
|
| 26 | + from pyspark.sql.types import LongType, IntegerType, StringType |
| 27 | +
|
| 28 | + import dbldatagen as dg |
| 29 | +
|
| 30 | +
|
| 31 | + country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', |
| 32 | + 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL'] |
| 33 | + country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, |
| 34 | + 17] |
| 35 | +
|
| 36 | + manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices'] |
| 37 | +
|
| 38 | + lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] |
| 39 | +
|
| 40 | + testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, |
| 41 | + partitions=8, |
| 42 | + randomSeedMethod='hash_fieldname') |
| 43 | + .withIdOutput() |
| 44 | + # we'll use hash of the base field to generate the ids to |
| 45 | + # avoid a simple incrementing sequence |
| 46 | + .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, |
| 47 | + uniqueValues=device_population, omit=True, baseColumnType="hash") |
| 48 | +
|
| 49 | + # note for format strings, we must use "%lx" not "%x" as the |
| 50 | + # underlying value is a long |
| 51 | + .withColumn("device_id", StringType(), format="0x%013x", |
| 52 | + baseColumn="internal_device_id") |
| 53 | +
|
| 54 | + # the device / user attributes will be the same for the same device id |
| 55 | + # so lets use the internal device id as the base column for these attribute |
| 56 | + .withColumn("country", StringType(), values=country_codes, |
| 57 | + weights=country_weights, |
| 58 | + baseColumn="internal_device_id") |
| 59 | + .withColumn("manufacturer", StringType(), values=manufacturers, |
| 60 | + baseColumn="internal_device_id") |
| 61 | +
|
| 62 | + # use omit = True if you don't want a column to appear in the final output |
| 63 | + # but just want to use it as part of generation of another column |
| 64 | + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", |
| 65 | + baseColumnType="hash") |
| 66 | + .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, |
| 67 | + baseColumn="device_id", |
| 68 | + baseColumnType="hash", omit=True) |
| 69 | +
|
| 70 | + .withColumn("event_type", StringType(), |
| 71 | + values=["activation", "deactivation", "plan change", |
| 72 | + "telecoms activity", "internet activity", "device error"], |
| 73 | + random=True) |
| 74 | + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", |
| 75 | + interval="1 minute", random=True) |
| 76 | +
|
| 77 | + ) |
| 78 | +
|
| 79 | + dfTestData = testDataSpec.build() |
| 80 | +
|
| 81 | + dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData1") |
| 82 | +
|
| 83 | +In the most basic form, you can simply save the dataframe to storage in JSON format. |
| 84 | + |
| 85 | +Use of nested structures in data generation specifications |
| 86 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 87 | + |
| 88 | +When we save a dataframe containing complex column types such as `map`, `struct` and `array`, these will be |
| 89 | +converted to equivalent constructs in JSON. |
| 90 | + |
| 91 | +So how do we go about creating these? |
| 92 | + |
| 93 | +We can use a struct valued column to hold the nested structure data and write the results out as JSON |
| 94 | + |
| 95 | +Struct / array and map valued columns can be created by adding a column of the appropriate type and using the `expr` |
| 96 | +attribute to assemble the complex column. |
| 97 | + |
| 98 | +Note that in the current release, the `expr` attribute will override other column data generation rules. |
| 99 | + |
| 100 | +.. code-block:: python |
| 101 | +
|
| 102 | + from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \ |
| 103 | + TimestampType, DateType, DecimalType, ByteType, BinaryType, ArrayType, MapType, StructType, StructField |
| 104 | +
|
| 105 | + import dbldatagen as dg |
| 106 | +
|
| 107 | +
|
| 108 | + country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', |
| 109 | + 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL'] |
| 110 | + country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, |
| 111 | + 17] |
| 112 | +
|
| 113 | + manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices'] |
| 114 | +
|
| 115 | + lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] |
| 116 | +
|
| 117 | + testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, |
| 118 | + partitions=8, |
| 119 | + randomSeedMethod='hash_fieldname') |
| 120 | + .withIdOutput() |
| 121 | + # we'll use hash of the base field to generate the ids to |
| 122 | + # avoid a simple incrementing sequence |
| 123 | + .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, |
| 124 | + uniqueValues=device_population, omit=True, baseColumnType="hash") |
| 125 | +
|
| 126 | + # note for format strings, we must use "%lx" not "%x" as the |
| 127 | + # underlying value is a long |
| 128 | + .withColumn("device_id", StringType(), format="0x%013x", |
| 129 | + baseColumn="internal_device_id") |
| 130 | +
|
| 131 | + # the device / user attributes will be the same for the same device id |
| 132 | + # so lets use the internal device id as the base column for these attribute |
| 133 | + .withColumn("country", StringType(), values=country_codes, |
| 134 | + weights=country_weights, |
| 135 | + baseColumn="internal_device_id") |
| 136 | +
|
| 137 | + .withColumn("manufacturer", StringType(), values=manufacturers, |
| 138 | + baseColumn="internal_device_id", omit=True) |
| 139 | + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", |
| 140 | + baseColumnType="hash", omit=True) |
| 141 | + .withColumn("manufacturer_info", StructType([StructField('line',StringType()), StructField('manufacturer', StringType())]), |
| 142 | + expr="named_struct('line', line, 'manufacturer', manufacturer)", |
| 143 | + baseColumn=['manufacturer', 'line']) |
| 144 | +
|
| 145 | +
|
| 146 | + .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, |
| 147 | + baseColumn="device_id", |
| 148 | + baseColumnType="hash", omit=True) |
| 149 | +
|
| 150 | + .withColumn("event_type", StringType(), |
| 151 | + values=["activation", "deactivation", "plan change", |
| 152 | + "telecoms activity", "internet activity", "device error"], |
| 153 | + random=True, omit=True) |
| 154 | + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", |
| 155 | + interval="1 minute", random=True, omit=True) |
| 156 | +
|
| 157 | + .withColumn("event_info", StructType([StructField('event_type',StringType()), StructField('event_ts', TimestampType())]), |
| 158 | + expr="named_struct('event_type', event_type, 'event_ts', event_ts)", |
| 159 | + baseColumn=['event_type', 'event_ts']) |
| 160 | + ) |
| 161 | +
|
| 162 | + dfTestData = testDataSpec.build() |
| 163 | + dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData2") |
| 164 | +
|
| 165 | +Generating JSON valued fields |
| 166 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 167 | + |
| 168 | +JSON valued fields can be generated as fields of `string` type and assembled using a combination of Spark SQL |
| 169 | +functions such as `named_struct` and `to_json`. |
| 170 | + |
| 171 | +.. code-block:: python |
| 172 | +
|
| 173 | + from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \ |
| 174 | + TimestampType, DateType, DecimalType, ByteType, BinaryType, ArrayType, MapType, StructType, StructField |
| 175 | +
|
| 176 | + import dbldatagen as dg |
| 177 | +
|
| 178 | +
|
| 179 | + country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', |
| 180 | + 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL'] |
| 181 | + country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, |
| 182 | + 17] |
| 183 | +
|
| 184 | + manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices'] |
| 185 | +
|
| 186 | + lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] |
| 187 | +
|
| 188 | + testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, |
| 189 | + partitions=8, |
| 190 | + randomSeedMethod='hash_fieldname') |
| 191 | + .withIdOutput() |
| 192 | + # we'll use hash of the base field to generate the ids to |
| 193 | + # avoid a simple incrementing sequence |
| 194 | + .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, |
| 195 | + uniqueValues=device_population, omit=True, baseColumnType="hash") |
| 196 | +
|
| 197 | + # note for format strings, we must use "%lx" not "%x" as the |
| 198 | + # underlying value is a long |
| 199 | + .withColumn("device_id", StringType(), format="0x%013x", |
| 200 | + baseColumn="internal_device_id") |
| 201 | +
|
| 202 | + # the device / user attributes will be the same for the same device id |
| 203 | + # so lets use the internal device id as the base column for these attribute |
| 204 | + .withColumn("country", StringType(), values=country_codes, |
| 205 | + weights=country_weights, |
| 206 | + baseColumn="internal_device_id") |
| 207 | +
|
| 208 | + .withColumn("manufacturer", StringType(), values=manufacturers, |
| 209 | + baseColumn="internal_device_id", omit=True) |
| 210 | + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", |
| 211 | + baseColumnType="hash", omit=True) |
| 212 | + .withColumn("manufacturer_info", "string", |
| 213 | + expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))", |
| 214 | + baseColumn=['manufacturer', 'line']) |
| 215 | +
|
| 216 | +
|
| 217 | + .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, |
| 218 | + baseColumn="device_id", |
| 219 | + baseColumnType="hash", omit=True) |
| 220 | +
|
| 221 | + .withColumn("event_type", StringType(), |
| 222 | + values=["activation", "deactivation", "plan change", |
| 223 | + "telecoms activity", "internet activity", "device error"], |
| 224 | + random=True, omit=True) |
| 225 | + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", |
| 226 | + interval="1 minute", random=True, omit=True) |
| 227 | +
|
| 228 | + .withColumn("event_info", "string", |
| 229 | + expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))", |
| 230 | + baseColumn=['event_type', 'event_ts']) |
| 231 | + ) |
| 232 | +
|
| 233 | + dfTestData = testDataSpec.build() |
| 234 | +
|
| 235 | + #dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData2") |
| 236 | + display(dfTestData) |
0 commit comments