Skip to content

Commit 04c8bd0

Browse files
Feature complex columns (#150)
* wip * switched type definition parser to pyparser based implementation * changes to support initialization of map, struct and array columns * additional tests for reported issue * added dependency specifiers on pyparsing * updated tests * updated tests and invalid type detection * updated tests and invalid type detection * corrected addParseAction lambda signature for consistency
1 parent 41f41bb commit 04c8bd0

File tree

10 files changed

+659
-131
lines changed

10 files changed

+659
-131
lines changed

CHANGELOG.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,19 @@ All notable changes to the Databricks Labs Data Generator will be documented in
66
### Unreleased
77

88
#### Changed
9-
* Refactoring of template text generation for better performance
9+
* Refactoring of template text generation for better performance via vectorized implementation
10+
* Additional migration of tests to use of `pytest`
11+
12+
#### Fixed
13+
* added type parsing support for binary and constructs such as `nvarchar(10)`
14+
* Fixed error occurring when schema contains map, array or struct.
1015

1116
#### Added
1217
* Ability to change name of seed column to custom name (defaults to `id`)
18+
* Added type parsing support for structs, maps and arrays and combinations of the above
19+
20+
#### Notes
21+
* column definitions for map, struct or array must use `expr` attribute to initialize field. Defaults to `NULL`
1322

1423
### Version 0.3.0
1524

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ numpy = "1.22.0"
1111
pyspark = "3.1.3"
1212
pyarrow = "1.0.1"
1313
pandas = "1.1.3"
14+
pyparsing = ">=2.4.7,<3.0.9"
1415

1516
sphinx = ">=2.0.0,<3.1.0"
1617
nbsphinx = "*"

dbldatagen/column_generation_spec.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pyspark.sql.functions import lit, concat, rand, round as sql_round, array, expr, when, udf, \
1414
format_string
1515
from pyspark.sql.types import FloatType, IntegerType, StringType, DoubleType, BooleanType, \
16-
TimestampType, DataType, DateType
16+
TimestampType, DataType, DateType, ArrayType, MapType, StructType
1717

1818
from .column_spec_options import ColumnSpecOptions
1919
from .datagen_constants import RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, RANDOM_SEED_RANDOM, DEFAULT_SEED_COLUMN
@@ -970,6 +970,9 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
970970
new_def = None
971971

972972
# generate expression
973+
if type(self.datatype) in [ArrayType, MapType, StructType] and self.expr is None:
974+
self.logger.warning("Array, Map or Struct type column with no SQL `expr` will result in NULL value")
975+
self.executionHistory.append(".. WARNING: Array, Map or Struct type column with no SQL `expr` ")
973976

974977
# handle weighted values for weighted value columns
975978
# a weighted values column will use a base value denoted by `self._weightedBaseColumn`
@@ -988,7 +991,8 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
988991
# rs: initialize the begin, end and interval if not initialized for date computations
989992
# defaults are start of day, now, and 1 minute respectively
990993

991-
self._computeImpliedRangeIfNeeded(self.datatype)
994+
if not type(self.datatype) in [ArrayType, MapType, StructType]:
995+
self._computeImpliedRangeIfNeeded(self.datatype)
992996

993997
# TODO: add full support for date value generation
994998
if self.expr is not None:
@@ -998,6 +1002,8 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
9981002
# record execution history
9991003
self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
10001004
self.executionHistory.append(f".. casting to `{self.datatype}`")
1005+
elif type(self.datatype) in [ArrayType, MapType, StructType]:
1006+
new_def = expr("NULL")
10011007
elif self._dataRange is not None and self._dataRange.isFullyPopulated():
10021008
self.executionHistory.append(f".. computing ranged value: {self._dataRange}")
10031009
new_def = self._computeRangedColumn(base_column=self.baseColumn, datarange=self._dataRange,

dbldatagen/data_generator.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ class DataGenerator:
3939
:param batchSize: = UDF batch number of rows to pass via Apache Arrow to Pandas UDFs
4040
:param debug: = if set to True, output debug level of information
4141
:param seedColumnName: = if set, this should be the name of the `seed` or logical `id` column. Defaults to `id`
42+
43+
By default the seed column is named `id`. If you need to use this column name in your generated data,
44+
it is recommended that you use a different name for the seed column - for example `_id`.
45+
46+
This may be specified by setting the `seedColumnName` attribute to `_id`
4247
"""
4348

4449
# class vars
@@ -699,12 +704,28 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
699704
baseColumn=None, nullable=True,
700705
omit=False, implicit=False, noWarn=False,
701706
**kwargs):
702-
""" add a new column for specification
707+
""" add a new column to the synthetic data generation specification
708+
709+
:param colName: Name of column to add. If this conflicts with the underlying seed column (`id`), it is
710+
recommended that the seed column name is customized during the construction of the data
711+
generator spec.
712+
:param colType: Data type for column. This may be specified as either a type from one of the possible
713+
pyspark.sql.types (e.g. `StringType`, `DecimalType(10,3)` etc) or as a string containing a Spark
714+
SQL type definition (i.e `String`, `array<Integer>`, `map<String, Float>`)
715+
:param omit: if True, the column will be omitted from the final set of columns in the generated data.
716+
Used to create columns that are used by other columns as intermediate results.
717+
Defaults to False
718+
719+
:param expr: Specifies SQL expression used to create column value. If specified, overrides the default rules
720+
for creating column value. Defaults to None
721+
722+
:param baseColumn: String or list of columns to control order of generation of columns. If not specified,
723+
column is dependent on base seed column (which defaults to `id`)
703724
704725
:returns: modified in-place instance of test data generator allowing for chaining of calls
705726
following Builder pattern
706727
707-
You may also add a variety of options to further control the test data generation process.
728+
You may also add a variety of additional options to further control the test data generation process.
708729
For full list of options, see :doc:`/reference/api/dbldatagen.column_spec_options`.
709730
710731
"""

0 commit comments

Comments
 (0)