databrickslabs
diff --git a/‎CHANGELOG.md
Lines changed: 10 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 10 additions & 1 deletion
diff --git a/‎Pipfile
Lines changed: 1 addition & 0 deletions b/‎Pipfile
Lines changed: 1 addition & 0 deletions
diff --git a/‎dbldatagen/column_generation_spec.py
Lines changed: 8 additions & 2 deletions b/‎dbldatagen/column_generation_spec.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎dbldatagen/data_generator.py
Lines changed: 23 additions & 2 deletions b/‎dbldatagen/data_generator.py
Lines changed: 23 additions & 2 deletions
@@ -6,10 +6,19 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 ### Unreleased
 
 #### Changed
-* Refactoring of template text generation for better performance
+* Refactoring of template text generation for better performance via vectorized implementation
+* Additional migration of tests to use of `pytest`
+
+#### Fixed 
+* added type parsing support for binary and constructs such as `nvarchar(10)`
+* Fixed error occurring when schema contains map, array or struct. 
 
 #### Added 
 * Ability to change name of seed column to custom name (defaults to `id`)
+* Added type parsing support for structs, maps and arrays and combinations of the above
+
+#### Notes
+* column definitions for map, struct or array must use `expr` attribute to initialize field. Defaults to `NULL`
 
 ### Version 0.3.0
 
 
@@ -11,6 +11,7 @@ numpy = "1.22.0"
 pyspark = "3.1.3"
 pyarrow = "1.0.1"
 pandas = "1.1.3"
+pyparsing = ">=2.4.7,<3.0.9"
 
 sphinx = ">=2.0.0,<3.1.0"
 nbsphinx = "*"
 
@@ -13,7 +13,7 @@
 from pyspark.sql.functions import lit, concat, rand, round as sql_round, array, expr, when, udf, \
     format_string
 from pyspark.sql.types import FloatType, IntegerType, StringType, DoubleType, BooleanType, \
-    TimestampType, DataType, DateType
+    TimestampType, DataType, DateType, ArrayType, MapType, StructType
 
 from .column_spec_options import ColumnSpecOptions
 from .datagen_constants import RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, RANDOM_SEED_RANDOM, DEFAULT_SEED_COLUMN
@@ -970,6 +970,9 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
         new_def = None
 
         # generate expression
+        if type(self.datatype) in [ArrayType, MapType, StructType] and self.expr is None:
+            self.logger.warning("Array, Map or Struct type column with no SQL `expr` will result in NULL value")
+            self.executionHistory.append(".. WARNING: Array, Map or Struct type column with no SQL `expr` ")
 
         # handle weighted values for weighted value columns
         # a weighted values column will use a base value denoted by `self._weightedBaseColumn`
@@ -988,7 +991,8 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
             # rs: initialize the begin, end and interval if not initialized for date computations
             # defaults are start of day, now, and 1 minute respectively
 
-            self._computeImpliedRangeIfNeeded(self.datatype)
+            if not type(self.datatype) in [ArrayType, MapType, StructType]:
+                self._computeImpliedRangeIfNeeded(self.datatype)
 
             # TODO: add full support for date value generation
             if self.expr is not None:
@@ -998,6 +1002,8 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
                 # record execution history
                 self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
                 self.executionHistory.append(f".. casting to  `{self.datatype}`")
+            elif type(self.datatype) in [ArrayType, MapType, StructType]:
+                new_def = expr("NULL")
             elif self._dataRange is not None and self._dataRange.isFullyPopulated():
                 self.executionHistory.append(f".. computing ranged value: {self._dataRange}")
                 new_def = self._computeRangedColumn(base_column=self.baseColumn, datarange=self._dataRange,
 
@@ -39,6 +39,11 @@ class DataGenerator:
     :param batchSize: = UDF batch number of rows to pass via Apache Arrow to Pandas UDFs
     :param debug: = if set to True, output debug level of information
     :param seedColumnName: = if set, this should be the name of the `seed` or logical `id` column. Defaults to `id`
+
+    By default the seed column is named `id`. If you need to use this column name in your generated data,
+    it is recommended that you use a different name for the seed column - for example `_id`.
+
+    This may be specified by setting the `seedColumnName` attribute to `_id`
     """
 
     # class vars
@@ -699,12 +704,28 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
                    baseColumn=None, nullable=True,
                    omit=False, implicit=False, noWarn=False,
                    **kwargs):
-        """ add a new column for specification
+        """ add a new column to the synthetic data generation specification
+
+        :param colName: Name of column to add. If this conflicts with the underlying seed column (`id`), it is
+                        recommended that the seed column name is customized during the construction of the data
+                        generator spec.
+        :param colType: Data type for column. This may be specified as either a type from one of the possible
+                        pyspark.sql.types (e.g. `StringType`, `DecimalType(10,3)` etc) or as a string containing a Spark
+                        SQL type definition (i.e  `String`, `array<Integer>`, `map<String, Float>`)
+        :param omit: if True, the column will be omitted from the final set of columns in the generated data.
+                     Used to create columns that are used by other columns as intermediate results.
+                     Defaults to False
+
+        :param expr: Specifies SQL expression used to create column value. If specified, overrides the default rules
+                     for creating column value. Defaults to None
+
+        :param baseColumn: String or list of columns to control order of generation of columns. If not specified,
+                           column is dependent on base seed column (which defaults to `id`)
 
         :returns: modified in-place instance of test data generator allowing for chaining of calls
                   following Builder pattern
 
-        You may also add a variety of options to further control the test data generation process.
+        You may also add a variety of additional options to further control the test data generation process.
         For full list of options, see :doc:`/reference/api/dbldatagen.column_spec_options`.
 
         """