Skip to content

Commit d90ce87

Browse files
made 'hash_fieldname' default for random seed generation for better value spread (#58)
* made 'hash_fieldname' default for random seed generation for better spread of values * made 'hash_fieldname' default for random seed generation for better spread of values
1 parent 3873f87 commit d90ce87

File tree

3 files changed

+32
-18
lines changed

3 files changed

+32
-18
lines changed

dbldatagen/data_generator.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
_OLD_MIN_OPTION = 'min'
2020
_OLD_MAX_OPTION = 'max'
2121

22-
NO_SEED_SUPPLIED = -2
23-
2422

2523
class DataGenerator:
2624
""" Main Class for test data set generation
@@ -53,7 +51,7 @@ class DataGenerator:
5351
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.NOTSET)
5452

5553
def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
56-
rows=1000000, startingId=0, randomSeed=NO_SEED_SUPPLIED, partitions=None, verbose=False,
54+
rows=1000000, startingId=0, randomSeed=None, partitions=None, verbose=False,
5755
batchSize=None, debug=False, **kwargs):
5856
""" Constructor for data generator object """
5957

@@ -93,11 +91,20 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
9391
self.logger.warning("option 'generateWithSelects' switch is deprecated - selects will always be used")
9492

9593
self._seedMethod = randomSeedMethod
96-
self._instanceRandomSeed = randomSeed if randomSeed != NO_SEED_SUPPLIED else self._randomSeed
9794

98-
# if a valid random seed was supplied but no seed method was applied, make the seed method "fixed"
99-
if (randomSeed is not None and randomSeed != RANDOM_SEED_RANDOM) and randomSeedMethod is None:
100-
self._seedMethod = "fixed"
95+
if randomSeed is None:
96+
self._instanceRandomSeed = self._randomSeed
97+
98+
if randomSeedMethod is None:
99+
self._seedMethod = RANDOM_SEED_HASH_FIELD_NAME
100+
else:
101+
self._seedMethod = randomSeedMethod
102+
else:
103+
self._instanceRandomSeed = randomSeed
104+
105+
# if a valid random seed was supplied but no seed method was applied, make the seed method "fixed"
106+
if randomSeedMethod is None:
107+
self._seedMethod = "fixed"
101108

102109
if self._seedMethod not in [None, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME]:
103110
msg = f"""seedMethod should be None, '{RANDOM_SEED_FIXED}' or '{RANDOM_SEED_HASH_FIELD_NAME}' """
@@ -700,6 +707,12 @@ def _generateColumnDefinition(self, colName, colType=None, baseColumn=None,
700707
effective_random_seed = new_props["randomSeed"]
701708
new_props.pop("randomSeed")
702709
new_props["random"] = True
710+
711+
# if random seed has override but randomSeedMethod does not
712+
# set it to fixed
713+
if "randomSeedMethod" not in new_props:
714+
new_props["randomSeedMethod"] = RANDOM_SEED_FIXED
715+
703716
elif "random" in new_props and new_props["random"]:
704717
effective_random_seed = self._instanceRandomSeed
705718
else:

docs/source/repeatable_data_generation.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ column specification is marked as ``random``.
6969
If a random seed is provided, either as an argument to the DataGenerator instance specification,
7070
or as option on the column specification, the random seed will be applied to fields when random data generation is used.
7171

72-
By default, a default random seed is used unless a specific random seed is provided.
72+
By default, a default random seed is used and the randomSeedMethod is set to 'hash_fieldname' unless a specific
73+
random seed is provided.
7374

7475
Use of Hashed field names
7576
^^^^^^^^^^^^^^^^^^^^^^^^^

tests/test_repeatable_data.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -296,21 +296,21 @@ def test_random_seed_flow(self):
296296
self.assertEqual(pluginDataspec.randomSeed, dg.DEFAULT_RANDOM_SEED)
297297

298298
code1Spec = pluginDataspec.getColumnSpec("code1")
299-
self.assertEqual(code1Spec.randomSeed, dg.DEFAULT_RANDOM_SEED)
299+
self.assertIsNotNone(code1Spec.randomSeed)
300300
code2Spec = pluginDataspec.getColumnSpec("code2")
301-
self.assertEqual(code2Spec.randomSeed, 2021)
301+
self.assertEqual(code2Spec.randomSeed, 2021, "code2")
302302

303303
textSpec = pluginDataspec.getColumnSpec("text")
304-
self.assertEqual(textSpec.randomSeed, dg.DEFAULT_RANDOM_SEED)
304+
self.assertIsNotNone(textSpec.randomSeed)
305305

306306
textSpec2 = pluginDataspec.getColumnSpec("text2")
307-
self.assertEqual(textSpec2.textGenerator.randomSeed, dg.DEFAULT_RANDOM_SEED)
307+
self.assertIsNotNone(textSpec2.textGenerator.randomSeed)
308308

309309
ilTextSpec = pluginDataspec.getColumnSpec("paras")
310-
self.assertEqual(ilTextSpec.randomSeed, dg.DEFAULT_RANDOM_SEED)
310+
self.assertIsNotNone(ilTextSpec.randomSeed)
311311

312312
ilTextSpec2 = pluginDataspec.getColumnSpec("paras2")
313-
self.assertEqual(ilTextSpec2.textGenerator.randomSeed, dg.DEFAULT_RANDOM_SEED)
313+
self.assertIsNotNone(ilTextSpec2.textGenerator.randomSeed)
314314

315315
self.assertEqual(textSpec.randomSeed, textSpec.textGenerator.randomSeed)
316316
self.assertEqual(textSpec2.randomSeed, textSpec2.textGenerator.randomSeed)
@@ -335,16 +335,16 @@ def test_random_seed_flow2(self):
335335
self.assertEqual(pluginDataspec.randomSeed, effective_random_seed)
336336

337337
code1Spec = pluginDataspec.getColumnSpec("code1")
338-
self.assertEqual(code1Spec.randomSeed, effective_random_seed, "code1")
338+
self.assertIsNotNone(code1Spec.randomSeed, "code1")
339339

340340
textSpec = pluginDataspec.getColumnSpec("text")
341-
self.assertEqual(textSpec.randomSeed, effective_random_seed, "text")
341+
self.assertIsNotNone(textSpec.randomSeed, "text")
342342

343343
code2Spec = pluginDataspec.getColumnSpec("code2")
344-
self.assertEqual(code1Spec.randomSeed, effective_random_seed, "code2")
344+
self.assertIsNotNone(code1Spec.randomSeed, "code2")
345345

346346
text2Spec = pluginDataspec.getColumnSpec("text2")
347-
self.assertEqual(text2Spec.randomSeed, effective_random_seed, "text2")
347+
self.assertIsNotNone(text2Spec.randomSeed, "text2")
348348

349349
self.assertEqual(textSpec.randomSeed, textSpec.textGenerator.randomSeed)
350350
self.assertEqual(text2Spec.randomSeed, text2Spec.textGenerator.randomSeed)

0 commit comments

Comments
 (0)