Skip to content

Commit 117e0d3

Browse files
Feature text generation plugins (#51)
* squashed commits and rebased with master * used address instead of cc * added FakerTextFactory with predefined Faker initialization * simplified syntax * simplified syntax * added fakerText() for default factory * updates to docs * updates to docs,etc
1 parent f22a79e commit 117e0d3

File tree

8 files changed

+740
-35
lines changed

8 files changed

+740
-35
lines changed

dbldatagen/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@
3737
from .function_builder import ColumnGeneratorBuilder
3838
from .spark_singleton import SparkSingleton
3939
from .text_generators import TemplateGenerator, ILText, TextGenerator
40-
40+
from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
4141

4242
__all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
4343
"column_generation_spec", "utils", "function_builder",
44-
"spark_singleton", "text_generators", "datarange", "datagen_constants"
44+
"spark_singleton", "text_generators", "datarange", "datagen_constants",
45+
"text_generator_plugins"
4546
]
4647

4748

dbldatagen/text_generator_plugins.py

Lines changed: 380 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,380 @@
1+
# See the License for the specific language governing permissions and
2+
# limitations under the License.
3+
#
4+
5+
"""
6+
This file defines the text generator plugin class `PyfuncText`
7+
"""
8+
9+
import importlib
10+
import logging
11+
12+
from .text_generators import TextGenerator
13+
from .utils import DataGenError
14+
15+
16+
class PyfuncText(TextGenerator):
17+
""" Text generator that supports generating text from arbitrary Python function
18+
19+
:param fn: function to call to generate text.
20+
:param init: function to call to initialize context
21+
:param initPerBatch: if init per batch is set to True, initialization of context is performed on every Pandas udf
22+
call. Default is False.
23+
:param name: String representing name of text generator when converted to string via ``repr`` or ``str``
24+
25+
The two functions define the plugin model
26+
27+
The first function, ``fn`` is called whenever text should be generated for a single column of a single row
28+
29+
It is called with the signature ``fn(context, value)`` unless a root property is set, in which the signature is
30+
``fn(rootProperty)`` with rootProperty having the value of the root property of the context.
31+
32+
Context is the stored context containing instances of random number generators, 3rd party
33+
client library objects etc.
34+
35+
The ``initFn`` is called to initialize the function call context. The plugin code can store arbitrary properties
36+
in the context following normal Python object rules.
37+
38+
The context is initialized with the property `textGenerator` prior to being initialized which is a reference to the
39+
enclosing text generator.
40+
41+
.. note::
42+
There are no expectations of repeatability of data generation when using external code
43+
or external libraries to generate text.
44+
45+
However, custom code can call the base class method to get a Numpy random
46+
number generator instance. This will have been seeded using the ``dbldatagen``
47+
random number seed if one was specified, so random numbers generated from this will be repeatable.
48+
49+
The custom code may call the property ``randomSeed`` on the text generator object to get the random seed
50+
which may be used to seed library specific initialization.
51+
52+
This random seed property may have the values ``None`` or ``-1`` which should be treated as meaning dont
53+
use a random seed.
54+
55+
The code does not guarantee thread or cross process safety. If a new instance of the random number
56+
generator is needed, you may call the base class method with the argument `forceNewInstance` set to True.
57+
"""
58+
59+
class _FnCallContext:
60+
""" inner class to support storage of context between calls
61+
62+
initial instances of random number generators, clients for services etc here during execution
63+
of the `initFn` calls
64+
65+
:param txtGen: - reference to outer PyfnText object
66+
"""
67+
68+
def __init__(self, txtGen):
69+
self.textGenerator = txtGen
70+
71+
def __init__(self, fn, init=None, initPerBatch=False, name=None, rootProperty=None):
72+
super().__init__()
73+
assert fn is not None or callable(fn), "Function must be provided wiith signature fn(context, oldValue)"
74+
assert init is None or callable(init), "Init function must be a callable function or lambda if passed"
75+
76+
# if root property is provided, root property will be passed to generate text function
77+
self._rootProperty = rootProperty
78+
79+
self._pyFn = fn # generate text function
80+
self._initFn = init # context initialization function
81+
self._context = None # context used to hold library root object and other properties
82+
83+
# if init per batch is True, initialization of context will be per UDF call
84+
assert initPerBatch in [True, False], "initPerBatch must evaluate to boolean True or False"
85+
self._initPerBatch = initPerBatch
86+
87+
self._name = name if name is not None else "PyfuncText"
88+
89+
def __str__(self):
90+
""" Get string representation of object
91+
``name`` property is used to provide user friendly name for text generator
92+
"""
93+
return f"{self._name}({repr(self._pyFn)}, init={self._initFn})"
94+
95+
def _getContext(self, forceNewInstance=False):
96+
""" Get the context for plugin function calls
97+
98+
:param forceNewInstance: if True, forces each call to create a new context
99+
:return: existing or newly created context.
100+
101+
"""
102+
context = self._context
103+
if context is None or forceNewInstance:
104+
context = PyfuncText._FnCallContext(self)
105+
106+
# init context using context creator if any provided
107+
if self._initFn is not None:
108+
self._initFn(context)
109+
110+
# save context for later use unless forceNewInstance is set
111+
if not forceNewInstance:
112+
self._context = context
113+
else:
114+
return context
115+
return self._context
116+
117+
def pandasGenerateText(self, v):
118+
""" Called to generate text via Pandas UDF mechanism
119+
120+
:param v: base value of column as Pandas Series
121+
122+
"""
123+
# save object properties in local vars to avoid overhead of object dereferences
124+
# on every call
125+
context = self._getContext(self._initPerBatch)
126+
evalFn = self._pyFn
127+
rootProperty = getattr(context, self._rootProperty) if self._rootProperty is not None else None
128+
129+
# define functions to call with context and with root property
130+
def _valueFromFn(originalValue):
131+
return evalFn(context, originalValue)
132+
133+
def _valueFromFnWithRoot(originalValue):
134+
return evalFn(rootProperty)
135+
136+
if rootProperty is not None:
137+
results = v.apply(_valueFromFnWithRoot, args=None)
138+
else:
139+
results = v.apply(_valueFromFn, args=None)
140+
141+
return results
142+
143+
144+
class PyfuncTextFactory:
145+
"""PyfuncTextFactory applies syntactic wrapping around creation of PyfuncText objects
146+
147+
:param name: name of generated object (when converted to string via ``str``)
148+
149+
It allows the use of the following constructs:
150+
151+
.. code-block:: python
152+
153+
# initialization (for Faker for example)
154+
155+
# setup use of Faker
156+
def initFaker(ctx):
157+
ctx.faker = Faker(locale="en_US")
158+
ctx.faker.add_provider(internet)
159+
160+
FakerText = (PyfuncTextFactory(name="FakerText")
161+
.withInit(initFaker) # determines how context should be initialized
162+
.withRootProperty("faker") # determines what context property is passed to fn
163+
)
164+
165+
# later use ...
166+
.withColumn("fake_name", text=FakerText("sentence", ext_word_list=my_word_list) )
167+
.withColumn("fake_sentence", text=FakerText("sentence", ext_word_list=my_word_list) )
168+
169+
# translates to generation of lambda function with keyword arguments
170+
# or without as needed
171+
.withColumn("fake_name",
172+
text=FakerText( (lambda faker: faker.name( )),
173+
init=initFaker,
174+
rootProperty="faker",
175+
name="FakerText"))
176+
.withColumn("fake_sentence",
177+
text=FakerText( (lambda faker:
178+
faker.sentence( **{ "ext_word_list" : my_word_list} )),
179+
init=initFaker,
180+
rootProperty="faker",
181+
name="FakerText"))
182+
183+
"""
184+
185+
def __init__(self, name=None):
186+
"""
187+
188+
:param name:
189+
"""
190+
self._initFn = None
191+
self._rootProperty = None
192+
self._name = "PyfuncText" if name is None else name
193+
self._initPerBatch = False
194+
195+
def withInit(self, fn):
196+
""" Specifies context initialization function
197+
198+
:param fn: function pointer or lambda function for initialization
199+
signature should ``initFunction(context)``
200+
201+
.. note::
202+
This variation initializes the context once per worker process per text generator
203+
instance.
204+
"""
205+
self._initFn = fn
206+
return self
207+
208+
def withInitPerBatch(self, fn):
209+
""" Specifies context initialization function
210+
211+
:param fn: function pointer or lambda function for initialization
212+
signature should ``initFunction(context)``
213+
214+
.. note::
215+
This variation initializes the context once per internal pandas UDF call.
216+
The UDF call will be called once per 10,000 rows if system is configured using defaults.
217+
Setting the pandas batch size as an argument to the DataSpec creation will change the default
218+
batch size.
219+
"""
220+
self._initPerBatch = True
221+
return self.withInit(fn)
222+
223+
def withRootProperty(self, prop):
224+
""" If called, specifies the property of the context to be passed to the text generation function.
225+
If not called, the context object itself will be passed to the text generation function.
226+
"""
227+
self._rootProperty = prop
228+
return self
229+
230+
def __call__(self, evalFn, *args, isProperty=False, **kwargs):
231+
""" Internal function call mechanism that implements the syntax expansion
232+
233+
:param evalFn: text generation function or lambda
234+
:param args: optional args to be passed by position
235+
:param kwargs: optional keyword args following Python keyword passing mechanism
236+
:param isProperty: if true, interpret evalFn as string name of property, not a function or method
237+
"""
238+
assert evalFn is not None and (type(evalFn) is str or callable(evalFn)), "Function must be provided"
239+
240+
if type(evalFn) is str:
241+
assert self._rootProperty is not None and len(self._rootProperty.strip()) > 0, \
242+
"string named functions can only be used on text generators with root property"
243+
fnName = evalFn
244+
if len(args) > 0 and len(kwargs) > 0:
245+
# generate lambda with both kwargs and args
246+
assert not isProperty, "isProperty cannot be true if using arguments"
247+
evalFn = (lambda root: getattr(root, fnName)(*args, **kwargs))
248+
elif len(args) > 0:
249+
# generate lambda with positional args
250+
assert not isProperty, "isProperty cannot be true if using arguments"
251+
evalFn = (lambda root: getattr(root, fnName)(*args))
252+
elif len(kwargs) > 0:
253+
# generate lambda with keyword args
254+
assert not isProperty, "isProperty cannot be true if using arguments"
255+
evalFn = (lambda root: getattr(root, fnName)(**kwargs))
256+
elif isProperty:
257+
# generate lambda with property access, not method call
258+
evalFn = (lambda root: getattr(root, fnName))
259+
else:
260+
# generate lambda with no args
261+
evalFn = (lambda root: getattr(root, fnName)())
262+
263+
# returns the actual PyfuncText text generator object.
264+
# Note all syntax expansion is performed once only
265+
return PyfuncText(evalFn, init=self._initFn, name=self._name, rootProperty=self._rootProperty)
266+
267+
268+
class FakerTextFactory(PyfuncTextFactory):
269+
""" Factory object for Faker text generator flavored ``PyfuncText`` objects
270+
271+
:param locale: list of locales. If empty, defaults to ``en-US``
272+
:param providers: list of providers
273+
:param name: name of generated objects. Defaults to ``FakerText``
274+
:param lib: library import name of Faker library. If none passed, uses ``faker``
275+
:param rootClass: name of root object class If none passed, uses ``Faker``
276+
277+
..note ::
278+
Both the library name and root object class can be overridden - this is primarily for internal testing purposes.
279+
"""
280+
281+
_FAKER_LIB = "faker"
282+
283+
_defaultFakerTextFactory = None
284+
285+
def __init__(self, locale=None, providers=None, name="FakerText", lib=None,
286+
rootClass=None):
287+
288+
super().__init__(name)
289+
290+
# set up the logger
291+
self._logger = logging.getLogger("FakerTextFactory")
292+
self._logger.setLevel(logging.WARNING)
293+
294+
# setup Faker library to use
295+
if lib is None:
296+
lib = self._FAKER_LIB
297+
298+
# allow overriding the root object class for test purposes
299+
if rootClass is None:
300+
self._rootObjectClass = "Faker"
301+
else:
302+
self._rootObjectClass = rootClass
303+
304+
# load the library
305+
fakerModule = self._loadLibrary(lib)
306+
307+
# make the initialization function
308+
initFn = self._mkInitFn(fakerModule, locale, providers)
309+
310+
self.withInit(initFn)
311+
self.withRootProperty("faker")
312+
313+
@classmethod
314+
def _getDefaultFactory(cls, lib=None, rootClass=None):
315+
"""Class method to get default faker text factory
316+
317+
Not intended for general use
318+
"""
319+
if cls._defaultFakerTextFactory is None:
320+
cls._defaultFakerTextFactory = FakerTextFactory(lib=lib, rootClass=rootClass)
321+
return cls._defaultFakerTextFactory
322+
323+
def _mkInitFn(self, libModule, locale, providers):
324+
""" Make Faker initialization function
325+
326+
:param locale: locale string or list of locale strings
327+
:param providers: providers to load
328+
:return:
329+
"""
330+
assert libModule is not None, "must have a valid loaded Faker library module"
331+
332+
fakerClass = getattr(libModule, self._rootObjectClass)
333+
334+
# define the initialization function for Faker
335+
def fakerInitFn(ctx):
336+
if locale is not None:
337+
ctx.faker = fakerClass(locale=locale)
338+
else:
339+
ctx.faker = fakerClass()
340+
341+
if providers is not None:
342+
for provider in providers:
343+
ctx.faker.add_provider(provider)
344+
345+
return fakerInitFn
346+
347+
def _loadLibrary(self, lib):
348+
""" Load faker library if not already loaded
349+
350+
:param lib: library name of Faker library. If none passed, uses ``faker``
351+
"""
352+
# load library
353+
try:
354+
if lib is not None:
355+
assert type(lib) is str and len(lib.strip()), f"Library ``{lib}`` must be a valid library name"
356+
357+
if lib in globals():
358+
return globals()[lib]
359+
else:
360+
fakerModule = importlib.import_module(lib)
361+
globals()[lib] = fakerModule
362+
return fakerModule
363+
except RuntimeError as err:
364+
raise DataGenError("Could not load or initialize Faker library", err)
365+
366+
367+
368+
369+
370+
def fakerText(mname, *args, _lib=None, _rootClass=None, **kwargs):
371+
"""Generate faker text generator object using default FakerTextFactory
372+
instance
373+
374+
:returns : instance of PyfuncText for use with Faker
375+
376+
``fakerText("sentence")`` is same as ``FakerTextFactory()("sentence")``
377+
"""
378+
defaultFactory = FakerTextFactory._getDefaultFactory(lib=_lib, rootClass=_rootClass)
379+
380+
return defaultFactory(mname, *args, **kwargs)

0 commit comments

Comments
 (0)