|
| 1 | +# See the License for the specific language governing permissions and |
| 2 | +# limitations under the License. |
| 3 | +# |
| 4 | + |
| 5 | +""" |
| 6 | +This file defines the text generator plugin class `PyfuncText` |
| 7 | +""" |
| 8 | + |
| 9 | +import importlib |
| 10 | +import logging |
| 11 | + |
| 12 | +from .text_generators import TextGenerator |
| 13 | +from .utils import DataGenError |
| 14 | + |
| 15 | + |
| 16 | +class PyfuncText(TextGenerator): |
| 17 | + """ Text generator that supports generating text from arbitrary Python function |
| 18 | +
|
| 19 | + :param fn: function to call to generate text. |
| 20 | + :param init: function to call to initialize context |
| 21 | + :param initPerBatch: if init per batch is set to True, initialization of context is performed on every Pandas udf |
| 22 | + call. Default is False. |
| 23 | + :param name: String representing name of text generator when converted to string via ``repr`` or ``str`` |
| 24 | +
|
| 25 | + The two functions define the plugin model |
| 26 | +
|
| 27 | + The first function, ``fn`` is called whenever text should be generated for a single column of a single row |
| 28 | +
|
| 29 | + It is called with the signature ``fn(context, value)`` unless a root property is set, in which the signature is |
| 30 | + ``fn(rootProperty)`` with rootProperty having the value of the root property of the context. |
| 31 | +
|
| 32 | + Context is the stored context containing instances of random number generators, 3rd party |
| 33 | + client library objects etc. |
| 34 | +
|
| 35 | + The ``initFn`` is called to initialize the function call context. The plugin code can store arbitrary properties |
| 36 | + in the context following normal Python object rules. |
| 37 | +
|
| 38 | + The context is initialized with the property `textGenerator` prior to being initialized which is a reference to the |
| 39 | + enclosing text generator. |
| 40 | +
|
| 41 | + .. note:: |
| 42 | + There are no expectations of repeatability of data generation when using external code |
| 43 | + or external libraries to generate text. |
| 44 | +
|
| 45 | + However, custom code can call the base class method to get a Numpy random |
| 46 | + number generator instance. This will have been seeded using the ``dbldatagen`` |
| 47 | + random number seed if one was specified, so random numbers generated from this will be repeatable. |
| 48 | +
|
| 49 | + The custom code may call the property ``randomSeed`` on the text generator object to get the random seed |
| 50 | + which may be used to seed library specific initialization. |
| 51 | +
|
| 52 | + This random seed property may have the values ``None`` or ``-1`` which should be treated as meaning dont |
| 53 | + use a random seed. |
| 54 | +
|
| 55 | + The code does not guarantee thread or cross process safety. If a new instance of the random number |
| 56 | + generator is needed, you may call the base class method with the argument `forceNewInstance` set to True. |
| 57 | + """ |
| 58 | + |
| 59 | + class _FnCallContext: |
| 60 | + """ inner class to support storage of context between calls |
| 61 | +
|
| 62 | + initial instances of random number generators, clients for services etc here during execution |
| 63 | + of the `initFn` calls |
| 64 | +
|
| 65 | + :param txtGen: - reference to outer PyfnText object |
| 66 | + """ |
| 67 | + |
| 68 | + def __init__(self, txtGen): |
| 69 | + self.textGenerator = txtGen |
| 70 | + |
| 71 | + def __init__(self, fn, init=None, initPerBatch=False, name=None, rootProperty=None): |
| 72 | + super().__init__() |
| 73 | + assert fn is not None or callable(fn), "Function must be provided wiith signature fn(context, oldValue)" |
| 74 | + assert init is None or callable(init), "Init function must be a callable function or lambda if passed" |
| 75 | + |
| 76 | + # if root property is provided, root property will be passed to generate text function |
| 77 | + self._rootProperty = rootProperty |
| 78 | + |
| 79 | + self._pyFn = fn # generate text function |
| 80 | + self._initFn = init # context initialization function |
| 81 | + self._context = None # context used to hold library root object and other properties |
| 82 | + |
| 83 | + # if init per batch is True, initialization of context will be per UDF call |
| 84 | + assert initPerBatch in [True, False], "initPerBatch must evaluate to boolean True or False" |
| 85 | + self._initPerBatch = initPerBatch |
| 86 | + |
| 87 | + self._name = name if name is not None else "PyfuncText" |
| 88 | + |
| 89 | + def __str__(self): |
| 90 | + """ Get string representation of object |
| 91 | + ``name`` property is used to provide user friendly name for text generator |
| 92 | + """ |
| 93 | + return f"{self._name}({repr(self._pyFn)}, init={self._initFn})" |
| 94 | + |
| 95 | + def _getContext(self, forceNewInstance=False): |
| 96 | + """ Get the context for plugin function calls |
| 97 | +
|
| 98 | + :param forceNewInstance: if True, forces each call to create a new context |
| 99 | + :return: existing or newly created context. |
| 100 | +
|
| 101 | + """ |
| 102 | + context = self._context |
| 103 | + if context is None or forceNewInstance: |
| 104 | + context = PyfuncText._FnCallContext(self) |
| 105 | + |
| 106 | + # init context using context creator if any provided |
| 107 | + if self._initFn is not None: |
| 108 | + self._initFn(context) |
| 109 | + |
| 110 | + # save context for later use unless forceNewInstance is set |
| 111 | + if not forceNewInstance: |
| 112 | + self._context = context |
| 113 | + else: |
| 114 | + return context |
| 115 | + return self._context |
| 116 | + |
| 117 | + def pandasGenerateText(self, v): |
| 118 | + """ Called to generate text via Pandas UDF mechanism |
| 119 | +
|
| 120 | + :param v: base value of column as Pandas Series |
| 121 | +
|
| 122 | + """ |
| 123 | + # save object properties in local vars to avoid overhead of object dereferences |
| 124 | + # on every call |
| 125 | + context = self._getContext(self._initPerBatch) |
| 126 | + evalFn = self._pyFn |
| 127 | + rootProperty = getattr(context, self._rootProperty) if self._rootProperty is not None else None |
| 128 | + |
| 129 | + # define functions to call with context and with root property |
| 130 | + def _valueFromFn(originalValue): |
| 131 | + return evalFn(context, originalValue) |
| 132 | + |
| 133 | + def _valueFromFnWithRoot(originalValue): |
| 134 | + return evalFn(rootProperty) |
| 135 | + |
| 136 | + if rootProperty is not None: |
| 137 | + results = v.apply(_valueFromFnWithRoot, args=None) |
| 138 | + else: |
| 139 | + results = v.apply(_valueFromFn, args=None) |
| 140 | + |
| 141 | + return results |
| 142 | + |
| 143 | + |
| 144 | +class PyfuncTextFactory: |
| 145 | + """PyfuncTextFactory applies syntactic wrapping around creation of PyfuncText objects |
| 146 | +
|
| 147 | + :param name: name of generated object (when converted to string via ``str``) |
| 148 | +
|
| 149 | + It allows the use of the following constructs: |
| 150 | +
|
| 151 | + .. code-block:: python |
| 152 | +
|
| 153 | + # initialization (for Faker for example) |
| 154 | +
|
| 155 | + # setup use of Faker |
| 156 | + def initFaker(ctx): |
| 157 | + ctx.faker = Faker(locale="en_US") |
| 158 | + ctx.faker.add_provider(internet) |
| 159 | +
|
| 160 | + FakerText = (PyfuncTextFactory(name="FakerText") |
| 161 | + .withInit(initFaker) # determines how context should be initialized |
| 162 | + .withRootProperty("faker") # determines what context property is passed to fn |
| 163 | + ) |
| 164 | +
|
| 165 | + # later use ... |
| 166 | + .withColumn("fake_name", text=FakerText("sentence", ext_word_list=my_word_list) ) |
| 167 | + .withColumn("fake_sentence", text=FakerText("sentence", ext_word_list=my_word_list) ) |
| 168 | +
|
| 169 | + # translates to generation of lambda function with keyword arguments |
| 170 | + # or without as needed |
| 171 | + .withColumn("fake_name", |
| 172 | + text=FakerText( (lambda faker: faker.name( )), |
| 173 | + init=initFaker, |
| 174 | + rootProperty="faker", |
| 175 | + name="FakerText")) |
| 176 | + .withColumn("fake_sentence", |
| 177 | + text=FakerText( (lambda faker: |
| 178 | + faker.sentence( **{ "ext_word_list" : my_word_list} )), |
| 179 | + init=initFaker, |
| 180 | + rootProperty="faker", |
| 181 | + name="FakerText")) |
| 182 | +
|
| 183 | + """ |
| 184 | + |
| 185 | + def __init__(self, name=None): |
| 186 | + """ |
| 187 | +
|
| 188 | + :param name: |
| 189 | + """ |
| 190 | + self._initFn = None |
| 191 | + self._rootProperty = None |
| 192 | + self._name = "PyfuncText" if name is None else name |
| 193 | + self._initPerBatch = False |
| 194 | + |
| 195 | + def withInit(self, fn): |
| 196 | + """ Specifies context initialization function |
| 197 | +
|
| 198 | + :param fn: function pointer or lambda function for initialization |
| 199 | + signature should ``initFunction(context)`` |
| 200 | +
|
| 201 | + .. note:: |
| 202 | + This variation initializes the context once per worker process per text generator |
| 203 | + instance. |
| 204 | + """ |
| 205 | + self._initFn = fn |
| 206 | + return self |
| 207 | + |
| 208 | + def withInitPerBatch(self, fn): |
| 209 | + """ Specifies context initialization function |
| 210 | +
|
| 211 | + :param fn: function pointer or lambda function for initialization |
| 212 | + signature should ``initFunction(context)`` |
| 213 | +
|
| 214 | + .. note:: |
| 215 | + This variation initializes the context once per internal pandas UDF call. |
| 216 | + The UDF call will be called once per 10,000 rows if system is configured using defaults. |
| 217 | + Setting the pandas batch size as an argument to the DataSpec creation will change the default |
| 218 | + batch size. |
| 219 | + """ |
| 220 | + self._initPerBatch = True |
| 221 | + return self.withInit(fn) |
| 222 | + |
| 223 | + def withRootProperty(self, prop): |
| 224 | + """ If called, specifies the property of the context to be passed to the text generation function. |
| 225 | + If not called, the context object itself will be passed to the text generation function. |
| 226 | + """ |
| 227 | + self._rootProperty = prop |
| 228 | + return self |
| 229 | + |
| 230 | + def __call__(self, evalFn, *args, isProperty=False, **kwargs): |
| 231 | + """ Internal function call mechanism that implements the syntax expansion |
| 232 | +
|
| 233 | + :param evalFn: text generation function or lambda |
| 234 | + :param args: optional args to be passed by position |
| 235 | + :param kwargs: optional keyword args following Python keyword passing mechanism |
| 236 | + :param isProperty: if true, interpret evalFn as string name of property, not a function or method |
| 237 | + """ |
| 238 | + assert evalFn is not None and (type(evalFn) is str or callable(evalFn)), "Function must be provided" |
| 239 | + |
| 240 | + if type(evalFn) is str: |
| 241 | + assert self._rootProperty is not None and len(self._rootProperty.strip()) > 0, \ |
| 242 | + "string named functions can only be used on text generators with root property" |
| 243 | + fnName = evalFn |
| 244 | + if len(args) > 0 and len(kwargs) > 0: |
| 245 | + # generate lambda with both kwargs and args |
| 246 | + assert not isProperty, "isProperty cannot be true if using arguments" |
| 247 | + evalFn = (lambda root: getattr(root, fnName)(*args, **kwargs)) |
| 248 | + elif len(args) > 0: |
| 249 | + # generate lambda with positional args |
| 250 | + assert not isProperty, "isProperty cannot be true if using arguments" |
| 251 | + evalFn = (lambda root: getattr(root, fnName)(*args)) |
| 252 | + elif len(kwargs) > 0: |
| 253 | + # generate lambda with keyword args |
| 254 | + assert not isProperty, "isProperty cannot be true if using arguments" |
| 255 | + evalFn = (lambda root: getattr(root, fnName)(**kwargs)) |
| 256 | + elif isProperty: |
| 257 | + # generate lambda with property access, not method call |
| 258 | + evalFn = (lambda root: getattr(root, fnName)) |
| 259 | + else: |
| 260 | + # generate lambda with no args |
| 261 | + evalFn = (lambda root: getattr(root, fnName)()) |
| 262 | + |
| 263 | + # returns the actual PyfuncText text generator object. |
| 264 | + # Note all syntax expansion is performed once only |
| 265 | + return PyfuncText(evalFn, init=self._initFn, name=self._name, rootProperty=self._rootProperty) |
| 266 | + |
| 267 | + |
| 268 | +class FakerTextFactory(PyfuncTextFactory): |
| 269 | + """ Factory object for Faker text generator flavored ``PyfuncText`` objects |
| 270 | +
|
| 271 | + :param locale: list of locales. If empty, defaults to ``en-US`` |
| 272 | + :param providers: list of providers |
| 273 | + :param name: name of generated objects. Defaults to ``FakerText`` |
| 274 | + :param lib: library import name of Faker library. If none passed, uses ``faker`` |
| 275 | + :param rootClass: name of root object class If none passed, uses ``Faker`` |
| 276 | +
|
| 277 | + ..note :: |
| 278 | + Both the library name and root object class can be overridden - this is primarily for internal testing purposes. |
| 279 | + """ |
| 280 | + |
| 281 | + _FAKER_LIB = "faker" |
| 282 | + |
| 283 | + _defaultFakerTextFactory = None |
| 284 | + |
| 285 | + def __init__(self, locale=None, providers=None, name="FakerText", lib=None, |
| 286 | + rootClass=None): |
| 287 | + |
| 288 | + super().__init__(name) |
| 289 | + |
| 290 | + # set up the logger |
| 291 | + self._logger = logging.getLogger("FakerTextFactory") |
| 292 | + self._logger.setLevel(logging.WARNING) |
| 293 | + |
| 294 | + # setup Faker library to use |
| 295 | + if lib is None: |
| 296 | + lib = self._FAKER_LIB |
| 297 | + |
| 298 | + # allow overriding the root object class for test purposes |
| 299 | + if rootClass is None: |
| 300 | + self._rootObjectClass = "Faker" |
| 301 | + else: |
| 302 | + self._rootObjectClass = rootClass |
| 303 | + |
| 304 | + # load the library |
| 305 | + fakerModule = self._loadLibrary(lib) |
| 306 | + |
| 307 | + # make the initialization function |
| 308 | + initFn = self._mkInitFn(fakerModule, locale, providers) |
| 309 | + |
| 310 | + self.withInit(initFn) |
| 311 | + self.withRootProperty("faker") |
| 312 | + |
| 313 | + @classmethod |
| 314 | + def _getDefaultFactory(cls, lib=None, rootClass=None): |
| 315 | + """Class method to get default faker text factory |
| 316 | +
|
| 317 | + Not intended for general use |
| 318 | + """ |
| 319 | + if cls._defaultFakerTextFactory is None: |
| 320 | + cls._defaultFakerTextFactory = FakerTextFactory(lib=lib, rootClass=rootClass) |
| 321 | + return cls._defaultFakerTextFactory |
| 322 | + |
| 323 | + def _mkInitFn(self, libModule, locale, providers): |
| 324 | + """ Make Faker initialization function |
| 325 | +
|
| 326 | + :param locale: locale string or list of locale strings |
| 327 | + :param providers: providers to load |
| 328 | + :return: |
| 329 | + """ |
| 330 | + assert libModule is not None, "must have a valid loaded Faker library module" |
| 331 | + |
| 332 | + fakerClass = getattr(libModule, self._rootObjectClass) |
| 333 | + |
| 334 | + # define the initialization function for Faker |
| 335 | + def fakerInitFn(ctx): |
| 336 | + if locale is not None: |
| 337 | + ctx.faker = fakerClass(locale=locale) |
| 338 | + else: |
| 339 | + ctx.faker = fakerClass() |
| 340 | + |
| 341 | + if providers is not None: |
| 342 | + for provider in providers: |
| 343 | + ctx.faker.add_provider(provider) |
| 344 | + |
| 345 | + return fakerInitFn |
| 346 | + |
| 347 | + def _loadLibrary(self, lib): |
| 348 | + """ Load faker library if not already loaded |
| 349 | +
|
| 350 | + :param lib: library name of Faker library. If none passed, uses ``faker`` |
| 351 | + """ |
| 352 | + # load library |
| 353 | + try: |
| 354 | + if lib is not None: |
| 355 | + assert type(lib) is str and len(lib.strip()), f"Library ``{lib}`` must be a valid library name" |
| 356 | + |
| 357 | + if lib in globals(): |
| 358 | + return globals()[lib] |
| 359 | + else: |
| 360 | + fakerModule = importlib.import_module(lib) |
| 361 | + globals()[lib] = fakerModule |
| 362 | + return fakerModule |
| 363 | + except RuntimeError as err: |
| 364 | + raise DataGenError("Could not load or initialize Faker library", err) |
| 365 | + |
| 366 | + |
| 367 | + |
| 368 | + |
| 369 | + |
| 370 | +def fakerText(mname, *args, _lib=None, _rootClass=None, **kwargs): |
| 371 | + """Generate faker text generator object using default FakerTextFactory |
| 372 | + instance |
| 373 | +
|
| 374 | + :returns : instance of PyfuncText for use with Faker |
| 375 | +
|
| 376 | + ``fakerText("sentence")`` is same as ``FakerTextFactory()("sentence")`` |
| 377 | + """ |
| 378 | + defaultFactory = FakerTextFactory._getDefaultFactory(lib=_lib, rootClass=_rootClass) |
| 379 | + |
| 380 | + return defaultFactory(mname, *args, **kwargs) |
0 commit comments