7
7
from ..core import platform_version
8
8
from .._services .model_repository import ModelRepository as modelRepo
9
9
10
+
10
11
# %%
11
12
class ScoreCode ():
12
-
13
+
13
14
@classmethod
14
15
def writeScoreCode (cls , inputDF , targetDF , modelPrefix ,
15
16
predictMethod , modelFileName ,
@@ -18,26 +19,26 @@ def writeScoreCode(cls, inputDF, targetDF, modelPrefix,
18
19
otherVariable = False , model = None , isH2OModel = False , missingValues = False ,
19
20
scoreCAS = True ):
20
21
'''
21
- Writes a Python score code file based on training data used to generate the model
22
- pickle file. The Python file is included in the ZIP file that is imported or registered
23
- into the common model repository. The model can then be used by SAS applications,
22
+ Writes a Python score code file based on training data used to generate the model
23
+ pickle file. The Python file is included in the ZIP file that is imported or registered
24
+ into the common model repository. The model can then be used by SAS applications,
24
25
such as SAS Open Model Manager.
25
-
26
+
26
27
The score code that is generated is designed to be a working template for any
27
- Python model, but is not guaranteed to work out of the box for scoring, publishing,
28
+ Python model, but is not guaranteed to work out of the box for scoring, publishing,
28
29
or validating the model.
29
-
30
- Note that for categorical variables, the variable is split into the possible
31
- categorical values of the variable. Also, by default it does NOT include a catch-all
32
- [catVar]_Other variable to store any missing values or any values not found in the
33
- training data set. If you have missing values or values not included in your training
30
+
31
+ Note that for categorical variables, the variable is split into the possible
32
+ categorical values of the variable. Also, by default it does NOT include a catch-all
33
+ [catVar]_Other variable to store any missing values or any values not found in the
34
+ training data set. If you have missing values or values not included in your training
34
35
data set, you must set the OtherVariable option to True.
35
-
36
+
36
37
Both the inputDF and targetDF dataframes have the following stipulations:
37
38
* Column names must be a valid Python variable name.
38
39
* For categorical columns, the values must be a valid Python variable name.
39
40
If either of these conditions is broken, an exception is raised.
40
-
41
+
41
42
Parameters
42
43
----------
43
44
inputDF : DataFrame
@@ -48,11 +49,11 @@ def writeScoreCode(cls, inputDF, targetDF, modelPrefix,
48
49
The `DataFrame` object contains the training data for the target variable.
49
50
modelPrefix : string
50
51
The variable for the model name that is used when naming model files.
51
- (For example: hmeqClassTree + [Score.py || .pickle]).
52
+ (For example: hmeqClassTree + [Score.py || .pickle]).
52
53
predictMethod : string
53
54
User-defined prediction method for score testing. This should be
54
- in a form such that the model and data input can be added using
55
- the format() command.
55
+ in a form such that the model and data input can be added using
56
+ the format() command.
56
57
For example: '{}.predict_proba({})'.
57
58
modelFileName : string
58
59
Name of the model file that contains the model.
@@ -92,10 +93,11 @@ def writeScoreCode(cls, inputDF, targetDF, modelPrefix,
92
93
Python score code wrapped in DS2 and prepared for CAS scoring or publishing.
93
94
'dmcas_packagescorecode.sas' (for SAS Viya 3.5 models)
94
95
Python score code wrapped in DS2 and prepared for SAS Microanalyic Service scoring or publishing.
95
- '''
96
+ '''
97
+
96
98
# Call REST API to check SAS Viya version
97
99
isViya35 = (platform_version () == '3.5' )
98
-
100
+
99
101
# Initialize modelID to remove unbound variable warnings
100
102
modelID = None
101
103
@@ -118,76 +120,86 @@ def upload_and_copy_score_resources(model, files):
118
120
else :
119
121
model = modelRepo .get_model (model )
120
122
modelID = model ['id' ]
121
-
123
+
122
124
# From the input dataframe columns, create a list of input variables, then check for viability
123
125
inputVarList = list (inputDF .columns )
124
126
for name in inputVarList :
125
127
if not str (name ).isidentifier ():
126
128
raise SyntaxError ('Invalid column name in inputDF. Columns must be ' +
127
129
'valid as Python variables.' )
128
130
newVarList = list (inputVarList )
129
- inputDtypesList = list (inputDF .dtypes )
130
-
131
+ inputDtypesList = list (inputDF .dtypes )
132
+
131
133
# Set the location for the Python score file to be written, then open the file
132
134
zPath = Path (pyPath )
133
135
pyPath = Path (pyPath ) / (modelPrefix + 'Score.py' )
134
136
with open (pyPath , 'w' ) as cls .pyFile :
135
-
137
+
136
138
# For H2O models, include the necessary packages
137
139
if isH2OModel :
138
140
cls .pyFile .write ('''\
139
141
import h2o
140
142
import gzip, shutil, os''' )
141
- # Import math for imputation; pickle for serialized models; pandas for data management; numpy for computation
143
+ # Import math for imputation; pickle for serialized models; pandas for data management; numpy for
144
+ # computation
142
145
cls .pyFile .write ('''\n
143
146
import math
144
147
import pickle
145
148
import pandas as pd
146
149
import numpy as np''' )
147
- # In SAS Viya 4.0 and SAS Open Model Manager, a settings.py file is generated that points to the resource location
150
+ # In SAS Viya 4.0 and SAS Open Model Manager, a settings.py file is generated that points to the resource
151
+ # location
148
152
if not isViya35 :
149
153
cls .pyFile .write ('''\n
150
154
import settings''' )
151
-
155
+
152
156
# Use a global variable for the model in order to load from memory only once
153
157
cls .pyFile .write ('''\n \n
154
158
global _thisModelFit''' )
155
-
159
+
156
160
# For H2O models, include the server initialization, or h2o.connect() call to use an H2O server
157
161
if isH2OModel :
158
162
cls .pyFile .write ('''\n
159
163
h2o.init()''' )
160
164
161
165
# For each case of SAS Viya version and H2O model or not, load the model file as variable _thisModelFit
162
- if isViya35 and not isH2OModel :
166
+ if isViya35 and isH2OModel :
163
167
cls .pyFile .write ('''\n
164
- with gzip.open('/models/resources/viya/{modelID}/{modelFileName}', 'r') as fileIn, open('/models/resources/viya/{modelID}/{modelZipFileName}', 'wb') as fileOut:
168
+ with gzip.open('/models/resources/viya/{modelID}/{modelFileName}', 'r') as fileIn, open('/models/resources/viya/{
169
+ modelID}/{modelZipFileName}', 'wb') as fileOut:
165
170
shutil.copyfileobj(fileIn, fileOut)
166
171
os.chmod('/models/resources/viya/{modelID}/{modelZipFileName}', 0o777)
167
172
_thisModelFit = h2o.import_mojo('/models/resources/viya/{modelID}/{modelZipFileName}')''' .format (
168
173
modelID = modelID ,
169
174
modelFileName = modelFileName ,
170
175
modelZipFileName = modelFileName [:- 4 ] + 'zip'
171
176
))
177
+ elif isViya35 and not isH2OModel :
178
+ cls .pyFile .write ('''\n
179
+ with open('/models/resources/viya/{modelID}/{modelFileName}', 'rb') as _pFile:
180
+ _thisModelFit = pickle.load(_pfile)''' .format (modelID = modelID , modelFileName = modelFileName ))
172
181
elif not isViya35 and not isH2OModel :
173
182
cls .pyFile .write ('''\n
174
183
with open(settings.pickle_path + '{modelFileName}', 'rb') as _pFile:
175
184
_thisModelFit = pickle.load(_pFile)''' .format (modelFileName = modelFileName ))
176
185
elif not isViya35 and isH2OModel :
177
186
cls .pyFile .write ('''\n
178
- with gzip.open(settings.pickle_path + '{modelFileName}', 'r') as fileIn, open(settings.pickle_path + '{modelZipFileName}', 'wb') as fileOut:
187
+ with gzip.open(settings.pickle_path + '{modelFileName}', 'r') as fileIn, open(settings.pickle_path + '{
188
+ modelZipFileName}', 'wb') as fileOut:
179
189
shutil.copyfileobj(fileIn, fileOut)
180
190
os.chmod(settings.pickle_path + '{modelZipFileName}', 0o777)
181
191
_thisModelFit = h2o.import_mojo(settings.pickle_path + '{modelZipFileName}')''' .format (modelFileName = modelFileName ,
182
- modelZipFileName = modelFileName [:- 4 ] + 'zip'
183
- ))
184
- # Create the score function with variables from the input dataframe provided and create the output variable line for SAS Model Manager
192
+ modelZipFileName = modelFileName [
193
+ :- 4 ] + 'zip' ))
194
+ # Create the score function with variables from the input dataframe provided and create the output
195
+ # variable line for SAS Model Manager
185
196
cls .pyFile .write ('''\n
186
197
def score{modelPrefix}({inputVarList}):
187
198
"Output: {metrics}"''' .format (modelPrefix = modelPrefix ,
188
- inputVarList = ', ' .join (inputVarList ),
189
- metrics = ', ' .join (metrics )))
190
- # As a check for missing model variables, run a try/except block that reattempts to load the model in as a variable
199
+ inputVarList = ', ' .join (inputVarList ),
200
+ metrics = ', ' .join (metrics )))
201
+ # As a check for missing model variables, run a try/except block that reattempts to load the model in as
202
+ # a variable
191
203
cls .pyFile .write ('''\n
192
204
try:
193
205
_thisModelFit
@@ -209,7 +221,7 @@ def score{modelPrefix}({inputVarList}):
209
221
elif not isViya35 and isH2OModel :
210
222
cls .pyFile .write ('''
211
223
_thisModelFit = h2o.import_mojo(settings.pickle_path + '{}')''' .format (modelFileName [:- 4 ] + 'zip' ))
212
-
224
+
213
225
if missingValues :
214
226
# For each input variable, impute for missing values based on variable dtype
215
227
for i , dTypes in enumerate (inputDtypesList ):
@@ -222,7 +234,7 @@ def score{modelPrefix}({inputVarList}):
222
234
{inputVar} = {inputVarMode}
223
235
except TypeError:
224
236
{inputVar} = {inputVarMode}''' .format (inputVar = inputVarList [i ],
225
- inputVarMode = float (list (inputDF [inputVarList [i ]].mode ())[0 ])))
237
+ inputVarMode = float (list (inputDF [inputVarList [i ]].mode ())[0 ])))
226
238
else :
227
239
cls .pyFile .write ('''\n
228
240
try:
@@ -239,10 +251,10 @@ def score{modelPrefix}({inputVarList}):
239
251
categoryStr = 'Other'\n ''' .format (inputVar = inputVarList [i ]))
240
252
241
253
tempVar = cls .splitStringColumn (inputDF [inputVarList [i ]],
242
- otherVariable )
254
+ otherVariable )
243
255
newVarList .remove (inputVarList [i ])
244
256
newVarList .extend (tempVar )
245
-
257
+
246
258
# For non-H2O models, insert the model into the provided predictMethod call
247
259
if not isH2OModel :
248
260
predictMethod = predictMethod .format ('_thisModelFit' , 'inputArray' )
@@ -301,10 +313,10 @@ def score{modelPrefix}({inputVarList}):
301
313
cls .pyFile .write ('''\n
302
314
{} = float(prediction[1][2])
303
315
{} = prediction[1][0]''' .format (metrics [0 ], metrics [1 ]))
304
-
316
+
305
317
cls .pyFile .write ('''\n
306
318
return({}, {})''' .format (metrics [0 ], metrics [1 ]))
307
-
319
+
308
320
# For SAS Viya 3.5, the model is first registered to SAS Model Manager, then the model UUID can be
309
321
# added to the score code and reuploaded to the model file contents
310
322
if isViya35 :
@@ -335,34 +347,34 @@ def score{modelPrefix}({inputVarList}):
335
347
model = modelRepo .get_model (modelID )
336
348
model ['scoreCodeType' ] = 'ds2MultiType'
337
349
modelRepo .update_model (model )
338
-
350
+
339
351
def splitStringColumn (cls , inputSeries , otherVariable ):
340
352
'''
341
353
Splits a column of string values into a number of new variables equal
342
354
to the number of unique values in the original column (excluding None
343
355
values). It then writes to a file the statements that tokenize the newly
344
356
defined variables.
345
-
357
+
346
358
Here is an example: Given a series named strCol with values ['A', 'B', 'C',
347
359
None, 'A', 'B', 'A', 'D'], designates the following new variables:
348
360
strCol_A, strCol_B, strCol_D. It then writes the following to the file:
349
361
strCol_A = np.where(val == 'A', 1.0, 0.0)
350
362
strCol_B = np.where(val == 'B', 1.0, 0.0)
351
363
strCol_D = np.where(val == 'D', 1.0, 0.0)
352
-
364
+
353
365
Parameters
354
366
---------------
355
367
inputSeries : string series
356
368
Series with the string dtype.
357
369
cls.pyFile : file (class variable)
358
370
Open python file to write into.
359
-
371
+
360
372
Returns
361
373
---------------
362
374
newVarList : string list
363
375
List of all new variable names split from unique values.
364
376
'''
365
-
377
+
366
378
uniqueValues = inputSeries .unique ()
367
379
uniqueValues = list (filter (None , uniqueValues ))
368
380
uniqueValues = [x for x in uniqueValues if str (x ) != 'nan' ]
@@ -375,50 +387,50 @@ def splitStringColumn(cls, inputSeries, otherVariable):
375
387
newVarList .append ('{}_{}' .format (inputSeries .name , uniq ))
376
388
cls .pyFile .write ('''
377
389
{0} = np.where(categoryStr == '{1}', 1.0, 0.0)''' .format (newVarList [i ], uniq ))
378
-
390
+
379
391
if ('Other' not in uniqueValues ) and otherVariable :
380
392
newVarList .append ('{}_Other' .format (inputSeries .name ))
381
393
cls .pyFile .write ('''
382
394
{}_Other = np.where(categoryStr == 'Other', 1.0, 0.0)''' .format (inputSeries .name ))
383
-
395
+
384
396
return newVarList
385
-
397
+
386
398
def checkIfBinary (inputSeries ):
387
399
'''
388
400
Checks a pandas series to determine whether the values are binary or nominal.
389
-
401
+
390
402
Parameters
391
403
---------------
392
404
inputSeries : float or int series
393
405
A series with numeric values.
394
-
406
+
395
407
Returns
396
408
---------------
397
409
isBinary : boolean
398
410
The returned value is True if the series values are binary, and False if the series values
399
411
are nominal.
400
412
'''
401
-
413
+
402
414
isBinary = False
403
415
binaryFloat = [float (1 ), float (0 )]
404
-
416
+
405
417
if inputSeries .value_counts ().size == 2 :
406
- if (binaryFloat [0 ] in inputSeries .astype ('float' ) and
407
- binaryFloat [1 ] in inputSeries .astype ('float' )):
418
+ if (binaryFloat [0 ] in inputSeries .astype ('float' ) and
419
+ binaryFloat [1 ] in inputSeries .astype ('float' )):
408
420
isBinary = False
409
421
else :
410
422
isBinary = True
411
-
423
+
412
424
return isBinary
413
-
425
+
414
426
def convertMAStoCAS (MASCode , modelId ):
415
- '''Using the generated score.sas code from the Python wrapper API,
427
+ '''Using the generated score.sas code from the Python wrapper API,
416
428
convert the SAS Microanalytic Service based code to CAS compatible.
417
429
418
430
Parameters
419
431
----------
420
432
MASCode : str
421
- String representation of the packagescore.sas DS2 wrapper
433
+ String representation of the packagescore.sas DS2 wrapper
422
434
modelId : str or dict
423
435
The name or id of the model, or a dictionary representation of
424
436
the model
@@ -436,16 +448,17 @@ def convertMAStoCAS(MASCode, modelId):
436
448
outputString = outputString + 'varchar(100) '
437
449
else :
438
450
outputString = outputString + 'double '
439
- outputString = outputString + outVar ['name' ] + ';\n '
451
+ outputString = outputString + outVar ['name' ] + ';\n '
440
452
start = MASCode .find ('score(' )
441
453
finish = MASCode [start :].find (');' )
442
- scoreVars = MASCode [start + 6 :start + finish ]
443
- inputString = ' ' .join ([x for x in scoreVars .split (' ' ) if (x != 'double' and x != 'in_out' and x != 'varchar(100)' )])
454
+ scoreVars = MASCode [start + 6 :start + finish ]
455
+ inputString = ' ' .join (
456
+ [x for x in scoreVars .split (' ' ) if (x != 'double' and x != 'in_out' and x != 'varchar(100)' )])
444
457
endBlock = 'method run();\n set SASEP.IN;\n score({});\n end;\n enddata;' .format (inputString )
445
458
replaceStrings = {'package pythonScore / overwrite=yes;' : 'data sasep.out;' ,
446
459
'dcl int resultCode revision;' : 'dcl double resultCode revision;\n ' + outputString ,
447
460
'endpackage;' : endBlock }
448
461
replaceStrings = dict ((re .escape (k ), v ) for k , v in replaceStrings .items ())
449
462
pattern = re .compile ('|' .join (replaceStrings .keys ()))
450
463
casCode = pattern .sub (lambda m : replaceStrings [re .escape (m .group (0 ))], MASCode )
451
- return casCode
464
+ return casCode
0 commit comments