Fix missing h2o/viya scenario on model import

jlwalke2 · jlwalke2 · commit d72ef198c373 · 2021-06-09T16:37:16.000-04:00
diff --git a/src/sasctl/pzmm/writeScoreCode.py b/src/sasctl/pzmm/writeScoreCode.py
@@ -7,9 +7,10 @@
 from ..core import platform_version
 from .._services.model_repository import ModelRepository as modelRepo
 
+
 # %%
 class ScoreCode():
-    
+
     @classmethod
     def writeScoreCode(cls, inputDF, targetDF, modelPrefix,
                        predictMethod, modelFileName,
@@ -18,26 +19,26 @@ def writeScoreCode(cls, inputDF, targetDF, modelPrefix,
                        otherVariable=False, model=None, isH2OModel=False, missingValues=False,
                        scoreCAS=True):
         '''
-        Writes a Python score code file based on training data used to generate the model 
-        pickle file. The Python file is included in the ZIP file that is imported or registered 
-        into the common model repository. The model can then be used by SAS applications, 
+        Writes a Python score code file based on training data used to generate the model
+        pickle file. The Python file is included in the ZIP file that is imported or registered
+        into the common model repository. The model can then be used by SAS applications,
         such as SAS Open Model Manager.
-        
+
         The score code that is generated is designed to be a working template for any
-        Python model, but is not guaranteed to work out of the box for scoring, publishing, 
+        Python model, but is not guaranteed to work out of the box for scoring, publishing,
         or validating the model.
-        
-        Note that for categorical variables, the variable is split into the possible 
-        categorical values of the variable. Also, by default it does NOT include a catch-all 
-        [catVar]_Other variable to store any missing values or any values not found in the 
-        training data set. If you have missing values or values not included in your training 
+
+        Note that for categorical variables, the variable is split into the possible
+        categorical values of the variable. Also, by default it does NOT include a catch-all
+        [catVar]_Other variable to store any missing values or any values not found in the
+        training data set. If you have missing values or values not included in your training
         data set, you must set the OtherVariable option to True.
-        
+
         Both the inputDF and targetDF dataframes have the following stipulations:
         * Column names must be a valid Python variable name.
         * For categorical columns, the values must be a valid Python variable name.
         If either of these conditions is broken, an exception is raised.
-        
+
         Parameters
         ----------
         inputDF : DataFrame
@@ -48,11 +49,11 @@ def writeScoreCode(cls, inputDF, targetDF, modelPrefix,
             The `DataFrame` object contains the training data for the target variable.
         modelPrefix : string
             The variable for the model name that is used when naming model files.
-            (For example: hmeqClassTree + [Score.py || .pickle]).      
+            (For example: hmeqClassTree + [Score.py || .pickle]).
         predictMethod : string
             User-defined prediction method for score testing. This should be
-            in a form such that the model and data input can be added using 
-            the format() command. 
+            in a form such that the model and data input can be added using
+            the format() command.
             For example: '{}.predict_proba({})'.
         modelFileName : string
             Name of the model file that contains the model.
@@ -92,10 +93,11 @@ def writeScoreCode(cls, inputDF, targetDF, modelPrefix,
             Python score code wrapped in DS2 and prepared for CAS scoring or publishing.
         'dmcas_packagescorecode.sas' (for SAS Viya 3.5 models)
             Python score code wrapped in DS2 and prepared for SAS Microanalyic Service scoring or publishing.
-        '''       
+        '''
+
         # Call REST API to check SAS Viya version
         isViya35 = (platform_version() == '3.5')
-        
+
         # Initialize modelID to remove unbound variable warnings
         modelID = None
 
@@ -118,76 +120,86 @@ def upload_and_copy_score_resources(model, files):
             else:
                 model = modelRepo.get_model(model)
                 modelID = model['id']
-        
+
         # From the input dataframe columns, create a list of input variables, then check for viability
         inputVarList = list(inputDF.columns)
         for name in inputVarList:
             if not str(name).isidentifier():
                 raise SyntaxError('Invalid column name in inputDF. Columns must be ' +
                                   'valid as Python variables.')
         newVarList = list(inputVarList)
-        inputDtypesList = list(inputDF.dtypes)        
-    
+        inputDtypesList = list(inputDF.dtypes)
+
         # Set the location for the Python score file to be written, then open the file
         zPath = Path(pyPath)
         pyPath = Path(pyPath) / (modelPrefix + 'Score.py')
         with open(pyPath, 'w') as cls.pyFile:
-        
+
             # For H2O models, include the necessary packages
             if isH2OModel:
                 cls.pyFile.write('''\
 import h2o
 import gzip, shutil, os''')
-            # Import math for imputation; pickle for serialized models; pandas for data management; numpy for computation    
+            # Import math for imputation; pickle for serialized models; pandas for data management; numpy for
+            # computation
             cls.pyFile.write('''\n
 import math
 import pickle
 import pandas as pd
 import numpy as np''')
-            # In SAS Viya 4.0 and SAS Open Model Manager, a settings.py file is generated that points to the resource location
+            # In SAS Viya 4.0 and SAS Open Model Manager, a settings.py file is generated that points to the resource
+            # location
             if not isViya35:
                 cls.pyFile.write('''\n
 import settings''')
-                
+
             # Use a global variable for the model in order to load from memory only once
             cls.pyFile.write('''\n\n
 global _thisModelFit''')
-            
+
             # For H2O models, include the server initialization, or h2o.connect() call to use an H2O server
             if isH2OModel:
                 cls.pyFile.write('''\n
 h2o.init()''')
 
             # For each case of SAS Viya version and H2O model or not, load the model file as variable _thisModelFit
-            if isViya35 and not isH2OModel:
+            if isViya35 and isH2OModel:
                 cls.pyFile.write('''\n
-with gzip.open('/models/resources/viya/{modelID}/{modelFileName}', 'r') as fileIn, open('/models/resources/viya/{modelID}/{modelZipFileName}', 'wb') as fileOut:
+with gzip.open('/models/resources/viya/{modelID}/{modelFileName}', 'r') as fileIn, open('/models/resources/viya/{
+modelID}/{modelZipFileName}', 'wb') as fileOut:
     shutil.copyfileobj(fileIn, fileOut)
 os.chmod('/models/resources/viya/{modelID}/{modelZipFileName}', 0o777)
 _thisModelFit = h2o.import_mojo('/models/resources/viya/{modelID}/{modelZipFileName}')'''.format(
                     modelID=modelID,
                     modelFileName=modelFileName,
                     modelZipFileName=modelFileName[:-4] + 'zip'
                 ))
+            elif isViya35 and not isH2OModel:
+                cls.pyFile.write('''\n
+with open('/models/resources/viya/{modelID}/{modelFileName}', 'rb') as _pFile:
+    _thisModelFit = pickle.load(_pfile)'''.format(modelID=modelID, modelFileName=modelFileName))
             elif not isViya35 and not isH2OModel:
                 cls.pyFile.write('''\n
 with open(settings.pickle_path + '{modelFileName}', 'rb') as _pFile:
     _thisModelFit = pickle.load(_pFile)'''.format(modelFileName=modelFileName))
             elif not isViya35 and isH2OModel:
                 cls.pyFile.write('''\n
-with gzip.open(settings.pickle_path + '{modelFileName}', 'r') as fileIn, open(settings.pickle_path + '{modelZipFileName}', 'wb') as fileOut:
+with gzip.open(settings.pickle_path + '{modelFileName}', 'r') as fileIn, open(settings.pickle_path + '{
+modelZipFileName}', 'wb') as fileOut:
     shutil.copyfileobj(fileIn, fileOut)
 os.chmod(settings.pickle_path + '{modelZipFileName}', 0o777)
 _thisModelFit = h2o.import_mojo(settings.pickle_path + '{modelZipFileName}')'''.format(modelFileName=modelFileName,
-                                                                                                 modelZipFileName=modelFileName[:-4] + 'zip'
-                                                                                                 ))
-            # Create the score function with variables from the input dataframe provided and create the output variable line for SAS Model Manager
+                                                                                       modelZipFileName=modelFileName[
+                                                                                                        :-4] + 'zip'))
+            # Create the score function with variables from the input dataframe provided and create the output
+            # variable line for SAS Model Manager
             cls.pyFile.write('''\n
 def score{modelPrefix}({inputVarList}):
     "Output: {metrics}"'''.format(modelPrefix=modelPrefix,
-                                             inputVarList=', '.join(inputVarList),
-                                             metrics=', '.join(metrics)))
-            # As a check for missing model variables, run a try/except block that reattempts to load the model in as a variable
+                                  inputVarList=', '.join(inputVarList),
+                                  metrics=', '.join(metrics)))
+            # As a check for missing model variables, run a try/except block that reattempts to load the model in as
+            # a variable
             cls.pyFile.write('''\n
     try:
         _thisModelFit
@@ -209,7 +221,7 @@ def score{modelPrefix}({inputVarList}):
             elif not isViya35 and isH2OModel:
                 cls.pyFile.write('''
         _thisModelFit = h2o.import_mojo(settings.pickle_path + '{}')'''.format(modelFileName[:-4] + 'zip'))
-            
+
             if missingValues:
                 # For each input variable, impute for missing values based on variable dtype
                 for i, dTypes in enumerate(inputDtypesList):
@@ -222,7 +234,7 @@ def score{modelPrefix}({inputVarList}):
             {inputVar} = {inputVarMode}
     except TypeError:
         {inputVar} = {inputVarMode}'''.format(inputVar=inputVarList[i],
-                                             inputVarMode=float(list(inputDF[inputVarList[i]].mode())[0])))
+                                              inputVarMode=float(list(inputDF[inputVarList[i]].mode())[0])))
                         else:
                             cls.pyFile.write('''\n
     try:
@@ -239,10 +251,10 @@ def score{modelPrefix}({inputVarList}):
         categoryStr = 'Other'\n'''.format(inputVar=inputVarList[i]))
 
                         tempVar = cls.splitStringColumn(inputDF[inputVarList[i]],
-                                                         otherVariable)
+                                                        otherVariable)
                         newVarList.remove(inputVarList[i])
                         newVarList.extend(tempVar)
-    
+
             # For non-H2O models, insert the model into the provided predictMethod call
             if not isH2OModel:
                 predictMethod = predictMethod.format('_thisModelFit', 'inputArray')
@@ -301,10 +313,10 @@ def score{modelPrefix}({inputVarList}):
                 cls.pyFile.write('''\n
     {} = float(prediction[1][2])
     {} = prediction[1][0]'''.format(metrics[0], metrics[1]))
-            
+
             cls.pyFile.write('''\n
     return({}, {})'''.format(metrics[0], metrics[1]))
-        
+
         # For SAS Viya 3.5, the model is first registered to SAS Model Manager, then the model UUID can be
         # added to the score code and reuploaded to the model file contents
         if isViya35:
@@ -335,34 +347,34 @@ def score{modelPrefix}({inputVarList}):
                 model = modelRepo.get_model(modelID)
                 model['scoreCodeType'] = 'ds2MultiType'
                 modelRepo.update_model(model)
-            
+
     def splitStringColumn(cls, inputSeries, otherVariable):
         '''
         Splits a column of string values into a number of new variables equal
         to the number of unique values in the original column (excluding None
         values). It then writes to a file the statements that tokenize the newly
         defined variables.
-        
+
         Here is an example: Given a series named strCol with values ['A', 'B', 'C',
         None, 'A', 'B', 'A', 'D'], designates the following new variables:
         strCol_A, strCol_B, strCol_D. It then writes the following to the file:
             strCol_A = np.where(val == 'A', 1.0, 0.0)
             strCol_B = np.where(val == 'B', 1.0, 0.0)
             strCol_D = np.where(val == 'D', 1.0, 0.0)
-                    
+
         Parameters
         ---------------
         inputSeries : string series
             Series with the string dtype.
         cls.pyFile : file (class variable)
             Open python file to write into.
-            
+
         Returns
         ---------------
         newVarList : string list
             List of all new variable names split from unique values.
         '''
-        
+
         uniqueValues = inputSeries.unique()
         uniqueValues = list(filter(None, uniqueValues))
         uniqueValues = [x for x in uniqueValues if str(x) != 'nan']
@@ -375,50 +387,50 @@ def splitStringColumn(cls, inputSeries, otherVariable):
             newVarList.append('{}_{}'.format(inputSeries.name, uniq))
             cls.pyFile.write('''
     {0} = np.where(categoryStr == '{1}', 1.0, 0.0)'''.format(newVarList[i], uniq))
-                
+
         if ('Other' not in uniqueValues) and otherVariable:
             newVarList.append('{}_Other'.format(inputSeries.name))
             cls.pyFile.write('''
     {}_Other = np.where(categoryStr == 'Other', 1.0, 0.0)'''.format(inputSeries.name))
-            
+
         return newVarList
-    
+
     def checkIfBinary(inputSeries):
         '''
         Checks a pandas series to determine whether the values are binary or nominal.
-        
+
         Parameters
         ---------------
         inputSeries : float or int series
             A series with numeric values.
-        
+
         Returns
         ---------------
         isBinary : boolean
             The returned value is True if the series values are binary, and False if the series values
             are nominal.
         '''
-        
+
         isBinary = False
         binaryFloat = [float(1), float(0)]
-        
+
         if inputSeries.value_counts().size == 2:
-            if (binaryFloat[0] in inputSeries.astype('float') and 
-                binaryFloat[1] in inputSeries.astype('float')):
+            if (binaryFloat[0] in inputSeries.astype('float') and
+                    binaryFloat[1] in inputSeries.astype('float')):
                 isBinary = False
             else:
                 isBinary = True
-                
+
         return isBinary
-        
+
     def convertMAStoCAS(MASCode, modelId):
-        '''Using the generated score.sas code from the Python wrapper API, 
+        '''Using the generated score.sas code from the Python wrapper API,
         convert the SAS Microanalytic Service based code to CAS compatible.
 
         Parameters
         ----------
         MASCode : str
-            String representation of the packagescore.sas DS2 wrapper 
+            String representation of the packagescore.sas DS2 wrapper
         modelId : str or dict
             The name or id of the model, or a dictionary representation of
             the model
@@ -436,16 +448,17 @@ def convertMAStoCAS(MASCode, modelId):
                 outputString = outputString + 'varchar(100) '
             else:
                 outputString = outputString + 'double '
-            outputString = outputString  + outVar['name'] + ';\n'
+            outputString = outputString + outVar['name'] + ';\n'
         start = MASCode.find('score(')
         finish = MASCode[start:].find(');')
-        scoreVars = MASCode[start+6:start+finish]
-        inputString = ' '.join([x for x in scoreVars.split(' ') if (x != 'double' and x != 'in_out' and x != 'varchar(100)')])
+        scoreVars = MASCode[start + 6:start + finish]
+        inputString = ' '.join(
+            [x for x in scoreVars.split(' ') if (x != 'double' and x != 'in_out' and x != 'varchar(100)')])
         endBlock = 'method run();\n    set SASEP.IN;\n    score({});\nend;\nenddata;'.format(inputString)
         replaceStrings = {'package pythonScore / overwrite=yes;': 'data sasep.out;',
                           'dcl int resultCode revision;': 'dcl double resultCode revision;\n' + outputString,
                           'endpackage;': endBlock}
         replaceStrings = dict((re.escape(k), v) for k, v in replaceStrings.items())
         pattern = re.compile('|'.join(replaceStrings.keys()))
         casCode = pattern.sub(lambda m: replaceStrings[re.escape(m.group(0))], MASCode)
-        return casCode
+        return casCode