Last updated: Jan 17, 2024
With the Extension Model node, you can run R or Python for Spark scripts to build and score results.
Note that many of the properties and much of the information on this page is only applicable to SPSS Modeler Desktop streams.
Python for Spark example
import modeler.api
stream = modeler.script.stream()
node = stream.create("extension_build", "extension_build")
node.setPropertyValue("syntax_type", "Python")
build_script = """
import json
import spss.pyspark.runtime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.tree import DecisionTree
cxt = spss.pyspark.runtime.getContext()
df = cxt.getSparkInputData()
schema = df.dtypes[:]
target = "Drug"
predictors = ["Age","BP","Sex","Cholesterol","Na","K"]
def metaMap(row,schema):
col = 0
meta = []
for (cname, ctype) in schema:
if ctype == 'string':
meta.append(set([row[col]]))
else:
meta.append((row[col],row[col]))
col += 1
return meta
def metaReduce(meta1,meta2,schema):
col = 0
meta = []
for (cname, ctype) in schema:
if ctype == 'string':
meta.append(meta1[col].union(meta2[col]))
else:
meta.append((min(meta1[col][0],meta2[col][0]),max(meta1[col][1],meta2[col][1])))
col += 1
return meta
metadata = df.rdd.map(lambda row: metaMap(row,schema)).reduce(lambda x,y:metaReduce(x,y,schema))
def setToList(v):
if isinstance(v,set):
return list(v)
return v
metadata = map(lambda x: setToList(x), metadata)
print metadata
lookup = {}
for i in range(0,len(schema)):
lookup[schema[i][0]] = i
def row2LabeledPoint(dm,lookup,target,predictors,row):
target_index = lookup[target]
tval = dm[target_index].index(row[target_index])
pvals = []
for predictor in predictors:
predictor_index = lookup[predictor]
if isinstance(dm[predictor_index],list):
pval = dm[predictor_index].index(row[predictor_index])
else:
pval = row[predictor_index]
pvals.append(pval)
return LabeledPoint(tval,DenseVector(pvals))
# count number of target classes
predictorClassCount = len(metadata[lookup[target]])
# define function to extract categorical predictor information from datamodel
def getCategoricalFeatureInfo(dm,lookup,predictors):
info = {}
for i in range(0,len(predictors)):
predictor = predictors[i]
predictor_index = lookup[predictor]
if isinstance(dm[predictor_index],list):
info[i] = len(dm[predictor_index])
return info
# convert dataframe to an RDD containing LabeledPoint
lps = df.rdd.map(lambda row: row2LabeledPoint(metadata,lookup,target,predictors,row))
treeModel = DecisionTree.trainClassifier(
lps,
numClasses=predictorClassCount,
categoricalFeaturesInfo=getCategoricalFeatureInfo(metadata, lookup, predictors),
impurity='gini',
maxDepth=5,
maxBins=100)
_outputPath = cxt.createTemporaryFolder()
treeModel.save(cxt.getSparkContext(), _outputPath)
cxt.setModelContentFromPath("TreeModel", _outputPath)
cxt.setModelContentFromString("model.dm",json.dumps(metadata), mimeType="application/json")\
.setModelContentFromString("model.structure",treeModel.toDebugString())
"""
node.setPropertyValue("python_build_syntax", build_script)
R example
node.setPropertyValue("syntax_type", "R")
node.setPropertyValue("r_build_syntax", """modelerModel <- lm(modelerData$Na~modelerData$K,modelerData)
modelerDataModel
modelerModel
""")
extensionmodelnode Properties |
Values | Property description |
---|---|---|
syntax_type |
R Python | Specify which script runs: R or Python (R is the default). |
r_build_syntax
|
string | The R scripting syntax for model building. |
r_score_syntax
|
string | The R scripting syntax for model scoring. |
python_build_syntax |
string | The Python scripting syntax for model building. |
python_score_syntax |
string | The Python scripting syntax for model scoring. |
convert_flags |
StringsAndDoubles
LogicalValues |
Option to convert flag fields. |
convert_missing
|
flag | Option to convert missing values to R NA value. |
convert_datetime
|
flag | Option to convert variables with date or datetime formats to R date/time formats. |
convert_datetime_class |
POSIXct POSIXlt |
Options to specify to what format variables with date or datetime formats are converted. |
output_html
|
flag | Option to display graphs in the R model nugget. |
output_text
|
flag | Option to write R console text output to the R model nugget. |