Last updated: Jan 17, 2024
With the Extension Import node, you can run R or Python for Spark scripts to import data.
Python for Spark example
import modeler.api
stream = modeler.script.stream()
node = stream.create("extension_importer", "extension_importer")
node.setPropertyValue("syntax_type", "Python")
python_script = """
import spss.pyspark
from pyspark.sql.types import *
cxt = spss.pyspark.runtime.getContext()
_schema = StructType([StructField('id', LongType(), nullable=False), \
StructField('age', LongType(), nullable=True), \
StructField('Sex', StringType(), nullable=True), \
StructField('BP', StringType(), nullable=True), \
StructField('Cholesterol', StringType(), nullable=True), \
StructField('K', DoubleType(), nullable=True), \
StructField('Na', DoubleType(), nullable=True), \
StructField('Drug', StringType(), nullable=True)])
if cxt.isComputeDataModelOnly():
cxt.setSparkOutputSchema(_schema)
else:
df = cxt.getSparkInputData()
if df is None:
drugList=[(1,23,'F','HIGH','HIGH',0.792535,0.031258,'drugY'), \
(2,47,'M','LOW','HIGH',0.739309,0.056468,'drugC'),\
(3,47,'M','LOW','HIGH',0.697269,0.068944,'drugC'),\
(4,28,'F','NORMAL','HIGH',0.563682,0.072289,'drugX'),\
(5,61,'F','LOW','HIGH',0.559294,0.030998,'drugY'),\
(6,22,'F','NORMAL','HIGH',0.676901,0.078647,'drugX'),\
(7,49,'F','NORMAL','HIGH',0.789637,0.048518,'drugY'),\
(8,41,'M','LOW','HIGH',0.766635,0.069461,'drugC'),\
(9,60,'M','NORMAL','HIGH',0.777205,0.05123,'drugY'),\
(10,43,'M','LOW','NORMAL',0.526102,0.027164,'drugY')]
sqlcxt = cxt.getSparkSQLContext()
rdd = cxt.getSparkContext().parallelize(drugList)
print 'pyspark read data count = '+str(rdd.count())
df = sqlcxt.createDataFrame(rdd, _schema)
cxt.setSparkOutputData(df)
"""
node.setPropertyValue("python_syntax", python_script)
R example
node.setPropertyValue("syntax_type", "R")
R_script = """# 'JSON Import' Node v1.0 for IBM SPSS Modeler
# 'RJSONIO' package created by Duncan Temple Lang - http://cran.r-project.org/web/packages/RJSONIO
# 'plyr' package created by Hadley Wickham http://cran.r-project.org/web/packages/plyr
# Node developer: Danil Savine - IBM Extreme Blue 2014
# Description: This node allows you to import into SPSS a table data from a JSON.
# Install function for packages
packages <- function(x){
x <- as.character(match.call()[[2]])
if (!require(x,character.only=TRUE)){
install.packages(pkgs=x,repos="http://cran.r-project.org")
require(x,character.only=TRUE)
}
}
# packages
packages(RJSONIO)
packages(plyr)
### This function is used to generate automatically the dataModel
getMetaData <- function (data) {
if (dim(data)[1]<=0) {
print("Warning : modelerData has no line, all fieldStorage fields set to strings")
getStorage <- function(x){return("string")}
} else {
getStorage <- function(x) {
res <- NULL
#if x is a factor, typeof will return an integer so we treat the case on the side
if(is.factor(x)) {
res <- "string"
} else {
res <- switch(typeof(unlist(x)),
integer = "integer",
double = "real",
character = "string",
"string")
}
return (res)
}
}
col = vector("list", dim(data)[2])
for (i in 1:dim(data)[2]) {
col[[i]] <- c(fieldName=names(data[i]),
fieldLabel="",
fieldStorage=getStorage(data[i]),
fieldMeasure="",
fieldFormat="",
fieldRole="")
}
mdm<-do.call(cbind,col)
mdm<-data.frame(mdm)
return(mdm)
}
# From JSON to a list
txt <- readLines('C:/test.json')
formatedtxt <- paste(txt, collapse = '')
json.list <- fromJSON(formatedtxt)
# Apply path to json.list
if(strsplit(x='true', split='
' ,fixed=TRUE)[[1]][1]) {
path.list <- unlist(strsplit(x='id_array', split=','))
i = 1
while(i<length(path.list)+1){
if(is.null(getElement(json.list, path.list[i]))){
json.list <- json.list[[1]]
}else{
json.list <- getElement(json.list, path.list[i])
i <- i+1
}
}
}
# From list to dataframe via unlisted json
i <-1
filled <- data.frame()
while(i < length(json.list)+ 1){
unlisted.json <- unlist(json.list[[i]])
to.fill <- data.frame(t(as.data.frame(unlisted.json, row.names = names(unlisted.json))), stringsAsFactors=FALSE)
filled <- rbind.fill(filled,to.fill)
i <- 1 + i
}
# Export to SPSS Modeler Data
modelerData <- filled
print(modelerData)
modelerDataModel <- getMetaData(modelerData)
print(modelerDataModel)
"""
node.setPropertyValue("r_syntax", R_script)
extensionimportnode properties |
Data type | Property description |
---|---|---|
syntax_type |
R Python | Specify which script runs – R or Python (R is the default). |
r_syntax |
string | The R scripting syntax to run. |
python_syntax |
string | The Python scripting syntax to run. |