
from importVectors
from importLogisticRegression
# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([
# Create a LogisticRegression instance. This instance is an Estimator.
lr =LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print"LogisticRegression parameters:\n"+ lr.explainParams()+"\n"
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 =
# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print"Model 1 was fit using parameters: "
print model1.extractParamMap()
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap ={lr.maxIter:20}
paramMap[lr.maxIter]=30# Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam:0.1, lr.threshold:0.55})# Specify multiple Params.
# You can combine paramMaps, which are python dictionaries.
paramMap2 ={lr.probabilityCol:"myProbability"}# Change output column name
paramMapCombined = paramMap.copy()
# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 =, paramMapCombined)
print"Model 2 was fit using parameters: "
print model2.extractParamMap()
# Prepare test data
test = spark.createDataFrame([
# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.
prediction = model2.transform(test)
selected ="features","label","myProbability","prediction")
for row in selected.collect():
print row
from importPipeline
from importLogisticRegression
from importHashingTF,Tokenizer
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
(0L,"a b c d e spark",1.0),
(1L,"b d",0.0),
(2L,"spark f g h",1.0),
(3L,"hadoop mapreduce",0.0)],["id","text","label"])
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer =Tokenizer(inputCol="text", outputCol="words")
hashingTF =HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr =LogisticRegression(maxIter=10, regParam=0.01)
pipeline =Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model =
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
(4L,"spark i j k"),
(5L,"l m n"),
(6L,"mapreduce spark"),
(7L,"apache hadoop")],["id","text"])
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected ="id","text","prediction")
for row in selected.collect():
