import sc.implicits._
val vectorData = dataRDD
//将 枚举的值 转化为 Double
.map( x => ( enum2Double("是否已流失",x._1), x._2(0) , x._2(1) ,x._2(2),x._2(3) ) )
//ml.feature.LabeledPoint
.toDF("loss","gender","age","grade","region")
//indexing columns
val stringColumns = Array("gender","age","grade","region")
val index_transformers: Array[org.apache.spark.ml.PipelineStage] = stringColumns.map(
cname => new StringIndexer()
.setInputCol(cname)
.setOutputCol(s"${cname}_index")
)
// Add the rest of your pipeline like VectorAssembler and algorithm
val index_pipeline = new Pipeline().setStages(index_transformers)
val index_model = index_pipeline.fit(vectorData)
val df_indexed = index_model.transform(vectorData)
//encoding columns
val indexColumns = df_indexed.columns.filter(x => x contains "index")
val one_hot_encoders: Array[org.apache.spark.ml.PipelineStage] = indexColumns.map(
cname => new OneHotEncoder()
.setInputCol(cname)
.setOutputCol(s"${cname}_vec")
)
val pipeline = new Pipeline().setStages(index_transformers ++ one_hot_encoders)
val model = pipeline.fit(vectorData)
model.transform(vectorData).select("loss","gender_index_vec","age_index_vec","grade_index_vec","region_index_vec")
.map (
x=>
ml.feature.LabeledPoint(x.apply(0).toString().toDouble ,ml.linalg.Vectors.dense(x.getAs[SparseVector] ("gender_index_vec").toArray++x.getAs[SparseVector]("age_index_vec").toArray++x.getAs[SparseVector]("grade_index_vec").toArray++x.getAs[SparseVector]("region_index_vec").toArray))
)
来源:
http://blog.csdn.net/pan_haufei/article/details/72903667
祝成功