初学机器学习,关于二路决策树的问题,请求大牛帮助,帮忙注释一下每行代码是做什么,跪谢!!
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
2.val rawDataWithHeader = sc.textFile("train.tsv")
3.import org.apache.spark.rdd.RDD
4.val rawData =
rawDataWithHeader.mapPartitionsWithIndex
{ (idx, iter) => if (idx == 0) iter.drop(1) else
iter }
5.val lines = rawData.map(_.split(“\t"))
6.val categoriesMap = lines.map(fields =>
fields(3)).distinct.collect.zipWithIndex.toMap
7.import org.apache.spark.mllib.linalg.Vectors
8.val labelpointRDD = lines.map { fields =>
val trFields = fields.map(_.replaceAll("\"", ""))
val categoryFeaturesArray = Array.ofDim[Double]
(categoriesMap.size)
val categoryIdx = categoriesMap(fields(3))
categoryFeaturesArray(categoryIdx) = 1
val numericalFeatures =
trFields.slice(4, fields.size - 1)
.map(d => if (d == "?") 0.0 else d.toDouble)
val label = trFields(fields.size - 1).toInt
LabeledPoint(label, Vectors.dense(categoryFeaturesArray
++ numericalFeatures))
}
val Array(trainData, validationData, testData) =
labelpointRDD.randomSplit(Array(0.8, 0.1, 0.1))
val startTime = new DateTime()
val model = DecisionTree.trainClassifier(trainData,
2, Map[Int, Int](), “entropy, 5, 5)
val endTime = new DateTime()
val duration = new Duration(startTime, endTime)
val rawDataWithHeader = sc.textFile("test.tsv")
val rawData = rawDataWithHeader.mapPartitionsWithIndex
{ (idx, iter) => if (idx == 0) iter.drop(1) else
iter }
val lines = rawData.map(_.split(“\t"))
val dataRDD = lines.take(20).map { fields =>
val trFields = fields.map(_.replaceAll("\"", ""))
val categoryFeaturesArray = Array.ofDim[Double]
(categoriesMap.size)
val categoryIdx = categoriesMap(fields(3))
categoryFeaturesArray(categoryIdx) = 1
val numericalFeatures = trFields.slice(4, fields.size)
.map(d => if (d == "?") 0.0 else d.toDouble)
val label = 0
val url = trFields(0)
val Features = Vectors.dense(categoryFeaturesArray ++
numericalFeatures)
val predict = model.predict(Features).toInt
var predictDesc = { predict match { case 0 =>
"ephemeral"; case 1 => "evergreen"; } }
println(“url:” + predictDesc)
}