1,258
社区成员
发帖
与我相关
我的任务
分享
scala> val cm = c.map(e => (e._1, (e._2, 0)))
cm: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[25] at map at <console>:23
scala> val cr = cm.reduceByKey((e1, e2) => (e1._1 + e2._1, e1._1/2 + e2._1/2))
cr: org.apache.spark.rdd.RDD[(String, (Int, Int))] = ShuffledRDD[26] at reduceByKey at <console>:25
scala> val cz = cr.map(e => (e._1, e._2._1, e._2._2))
cz: org.apache.spark.rdd.RDD[(String, Int, Int)] = MapPartitionsRDD[27] at map at <console>:27
scala> cz.collect
res15: Array[(String, Int, Int)] = Array((b,3,1), (a,6,2), (c,1,0))
scala> val c = sc.parallelize(List(("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("c", 1)))
c: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[28] at parallelize at <console>:21