1,258
社区成员
发帖
与我相关
我的任务
分享
scala> spark.version
res18: String = 2.4.3
scala> val ds = Seq(Person(1,"Jason",34,null,1),Person(1,"Jason1",null,"Dev",2),Person(1,null,28,"DBA",3),Person(2,"Tom",20,null,1),Person(2,"Tom1",null,"Cooker",2)).toDS
ds: org.apache.spark.sql.Dataset[Person] = [id: bigint, name: string ... 3 more fields]
scala> ds.show(false)
+---+------+----+------+---+
|id |name |age |job |rn |
+---+------+----+------+---+
|1 |Jason |34 |null |1 |
|1 |Jason1|null|Dev |2 |
|1 |null |28 |DBA |3 |
|2 |Tom |20 |null |1 |
|2 |Tom1 |null|Cooker|2 |
+---+------+----+------+---+
scala> ds.groupByKey(p=>p.id).reduceGroups((p1,p2) => if (p1.rn <= p2.rn) Person( id = p2.id, name = if (p2.name == null) p1.name else p2.name, age = if (p2.age == null) p1.age else p2.age, job = if (p2.job == null) p1.job else p2.job, p2.rn) else Person( id = p1.id, name = if (p1.name == null) p2.name else p1.name, age = if (p1.age == null) p2.age else p1.age, job = if (p1.job == null) p2.job else p1.job, p1.rn)).map(_._2).show(false)
+---+------+---+------+---+
|id |name |age|job |rn |
+---+------+---+------+---+
|1 |Jason1|28 |DBA |3 |
|2 |Tom1 |20 |Cooker|2 |
+---+------+---+------+---+