291,254
社区成员
import org.apache.spark.sql.SparkSession
object Step1 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("Step1").master("local").getOrCreate()
/**********begin**********/
val frame = spark.read.option("header", true).option("delimiter", "\t").csv("/root/data.csv")
frame.createTempView("data")
spark.udf.register("cleanData", (x: String) => {
x.replaceAll("\\@+", "").replaceAll("\\$+", "")
})
spark.sql(
"""
|select cleanData(TRIP_ID) as TRIP_ID,cleanData(CALL_TYPE) as CALL_TYPE,cleanData(ORIGIN_CALL) as ORIGIN_CALL,
|cleanData(TAXI_ID) as TAXI_ID,cleanData(ORIGIN_STAND) as ORIGIN_STAND ,cleanData(TIMESTAMP) as TIMESTAMP,
|cleanData(POLYLINE) as POLYLINE
|from data
""".stripMargin).show()
/**********end**********/
spark.stop()
}
}
import com.alibaba.fastjson.JSON
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StringType
object Step2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("Step1").master("local").getOrCreate()
spark.sparkContext.setLogLevel("error")
/**********begin**********/
val frame = spark.read.option("header", true).option("delimiter", "\t").csv("/root/data2.csv")
frame.createTempView("data")
//1.将时间戳转换成时间
spark.sql("select TRIP_ID,CALL_TYPE,ORIGIN_CALL,TAXI_ID,ORIGIN_STAND,POLYLINE, from_unixtime(TIMESTAMP,'yyyy-MM-dd') as TIME from data").createTempView("data2")
spark.sql("select * from data2").show()
//2.将POLYLINE字段,分离出startLocation,endLocation 两个字段
spark.udf.register("startLocation", (x: String) => {
val arr = JSON.parseArray(x)
arr.get(0).toString
})
spark.udf.register("endLocation", (x: String) => {
val arr = JSON.parseArray(x)
arr.get(arr.size() - 1).toString
})
spark.sql(
"""
|select TRIP_ID,CALL_TYPE,ORIGIN_CALL,TAXI_ID,ORIGIN_STAND,POLYLINE,TIME,startLocation(POLYLINE) as startLocation,endLocation(POLYLINE) as endLocation from data2
""".stripMargin).createTempView("data3")
spark.sql("select * from data3").show()
//3.计算时长,行程的总行程时间定义为(点数-1)×15秒。
// 例如,POLYLINE中具有101个数据点的行程具有(101-1)* 15 = 1500秒的长度
spark.udf.register("timeLen", (x: String) => {
(JSON.parseArray(x).size() - 1) * 15
})
spark.sql(
"""
|select TRIP_ID,CALL_TYPE,ORIGIN_CALL,TAXI_ID,ORIGIN_STAND,POLYLINE,TIME,startLocation(POLYLINE) as startLocation,endLocation(POLYLINE) as endLocation,timeLen(POLYLINE) as timeLen from data3
""".stripMargin).createTempView("data4")
spark.sql("select * from data4").show()
//4.统计每天各种呼叫类型的数量并以CALL_TYPE,TIME升序排序
spark.sql(
"""
|select CALL_TYPE ,TIME,count(1) as num from data4 group by TIME,CALL_TYPE order by CALL_TYPE,TIME
""".stripMargin).show()
/**********end**********/
spark.stop()
}
}