1,258
社区成员
发帖
与我相关
我的任务
分享
// 设置运行环境
val conf = new SparkConf().setAppName("LDA Test")
.setMaster("spark://master:7077").setJars(Seq("E:\\Intellij\\Projects\\MachineLearning\\MachineLearning.jar"))
val sc = new SparkContext(conf)
Logger.getRootLogger().setLevel(Level.WARN)
// 连接MySQL数据库
case class Article(id:Int,article:String)
case class Word(id:Int,word:String)
case class Vocabulary(id:Int,word_id:Int,article_id:Int,count:Int)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val url = "jdbc:mysql://192.168.1.101:3306/machinelearning"
val table_article = "article"
val table_word = "word"
val table_vocabulary = "vocabulary"
val reader = sqlContext.read.format("jdbc")
reader.option("url", url)
reader.option("driver", "com.mysql.jdbc.Driver")
reader.option("user", "hadoop")
reader.option("password", "root")
reader.option("dbtable", table_article)
val df_article = reader.load()
reader.option("dbtable", table_word)
val df_word = reader.load()
reader.option("dbtable", table_vocabulary)
val df_vocabulary = reader.load()
val vocuRDD = df_vocabulary.collect()
df_article.show()
df_word .show()
df_vocabulary.show()
// 记录属性
val prop = new java.util.Properties
prop.setProperty("user", "hadoop")
prop.setProperty("password", "root")
prop.setProperty("driver", "com.mysql.jdbc.Driver")
// 读取样本数据并解析
val textRDD = sc.textFile("hdfs://master:9000/ml/data/Article_01.txt")
val wordRDD = textRDD.flatMap(line => line.split("\\s+")).map(word => (word, 1)).reduceByKey((x, y) => x + y)
println("-----------------------------------------------------------------------")
wordRDD.collect().foreach(p => {
println(p)
var isWrite = true
breakable{
vocuRDD.foreach(q => {
if (p._1.toInt == q.getInt(1)){
// 已经存在这个单词则不写入
isWrite = false
break
}
}
)
}
// 该单词写入表vocabulary
if (isWrite){
prop.setProperty("Word",p._1)
df_vocabulary.write.mode(SaveMode.Append).jdbc(url, table_vocabulary, prop)
// df_vocabulary.save
}
})
df_vocabulary.write.mode(SaveMode.Append).jdbc(url, table_vocabulary, prop)