spark处理时间长,代码参数如下,求教如何提升效率
spark处理数据,处理时间太久,求教如何改变可以提升程序性能??????
详情如下:
数据1:百G或者TB级
数据2:10万条数据
条件:数据1包含数据2中的数据
结果:百G数据处理时间8小时左右,TB级各种文件找不到错误
提交命令:
spark-submit --class com.spark.rdd.UrlAnalyseTest2 --master yarn-client --num-executors 30 --executor-memory 10G --executor-cores 5 --driver-memory 3g --conf spark.driver.maxResultSize=2g --conf spark.akka.frameSize=50 --conf spark.storage.memoryFraction=0.5 --conf spark.default.parallelism=500 --conf spark.yarn.executor.memoryOverhead=2048 --conf spark.core.connection.ack.wait.timeout=300 --conf spark.shuffle.memoryFraction=0.1 test.jar /home1 /QueueD1 10000 /QueueD2
代码如下:
SparkConf sc=new SparkConf().setAppName("urlanalyseTest2 ");
JavaSparkContext jsc=new JavaSparkContext(sc);
JavaRDD<String> data1=jsc.textFile(args[0]);
List<String> url1=jsc.textFile(args[1]).filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
String[] str=v1.split("\\|");
return str.length==3;
}
}).collect();
final Broadcast<List<String>> bc=jsc.broadcast(url1);
JavaRDD<String> data2=data1.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
String[] str = v1.split("\\|");
return str.length == 26 && !str[25].equals("") && !str[0].equals("");
}
});
JavaRDD<String> url=data2.map(new Function<String, String>() {
@Override
public String call(String v1) throws Exception {
String[] str = v1.split("\\|");
return str[0] + "|" + str[25];
}
}).repartition(Integer.parseInt(args[2]));
JavaRDD<String> data3=url.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
@Override
public Iterable<String> call(Iterator<String> stringIterator) throws Exception {
List<String> list = new ArrayList<String>();
List<String> l = bc.value();
while (stringIterator.hasNext()) {
String[] str=stringIterator.next().split("\\|");
String phone = str[0];
String url=str[1];
String result = "";
for (int i = 0; i <l.size(); i++) {
String[] remark = l.get(i).split("\\|");
String url2 = remark[0];
if (url.contains(url2)) {
result = l.get(i) + "|" + phone;
break;
}
}
if (result != "") {
list.add(result);
}
}
return list;
}
}).filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
return v1 != "" || !v1.equals("") || v1 != null;
}
});
data3.saveAsTextFile(args[3]);