关于mapreduce，再reduce方法中使用两次foreach来遍历values

星夜丶晚晚 2019-03-13 10:55:17

想用mapreduce找出几天中，每一天的气温最高的记录；代码思路很简单，日期作为key，时间和温度作为value。然后在reduce方法中对values先遍历一遍找到最高温度max，再遍历一遍values，找到最高温度的记录，但是很可惜，程序能输出，但是输出文件没有内容，不知是何缘由。个人怀疑是不是不能使用两次遍历。
输入文件中的格式是：日期时间温度
例：
2017-06-23 08 12
2017-06-23 12 25
代码如下：

import java.io.IOException;

import java.net.URI;



import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;



import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



public class MaxTemperature {

	public static class MaxTemperatureMap extends Mapper<Object, Text, Text, Text>

	{

		@Override 

		public void map(Object key,Text value,Context context) throws IOException,InterruptedException

		{

			String[] str = value.toString().split(" ");

			context.write(new Text(str[0]), new Text(str[1]+str[2]));

		}

	}



	public static class MaxTemperatureReduce extends Reducer<Text, Text, Text, Text> 

	{



		@Override 

		public void reduce(Text key ,Iterable<Text> values ,Context context) throws IOException,InterruptedException

		{

			//先找到最高温度max

                        int max=Integer.MIN_VALUE;

			for(Text value:values) {

				  max=Math.max(max, Integer.parseInt(value.toString().substring(2)));



				}

//			context.write(key, new Text(String.valueOf(max)));

//再找到取到最高温度max时的记录，输出到文件

			for(Text value:values) {

				if (max==Integer.parseInt(value.toString().substring(2))) 

				{

					context.write(key, value);

				}

			}

		

//			context.write(new Text(key.toString()+":"+value.toString().substring(0, 2)), new IntWritable(max));

		}

	}

	

	public static void main(String[] args) throws Exception {

//		设置HDFS

		String ipName = "127.0.0.1";

		String hdfs = "hdfs://"+ipName+":9000";

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		String jobName = "MaxTemperature";

		removeOutput(conf,hdfs);

		

		job.setJarByClass(MaxTemperature.class);

		job.setMapperClass(MaxTemperatureMap.class);

		job.setMapOutputKeyClass(Text.class);

		job.setMapOutputValueClass(Text.class);

		

//		job.setCombinerClass(MaxTemperatureCombine.class);

		job.setReducerClass(MaxTemperatureReduce.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		



		//3.设置作业输入和输出路径

		String dataDir = "/workspace/flowStatistics/date_data";				//实验数据目录	

		String outputDir = "/workspace/flowStatistics/maxTemperature";	//实验输出目录

		Path inPath = new Path(hdfs + dataDir);

		Path outPath = new Path(hdfs + outputDir);



		FileInputFormat.addInputPath(job, inPath);

		FileOutputFormat.setOutputPath(job, outPath);

		

		System.out.println("Job: " + jobName + " is running...");

		if(job.waitForCompletion(true)) {

			System.out.println("success!");

			System.exit(0);

		} else {

			System.out.println("failed!");

			System.exit(1);

		}

	}

	// 这个方法是为了 避免因为output文件已经存在而报错。直接删掉上一次运行的output文件夹。

		private static void removeOutput(Configuration conf, String ipPre)

				throws IOException {

			String outputPath = ipPre + "/workspace/flowStatistics/maxTemperature";

			FileSystem fs = FileSystem.get(URI.create(outputPath), conf);

			Path path = new Path(outputPath);

			if (fs.exists(path)) {

				fs.deleteOnExit(path);

			}

			fs.close();

		}

}