mapreduce 去重的问题怎么解决？

wzl189 2014-06-14 07:05:47

john 89
tom 100
mary 100
mary 200
tom 20
———–
我刚学mapreduce，正在练习，上面这个我计算了很久也不对，就是对第一列去重，去重后应该是3
如果用mapreduce计算成功后，part-00000 的文件内容是：
3
请问下，这个mapreduce怎么写啊？

...全文

1388 15 打赏收藏转发到动态举报

写回复

用AI写文章

15 条回复

切换为时间正序

请发表友善的回复…

发表回复

dary2015 2015-04-10

打赏
举报

不错，太感谢了

pandayp 2015-03-27

打赏
举报

求教，那如果是5列数据，用其中的三列来去重呢。列之间\t 分割，行之间\n分割。map不会写呀怎么取出三列数据

撸大湿 2014-06-21

打赏
举报

map 输出key 用班级 + 分隔符 + 姓名重写 grouping 实现二次排序，如果reduce num > 1 还需要重写 partition reduce略作修改，增个姓名变量，比较当前姓名是否和前一个姓名是否一致，如果不一致计数器+=1 代码就不贴了，LZ多思考一下，这种简单的MR不难解决

wzl189 2014-06-20

打赏
举报

引用 11 楼 tntzbzc 的回复:

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class wzl189_distinct {
	public static class MyMapper extends
			Mapper<Object, Text, Text, NullWritable> {

		Text outKey = new Text();

		@Override
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			String tmp[] = value.toString().split(" ");
			if (tmp.length != 2)
				return;
			outKey.set(tmp[0]);
			context.write(outKey, NullWritable.get());

		}
	}

	public static class MyReducer extends
			Reducer<Text, NullWritable, LongWritable, NullWritable> {

		long myCount = 0l;

		@Override
		public void reduce(Text key, Iterable<NullWritable> values,
				Context context) throws IOException, InterruptedException {
			++myCount;
		}

		@Override
		public void cleanup(Context context) throws IOException,
				InterruptedException {
			context.write(new LongWritable(myCount), NullWritable.get());
		};
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		if (args.length != 2) {
			System.err.println("Usage: <in> <out>");
			System.exit(2);
		}

		conf.set("mapred.child.java.opts", "-Xmx350m -Xmx1024m");

		@SuppressWarnings("deprecation")
		Job job = new Job(conf, "wzl189_distinct");
		job.setNumReduceTasks(1);
		job.setInputFormatClass(TextInputFormat.class);
		job.setJarByClass(wzl189_distinct.class);
		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);

		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

reduce阶段只用一个计数器就行了

太感谢了，你了解这么多啊，我都搞了2周，没有结果，想再请教最后一个问题：假如第一列是姓名，第二列是班级（先不管我这个需求是否合理） john 100 john 100 mary 100 mary 200 tom 200 想统计处如下结果，就是按班级人数去重 100 2 200 2 这个mapreduce怎么写啊？望高手最后再解答下，万分感谢了。

撸大湿 2014-06-20

打赏
举报

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class wzl189_distinct {
	public static class MyMapper extends
			Mapper<Object, Text, Text, NullWritable> {

		Text outKey = new Text();

		@Override
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			String tmp[] = value.toString().split(" ");
			if (tmp.length != 2)
				return;
			outKey.set(tmp[0]);
			context.write(outKey, NullWritable.get());

		}
	}

	public static class MyReducer extends
			Reducer<Text, NullWritable, LongWritable, NullWritable> {

		long myCount = 0l;

		@Override
		public void reduce(Text key, Iterable<NullWritable> values,
				Context context) throws IOException, InterruptedException {
			++myCount;
		}

		@Override
		public void cleanup(Context context) throws IOException,
				InterruptedException {
			context.write(new LongWritable(myCount), NullWritable.get());
		};
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		if (args.length != 2) {
			System.err.println("Usage: <in> <out>");
			System.exit(2);
		}

		conf.set("mapred.child.java.opts", "-Xmx350m -Xmx1024m");

		@SuppressWarnings("deprecation")
		Job job = new Job(conf, "wzl189_distinct");
		job.setNumReduceTasks(1);
		job.setInputFormatClass(TextInputFormat.class);
		job.setJarByClass(wzl189_distinct.class);
		job.setMapperClass(MyMapper.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);

		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

reduce阶段只用一个计数器就行了

撸大湿 2014-06-20

打赏
举报

我晚点写个完整例子给你

wzl189 2014-06-20

打赏
举报

引用 1 楼 tntzbzc 的回复:

map按第一列为key，value无所谓 reduce class中初始化一个计数器每个reduce方法中计数器每次加一 reduce 的cleanup方法中commit计数器就可以了

谢谢了，请教下，你说的这个map我知道怎么写了，但是这个reduce怎么写啊？

wzl189 2014-06-20

打赏
举报

引用 7 楼 wzl189 的回复:

[quote=引用 4 楼 tjytad1982 的回复:] 学习

public static class Map extends Mapper<LongWritable, Text, Text, Text> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); try { String[] lineSplit = line.split("\t"); context.write(new Text(lineSplit[0]), new Text("")); context.write(new Text("uniq") ,new Text(lineSplit[0]) ); } catch (java.lang.ArrayIndexOutOfBoundsException e) { context.getCounter(Counter.LINESKIP).increment(1); return; } } } public static class Reduce extends Reducer<Text, Text, Text, Text> { private Set<String> count = new HashSet<String>(); public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for(Text value:values){ count.add(value.toString()); } context.write("uniq", new Text(count.size()+"")); } } ------------------------- 这个问题纠结我2周了，这个方面的学习资料太少了，我的map和reduce是这样写的，但是数据量大一些，就会内存溢出，我想我这个思路是错误的你说的 “必须用reduce去group后的key才能得到去重效果 ”，这个 map和reduce是具体怎么写的啊？[/quote] -------------刚才写的mapreduce错了，以这个为准

wzl189 2014-06-20

打赏
举报

引用 4 楼 tjytad1982 的回复:

学习

public static class Map extends Mapper<LongWritable, Text, Text, Text> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); try { String[] lineSplit = line.split("\t"); context.write(new Text(lineSplit[0]), new Text("")); } catch (java.lang.ArrayIndexOutOfBoundsException e) { context.getCounter(Counter.LINESKIP).increment(1); return; } } } public static class Reduce extends Reducer<Text, Text, Text, Text> { private Set<String> count = new HashSet<String>(); public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for(Text value:values){ count.add(value.toString()); } context.write(key, new Text("")); } } ------------------------- 这个问题纠结我2周了，这个方面的学习资料太少了，我的map和reduce是这样写的，但是数据量大一些，就会内存溢出，我想我这个思路是错误的你说的 “必须用reduce去group后的key才能得到去重效果 ”，这个 map和reduce是具体怎么写的啊？