MapReuce 怎么实现去掉文件重复的内容

黄色沙琪玛 2014-07-18 04:22:14

文件数据格式：
Name,CardNo,Descriot,CtfTp,CtfId,Gender,Birthday,Address
彬,,,ID,xxxxx21059,M,20101001,,,F,,CHN,,,,,,,,139,19:46:02,4000000
珊,,,ID,xxxxx811090189,M,20101001,,,F,,CHN,,,,,,,,158
世,,,ID,4xxxxx1772,M,19880129,,,F,,CHN,,,,,,,,152108,2,1,,,,,0,2010-10-19
超,,,ID,4xxxxx1772,M,20101001,,,F,,CHN,,,,,,,,1585091,,,mc06o.cn,,,,,,,,0,2010-10-19 19:46:04,4000003
磊,,,ID,4xxxxx1772,M,19831011,,,F,,CHN,,,,,,,,159159,362,,565408,,,,,,,,0,2010-10-19 19:46:05,4000004
芳,,,ID,xxxxx811090189,M,20101001,,,F,,CHN,,,,,,,,1506:05,4005
要求：读取文件后，我要根据CtfTp 字段删除对应的字段内容，删除掉重复的，只保留第一条。
删除后的数据格式应该是：
彬,,,ID,xxxxx21059,M,20101001,,,F,,CHN,,,,,,,,139,19:46:02,4000000
珊,,,ID,xxxxx811090189,M,20101001,,,F,,CHN,,,,,,,,158
世,,,ID,4xxxxx1772,M,19880129,,,F,,CHN,,,,,,,,152108,2,1,,,,,0,2010-10-19

说明：上面是我的数据文件格式。Name,CardNo,Descriot,CtfTp,CtfId,Gender,Birthday,Address 分布是字
段，对应下面的数据，如果是空用逗号（“,”）分开，CtfTp 对应是身份证内容信息，现在需要去掉重复的身份证信息。
如果遇到重复的身份证，删除该条记录，只保留第一条。

...全文

350 10 打赏收藏转发到动态举报

写回复

用AI写文章

10 条回复

切换为时间正序

请发表友善的回复…

发表回复

xuhui32 2014-08-10

打赏
举报

这就是传说中，XX酒店泄露出来的住宿信息吧

人生偌只如初见 2014-07-19

打赏
举报

public class RemoveDuplicate { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.out.println("error"); System.exit(2); } Job job = Job.getInstance(conf); job.setJarByClass(RemoveDuplicate.class); job.setMapperClass(RemoveDuplicateMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(RemoveDuplicateReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.out.println(job.waitForCompletion(true) ? 0 : 1); } } class RemoveDuplicateMapper extends Mapper<LongWritable, Text, Text, Text> { protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); if (StringUtils.isBlank(line)) return; StringTokenizer token = new StringTokenizer(line, ":"); String id = token.nextToken(); context.write(new Text(id), value); } } class RemoveDuplicateReducer extends Reducer<Text, Text, NullWritable, Text> { protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { context.write(NullWritable.get(), value); break; } }; }

黄色沙琪玛 2014-07-19

打赏
举报

引用 8 楼 wulinshishen 的回复:

public class RemoveDuplicate { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.out.println("error"); System.exit(2); } Job job = Job.getInstance(conf); job.setJarByClass(RemoveDuplicate.class); job.setMapperClass(RemoveDuplicateMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(RemoveDuplicateReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.out.println(job.waitForCompletion(true) ? 0 : 1); } } class RemoveDuplicateMapper extends Mapper<LongWritable, Text, Text, Text> { protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); if (StringUtils.isBlank(line)) return; StringTokenizer token = new StringTokenizer(line, ":"); String id = token.nextToken(); context.write(new Text(id), value); } } class RemoveDuplicateReducer extends Reducer<Text, Text, NullWritable, Text> { protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { context.write(NullWritable.get(), value); break; } }; }

谢谢。结贴了

黄色沙琪玛 2014-07-18

打赏
举报

引用 4 楼 wulinshishen 的回复:

Map 输出 <CtfTp, 当前记录行> Reduce 选一条记录输出


package cn.trimData;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * @author z
 * @version 创建时间： 2014 11:08:02 PM
 * 类说明 去掉重复的身份证信息
 * version 1.0
 */
public class ReDuplicate {

	static class DupMapper extends Mapper<Object,Text,Text,Text>{
		@Override
		protected void map(Object key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String outKey = "";
			if(line.indexOf("Name") >= 0){
				return ;
			}
			String[] str1 = line.split(",");
			if(str1.length == 8){
				outKey = str1[4];
				if(outKey.length() < 0){
					return ;
				}
			}
			context.write(new Text(outKey), new Text(line));
		}
	}
	
	static class DupReduce extends Reducer<Text,Text,NullWritable,Text>{
		@Override
		protected void reduce(Text key, Iterable<Text> values,
				Context context)
				throws IOException, InterruptedException {
			Text value = new Text("");
			Text next = values.iterator().next();
			for(Text outvalue:values){
				next = outvalue;
			}
			context.write(NullWritable.get(), (Text) values);
		}
	}
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		 String otherArgs[] = (new GenericOptionsParser(conf, args)).getRemainingArgs();
	        if(otherArgs.length != 2)
	        {
	            System.out.println(" ");
	            System.exit(1);
	        }
	        Job job = new Job(conf, "new ");
	        job.setJarByClass(ReDuplicate.class);
	        job.setMapperClass(DupMapper.class);
	        job.setReducerClass(DupReduce.class);
	        job.setMapOutputKeyClass(Text.class);
	        job.setOutputKeyClass(NullWritable.class);
	        
	        job.setOutputKeyClass(Text.class);
	        job.setOutputKeyClass(Text.class);
	        
	        FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
	        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	        System.out.println((job.waitForCompletion(true)? 0 :1));
	}

}

抛出的异常1提示：


org.apache.hadoop.mapreduce.task.ReduceContextImpl$ValueIterable cannot be cast to org.apache.hadoop.io.Text
	at cn.trimData.ReDuplicate$DupReduce.reduce(ReDuplicate.java:54)

就是reduce 方法的 context.write(NullWritable.get(), (Text) values); 类型转换异常异常2：


 ERROR hdfs.DFSClient (DFSClient.java:closeAllFilesBeingWritten(712)) - Failed to close file /testout/010/_temporary/_attempt_local1576430195_0001_r_000000_0/part-r-00000
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): No lease on /testout/010/_temporary/_attempt_local1576430195_0001_r_000000_0/part-r-00000: File does not exist. Holder DFSClient_NONMAPREDUCE_1155748347_1 does not have any open files.
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2543)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2535)

输出异常

黄色沙琪玛 2014-07-18