MapReuce 怎么实现去掉文件重复的内容

黄色沙琪玛 2014-07-18 04:22:14
文件数据格式:
Name,CardNo,Descriot,CtfTp,CtfId,Gender,Birthday,Address
彬,,,ID,xxxxx21059,M,20101001,,,F,,CHN,,,,,,,,139,19:46:02,4000000
珊,,,ID,xxxxx811090189,M,20101001,,,F,,CHN,,,,,,,,158
世,,,ID,4xxxxx1772,M,19880129,,,F,,CHN,,,,,,,,152108,2,1,,,,,0,2010-10-19
超,,,ID,4xxxxx1772,M,20101001,,,F,,CHN,,,,,,,,1585091,,,mc06o.cn,,,,,,,,0,2010-10-19 19:46:04,4000003
磊,,,ID,4xxxxx1772,M,19831011,,,F,,CHN,,,,,,,,159159,362,,565408,,,,,,,,0,2010-10-19 19:46:05,4000004
芳,,,ID,xxxxx811090189,M,20101001,,,F,,CHN,,,,,,,,1506:05,4005
要求:读取文件后,我要根据CtfTp 字段删除对应的字段内容,删除掉重复的,只保留第一条。
删除后的数据格式应该是:
彬,,,ID,xxxxx21059,M,20101001,,,F,,CHN,,,,,,,,139,19:46:02,4000000
珊,,,ID,xxxxx811090189,M,20101001,,,F,,CHN,,,,,,,,158
世,,,ID,4xxxxx1772,M,19880129,,,F,,CHN,,,,,,,,152108,2,1,,,,,0,2010-10-19

说明:上面是我的数据文件格式。Name,CardNo,Descriot,CtfTp,CtfId,Gender,Birthday,Address 分布是字
段,对应下面的数据,如果是空用逗号(“,”)分开,CtfTp 对应是身份证内容信息,现在需要去掉重复的身份证信息。
如果遇到重复的身份证,删除该条记录,只保留第一条。
...全文
338 10 打赏 收藏 转发到动态 举报
写回复
用AI写文章
10 条回复
切换为时间正序
请发表友善的回复…
发表回复
xuhui32 2014-08-10
  • 打赏
  • 举报
回复
这就是传说中,XX酒店泄露出来的住宿信息吧
  • 打赏
  • 举报
回复
public class RemoveDuplicate { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.out.println("error"); System.exit(2); } Job job = Job.getInstance(conf); job.setJarByClass(RemoveDuplicate.class); job.setMapperClass(RemoveDuplicateMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(RemoveDuplicateReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.out.println(job.waitForCompletion(true) ? 0 : 1); } } class RemoveDuplicateMapper extends Mapper<LongWritable, Text, Text, Text> { protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); if (StringUtils.isBlank(line)) return; StringTokenizer token = new StringTokenizer(line, ":"); String id = token.nextToken(); context.write(new Text(id), value); } } class RemoveDuplicateReducer extends Reducer<Text, Text, NullWritable, Text> { protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { context.write(NullWritable.get(), value); break; } }; }
黄色沙琪玛 2014-07-19
  • 打赏
  • 举报
回复
引用 8 楼 wulinshishen 的回复:
public class RemoveDuplicate { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.out.println("error"); System.exit(2); } Job job = Job.getInstance(conf); job.setJarByClass(RemoveDuplicate.class); job.setMapperClass(RemoveDuplicateMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(RemoveDuplicateReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.out.println(job.waitForCompletion(true) ? 0 : 1); } } class RemoveDuplicateMapper extends Mapper<LongWritable, Text, Text, Text> { protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); if (StringUtils.isBlank(line)) return; StringTokenizer token = new StringTokenizer(line, ":"); String id = token.nextToken(); context.write(new Text(id), value); } } class RemoveDuplicateReducer extends Reducer<Text, Text, NullWritable, Text> { protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { context.write(NullWritable.get(), value); break; } }; }
谢谢。结贴了
黄色沙琪玛 2014-07-18
  • 打赏
  • 举报
回复
引用 4 楼 wulinshishen 的回复:
Map 输出 <CtfTp, 当前记录行> Reduce 选一条记录输出

package cn.trimData;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * @author z
 * @version 创建时间: 2014 11:08:02 PM
 * 类说明 去掉重复的身份证信息
 * version 1.0
 */
public class ReDuplicate {

	static class DupMapper extends Mapper<Object,Text,Text,Text>{
		@Override
		protected void map(Object key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String outKey = "";
			if(line.indexOf("Name") >= 0){
				return ;
			}
			String[] str1 = line.split(",");
			if(str1.length == 8){
				outKey = str1[4];
				if(outKey.length() < 0){
					return ;
				}
			}
			context.write(new Text(outKey), new Text(line));
		}
	}
	
	static class DupReduce extends Reducer<Text,Text,NullWritable,Text>{
		@Override
		protected void reduce(Text key, Iterable<Text> values,
				Context context)
				throws IOException, InterruptedException {
			Text value = new Text("");
			Text next = values.iterator().next();
			for(Text outvalue:values){
				next = outvalue;
			}
			context.write(NullWritable.get(), (Text) values);
		}
	}
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		 String otherArgs[] = (new GenericOptionsParser(conf, args)).getRemainingArgs();
	        if(otherArgs.length != 2)
	        {
	            System.out.println(" ");
	            System.exit(1);
	        }
	        Job job = new Job(conf, "new ");
	        job.setJarByClass(ReDuplicate.class);
	        job.setMapperClass(DupMapper.class);
	        job.setReducerClass(DupReduce.class);
	        job.setMapOutputKeyClass(Text.class);
	        job.setOutputKeyClass(NullWritable.class);
	        
	        job.setOutputKeyClass(Text.class);
	        job.setOutputKeyClass(Text.class);
	        
	        FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
	        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	        System.out.println((job.waitForCompletion(true)? 0 :1));
	}

}
抛出的异常1提示:

org.apache.hadoop.mapreduce.task.ReduceContextImpl$ValueIterable cannot be cast to org.apache.hadoop.io.Text
	at cn.trimData.ReDuplicate$DupReduce.reduce(ReDuplicate.java:54)
就是reduce 方法的 context.write(NullWritable.get(), (Text) values); 类型转换异常 异常2:

 ERROR hdfs.DFSClient (DFSClient.java:closeAllFilesBeingWritten(712)) - Failed to close file /testout/010/_temporary/_attempt_local1576430195_0001_r_000000_0/part-r-00000
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): No lease on /testout/010/_temporary/_attempt_local1576430195_0001_r_000000_0/part-r-00000: File does not exist. Holder DFSClient_NONMAPREDUCE_1155748347_1 does not have any open files.
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2543)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkLease(FSNamesystem.java:2535)
输出异常
黄色沙琪玛 2014-07-18
  • 打赏
  • 举报
回复
引用 2 楼 liqi_wj 的回复:
你这个是不是用hive写更简单点啊
文件很大,不适合用hive,去重复只是第一步,后面还有很多步:分区域,分男女
黄色沙琪玛 2014-07-18
  • 打赏
  • 举报
回复
引用 1 楼 liqi_wj 的回复:
你确定你给的想要的结果 是对的?
我想要的结果就是根据CtfTp 来判断身份证号码是否有重复,如果有就保留第一条,其他删除
  • 打赏
  • 举报
回复
Map 输出 <CtfTp, 当前记录行> Reduce 选一条记录输出
liqi_wj 2014-07-18
  • 打赏
  • 举报
回复
如果你的文件来源是多个 或者map的数量超过1 那也没法确定 哪个是第一条吧~~
liqi_wj 2014-07-18
  • 打赏
  • 举报
回复
你这个是不是用hive写更简单点啊
liqi_wj 2014-07-18
  • 打赏
  • 举报
回复
你确定你给的想要的结果 是对的?

20,808

社区成员

发帖
与我相关
我的任务
社区描述
Hadoop生态大数据交流社区,致力于有Hadoop,hive,Spark,Hbase,Flink,ClickHouse,Kafka,数据仓库,大数据集群运维技术分享和交流等。致力于收集优质的博客
社区管理员
  • 分布式计算/Hadoop社区
  • 涤生大数据
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧