关于mapreduce生成hfile文件

没有女人缘的男人 2013-05-27 11:33:36

我用mapreduce处理数据然后入库到hbase当中，由于put比较慢，所以采用生成hfile文件的方式入库，在生成hfile文件大小的时候总是报一下错误：

java.io.IOException: Added a key not lexically larger than previous key=\x00\x02Mi\x0BsearchIndexuserId\x00\x00\x01>\xD5\xD6\xF3\xA3\x04, lastkey=\x00\x01w\x0BsearchIndexuserId\x00\x00\x01>\xD5\xD6\xF3\xA3\x04
at org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter.checkKey(AbstractHFileWriter.java:203)
at org.apache.hadoop.hbase.io.hfile.HFileWriterV2.append(HFileWriterV2.java:328)
at org.apache.hadoop.hbase.io.hfile.HFileWriterV2.append(HFileWriterV2.java:293)
at org.apache.hadoop.hbase.regionserver.StoreFile$Writer.append(StoreFile.java:962)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat$1.write(HFileOutputFormat.java:167)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat$1.write(HFileOutputFormat.java:123)
at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:587)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at com.ciwong.test.BuildTest$BuildIndexAfterReduce.reduce(BuildTest.java:110)
at com.ciwong.test.BuildTest$BuildIndexAfterReduce.reduce(BuildTest.java:1)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:649)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:417)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
at org.apache.hadoop.mapred.Child.main(Child.java:249)

我的代码是：

Configuration conf2 = new Configuration();
conf2.set("mapred.input.dir", args[2]);
conf2.set("mapred.output.dir", args[3]);
Job jobAfter = new Job(conf2);
jobAfter.setJarByClass(BuildTest.class);
jobAfter.setMapperClass(BuildIndexAfterMap.class);
jobAfter.setReducerClass(BuildIndexAfterReduce.class);
jobAfter.setNumReduceTasks(1);
jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
jobAfter.setMapOutputValueClass(Text.class);
jobAfter.setSortComparatorClass(ByteArrayComparator.class);
jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
jobAfter.setOutputFormatClass(HFileOutputFormat.class);
jobAfter.setInputFormatClass(TextInputFormat.class);
HTable table=new HTable(conf,conf.get("tableName"));
HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
jobAfter.waitForCompletion(true);

map代码：

public static class BuildIndexAfterMap extends Mapper<LongWritable, Text, ImmutableBytesWritable, Text> {

@Override
public void map(LongWritable key, Text value, Context output)
throws IOException, InterruptedException {
String outKey = value.toString().split("\t")[0];
String outValue = value.toString().split("\t")[1];
if(!outKey.trim().isEmpty()){
output.write(new ImmutableBytesWritable(Bytes.toBytes(outKey)), new Text(outValue));
}
}
}

reduce代码：

public static class BuildIndexAfterReduce extends Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable,KeyValue> { @Override
public void reduce(ImmutableBytesWritable key, Iterable<Text> values, Context output)
throws IOException, InterruptedException {
String outValue = "";
Iterator<Text> iterator = values.iterator();
while (iterator.hasNext()) {
String v = iterator.next().toString();
outValue = v + Constants.FIELD_SEPERATOR + outValue;
}
KeyValue column=new KeyValue(key.get(),
Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
Bytes.toBytes(outValue));
output.write(key,column);
}
}

请各位大侠帮忙看看是肿么回事，不胜感激

...全文

1139 15 打赏收藏转发到动态举报

写回复

用AI写文章

15 条回复

切换为时间正序

请发表友善的回复…

发表回复

blackproof 2014-06-03

打赏
举报

jobAfter.setMapOutputValueClass(Text.class);这里有误只能是Put或KeyValue类型

「已注销」 2014-03-21

打赏
举报

附上代码：package com.cs.HFileGenerator; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat; import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer; import org.apache.hadoop.hbase.mapreduce.PutSortReducer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class TestHFileToHBase { public static class TestHFileToHBaseMapper extends Mapper { // private Text t = new Text(); protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] values = value.toString().split(",", -1); byte[] subject = Bytes.toBytes(values[0]); //主语 byte[] predicate = Bytes.toBytes(values[1]); //谓词 byte[] object = Bytes.toBytes(values[2]); //宾语 Put tmpPut=new Put(subject); ImmutableBytesWritable rowKey = new ImmutableBytesWritable(subject); KeyValue kvProtocol = new KeyValue(subject , predicate, object, "1".getBytes()); context.write(rowKey, kvProtocol ); // KeyValue kvSrcip = new KeyValue(row, "SRCIP".getBytes(), // "SRCIP".getBytes(), values[1].getBytes()); // context.write(k, kvSrcip); // HFileOutputFormat.getRecordWriter } } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = HBaseConfiguration.create(); Job job = new Job(conf, "TestHFileToHBase"); job.setJarByClass(TestHFileToHBase.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setMapperClass(TestHFileToHBaseMapper.class); job.setReducerClass(KeyValueSortReducer.class); // job.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.HFileOutputFormat.class); job.setOutputFormatClass(HFileOutputFormat.class); // job.setNumReduceTasks(4); // job.setPartitionerClass(org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner.class); // HBaseAdmin admin = new HBaseAdmin(conf); HTable table = new HTable(conf, "AssistantProfessor_S_PO"); HFileOutputFormat.configureIncrementalLoad(job, table); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }

「已注销」 2014-03-21

打赏
举报

Hbase数据生成Hfile文件老是报错是什么原因求大神，急急急 14/03/21 20:17:31 WARN mapred.LocalJobRunner: job_local_0001 java.lang.IllegalArgumentException: Can't read partitions file at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:116) at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:62) at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:117) at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:677) at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:756) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370) at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212) Caused by: java.io.FileNotFoundException: File _partition.lst does not exist. at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:397) at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:251) at org.apache.hadoop.fs.FileSystem.getLength(FileSystem.java:796) at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1475) at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1470) at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.readPartitions(TotalOrderPartitioner.java:301) at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:88) ... 6 more 14/03/21 20:17:32 INFO mapred.JobClient: Job complete: job_local_0001

撸大湿 2013-05-27

打赏
举报

HFILE中的KeyValue没有排序 HFILE导入必须排序

撸大湿 2013-05-27

打赏
举报

没有女人缘的男人 2013-05-27

打赏
举报

我知道了，可能是我value里面的数据太多了，string做字符串的连接小量的数据可以，数据量一大就不行了

没有女人缘的男人 2013-05-27

打赏
举报

多谢版主赐教啊

没有女人缘的男人 2013-05-27

打赏
举报

引用 8 楼 tntzbzc 的回复:

改成StringBuilder试试

public static class BuildIndexAfterReduce
			extends
			Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
		@Override
		public void reduce(ImmutableBytesWritable key, Iterable<Text> values,
				Context output) throws IOException, InterruptedException {
			StringBuilder outValue = new StringBuilder("");
			Iterator<Text> iterator = values.iterator();
			while (iterator.hasNext()) {
				String v = iterator.next().toString();
				outValue.append(v + Constants.FIELD_SEPERATOR);
			}
			KeyValue column = new KeyValue(key.get(),
					Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
					Bytes.toBytes(outValue.toString()));
			output.write(key, column);
		}
	}

我勒个去啊，果然差距非常大啊，1分25秒就跑完了，原来跑几个小时啊，我平常用String和StringBuilder也没感觉会差距这么大，JDK1.6以后String的性能已经优化的非常好了，没想到还是差距这么大，在平常的使用当中String和StringBuilder的差距没有这么大啊，String是个常量池，这个我是知道，mapreduce在执行的时候为什么会差距这么大？请问这个后面的原理是什么？

撸大湿 2013-05-27

打赏
举报

改成StringBuilder试试

public static class BuildIndexAfterReduce
			extends
			Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
		@Override
		public void reduce(ImmutableBytesWritable key, Iterable<Text> values,
				Context output) throws IOException, InterruptedException {
			StringBuilder outValue = new StringBuilder("");
			Iterator<Text> iterator = values.iterator();
			while (iterator.hasNext()) {
				String v = iterator.next().toString();
				outValue.append(v + Constants.FIELD_SEPERATOR);
			}
			KeyValue column = new KeyValue(key.get(),
					Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
					Bytes.toBytes(outValue.toString()));
			output.write(key, column);
		}
	}

没有女人缘的男人 2013-05-27

打赏
举报

我的集群环境是，hbase三个节点，hadoop也是三个节点，我看jobtracker的日志，并不是在载入hbase的时候慢，而是在写入hfile的时候慢，非常慢

没有女人缘的男人 2013-05-27

打赏
举报

引用 5 楼 tntzbzc 的回复:

我怀疑setSortComparatorClass中的ByteArrayComparator.class有问题你先试一下

我原来key里面保存的是中文汉字的字符串，我现在换成字符串的hashcode了，小数据(大概100多K)生成hfile文件没有问题了，但是在处理稍微大一点的数据(1.7G)的时候会非常慢，一个多小时reduce才70%



public static class BuildIndexAfterMap extends
			Mapper<LongWritable, Text, ImmutableBytesWritable, Text> {

		@Override
		public void map(LongWritable key, Text value, Context output)
				throws IOException, InterruptedException {
			String outKey = value.toString().split("\t")[0].trim();
			String outValue = value.toString().split("\t")[1].trim();
			if (!outKey.trim().isEmpty()) {
				output.write(new ImmutableBytesWritable(Bytes.toBytes(outKey.hashCode())),
						new Text(outValue));
			}
		}
	}

public static class BuildIndexAfterReduce
			extends
			Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
		@Override
		public void reduce(ImmutableBytesWritable key, Iterable<Text> values,
				Context output) throws IOException, InterruptedException {
			String outValue = "";
			Iterator<Text> iterator = values.iterator();
			while (iterator.hasNext()) {
				String v = iterator.next().toString();
				outValue = v + Constants.FIELD_SEPERATOR + outValue;
			}
			KeyValue column = new KeyValue(key.get(),
					Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
					Bytes.toBytes(outValue));
			output.write(key, column);
		}
	}


Configuration conf2 = new Configuration();
			conf2.set("mapred.input.dir", args[2]);
			conf2.set("mapred.output.dir", args[3]);
			Job jobAfter = new Job(conf2);
			jobAfter.setJarByClass(BuildTest.class);
			jobAfter.setMapperClass(BuildIndexAfterMap.class);
			jobAfter.setReducerClass(BuildIndexAfterReduce.class);
			jobAfter.setNumReduceTasks(10);
			jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
			jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
			jobAfter.setMapOutputValueClass(Text.class);
//			jobAfter.setSortComparatorClass(ByteArrayComparator.class);
//			jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
			FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
			jobAfter.setOutputFormatClass(HFileOutputFormat.class);
			jobAfter.setInputFormatClass(TextInputFormat.class);
			HTable table = new HTable(conf, conf.get("tableName"));
			HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
			jobAfter.waitForCompletion(true);

撸大湿 2013-05-27

打赏
举报

我怀疑setSortComparatorClass中的ByteArrayComparator.class有问题你先试一下

没有女人缘的男人 2013-05-27

打赏
举报

引用 3 楼 tntzbzc 的回复:

你是单REDUCE写入，setSortComparatorClass和jobAfter.setGroupingComparatorClass去掉再试试


                        Configuration conf2 = new Configuration();
                        conf2.set("mapred.input.dir", args[2]);
                        conf2.set("mapred.output.dir", args[3]);
                        Job jobAfter = new Job(conf2);
                        jobAfter.setJarByClass(BuildTest.class);
                        jobAfter.setMapperClass(BuildIndexAfterMap.class);
                        jobAfter.setReducerClass(BuildIndexAfterReduce.class);
                        jobAfter.setNumReduceTasks(1);
                        jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
                        jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
                        jobAfter.setMapOutputValueClass(Text.class);
                        //jobAfter.setSortComparatorClass(ByteArrayComparator.class);
                        //jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
                        FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
                        jobAfter.setOutputFormatClass(HFileOutputFormat.class);
                        jobAfter.setInputFormatClass(TextInputFormat.class);
                        HTable table=new HTable(conf,conf.get("tableName"));
                        HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
                        jobAfter.waitForCompletion(true);

去掉的话岂不是不指定排序规则了？用他默认的排序规则？

撸大湿 2013-05-27

打赏
举报

你是单REDUCE写入，setSortComparatorClass和jobAfter.setGroupingComparatorClass去掉再试试


                        Configuration conf2 = new Configuration();
                        conf2.set("mapred.input.dir", args[2]);
                        conf2.set("mapred.output.dir", args[3]);
                        Job jobAfter = new Job(conf2);
                        jobAfter.setJarByClass(BuildTest.class);
                        jobAfter.setMapperClass(BuildIndexAfterMap.class);
                        jobAfter.setReducerClass(BuildIndexAfterReduce.class);
                        jobAfter.setNumReduceTasks(1);
                        jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
                        jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
                        jobAfter.setMapOutputValueClass(Text.class);
                        //jobAfter.setSortComparatorClass(ByteArrayComparator.class);
                        //jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
                        FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
                        jobAfter.setOutputFormatClass(HFileOutputFormat.class);
                        jobAfter.setInputFormatClass(TextInputFormat.class);
                        HTable table=new HTable(conf,conf.get("tableName"));
                        HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
                        jobAfter.waitForCompletion(true);

没有女人缘的男人 2013-05-27