关于mapreduce生成hfile文件

没有女人缘的男人 2013-05-27 11:33:36
我用mapreduce处理数据然后入库到hbase当中,由于put比较慢,所以采用生成hfile文件的方式入库,在生成hfile文件大小的时候总是报一下错误:

java.io.IOException: Added a key not lexically larger than previous key=\x00\x02Mi\x0BsearchIndexuserId\x00\x00\x01>\xD5\xD6\xF3\xA3\x04, lastkey=\x00\x01w\x0BsearchIndexuserId\x00\x00\x01>\xD5\xD6\xF3\xA3\x04
at org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter.checkKey(AbstractHFileWriter.java:203)
at org.apache.hadoop.hbase.io.hfile.HFileWriterV2.append(HFileWriterV2.java:328)
at org.apache.hadoop.hbase.io.hfile.HFileWriterV2.append(HFileWriterV2.java:293)
at org.apache.hadoop.hbase.regionserver.StoreFile$Writer.append(StoreFile.java:962)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat$1.write(HFileOutputFormat.java:167)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat$1.write(HFileOutputFormat.java:123)
at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:587)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at com.ciwong.test.BuildTest$BuildIndexAfterReduce.reduce(BuildTest.java:110)
at com.ciwong.test.BuildTest$BuildIndexAfterReduce.reduce(BuildTest.java:1)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:649)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:417)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
at org.apache.hadoop.mapred.Child.main(Child.java:249)

我的代码是:

Configuration conf2 = new Configuration();
conf2.set("mapred.input.dir", args[2]);
conf2.set("mapred.output.dir", args[3]);
Job jobAfter = new Job(conf2);
jobAfter.setJarByClass(BuildTest.class);
jobAfter.setMapperClass(BuildIndexAfterMap.class);
jobAfter.setReducerClass(BuildIndexAfterReduce.class);
jobAfter.setNumReduceTasks(1);
jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
jobAfter.setMapOutputValueClass(Text.class);
jobAfter.setSortComparatorClass(ByteArrayComparator.class);
jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
jobAfter.setOutputFormatClass(HFileOutputFormat.class);
jobAfter.setInputFormatClass(TextInputFormat.class);
HTable table=new HTable(conf,conf.get("tableName"));
HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
jobAfter.waitForCompletion(true);

map代码:

public static class BuildIndexAfterMap extends Mapper<LongWritable, Text, ImmutableBytesWritable, Text> {

@Override
public void map(LongWritable key, Text value, Context output)
throws IOException, InterruptedException {
String outKey = value.toString().split("\t")[0];
String outValue = value.toString().split("\t")[1];
if(!outKey.trim().isEmpty()){
output.write(new ImmutableBytesWritable(Bytes.toBytes(outKey)), new Text(outValue));
}
}
}

reduce代码:

public static class BuildIndexAfterReduce extends Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable,KeyValue> { @Override
public void reduce(ImmutableBytesWritable key, Iterable<Text> values, Context output)
throws IOException, InterruptedException {
String outValue = "";
Iterator<Text> iterator = values.iterator();
while (iterator.hasNext()) {
String v = iterator.next().toString();
outValue = v + Constants.FIELD_SEPERATOR + outValue;
}
KeyValue column=new KeyValue(key.get(),
Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
Bytes.toBytes(outValue));
output.write(key,column);
}
}


请各位大侠帮忙看看是肿么回事,不胜感激
...全文
1088 15 打赏 收藏 转发到动态 举报
写回复
用AI写文章
15 条回复
切换为时间正序
请发表友善的回复…
发表回复
blackproof 2014-06-03
  • 打赏
  • 举报
回复
jobAfter.setMapOutputValueClass(Text.class);这里有误 只能是Put或KeyValue类型
「已注销」 2014-03-21
  • 打赏
  • 举报
回复
附上代码:package com.cs.HFileGenerator; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat; import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer; import org.apache.hadoop.hbase.mapreduce.PutSortReducer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class TestHFileToHBase { public static class TestHFileToHBaseMapper extends Mapper { // private Text t = new Text(); protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] values = value.toString().split(",", -1); byte[] subject = Bytes.toBytes(values[0]); //主语 byte[] predicate = Bytes.toBytes(values[1]); //谓词 byte[] object = Bytes.toBytes(values[2]); //宾语 Put tmpPut=new Put(subject); ImmutableBytesWritable rowKey = new ImmutableBytesWritable(subject); KeyValue kvProtocol = new KeyValue(subject , predicate, object, "1".getBytes()); context.write(rowKey, kvProtocol ); // KeyValue kvSrcip = new KeyValue(row, "SRCIP".getBytes(), // "SRCIP".getBytes(), values[1].getBytes()); // context.write(k, kvSrcip); // HFileOutputFormat.getRecordWriter } } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = HBaseConfiguration.create(); Job job = new Job(conf, "TestHFileToHBase"); job.setJarByClass(TestHFileToHBase.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setMapperClass(TestHFileToHBaseMapper.class); job.setReducerClass(KeyValueSortReducer.class); // job.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.HFileOutputFormat.class); job.setOutputFormatClass(HFileOutputFormat.class); // job.setNumReduceTasks(4); // job.setPartitionerClass(org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner.class); // HBaseAdmin admin = new HBaseAdmin(conf); HTable table = new HTable(conf, "AssistantProfessor_S_PO"); HFileOutputFormat.configureIncrementalLoad(job, table); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
「已注销」 2014-03-21
  • 打赏
  • 举报
回复
Hbase数据生成Hfile文件 老是报错 是什么原因 求大神 ,急急急 14/03/21 20:17:31 WARN mapred.LocalJobRunner: job_local_0001 java.lang.IllegalArgumentException: Can't read partitions file at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:116) at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:62) at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:117) at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:677) at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:756) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370) at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212) Caused by: java.io.FileNotFoundException: File _partition.lst does not exist. at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:397) at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:251) at org.apache.hadoop.fs.FileSystem.getLength(FileSystem.java:796) at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1475) at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1470) at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.readPartitions(TotalOrderPartitioner.java:301) at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:88) ... 6 more 14/03/21 20:17:32 INFO mapred.JobClient: Job complete: job_local_0001
撸大湿 2013-05-27
  • 打赏
  • 举报
回复
HFILE中的KeyValue没有排序 HFILE导入必须排序
撸大湿 2013-05-27
  • 打赏
  • 举报
回复
  • 打赏
  • 举报
回复
我知道了,可能是我value里面的数据太多了,string做字符串的连接小量的数据可以,数据量一大就不行了
  • 打赏
  • 举报
回复
多谢版主赐教啊
  • 打赏
  • 举报
回复
引用 8 楼 tntzbzc 的回复:
改成StringBuilder试试
public static class BuildIndexAfterReduce
			extends
			Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
		@Override
		public void reduce(ImmutableBytesWritable key, Iterable<Text> values,
				Context output) throws IOException, InterruptedException {
			StringBuilder outValue = new StringBuilder("");
			Iterator<Text> iterator = values.iterator();
			while (iterator.hasNext()) {
				String v = iterator.next().toString();
				outValue.append(v + Constants.FIELD_SEPERATOR);
			}
			KeyValue column = new KeyValue(key.get(),
					Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
					Bytes.toBytes(outValue.toString()));
			output.write(key, column);
		}
	}
我勒个去啊,果然差距非常大啊,1分25秒就跑完了,原来跑几个小时啊,我平常用String和StringBuilder也没感觉会差距这么大,JDK1.6以后String的性能已经优化的非常好了,没想到还是差距这么大,在平常的使用当中String和StringBuilder的差距没有这么大啊,String是个常量池,这个我是知道,mapreduce在执行的时候为什么会差距这么大?请问这个后面的原理是什么?
撸大湿 2013-05-27
  • 打赏
  • 举报
回复
改成StringBuilder试试
public static class BuildIndexAfterReduce
			extends
			Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
		@Override
		public void reduce(ImmutableBytesWritable key, Iterable<Text> values,
				Context output) throws IOException, InterruptedException {
			StringBuilder outValue = new StringBuilder("");
			Iterator<Text> iterator = values.iterator();
			while (iterator.hasNext()) {
				String v = iterator.next().toString();
				outValue.append(v + Constants.FIELD_SEPERATOR);
			}
			KeyValue column = new KeyValue(key.get(),
					Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
					Bytes.toBytes(outValue.toString()));
			output.write(key, column);
		}
	}
  • 打赏
  • 举报
回复
我的集群环境是,hbase三个节点,hadoop也是三个节点,我看jobtracker的日志,并不是在载入hbase的时候慢,而是在写入hfile的时候慢,非常慢
  • 打赏
  • 举报
回复
引用 5 楼 tntzbzc 的回复:
我怀疑setSortComparatorClass中的ByteArrayComparator.class有问题 你先试一下
我原来key里面保存的是中文汉字的字符串,我现在换成字符串的hashcode了,小数据(大概100多K)生成hfile文件没有问题了,但是在处理稍微大一点的数据(1.7G)的时候会非常慢,一个多小时reduce才70%


public static class BuildIndexAfterMap extends
			Mapper<LongWritable, Text, ImmutableBytesWritable, Text> {

		@Override
		public void map(LongWritable key, Text value, Context output)
				throws IOException, InterruptedException {
			String outKey = value.toString().split("\t")[0].trim();
			String outValue = value.toString().split("\t")[1].trim();
			if (!outKey.trim().isEmpty()) {
				output.write(new ImmutableBytesWritable(Bytes.toBytes(outKey.hashCode())),
						new Text(outValue));
			}
		}
	}

public static class BuildIndexAfterReduce
			extends
			Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
		@Override
		public void reduce(ImmutableBytesWritable key, Iterable<Text> values,
				Context output) throws IOException, InterruptedException {
			String outValue = "";
			Iterator<Text> iterator = values.iterator();
			while (iterator.hasNext()) {
				String v = iterator.next().toString();
				outValue = v + Constants.FIELD_SEPERATOR + outValue;
			}
			KeyValue column = new KeyValue(key.get(),
					Bytes.toBytes("searchIndex"), Bytes.toBytes("userId"),
					Bytes.toBytes(outValue));
			output.write(key, column);
		}
	}


Configuration conf2 = new Configuration();
			conf2.set("mapred.input.dir", args[2]);
			conf2.set("mapred.output.dir", args[3]);
			Job jobAfter = new Job(conf2);
			jobAfter.setJarByClass(BuildTest.class);
			jobAfter.setMapperClass(BuildIndexAfterMap.class);
			jobAfter.setReducerClass(BuildIndexAfterReduce.class);
			jobAfter.setNumReduceTasks(10);
			jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
			jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
			jobAfter.setMapOutputValueClass(Text.class);
//			jobAfter.setSortComparatorClass(ByteArrayComparator.class);
//			jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
			FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
			jobAfter.setOutputFormatClass(HFileOutputFormat.class);
			jobAfter.setInputFormatClass(TextInputFormat.class);
			HTable table = new HTable(conf, conf.get("tableName"));
			HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
			jobAfter.waitForCompletion(true);


撸大湿 2013-05-27
  • 打赏
  • 举报
回复
我怀疑setSortComparatorClass中的ByteArrayComparator.class有问题 你先试一下
  • 打赏
  • 举报
回复
引用 3 楼 tntzbzc 的回复:
你是单REDUCE写入,setSortComparatorClass和jobAfter.setGroupingComparatorClass去掉 再试试

                        Configuration conf2 = new Configuration();
                        conf2.set("mapred.input.dir", args[2]);
                        conf2.set("mapred.output.dir", args[3]);
                        Job jobAfter = new Job(conf2);
                        jobAfter.setJarByClass(BuildTest.class);
                        jobAfter.setMapperClass(BuildIndexAfterMap.class);
                        jobAfter.setReducerClass(BuildIndexAfterReduce.class);
                        jobAfter.setNumReduceTasks(1);
                        jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
                        jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
                        jobAfter.setMapOutputValueClass(Text.class);
                        //jobAfter.setSortComparatorClass(ByteArrayComparator.class);
                        //jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
                        FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
                        jobAfter.setOutputFormatClass(HFileOutputFormat.class);
                        jobAfter.setInputFormatClass(TextInputFormat.class);
                        HTable table=new HTable(conf,conf.get("tableName"));
                        HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
                        jobAfter.waitForCompletion(true);
去掉的话岂不是不指定排序规则了?用他默认的排序规则?
撸大湿 2013-05-27
  • 打赏
  • 举报
回复
你是单REDUCE写入,setSortComparatorClass和jobAfter.setGroupingComparatorClass去掉 再试试

                        Configuration conf2 = new Configuration();
                        conf2.set("mapred.input.dir", args[2]);
                        conf2.set("mapred.output.dir", args[3]);
                        Job jobAfter = new Job(conf2);
                        jobAfter.setJarByClass(BuildTest.class);
                        jobAfter.setMapperClass(BuildIndexAfterMap.class);
                        jobAfter.setReducerClass(BuildIndexAfterReduce.class);
                        jobAfter.setNumReduceTasks(1);
                        jobAfter.setPartitionerClass(SimpleTotalOrderPartitioner.class);
                        jobAfter.setMapOutputKeyClass(ImmutableBytesWritable.class);
                        jobAfter.setMapOutputValueClass(Text.class);
                        //jobAfter.setSortComparatorClass(ByteArrayComparator.class);
                        //jobAfter.setGroupingComparatorClass(ByteArrayComparator.class);
                        FileOutputFormat.setOutputPath(jobAfter, new Path(args[3]));
                        jobAfter.setOutputFormatClass(HFileOutputFormat.class);
                        jobAfter.setInputFormatClass(TextInputFormat.class);
                        HTable table=new HTable(conf,conf.get("tableName"));
                        HFileOutputFormat.configureIncrementalLoad(jobAfter, table);
                        jobAfter.waitForCompletion(true);
  • 打赏
  • 举报
回复
引用 1 楼 tntzbzc 的回复:
HFILE中的KeyValue没有排序 HFILE导入必须排序
我已经指定排序类了,为什么还是排序不对?

20,808

社区成员

发帖
与我相关
我的任务
社区描述
Hadoop生态大数据交流社区,致力于有Hadoop,hive,Spark,Hbase,Flink,ClickHouse,Kafka,数据仓库,大数据集群运维技术分享和交流等。致力于收集优质的博客
社区管理员
  • 分布式计算/Hadoop社区
  • 涤生大数据
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧