问题导读:
1.如何通过mr生成HFile
2.改进后的HFileOutputFormat有什么新增特性?
3.HFile入库到HBase如何入库到HBase
一、MR生成HFile文件
package insert.tools.hfile;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TestHFileToHBase {
public static class TestHFileToHBaseMapper extends Mapper {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] values = value.toString().split("/t", 2);
byte[] row = Bytes.toBytes(values[0]);
ImmutableBytesWritable k = new ImmutableBytesWritable(row);
KeyValue kvProtocol = new KeyValue(row, "PROTOCOLID".getBytes(), "PROTOCOLID".getBytes(), values[1]
.getBytes());
context.write(k, kvProtocol);
// KeyValue kvSrcip = new KeyValue(row, "SRCIP".getBytes(),
// "SRCIP".getBytes(), values[1].getBytes());
// context.write(k, kvSrcip);
// HFileOutputFormat.getRecordWriter
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = HBaseConfiguration.create();
Job job = new Job(conf, "TestHFileToHBase");
job.setJarByClass(TestHFileToHBase.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setMapperClass(TestHFileToHBaseMapper.class);
job.setReducerClass(KeyValueSortReducer.class);
// job.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.HFileOutputFormat.class);
job.setOutputFormatClass(HFileOutputFormat.class);
// job.setNumReduceTasks(4);
// job.setPartitionerClass(org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner.class);
// HBaseAdmin admin = new HBaseAdmin(conf);
// HTable table = new HTable(conf, "hua");
HFileOutputFormat.configureIncrementalLoad(job, table);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
} 复制代码
二、改进后的HFileOutputFormat
源码中的HFileOutputFormat只适合一次生成一个列族的HFile,改进后的HFileOutputFormat适合同时多列族生成HFile文件。有add标签的是在源码上添加代码,有revise标签的是在源码上增加代码。参考:https://review.cloudera.org/r/12 ... 977#file17977line93
/**
* Copyright 2009 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package insert.tools.hfile;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.base.Preconditions;
/**
* Writes HFiles. Passed KeyValues must arrive in order. Currently, can only
* write files to a single column family at a time. Multiple column families
* requires coordinating keys cross family. Writes current time as the sequence
* id for the file. Sets the major compacted attribute on created hfiles.
*
* @see KeyValueSortReducer
*/
public class HFileOutputFormat extends
FileOutputFormat {
static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
public RecordWriter getRecordWriter(
final TaskAttemptContext context) throws IOException,
InterruptedException {
// Get the path of the temporary output file
final Path outputPath = FileOutputFormat.getOutputPath(context);
final Path outputdir = new FileOutputCommitter(outputPath, context)
.getWorkPath();
Configuration conf = context.getConfiguration();
final FileSystem fs = outputdir.getFileSystem(conf);
// These configs. are from hbase-*.xml
// revise
// final long maxsize = conf.getLong("hbase.hregion.max.filesize",
// 268435456);
// final int blocksize = conf.getInt("hfile.min.blocksize.size", 65536);
final long maxsize = conf.getLong("hbase.hregion.max.filesize",
HConstants.DEFAULT_MAX_FILE_SIZE);
final int blocksize = conf.getInt("hfile.min.blocksize.size",
HFile.DEFAULT_BLOCKSIZE);
// -revise
// Invented config. Add to hbase-*.xml if other than default
// compression.
final String compression = conf.get("hfile.compression",
Compression.Algorithm.NONE.getName());
return new RecordWriter() {
// Map of families to writers and how much has been output on the
// writer.
private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>(
Bytes.BYTES_COMPARATOR);
private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY;
private final byte[] now = Bytes
.toBytes(System.currentTimeMillis());
// add
private boolean rollRequested = false;
// -add
public void write(ImmutableBytesWritable row, KeyValue kv)
throws IOException {
// add
// null input == user explicitly wants to flush
if (row == null && kv == null) {
rollWriters();
return;
}
byte[] rowKey = kv.getRow();
// -add
long length = kv.getLength();
byte[] family = kv.getFamily();
WriterLength wl = this.writers.get(family);
// revise
// if (wl == null
// || ((length + wl.written) >= maxsize)
// && Bytes.compareTo(this.previousRow, 0,
// this.previousRow.length, kv.getBuffer(), kv
// .getRowOffset(), kv.getRowLength()) != 0) {
// // Get a new writer.
// Path basedir = new Path(outputdir, Bytes.toString(family));
// if (wl == null) {
// wl = new WriterLength();
// this.writers.put(family, wl);
// if (this.writers.size() > 1)
// throw new IOException("One family only");
// // If wl == null, first file in family. Ensure family
// // dir exits.
// if (!fs.exists(basedir))
// fs.mkdirs(basedir);
// }
// wl.writer = getNewWriter(wl.writer, basedir);
// LOG
// .info("Writer="
// + wl.writer.getPath()
// + ((wl.written == 0) ? "" : ", wrote="
// + wl.written));
// wl.written = 0;
// }
// If this is a new column family, verify that the directory
// exists
if (wl == null) {
fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
}
// If any of the HFiles for the column families has reached
// maxsize, we need to roll all the writers
if (wl != null && wl.written + length >= maxsize) {
this.rollRequested = true;
}
// This can only happen once a row is finished though
if (rollRequested
&& Bytes.compareTo(this.previousRow, rowKey) != 0) {
rollWriters();
}
// create a new HLog writer, if necessary
if (wl == null || wl.writer == null) {
wl = getNewWriter(family);
}
// we now have the proper HLog writer. full steam ahead
// -revise
kv.updateLatestStamp(this.now);
wl.writer.append(kv);
wl.written += length;
// Copy the row so we know when a row transition.
// revise
// this.previousRow = kv.getRow();
this.previousRow = rowKey;
// -revise
}
// revise
// /*
// * Create a new HFile.Writer. Close current if there is one.
// *
// * @param writer
// *
// * @param familydir
// *
// * @return A new HFile.Writer.
// *
// * @throws IOException
// */
// private HFile.Writer getNewWriter(final HFile.Writer writer,
// final Path familydir) throws IOException {
// close(writer);
// return new HFile.Writer(fs, StoreFile.getUniqueFile(fs,
// familydir), blocksize, compression,
// KeyValue.KEY_COMPARATOR);
// }
private void rollWriters() throws IOException {
for (WriterLength wl : this.writers.values()) {
if (wl.writer != null) {
LOG.info("Writer="
+ wl.writer.getPath()
+ ((wl.written == 0) ? "" : ", wrote="
+ wl.written));
close(wl.writer);
}
wl.writer = null;
wl.written = 0;
}
this.rollRequested = false;
}
/*
* Create a new HFile.Writer.
*
* @param family
*
* @return A WriterLength, containing a new HFile.Writer.
*
* @throws IOException
*/
private WriterLength getNewWriter(byte[] family) throws IOException {
WriterLength wl = new WriterLength();
Path familydir = new Path(outputdir, Bytes.toString(family));
wl.writer = new HFile.Writer(fs, StoreFile.getUniqueFile(fs,
familydir), blocksize, compression,
KeyValue.KEY_COMPARATOR);
this.writers.put(family, wl);
return wl;
}
// -revise
private void close(final HFile.Writer w) throws IOException {
if (w != null) {
w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes
.toBytes(System.currentTimeMillis()));
w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes
.toBytes(context.getTaskAttemptID().toString()));
w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes
.toBytes(true));
w.close();
}
}
// revise
// public void close(TaskAttemptContext c) throws IOException,
// InterruptedException {
// for (Map.Entry e : this.writers
// .entrySet()) {
// close(e.getValue().writer);
// }
// }
public void close(TaskAttemptContext c) throws IOException,
InterruptedException {
for (WriterLength wl : this.writers.values()) {
close(wl.writer);
}
}
// -revise
};
}
/*
* Data structure to hold a Writer and amount of data written on it.
*/
static class WriterLength {
long written = 0;
HFile.Writer writer = null;
}
/**
* Return the start keys of all of the regions in this table, as a list of
* ImmutableBytesWritable.
*/
private static List getRegionStartKeys(HTable table)
throws IOException {
byte[][] byteKeys = table.getStartKeys();
ArrayList ret = new ArrayList(
byteKeys.length);
for (byte[] byteKey : byteKeys) {
ret.add(new ImmutableBytesWritable(byteKey));
}
return ret;
}
/**
* Write out a SequenceFile that can be read by TotalOrderPartitioner that
* contains the split points in startKeys.
*
* @param partitionsPath
* output path for SequenceFile
* @param startKeys
* the region start keys
*/
private static void writePartitions(Configuration conf,
Path partitionsPath, List startKeys)
throws IOException {
Preconditions.checkArgument(!startKeys.isEmpty(), "No regions passed");
// We're generating a list of split points, and we don't ever
// have keys < the first region (which has an empty start key)
// so we need to remove it. Otherwise we would end up with an
// empty reducer with index 0
TreeSet sorted = new TreeSet(
startKeys);
ImmutableBytesWritable first = sorted.first();
Preconditions
.checkArgument(
first.equals(HConstants.EMPTY_BYTE_ARRAY),
"First region of table should have empty start key. Instead has: %s",
Bytes.toStringBinary(first.get()));
sorted.remove(first);
// Write the actual file
FileSystem fs = partitionsPath.getFileSystem(conf);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,
partitionsPath, ImmutableBytesWritable.class,
NullWritable.class);
try {
for (ImmutableBytesWritable startKey : sorted) {
writer.append(startKey, NullWritable.get());
}
} finally {
writer.close();
}
}
/**
* Configure a MapReduce Job to perform an incremental load into the given
* table. This
*
*
Inspects the table to configure a total order partitioner
*
Uploads the partitions file to the cluster and adds it to the
* DistributedCache
*
Sets the number of reduce tasks to match the current number of
* regions
*
Sets the output key/value class to match HFileOutputFormat's
* requirements
*
Sets the reducer up to perform the appropriate sorting (either
* KeyValueSortReducer or PutSortReducer)
*
* The user should be sure to set the map output value class to either
* KeyValue or Put before running this function.
*/
public static void configureIncrementalLoad(Job job, HTable table)
throws IOException {
Configuration conf = job.getConfiguration();
job.setPartitionerClass(TotalOrderPartitioner.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setOutputFormatClass(HFileOutputFormat.class);
// Based on the configured map output class, set the correct reducer to
// properly
// sort the incoming values.
// TODO it would be nice to pick one or the other of these formats.
if (KeyValue.class.equals(job.getMapOutputValueClass())) {
job.setReducerClass(KeyValueSortReducer.class);
} else if (Put.class.equals(job.getMapOutputValueClass())) {
job.setReducerClass(PutSortReducer.class);
} else {
LOG.warn("Unknown map output value type:"
+ job.getMapOutputValueClass());
}
LOG.info("Looking up current regions for table " + table);
List startKeys = getRegionStartKeys(table);
LOG.info("Configuring " + startKeys.size() + " reduce partitions "
+ "to match current region count");
job.setNumReduceTasks(startKeys.size());
Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_"
+ System.currentTimeMillis());
LOG.info("Writing partition information to " + partitionsPath);
FileSystem fs = partitionsPath.getFileSystem(conf);
writePartitions(conf, partitionsPath, startKeys);
partitionsPath.makeQualified(fs);
URI cacheUri;
try {
cacheUri = new URI(partitionsPath.toString() + "#"
+ TotalOrderPartitioner.DEFAULT_PATH);
} catch (URISyntaxException e) {
throw new IOException(e);
}
DistributedCache.addCacheFile(cacheUri, conf);
DistributedCache.createSymlink(conf);
LOG.info("Incremental table output configured.");
}
} 复制代码
三、MR生成HFile的注意事项
1. 无论是map还是reduce作为最终的输出结果,输出的key和value的类型应该是: 或者< ImmutableBytesWritable, Put>。
2. Map或者reduce的输出类型是KeyValue 或Put对应KeyValueSortReducer或PutSortReducer。
3. MR例子中job.setOutputFormatClass(HFileOutputFormat.class); HFileOutputFormat是改进后的mr,可适用于多列族同时生成HFile文件,源码中只适合一次对单列族组织成HFile文件。
4. MR例子中HFileOutputFormat.configureIncrementalLoad(job, table);自动对job进行配置,SimpleTotalOrderPartitioner是需要先对key进行整体排序,然后划分到每个reduce中,保证每一个reducer中的的key最小最大值区间范围,是不会有交集的。
因为入库到HBase的时候,作为一个整体的Region,key是绝对有序的。
5. MR例子中最后生成HFile存储在HDFS上,输出路径下的子目录是各个列族。如果对HFile进行入库HBase,相当于move HFile到HBase的Region中,HFile子目录的列族内容没有了。
四、HFile入库到HBase
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
public class TestLoadIncrementalHFileToHBase {
// private static final byte[] TABLE = Bytes.toBytes("hua");
// private static final byte[] QUALIFIER = Bytes.toBytes("PROTOCOLID");
// private static final byte[] FAMILY = Bytes.toBytes("PROTOCOLID");
public static void main(String[] args) throws IOException {
Configuration conf = HBaseConfiguration.create();
// byte[] TABLE = Bytes.toBytes("hua");
byte[] TABLE = Bytes.toBytes(args[0]);
HTable table = new HTable(TABLE);
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
loader.doBulkLoad(new Path(args[1]), table);
// loader.doBulkLoad(new Path("/hua/testHFileResult/"), table);
}
} 复制代码
五、HFile入库到HBase注意事项
1. 通过HBase中 LoadIncrementalHFiles的doBulkLoad方法,对生成的HFile文件入库,入库的第一个参数是表名,第二个参数是HFile的路径(以上MR生成HFile的输出路径),也可一个个列族录入到HBase中对应的表列族。
2. 如何入库的相关链接:
http://hbase.apache.org/docs/r0.89.20100726/bulk-loads.html
http://hbase.apache.org/docs/r0. ... e-summary.html#bulk
http://genius-bai.javaeye.com/blog/641927
3. 入库分为代码入库以及脚本入库。代码入库有两种,一种是
hadoop jar hbase-VERSION.jar completebulkload /myoutput mytable;
另外一种是通过以上的TestLoadIncrementalHFileToHBase类。
脚本入库为:jruby $HBASE_HOME/bin/loadtable.rb hbase-mytable hadoop-hbase-hfile-outputdir。