分享

用mapreduce将文件转换为SequenceFile文件

xiaobaiyang 发表于 2015-6-16 23:25:55 [显示全部楼层] 回帖奖励 阅读模式 关闭右栏 1 21628
package com.it.mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class SmallFilesToSequenceFileConverter extends Configured implements
        Tool {
    static class SequenceFileMapper extends
            Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
        private Text filenameKey;

        @Override
        protected void setup(Context context) {
            InputSplit split = context.getInputSplit();
            Path path = ((FileSplit) split).getPath();
            filenameKey = new Text(path.toString());
        }

        @Override
        public void map(NullWritable key, BytesWritable value, Context context)
                throws IOException, InterruptedException {
            context.write(filenameKey, value);
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        String inputuri = args[0];
        String outputuri = args[1];
        Configuration conf = new Configuration();

        FileSystem fileSystem = FileSystem.get(new URI(outputuri), conf);
        if (fileSystem.exists(new Path(outputuri))) {
            fileSystem.delete(new Path(outputuri), true);
        }

        Job job = new Job(conf,
                SmallFilesToSequenceFileConverter.class.getSimpleName());
        job.setJarByClass(SmallFilesToSequenceFileConverter.class);
        // job.setJobName("SmallFilesToSequenceFileConverter");

        // FileInputFormat.addInputPath(job, new Path(args[0]));
        FileInputFormat.setInputPaths(job, new Path(inputuri));

        job.setMapperClass(SequenceFileMapper.class);

        FileOutputFormat.setOutputPath(job, new Path(outputuri));

        job.setInputFormatClass(WholeFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setPartitionerClass(HashPartitioner.class);
        job.setNumReduceTasks(1);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);
        job.waitForCompletion(true);
        return 0;
        // return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);
        /*
         * int exitCode = ToolRunner.run(new
         * SmallFilesToSequenceFileConverter(), args); System.exit(exitCode);
         */
    }
}


package com.it.mapreduce;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {  
    //避免切分
    @Override  
    protected boolean isSplitable(JobContext context, Path file) {  
        return false;  
    }  
    @Override  
    public RecordReader<NullWritable, BytesWritable> createRecordReader(
            InputSplit split, TaskAttemptContext context) throws IOException,InterruptedException {  
    WholeFileRecordReader reader = new WholeFileRecordReader();  
    reader.initialize(split, context);  
    return reader;  
}  
}
package com.it.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable>{  
    private FileSplit fileSplit;  
    private Configuration conf;  
    private BytesWritable value = new BytesWritable();  
    private boolean processed = false;  
    @Override  
    public void close() throws IOException {  
        // do nothing  
    }  
  
    @Override  
    public NullWritable getCurrentKey() throws IOException,  
            InterruptedException {  
        return NullWritable.get();  
    }  
  
    @Override  
    public BytesWritable getCurrentValue() throws IOException,  
            InterruptedException {  
        return value;  
    }  
  
    @Override  
    public float getProgress() throws IOException, InterruptedException {  
        return processed? 1.0f : 0.0f;  
    }  
  
    @Override  
    public void initialize(InputSplit split, TaskAttemptContext context)  
            throws IOException, InterruptedException {  
        this.fileSplit = (FileSplit) split;  
        this.conf = context.getConfiguration();  
    }  
  
    //process表示记录是否已经被处理过  
    @Override  
    public boolean nextKeyValue() throws IOException, InterruptedException {  
        if (!processed) {  
            byte[] contents = new byte[(int) fileSplit.getLength()];  
            Path file = fileSplit.getPath();  
            FileSystem fs = file.getFileSystem(conf);  
            FSDataInputStream in = null;  
            try {  
              in = fs.open(file);  
              //将file文件中 的内容放入contents数组中。使用了IOUtils实用类的readFully方法,将in流中得内容放入  contents字节数组中。
              IOUtils.readFully(in, contents, 0, contents.length);  
              //BytesWritable是一个可用做key或value的字节序列,而ByteWritable是单个字节。  将value的内容设置为contents的值  
              value.set(contents, 0, contents.length);  
            } finally {  
              IOUtils.closeStream(in);  
            }  
            processed = true;  
            return true;  
          }  
          return false;  
    }  
}  



已有(1)人评论

跳转到指定楼层
arsenduan 发表于 2015-6-16 23:47:00
感谢楼主分享
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

关闭

推荐上一条 /2 下一条