分享

Hadoop MapReduce纵表转横表 与 横表转纵表

laozhao0 发表于 2014-8-30 11:22:29 [显示全部楼层] 回帖奖励 阅读模式 关闭右栏 0 6251



输入数据如下:以\t分隔


  1. 0-3岁育儿百科 书 23
  2. 0-5v液位传感器 5
  3. 0-5轴承 2
  4. 0-6个月奶粉 23
  5. 0-6个月奶粉c2c报告 23
  6. 0-6个月奶粉在线购物排名 23
  7. 0-6个月奶粉市场前景 23
  8. 0-6个月配方奶粉 23
  9. 0.001g电子天平 5
  10. 0.01t化铝炉 2
  11. 0.01吨熔铝合金炉 2
  12. 0.03吨化镁炉 25
  13. 0.03吨电磁炉 11
复制代码

其中左侧是搜索词,右侧是类别,可看成是数据库中的纵表,现需要将输入转成横表,即 类名\t语句1\t语句2...,这样的格式。
MapReduce最适合做这样的事情了。因为经常用到,记录一下。Hive表中的数据要转成横表的时候,单独写个MR来处理就很方便了。



  1. package seg;
  2. import java.io.IOException;
  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.conf.Configured;
  5. import org.apache.hadoop.fs.FileSystem;
  6. import org.apache.hadoop.fs.Path;
  7. import org.apache.hadoop.io.LongWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.apache.hadoop.mapreduce.Job;
  10. import org.apache.hadoop.mapreduce.Mapper;
  11. import org.apache.hadoop.mapreduce.Reducer;
  12. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  13. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  14. import org.apache.hadoop.util.GenericOptionsParser;
  15. import org.apache.hadoop.util.Tool;
  16. import org.apache.hadoop.util.ToolRunner;
  17. /**
  18. * @author zhf
  19. * @email zhf.thu@gmail.com
  20. * @version 创建时间:2014年8月24日 上午9:56:45
  21. */
  22. public class Vertical2Horizontal extends Configured implements Tool{
  23. public static void main(String[] args) throws Exception {
  24. int exitCode = ToolRunner.run(new Vertical2Horizontal(), args);
  25. System.exit(exitCode);
  26. }
  27. @Override
  28. public int run(String[] arg0) throws Exception {
  29. String[] args = new GenericOptionsParser(arg0).getRemainingArgs();
  30. if(args.length != 2){
  31. System.out.println("Usage:seg.Horizontal2Vertical <input> <output>");
  32. System.exit(1);
  33. }
  34. Configuration conf = new Configuration();
  35. FileSystem fs = FileSystem.get(conf);
  36. if(fs.exists(new Path(args[1])))
  37. fs.delete(new Path(args[1]),true);
  38. Job job = new Job(conf);
  39. job.setJarByClass(getClass());
  40. job.setMapperClass(HVMapper.class);
  41. job.setReducerClass(HVReducer.class);
  42. job.setMapOutputKeyClass(Text.class);
  43. job.setMapOutputValueClass(Text.class);
  44. job.setOutputKeyClass(Text.class);
  45. job.setOutputValueClass(Text.class);
  46. FileInputFormat.addInputPath(job, new Path(args[0]));
  47. FileOutputFormat.setOutputPath(job, new Path(args[1]));
  48. return job.waitForCompletion(true) ? 0:1;
  49. }
  50. public static class HVMapper extends Mapper<LongWritable,Text,Text,Text>{
  51. private Text text = new Text();
  52. private Text clazz = new Text();
  53. public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
  54. String line = value.toString();
  55. String params[] = line.split("\t");
  56. text.set(params[0]);
  57. clazz.set(params[1]);
  58. context.write(clazz,text);
  59. }
  60. }
  61. public static class HVReducer extends Reducer<Text,Text,Text,Text>{
  62. private Text result = new Text();
  63. public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
  64. String tmp = "";
  65. for(Text val : values){
  66. tmp += val + "\t";
  67. }
  68. result.set(tmp.trim());
  69. context.write(key, result);
  70. }
  71. }
  72. }
复制代码

输出:
  1. 莱舍万 服装美学 莱芜劳保服装    南京羽绒服特卖会        螃蟹的秘密品牌内衣店    螃蟹的秘密内衣专卖店
复制代码

今天又要用到了横表转纵表,记录一下。横表转纵表只需要一个Mapper就可以了,完全没有难度,就是把数据吹散开就好了。
  1. package seg;
  2. import java.io.IOException;
  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.conf.Configured;
  5. import org.apache.hadoop.fs.FileSystem;
  6. import org.apache.hadoop.fs.Path;
  7. import org.apache.hadoop.io.LongWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.apache.hadoop.mapreduce.Job;
  10. import org.apache.hadoop.mapreduce.Mapper;
  11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  13. import org.apache.hadoop.util.GenericOptionsParser;
  14. import org.apache.hadoop.util.Tool;
  15. import org.apache.hadoop.util.ToolRunner;
  16. /**
  17. * @ClassName: Horizontal2Vertical
  18. * @Description: 横表转纵表
  19. * @date 2014年8月27日 下午2:01:35
  20. *
  21. */
  22. public class Horizontal2Vertical extends Configured implements Tool {
  23. public static void main(String[] args) throws Exception {
  24. int exitCode = ToolRunner.run(new Horizontal2Vertical(), args);
  25. System.exit(exitCode);
  26. }
  27. @Override
  28. public int run(String[] arg0) throws Exception {
  29. String[] args = new GenericOptionsParser(arg0).getRemainingArgs();
  30. if(args.length != 2){
  31. System.err.println("Usage : TableTransferMR <input> <output>");
  32. }
  33. Configuration conf = new Configuration();
  34. FileSystem fs = FileSystem.get(conf);
  35. if(fs.exists(new Path(args[1])))
  36. fs.delete(new Path(args[1]),true);
  37. Job job = new Job(conf);
  38. job.setJarByClass(Horizontal2Vertical.class);
  39. job.setMapperClass(TableMapper.class);
  40. job.setNumReduceTasks(0);
  41. job.setMapOutputKeyClass(Text.class);
  42. job.setMapOutputValueClass(Text.class);
  43. job.setOutputKeyClass(Text.class);
  44. job.setOutputValueClass(Text.class);
  45. FileInputFormat.addInputPath(job, new Path(args[0]));
  46. FileOutputFormat.setOutputPath(job, new Path(args[1]));
  47. return job.waitForCompletion(true) ? 0:1;
  48. }
  49. public static class TableMapper extends Mapper<LongWritable,Text,Text,Text>{
  50. public Text baseinfo = new Text();
  51. public Text filter = new Text();
  52. public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
  53. String line = value.toString().trim();
  54. String[] params = line.split("\t");
  55. String dspid = params[0];
  56. String token = params[1];
  57. String userseq = params[2];
  58. String ip = params[3];
  59. String filters = params[8];
  60. String platform = params[9];
  61. baseinfo.set(dspid+"\t"+token+"\t"+userseq+"\t"+ip);
  62. String[] fs = filters.split("\\|");
  63. for(String f : fs){
  64. filter.set(f+"\t"+platform);
  65. context.write(baseinfo, filter);
  66. }
  67. }
  68. }
  69. }
复制代码




Hadoop MapReduce纵表转横表 与 横表转纵表


没找到任何评论,期待你打破沉寂

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

关闭

推荐上一条 /2 下一条