如何用mapreduce实现多字段的group by和order by

提示: 作者被禁止或删除内容自动屏蔽

rsgg03 · 发表于 2013-12-14 12:31:51

首先不知道你为什么这么做，下面给你一些思路。如果你想学习的话，建议使用hive，生成mapreduce，然后参考一下，它是怎么生成的。
既然你不想用hive，你只能自己写算法了。这个需要自己写函数，这个实现起来时非常麻烦的。只是提供参考，这里只是抛砖引玉。1.如何分组：
你需要对字符串进行分割，并且不断比较。
2.如何排序：
简单的方法，你可以使用数组实现。如果不想用数组，那你只能自己写个数组，或则自己建立某个数据结构来实现了。

string2020 · 发表于 2013-12-14 16:12:22

提示: 作者被禁止或删除内容自动屏蔽

lzw · 发表于 2013-12-14 18:23:13

string2020 发表于 2013-12-14 16:12
假设我现在把每一行都用逗号分隔开了，
下一步怎么做。

我晚上给你写一个例子你看看，就明白了，其中主要考虑的是gorup中的key如何设计，设计好了自然就能解决

lzw · 发表于 2013-12-14 20:36:24

string2020 发表于 2013-12-14 16:12
假设我现在把每一行都用逗号分隔开了，
下一步怎么做。

package com.duplicate;

import java.io.IOException;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OdsEmp {
        public static class Map extends Mapper<Object, Text, Text, Text>{
                
                private Text k = new Text();
                private Text vs = new Text();
                
                public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
                        String v = value.toString();
                        String[] token = v.split(",");
                        k.set(token[0] + "," + token[1]); //使用 i_date + "," + v_emp_id组合key值
                        
                        vs.set(token[3] + "," + token[4]); // 使用 i_amr1 + "," + i_amr2组合value值
                        context.write(k, vs);
                }
        }
        
        public static class Reduce extends Reducer<Text,Text,NullWritable,Text>{
                
                public void reduce(Text key,Iterable<Text> values, Context context)throws IOException,InterruptedException{
                        FloatWritable i_amr1  = new FloatWritable();
                        FloatWritable i_amr2  = new FloatWritable();
                        
                        // 进行同一组的sum
                        for(Text tx : values){
                                String[] split = tx.toString().split(",");
                                i_amr1.set( i_amr1.get() + Float.parseFloat(split[0]));
                                i_amr2.set(i_amr2.get() + Float.parseFloat(split[1]));
                        }// End for
                        
                        Text value = new Text();
                        value.set(key + "," + i_amr1 + "," + i_amr2); // 以i_date,v_emp_id,v_proj_id,i_amr1,i_amr2输出
                        context.write(null, value);
                }
        }
        
        /**
         * @param args
         * @throws IOException 
         * @throws ClassNotFoundException 
         * @throws InterruptedException 
         */
        public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
                // TODO Auto-generated method stub
                Configuration conf = new Configuration();
                Job job = new Job(conf,"OdsEmp");
                job.setJarByClass(OdsEmp.class);
                job.setMapperClass(Map.class);
                job.setReducerClass(Reduce.class);
                
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(Text.class);
                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));
                System.exit(job.waitForCompletion(true)?0:1);
                
        }

}
复制代码