本帖最后由 howtodown 于 2014-4-29 12:17 编辑
问题导读:
1.如何判断程序是运行在本地还是在集群?
2.eclipse运行程序需要注意什么问题?
eclipse中运行wordcount任务后在hadoop监控页面没有显示该job,并且查看集群资源的使用情况,集群计算节点cpu和内存基本没用,而装有eclipse的电脑cpu和内存使用量却达到了100%,wordcount程序是在自己电脑的eclipse里面写的,请问这是怎么回事,难道mapreduce程序没有在集群上跑,而在自己电脑上跑吗?谢谢!
wordcount代码如下:
- import java.io.IOException;
- import java.text.SimpleDateFormat;
- import java.util.Date;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
-
- public class WordCount {
-
- public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
-
- private final static IntWritable one = new IntWritable(1);
-
- private Text word = new Text();
-
-
- public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
-
- StringTokenizer itr = new StringTokenizer(value.toString());
-
- System.out.println(value.toString());
-
- while (itr.hasMoreTokens()) {
-
- word.set(itr.nextToken());//获取下个字段的值并写入文件
-
- context.write( word, one );
-
- // System.out.println("itr.hasMoreTokens(): "+itr.hasMoreTokens()+" itr.nextToken(): "+itr.nextToken());
-
- }
-
- }
-
- }
-
- public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
-
- private IntWritable result = new IntWritable();
-
- public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
-
- int sum = 0;
-
- for (IntWritable val : values) {
-
- sum += val.get();
-
- }
-
- result.set(sum);
-
- context.write(key, result);
-
- }
-
- }
-
-
-
- public static void main(String[] args) throws Exception {
-
- // System.setProperty("hadoop.home.dir", "F:/ hadoop/hadoop -2.3.0");
-
- Configuration conf = new Configuration();
-
- // conf.set("mapred.job.tracker", "192.168.1.181:9001");
-
- // conf.set("yarn.resourcemanager.address", "192.168.1.181:9001");
-
- String[] otherArgs = new String[2];
-
- otherArgs[0] = "hdfs://192.168.1.181/data/sortin1" ;
-
- otherArgs[1] = "hdfs://192.168.1.181:9000/data/meanout/";
-
- Job job = new Job(conf, "word count");
-
- job.setJarByClass(WordCount. class);
-
-
-
- // job.setMaxMapAttempts(10);//设置最大试图产生底map数量,该命令不一定会设置该任务运行过车中的map数量
-
- // job.setNumReduceTasks(5);//设置reduce数量,即最后生成文件的数量
-
- // job.monitorAndPrintJob();
-
-
- job.setMapperClass(TokenizerMapper .class);//执行用户自定义map函数
-
- job.setCombinerClass(IntSumReducer. class);//对用户自定义map函数的数据处理结果进行合并,可以减少带宽消耗
-
- job.setReducerClass(IntSumReducer. class);//执行用户自定义reduce函数
-
-
- job.setOutputKeyClass(Text. class);
-
- job.setOutputValueClass(IntWritable. class);
-
-
- System.out.println("Job start!");
-
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
-
-
-
- FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
-
-
- if(job.waitForCompletion(true)){
-
- System.out.println("ok!");
-
- }else{
-
- System.out.println("error!");
-
- System.exit(0);
-
- }
-
- // System.exit(job.waitForCompletion(true) ? 0 : 1);
-
- // job.waitForCompletion(true);
-
- }
-
- }
复制代码
2 hadoop版本为hadoop2.3.0,使用yarn框架
mapred-site.xml如下:
- <configuration>
- <property>
- <name>mapreduce.framework.name</name>
- <value>yarn</value>
- </property>
- <property>
- <name>mapreduce.jobtracker.address</name>
- <value>192.168.1.181:9001</value>
- </property>
- <property>
- <name>mapreduce.jobhistory.address</name>
- <value>192.168.1.181:10020</value>
- </property>
- <property>
- <name>mapreduce.jobhistory.webapp.address</name>
- <value>192.168.1.181:19888</value>
- </property>
- <property>
- <name>mapred.local.dir</name>
- <value>file:///data1/hdfs1/mapred/local</value>
- </property>
- <property>
- <name>mapred.system.dir</name>
- <value>file:///data1/hdfs1/mapred/system</value>
- </property>
- </configuration>
复制代码
yarn-site.xml如下:
- <configuration>
- <property>
- <name>yarn.resourcemanager.hostname</name>
- <value>192.168.1.181</value>
- </property>
- <property>
- <name>yarn.resourcemanager.address</name>
- <value>192.168.1.181:8032</value>
- </property>
- <property>
- <name>yarn.resourcemanager.scheduler.address</name>
- <value>192.168.1.181:8030</value>
- </property>
- <property>
- <name>yarn.resourcemanager.resource-tracker.address</name>
- <value>192.168.1.181:8031</value>
- </property>
- <property>
- <name>yarn.resourcemanager.admin.address</name>
- <value>192.168.1.181:8033</value>
- </property>
- <property>
- <name>yarn.resourcemanager.webapp.address</name>
- <value>192.168.1.181:8088</value>
- </property>
- <property>
- <name>yarn.nodemanager.aux-services</name>
- <value>mapreduce_shuffle</value>
- </property>
- <property>
- <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
- <value>org.apache.hadoop.mapred.ShuffleHandler</value>
- </property>
- </configuration>
复制代码
解决方案:
研究对比后发现,按上面代码直接运行是运行在本地的,就是安装有eclipse的机器,我在这台机器上也部署了和集群一样的hadoop配置,因为当时用eclipse连接部署hadoop的mapreduce就在本地搭建了一个相同的环境,本地环境搭建好后mapreduce可以正常运行了,但这种运行是在本地运行的,要真正在集群中运行,在本地是不需要搭建和集群一样的hadoop配置的。而是要在java代码中添加hadoop集群配置:
conf.set("fs.default.name", "hdfs://192.168.1.181:9000");
conf.set("hadoop.job.user","hadoop");
conf.set("mapreduce.framework.name","yarn");
// conf.set("mapred.job.tracker","192.168.1.187:9001"); 用下面的设置而不用该设置,该设置是旧版本的设置,自己用的是hadoop2.3.0,查看官方配置文档后发现里面用的是下面mapreduce.jobtracker.address的配置地址
conf.set("mapreduce.jobtracker.address","192.168.1.187:9001");
conf.set("yarn.resourcemanager.hostname", "192.168.1.187");
conf.set("yarn.resourcemanager.admin.address", "192.168.1.187:8033");
conf.set("yarn.resourcemanager.address", "192.168.1.187:8032");
conf.set("yarn.resourcemanager.resource-tracker.address", "192.168.1.187:8036");
conf.set("yarn.resourcemanager.scheduler.address", "192.168.1.187:8030");
|