//主要功能是从hbase中读一条URL执行,很简单的代码:
public static class CrawlReducer
extends TableReducer<Text, IntWritable, NullWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> value, Context context)
throws IOException, InterruptedException{
Put put = new Put(Bytes.toBytes(key.toString()));
put.add(Bytes.toBytes("content"), Bytes.toBytes("html"),
Bytes.toBytes(startCrawl(key.toString())));
context.write(NullWritable.get(), put);
}
}
public static String startCrawl(String url){
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setTimeout(5000);
HtmlPage page = null;
try {
page = webClient.getPage(url);//运行到此就错误、、、、、、、、、、、、、、、、、、、、、、、、、、
Thread.sleep(5000);
} catch (FailingHttpStatusCodeException e1) {
e1.printStackTrace();
} catch (MalformedURLException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return page.getPage().asXml();
}
独立运行都没错,但是把下载网页放入reduce就有错,瞎搞了两天没有解决,真郁闷。求大神告知解决方案,搜了MapReduce有什么主意事项基本没有,都是复制一大推Wordcount例子,无语。。。。。。。。
|
|