采集类修改 在开始介绍分布式采集之前,我们需要对之前介绍的采集类添加一些方法,也就是返回上一篇博客中介绍的小说javabean,具体源码还请参照个人网站上的博客源码。 1.简介页 简介页需呀添加一个方法,让它返回简介页的数据信息,具体如下:
view plaincopy to clipboardprint?
- /**
- * @return
- * @Author:lulei
- * @Description: 分析简介页,获取简介页数据
- */
- public NovelIntroModel getNovelIntro() {
- NovelIntroModel bean = new NovelIntroModel();
- bean.setMd5Id(ParseMD5.parseStrToMd5L32(this.pageUrl));
- bean.setName(getName());
- bean.setAuthor(getAuthor());
- bean.setDescription(getDesc());
- bean.setType(getType());
- bean.setLastChapter(getLatestChapter());
- bean.setChapterlisturl(getChapterListUrl());
- bean.setWordCount(getWordCount());
- bean.setKeyWords(keyWords());
- return bean;
- }
view plaincopy to clipboardprint?
- /**
- * @return
- * @Author:lulei
- * @Description: 分析阅读页,获取阅读页数据
- */
- public NovelReadModel getNovelRead(){
- NovelReadModel novel = new NovelReadModel();
- novel.setTitle(getTitle());
- novel.setWordCount(getWordCount());
- novel.setContent(getContent());
- return novel;
- }
各页采集线程类 在实现分布式采集的时候,就需要编写各个页面的采集线程类,让他来控制各页面的采集业务,下面我们就一一介绍: 1.更新列表页线程 这个线程的主要功能就是监控更新列表页的数据,提取页面上的简介页URL,认为它们是有更新的页面,将对应的信息持久化到数据库中,具体实现如下:
view plaincopy to clipboardprint?
- /**
- *@Description: 更新列表页线程
- */
- package com.lulei.crawl.novel.zongheng;
- import java.util.List;
- import java.util.concurrent.TimeUnit;
- import com.lulei.db.novel.zongheng.ZonghengDb;
- public class UpdateListThread extends Thread{
- private boolean flag = false;
- private String url;//抓取的更新列表页URL
- private int frequency;//采集频率
- public UpdateListThread(String name, String url, int frequency){
- super(name);
- this.url = url;
- this.frequency = frequency;
- }
- @Override
- public void run() {
- flag = true;
- ZonghengDb db = new ZonghengDb();
- while (flag){
- try {
- UpdateList updateList = new UpdateList(url);
- List<String> urls = updateList.getPageUrls(true);
- db.saveInfoUrls(urls);
- TimeUnit.SECONDS.sleep(frequency);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- super.run();
- }
- public static void main(String[] args) {
- // TODO Auto-generated method stub
- UpdateListThread thread = new UpdateListThread("llist", "http://book.zongheng.com/store/c0/c0/b9/u0/p1/v0/s9/t0/ALL.html", 60);
- thread.start();
- }
- }
view plaincopy to clipboardprint?
- /**
- *@Description: 小说简介信息线程
- */
- package com.lulei.crawl.novel.zongheng;
- import java.util.List;
- import java.util.concurrent.TimeUnit;
- import com.lulei.crawl.novel.zongheng.model.NovelIntroModel;
- import com.lulei.db.novel.zongheng.ZonghengDb;
- public class IntroPageThread extends Thread {
- private boolean flag = false;
- public IntroPageThread(String name) {
- super(name);
- }
- @Override
- public void run() {
- flag = true;
- try {
- ZonghengDb db = new ZonghengDb();
- while (flag) {
- //随机获取一个待采集的简介页url
- String url = db.getRandIntroPageUrl(1);
- if (url != null) {
- IntroPage intro = new IntroPage(url);
- NovelIntroModel bean = intro.getNovelIntro();
- //采集小说章节列表页信息
- ChapterPage chapterPage = new ChapterPage(bean.getChapterlisturl());
- List<String[]> chapters = chapterPage.getChaptersInfo();
- bean.setChapterCount(chapters == null ? 0 : chapters.size());
- //更新小说简介信息
- db.updateInfo(bean);
- //插入待采集的章节列表
- db.saveChapters(chapters);
- //如果本次有待采集的资源,睡眠一个时间,没有待采集的资源,睡眠另一个时间
- TimeUnit.MILLISECONDS.sleep(500);
- }else {
- TimeUnit.MILLISECONDS.sleep(1000);
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) {
- // TODO Auto-generated method stub
- IntroPageThread thread = new IntroPageThread("novelinfo");
- thread.start();
- }
- }
view plaincopy to clipboardprint?
- /**
- *@Description: 小说阅读页线程
- */
- package com.lulei.crawl.novel.zongheng;
- import java.util.concurrent.TimeUnit;
- import com.lulei.crawl.novel.zongheng.model.NovelChapterModel;
- import com.lulei.crawl.novel.zongheng.model.NovelReadModel;
- import com.lulei.db.novel.zongheng.ZonghengDb;
- import com.lulei.util.ParseMD5;
- public class ReadPageThread extends Thread {
- private boolean flag = false;
- public ReadPageThread(String name) {
- super(name);
- }
- @Override
- public void run() {
- flag = true;
- ZonghengDb db = new ZonghengDb();
- while (flag) {
- try {
- //随机获取待采集的阅读页
- NovelChapterModel chapter = db.getRandReadPageUrl(1);
- if (chapter != null) {
- ReadPage read = new ReadPage(chapter.getUrl());
- NovelReadModel novel = read.getNovelRead();
- if (novel == null) {
- continue;
- }
- novel.setChapterId(chapter.getChapterId());
- novel.setTime(chapter.getTime());
- novel.setUrl(chapter.getUrl());
- //保存阅读页信息
- db.saveNovelRead(novel);
- //将状态修改为不需要采集
- db.updateChapterState(ParseMD5.parseStrToMd5L32(novel.getUrl()), 0);
- //如果本次有待采集的资源,睡眠一个时间,没有待采集的资源,睡眠另一个时间
- TimeUnit.MILLISECONDS.sleep(500);
- } else {
- TimeUnit.MILLISECONDS.sleep(1000);
- }
- } catch(Exception e){
- e.printStackTrace();
- }
- }
- }
- public static void main(String[] args) {
- ReadPageThread thread = new ReadPageThread("novel read page");
- thread.start();
- }
- }
分布式采集 上面已经介绍完了各个线程完成的工作,下面就需要一个类来控制管理这些线程,让其运行起来,具体代码如下:
view plaincopy to clipboardprint?
- /**
- *@Description:
- */
- package com.lulei.crawl.novel.zongheng;
- import java.util.List;
- import com.lulei.crawl.novel.zongheng.model.CrawlListInfo;
- import com.lulei.db.novel.zongheng.ZonghengDb;
- public class CrawStart {
- private static boolean booleanCrawlList = false;
- private static boolean booleanCrawlIntro = false;
- //简介页采集线程数目
- private static int crawlIntroThreadNum = 2;
- private static boolean booleanCrawlRead = false;
- //阅读页采集线程数目
- private static int crawlReadThreadNum = 10;
- /**
- * @Author:lulei
- * @Description: 更新列表页采集
- */
- public void startCrawlList(){
- if (booleanCrawlList) {
- return;
- }
- booleanCrawlList = true;
- ZonghengDb db = new ZonghengDb();
- List<CrawlListInfo> infos = db.getCrawlListInfos();
- if (infos == null) {
- return;
- }
- for (CrawlListInfo info : infos) {
- if (info.getUrl() == null || "".equals(info.getUrl())) {
- continue;
- }
- UpdateListThread thread = new UpdateListThread(info.getInfo(), info.getUrl(), info.getFrequency());
- thread.start();
- }
- }
- /**
- * @Author:lulei
- * @Description: 小说简介页和章节列表页
- */
- public void startCrawlIntro() {
- if (booleanCrawlIntro) {
- return;
- }
- booleanCrawlIntro = true;
- for (int i = 0; i < crawlIntroThreadNum; i++) {
- IntroPageThread thread = new IntroPageThread("novel info thread" + i);
- thread.start();
- }
- }
- /**
- * @Author:lulei
- * @Description: 小说阅读页
- */
- public void startCrawlRead() {
- if (booleanCrawlRead) {
- return;
- }
- booleanCrawlRead = true;
- for (int i = 0; i < crawlReadThreadNum; i++) {
- ReadPageThread thread = new ReadPageThread("novel read page" + i);
- thread.start();
- }
- }
- public static void main(String[] args) {
- CrawStart start = new CrawStart();
- start.startCrawlList();
- start.startCrawlIntro();
- start.startCrawlRead();
- }
- }
运行结果 通过上面的这几个步骤,纵横小说的分布式采集程序已经完成,下面就为大家展示一下采集后的数据库截图
转载链接:http://www.llwjy.com/blogdetail/ ... ce07e2fb2d768f.html