#! /usr/bin/env python2.7
#coding=utf-8
import re
import urllib2
def baiduspider(pages,keys):
for ls in range(pages+1):
#exp: http://www.baidu.com/s?wd=idev&pn=10&ie=utf-8
next = ls *10 #每页显示10个
url = 'http://www.baidu.com/s?ie=utf-8&'+'wd='+keys+'&pn='+str(next)
#exp: <span class="g">www.<b>idev</b>.com.cn/ 2013-7-12 </span>
#如果域名注中含有关键词 百度会给他加<b>
tmp = "<b>%s</b>" %keys
html = urllib2.urlopen(url).read().replace(tmp,keys)
#编译正则,对域名部分分组,下一步直接findall() 返回元组
purge = re.compile(r'<span class="g">(.*?)/.*?</span>')
result = (re.findall(purge,html))
#遍历后写入文件
f=file('url.txt','w+b')
for urlist in result:
print urlist
f.write('http://'+urlist+'\n')
f.close
def googlespider(pages,keys):
pass
def printinfo():
print '''
#######################################################
# Spider EXP #
# auto search and test #
# @ [url]www.idev.pw[/url] #
# #
#######################################################
'''
if __name__ == "__main__":
printinfo()
pages=int(raw_input('搜索页数: '))
keywords=raw_input('关键词: ')
way=raw_input('选择搜索引擎(1.baidu 2.google):(1 or 2)')
if way==2 :
googlespider(pages=pages,keys=keywords)
else:
baiduspider(pages=pages,keys=keywords)