查询收录数,收录率,未收录链接,收录的链接
需要安装pycurl模块
需要查询的URL放在url.csv里面文件必须是utf-8格式
运行BDshoulu.py文件
在Windows下面的命令提示符下运行会乱码,print的内容会乱码,请自行转码,不影响结果。
遇到验证码的时候会停止5分钟重新查
线程建议不要开太多,否则会导致封IP
- #coding:utf-8
- import pycurl,re,StringIO
- import threading,Queue,time
- class caiji:
- #打开网页 url:网页URL
- def html(self,url):
- while 1:
- try:
- b=StringIO.StringIO()
- c=pycurl.Curl()
- c.setopt(pycurl.URL,url) #打开URL
- c.setopt(pycurl.FOLLOWLOCATION,2) #允许跟踪来源,有参数:1和2
- c.setopt(pycurl.ENCODING, 'gzip') #开启gzip压缩提高下载速度
- c.setopt(pycurl.NOSIGNAL, True) #开启后多线程不会报错
- c.setopt(pycurl.MAXREDIRS,1) #最大重定向次数,0表示不重定向
- c.setopt(pycurl.CONNECTTIMEOUT,60) #链接超时
- c.setopt(pycurl.TIMEOUT,30) #下载超时
- c.setopt(pycurl.USERAGENT,'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)')
- #pycurl.USERAGENT 模拟浏览器
- c.setopt(pycurl.WRITEFUNCTION, b.write) #回调写入字符串缓存
- c.perform() #执行上述访问网址的操作
- # print c.getinfo(pycurl.HTTP_CODE)
- c.close()
- html=b.getvalue() #读取b中的数据
- return html #跳出并返回html
- except:
- continue
- wurl=open(r"url1.csv",'a')
- caiji=caiji()
- class count:
- def __init__(self):
- self.shoulu=0
- self.wshoulu=0
- self.i=0
- self.lock=threading.Lock()
- def c_wshoulu(self):
- self.lock.acquire()
- self.wshoulu+=1
- wshoulu=self.wshoulu
- self.lock.release()
- return wshoulu
- def c_sl(self):
- self.lock.acquire()
- self.shoulu+=1
- shoulu=self.shoulu
- self.lock.release()
- return shoulu
- def c_i(self):
- self.lock.acquire()
- self.i+=1
- i=self.i
- self.lock.release()
- return i
- count=count()
- class th(threading.Thread):
- def __init__(self,qurl):
- threading.Thread.__init__(self)
- self.qurl=qurl
- self.lock=threading.Lock()
- self.cond=threading.Condition()
- def run(self):
- while 1:
- ddc=self.qurl.get()
- if ddc is None:
- break
- while 1:
- bdhtm=caiji.html('http://www.baidu.com/s?wd='+ddc)
- self.lock.acquire()
- if '百度为您找到相关结果约' in bdhtm:
- i=count.c_i()
- print '第%s条, %s ,收录'% (i,ddc)
- wurl.writelines('第%s条, %s ,收录\n'% (i,ddc))
- count.c_sl()
- break
- elif '抱歉,没有找到与' in bdhtm:
- i=count.c_i()
- print '第%s条, %s ,未收录'% (i,ddc)
- wurl.writelines('第%s条, %s ,未收录\n'% (i,ddc))
- count.c_wshoulu()
- break
- elif 'http://verify.baidu.com/' in bdhtm:
- print ddc,'出现验证码,等待5分钟后自动开始'
- self.lock.release()
- time.sleep(500)
- continue
- else:
- print 'Error'
- break
- self.lock.release()
- qurl=Queue.Queue(0)
- threadCount=6 #开启线程数,默认6个线程
- ths=[]
- for t in range(threadCount):
- thread=th(qurl)
- thread.start()
- ths.append(thread)
- for ddc in open(r'url.csv'): #导入需要查询的URL文件,格式必须是utf-8
- ddc=ddc[0:-1]
- qurl.put(ddc)
- for tt in range(threadCount):
- qurl.put(None)
- for t in ths:
- t.join()
- sl=count.c_sl()-1
- print '\n收录率:'+str(round(float(sl)/float(count.c_i()-1)*100,2)),"%"
- print '收录:%s 条'%str(sl)
- print '未收录:%s 条'%str(count.c_wshoulu()-1)
复制代码 |