本帖最后由 badbird 于 2014-6-18 18:21 编辑
# -*- coding: cp936 -*-
import urllib,urllib2,re,sys,requests #引用模块
from bs4 import BeautifulSoup
findid=re.compile('>.+<')
def get_url(url):
num = 1
while 'blue t_l word' in urllib2.urlopen('http://baidurank.aizhan.com/baidu/%s/%s/position/'%(url,num)).read():
num+=1
for x in range(1,num):
html=urllib2.urlopen('http://baidurank.aizhan.com/baidu/%s/%s/position/'%(url,x)).read()#下载页面
soup = BeautifulSoup(html)
keywords=soup.find_all('td',class_='blue t_l word')
keyword_rank=soup.find_all('span',class_='pos')
filename=open("aizhan100.csv","a")
k_list=[]
for k in keywords:
k_string=str(list(k)[1])
#print findid.findall(k_string)[0]
k_list.append(findid.findall(k_string)[0].rstrip('<').lstrip('>').decode('utf-8').encode('gb2312'))
r_list=[]
for r in keyword_rank:
r_list.append(findid.findall(str(r))[0].rstrip('<').lstrip('>'))
k_rank=dict(zip(k_list,r_list))
for k in k_rank:
filename.write(k + ',' + k_rank[k]+"\n")
get_url('www.xxxxxx.com')
用了2小时写了一段可以直接提取到在爱占百度排名下的所有网站关键词功能,希望对大家有所帮助。(已本地测试可成功提取,有问题可以直接论坛下探讨) 谢谢
|
评分
-
查看全部评分
|