本帖最后由 GoGo闯 于 2014-6-28 22:40 编辑
猛然发现没有赞了!!!没赞怎么上课呢!!!!!!
----------------------------------------------------------------
分享一个自己平常用的日志分析脚本,很简单,主要统计URL的抓取和流量数据,略细分。光年那宏观的数据感觉没多大用,就没往里加。因为是临时想到什么就往里加的什么,搞得好多变量自己都忘了是干嘛的,所以整体看上去比较繁琐。效率一般,i3处理器1G多日志3、4分钟。
执行命令:python log_file seo_file
log_file:要输入日志的文件名,seo_file:随便起一个。。。
脚本输出的内容包括:
蜘蛛抓取量(总抓取量)
每类页面的抓取量(分不同蜘蛛的;包含唯一抓取量和总抓取量,对比看下重复抓取对不多。。。)
SEO流量(统计的搜索引擎在‘seo_traffic_req’变量里,觉得不够在自己加)
每类页面的SEO流量
每类页面的百度SEO流量和360 SEO流量
蜘蛛状态码汇总
百度来源关键词
- #coding:utf-8
- #weblog analytics
- import re
- import sys
- import urllib
- import os
- input_file,seo_file = sys.argv[1:3] #要输入的日志文件名,和输出的seo流量文件名
- seo_url = open(seo_file,'a')
- #fenci = open(fenci_file,'a') ps:需要单独输出百度来源关键词文件,则取消注释
- baidu_seo = open('baiduseo.txt','a')
- #要统计蜘蛛抓取及流量数据的页面url对应的正则,想统计1个就写1个,想统计100个就写100个,根据自己需求替换下~~
- mulu_re = [
- '/abc/[0-9]+.html',
- '/abc/g[0-9]+/[a-z]+.html'
- ]
- #要统计的蜘蛛,根据自己需求替换下
- kz_spider = [
- 'Baiduspider.*search/spider.html' #因为只匹配‘Baiduspider’可能把假蜘蛛也算进来,所以这么写。。
- #'360Spider'
- #'Googlebot',
- #'Sogou'
- ]
- weblog = open(input_file).read()
- word_re = re.compile('\.baidu\.com/.*?(?:wd|word)=(.*?)[&"]')
- seo_traffic_req = re.compile(r'(so.com/.*?q=|360.cn/.*?q=|baidu.com/.*wd=|baidu.com/.*word=|so.com/.*q=|sogou.com/.*query=|youdao.com/.*q=|yahoo.com/.*p=|bing.com/.*q=|google.com/.*q=)')
- baidu_seo_re = re.compile(r'(baidu.com/.*wd=|baidu.com/.*word=)')
- seo_traffic = 0
- seo_baidu = 0
- pagecode = {}
- baidupagecode = {}
- def spider_zq(spider):
- req = re.compile(spider)
- data = len(re.findall(req,weblog))
- return data
-
- def url_spider_zq(zz,spider):
- url_re = zz + '.*' + spider
- req = re.compile(url_re)
- data_one = len(list(set(re.findall(req,weblog)))) #唯一抓取量
- data_two = len(re.findall(req,weblog)) #总抓取量
- #e = '%.2f%%'% (float('%.1f'%(data_two-data_one))/data_two)
- return data_one,data_two
- print "\n"
- print "<-------------------------------每个蜘蛛的总抓取量---------------------------------->"
- for spider in kz_spider:
- print spider + "总抓取量:",spider_zq(spider)
- print "\n"
- print "<-------------------------------蜘蛛目录抓取量---------------------------------->"
- for spider in kz_spider:
- print spider+"目录抓取量:","\n"
- for zz in mulu_re:
- print zz,":",url_spider_zq(zz,spider)
- print "\n"
- print "<-------------------------------SEO总流量---------------------------------->"
- for line in open(input_file):
- data = re.search(seo_traffic_req,line)
- baidu = re.search(baidu_seo_re,line)
- if data:
- seo_traffic += 1
- seo_url.write(line+'\n')
- else:
- continue
- if baidu:
- seo_baidu += 1
- baidu_seo.write(line+'\n')
- else:
- continue
- code = line.split(' ')[9]
- if code.isdigit():
- if code in pagecode:
- pagecode[code] += 1
- else:
- pagecode[code] = 1
-
- print 'SEO流量:',seo_traffic,"\n"
- baidu_seo.close()
- seo_url.close()
- seo_mulu = open(seo_file).read()
- baiduseo = open('baiduseo.txt').read()
-
- print "<-------------------------------SEO目录流量---------------------------------->"
- print "网站目录SEO流量统计:","\n"
- for line in mulu_re:
- req = re.compile(line)
- seo_data = len(re.findall(req,seo_mulu))
- print line,seo_data
- print "\n"
- print "<-------------------------------百度 SEO目录流量---------------------------------->"
- print "网站目录SEO流量统计:","\n"
- for line in mulu_re:
- req = re.compile(line)
- seo_data = len(re.findall(req,baiduseo))
- print line,seo_data
- print "\n"
- print "<-------------------------------360 SEO目录流量---------------------------------->"
- print "360 SEO流量统计:","\n"
- for line in mulu_re:
- line_360 = line + ".*(so.com|360.cn)/.*?q="
- req = re.compile(line_360)
- seo_data_360 = len(re.findall(req,seo_mulu))
- print line,seo_data_360
-
- print "\n"
- print "<-------------------------------蜘蛛状态码---------------------------------->"
- pagecode_sort = sorted(pagecode.iteritems(), key=lambda d:d[1], reverse = True)
- print pagecode_sort
- print "\n"
-
- os.remove('baiduseo.txt')
- os.remove(seo_file) #如果需要日志中SEO流量的部分可以删掉这行
- #如果需要日志中的百度来源关键词可以取消注释
- #for line in open(seo_file):
- # word = re.search(word_re,line)
- # if not word:
- # continue
- # kw = urllib.unquote_plus(word.group(1))
- # if 'ie=utf-8' not in line:
- # kw = kw.decode('gb2312','ignore').encode('utf-8')
- # fenci.write(kw+"\n")
复制代码 |