想把FreeBuf上介绍的所有工具整理一个列表,然后找出那些自己感兴趣而又没有了解过的工具,本来想着用BurpSuite简单的抓取下,但是操作发现Burp的正则匹配存在问题。于是花了点时间配置notepad++用python写个脚本抓取下,分享出来,给有需要的人,并通晓此类的问题的简单处理方法。



#coding=utf-8
'''
Freebuf工具抓取脚本
http://hi.baidu.com/l34rn
'''
import re,sys
from urllib import urlopen
from Queue import Queue
from threading import Thread
from time import strftime
baseUrl='http://www.freebuf.com/tools/page/'
outPut='FreebufToolsListX.html'
pageNum=32+1
threadNum=10
urlList=[]
threadList=[]
urlNum=0
def spiderIndex(url):
'''
spider function
'''
global urlNum
try:
res=urlopen(url)
except Exception,e:
print '[-] [%s] [Error] [%s]'
if res.getcode()==200:
html=res.read()
lines=html.split('\n')
for line in lines:
rex=re.search(r'(<dt><a href=\")(http://www.freebuf.com/tools/\d*\.html)(\" target=\"_blank\">).*',line)
if rex!=None:
urlNum+=1
urlList.append(rex.group())
sys.stdout.write('\r[*] [%s] [Working] [%s]'%(str(strftime('%X')) ,str(urlNum)))
class WorkThread(Thread):
'''
work thread
'''
def __init__(self,q):
Thread.__init__(self)
self.q=q
def run(self):
while True:
if self.q.empty()==True:
break
_url=baseUrl+str(self.q.get())
spiderIndex(_url)
def main():
'''
main function
'''
q=Queue(maxsize=0)
for i in xrange(1,pageNum,1):
q.put(i)
print '[+] [%s] [Start]'%strftime('%X')
print '[*] [%s] [http://hi.baidu.com/l34rn]'%strftime('%X')
spiderIndex('http://www.freebuf.com/tools')
for i in xrange(threadNum):
t=WorkThread(q)
threadList.append(t)
for i in threadList:
i.start()
for i in threadList:
i.join()
f=open(outPut,'ab')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n')
f.write('<title>Freebuf Tools List</title>\n')
f.write('<center><h1><b>Freebuf Tools List</b></h1>\n'+'Time:'+str(strftime("%Y-%b-%d %X"))+' Count:'+str(len(urlList))+'</center><hr/>\n<h5>\n')
for line in urlList:
f.write(line+'</br>\n')
f.write('</h5><hr/><center><a href="http://hi.baidu.com/l34rn">Powered By L34Rn</a></center>')
f.close()
print '\n[+] [%s] [End] [All Done!]'%strftime('%X')
print '[+] [%s] [Save As] [%s]'%(strftime('%X'),outPut)
if __name__=='__main__':
main()