# coding: UTF-8
import urllib
import re
import sys
import os
import threading
from time import ctime
#sys.stderr = None
#获取每页URL,返回当前页URL 列表
def getUrl(url):
page = urllib.urlopen(url)
srcHtml = page.read()
reg = '<span class="g">([a-zA-Z0-9<b></b>\._-]+\.[a-zA-Z]{2,6})/.*?</span>'
aListUrl = re.findall(reg,srcHtml)
for index in range(len(aListUrl)):
reg = '[<b>|</b>]'
ok = re.sub(reg,'',aListUrl[index])
if ok:
aListUrl[index] = ok
return aListUrl
#拼装所有请求页URL地址 "Codier's Blog" "30"
def getPage(search,num,threadNum):
pageList = []
for index in range(0,num):
strBuff = "http://www.baidu.com/s?wd="+search+"&pn="
nowPage = index*10
strBuff = strBuff + str(nowPage)
pageList.append(strBuff)
return pageList,threadNum
#封装多线程对象列表
def creMultiThread(allPageList,threadNum=2):
if len(allPageList) < 100:#如果页数少于10页,则不使用多线程
getPageListUrl(allPageList)
else:
threads = []
nextpage = ceil(len(allPageList)/(10*threadNum))
for index in range(0,threadNum):
threads.append(threading.Thread(target=getPageListUrl,args=(pageList[nextpage*index*10:nextpage*(index+1)*10],)))
for thd in threads:
thd.setDaemon(True)
thd.start()
for thd in threads:
thd.join()
resultList = []
def getPageListUrl(pageList):
global resultList
for index in pageList:
for index2 in getUrl(index):
print index2
resultList.append(index2)
#把每页采集到的URL写入文件
def putFile(allUrl):
if os.path.exists("url.txt"):
print u"己删除历史记录,将重新生成采集url...."
os.remove("url.txt")
for index in allUrl:
file = open("url.txt","a+")
file.write(index+"\n")
file.close()
#多线程跑起来
if __name__ == '__main__':
if len(sys.argv) == 4:
list = getPage(str(sys.argv[1]),int(sys.argv[2]),int(sys.argv[3]))
st = ctime()
creMultiThread(list[0],list[1])
result = set(resultList)
print u'''
-----------------------------------------------------
多线程百度URL采集器 python:2.7.10
-----------------------------------------------------
^v^ example:geturl.pyc "关键字" "页数" "线程数" ^v^
^v^ team:www.secbox.cn ^v^
codier_qq:510623849
-----------------------------------------------------
'''
putFile(result)
print u"开始时间:"+st
print u"总共采集"+str(len(resultList))+u"条url........."
print u"去除重复剩余"+str(len(result))+u"条url........."
print u"采集结果已经自动保存当前目录 url.txt 文件中..... ^V^"
print u"结束时间:"+ctime()
elif len(sys.argv) < 4:
print u'''
-----------------------------------------------------
多线程百度URL采集器 python:2.7.10
-----------------------------------------------------
^v^ example:geturl.pyc "关键字" "页数" "线程数" ^v^
^v^ team:www.secbox.cn ^v^
codier_qq:510623849
-----------------------------------------------------
'''
转载于:https://blog.51cto.com/blck4/1671073
发布者:全栈程序员-用户IM,转载请注明出处:https://javaforall.cn/109402.html原文链接:https://javaforall.cn
【正版授权,激活自己账号】: Jetbrains全家桶Ide使用,1年售后保障,每天仅需1毛
【官方授权 正版激活】: 官方授权 正版激活 支持Jetbrains家族下所有IDE 使用个人JB账号...