代碼沒有執(zhí)行apply_async中添加的函數(shù)就直接結(jié)束了
from bs4 import BeautifulSoup
import random
import requests
import pymongo
import datetime
import random
import time
from multiprocessing import Pool
user_agents = [
'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 '
'Mobile/13B143 Safari/601.1]',
'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.23 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.23 Mobile Safari/537.36']
heads = {
'User_Agent': random.choice(user_agents)
}
ipHeads = {
'Upgrade-Insecure-Requests':'1',
'User-Agent':random.choice(user_agents),
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer':'http://www.xicidaili.com/nn/',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
}
class douban():
def __init__(self):
self.client = pymongo.MongoClient('localhost', 27017)
self.db = self.client['books']
self.tool = self.client['tool']
self.collectIp = self.tool['ip']
def getFromSQL(self):
item = self.collectIp.find_one({'http': 'http'})
proxies = {}
proxies[item['http']] = 'http://' + item['ip'] + ':' + item['port']
return proxies
def getAllTag(self):
ipDic = {}
url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
proxies = self.getFromSQL()
s = requests.get(url,headers=heads,proxies=proxies)
if s.status_code == 403:
values = list(proxies.values())[0]
ip = values.split('//')[1].split(':')[0]
self.collect.remove({'ip': ip})
proxies = getFromSQL()
s = requests.get(url,headers=heads,proxies=proxies)
soup = BeautifulSoup(s.text,'lxml')
titleTags= soup.find_all('a', class_='tag-title-wrapper')
tagList = soup.find_all('table',class_='tagCol')
href = {}
titleList = []
i = 0
for titleTag in titleTags:
title = titleTag['name']
titleList.append(title)
trs = tagList[i].find_all('tr')
hreflist = []
for tr in trs:
hreflist.append(tr.td.a['href'])
href[title] = hreflist
i = i + 1
return titleList,href
def getAllBookUrl(self,title, hrefDic):
print('a')
collect = self.db[title]
for href in hrefDic[title]:
index = 0
while 1:
url = 'https://book.douban.com' + href +'?start='+ str(index) + '&type=T'
proxies = self.getFromSQL()
s = requests.get(url, headers=heads,proxies=proxies)
if s.status_code == 403:
values = list(proxies.values())[0]
ip = values.split('//')[1].split(':')[0]
collect.remove({'ip': ip})
proxies = self.getFromSQL()
s = requests.get(url,headers=heads,proxies=proxies)
html = s.text
soup = BeautifulSoup(html, 'lxml')
liList= soup.find_all('li',class_='subject-item')
if len(liList):
for li in liList:
id = li.find('a')['href'][32:-1]
collect.insert({'bookId':id})
index += 20
time.sleep(3)
else:
break
if __name__== '__main__':
p = Pool(4)
a = douban()
titleList, hrefDic = a.getAllTag()
for i in range(len(titleList)):
print('開始爬取%s'%titleList[i])
p.apply_async(a.getAllBookUrl, args=(titleList[i],hrefDic))
p.close()
p.join()
# a = douban()
# titleList, hrefDic = a.getAllTag()
# a.getAllBookUrl(titleList[0],hrefDic)
# print('done')
北大青鳥APTECH成立于1999年。依托北京大學(xué)優(yōu)質(zhì)雄厚的教育資源和背景,秉承“教育改變生活”的發(fā)展理念,致力于培養(yǎng)中國IT技能型緊缺人才,是大數(shù)據(jù)專業(yè)的國家
達內(nèi)教育集團成立于2002年,是一家由留學(xué)海歸創(chuàng)辦的高端職業(yè)教育培訓(xùn)機構(gòu),是中國一站式人才培養(yǎng)平臺、一站式人才輸送平臺。2014年4月3日在美國成功上市,融資1
北大課工場是北京大學(xué)校辦產(chǎn)業(yè)為響應(yīng)國家深化產(chǎn)教融合/校企合作的政策,積極推進“中國制造2025”,實現(xiàn)中華民族偉大復(fù)興的升級產(chǎn)業(yè)鏈。利用北京大學(xué)優(yōu)質(zhì)教育資源及背
博為峰,中國職業(yè)人才培訓(xùn)領(lǐng)域的先行者
曾工作于聯(lián)想擔(dān)任系統(tǒng)開發(fā)工程師,曾在博彥科技股份有限公司擔(dān)任項目經(jīng)理從事移動互聯(lián)網(wǎng)管理及研發(fā)工作,曾創(chuàng)辦藍懿科技有限責(zé)任公司從事總經(jīng)理職務(wù)負責(zé)iOS教學(xué)及管理工作。
浪潮集團項目經(jīng)理。精通Java與.NET 技術(shù), 熟練的跨平臺面向?qū)ο箝_發(fā)經(jīng)驗,技術(shù)功底深厚。 授課風(fēng)格 授課風(fēng)格清新自然、條理清晰、主次分明、重點難點突出、引人入勝。
精通HTML5和CSS3;Javascript及主流js庫,具有快速界面開發(fā)的能力,對瀏覽器兼容性、前端性能優(yōu)化等有深入理解。精通網(wǎng)頁制作和網(wǎng)頁游戲開發(fā)。
具有10 年的Java 企業(yè)應(yīng)用開發(fā)經(jīng)驗。曾經(jīng)歷任德國Software AG 技術(shù)顧問,美國Dachieve 系統(tǒng)架構(gòu)師,美國AngelEngineers Inc. 系統(tǒng)架構(gòu)師。