進程池未執(zhí)行apply_async中添加的函數(shù)就直接結(jié)束了

代碼沒有執(zhí)行apply_async中添加的函數(shù)就直接結(jié)束了

from bs4 import BeautifulSoup
import random
import requests
import pymongo
import datetime
import random
import time
from multiprocessing import Pool

user_agents = [
'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 '
'Mobile/13B143 Safari/601.1]',
'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.23 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.23 Mobile Safari/537.36']
heads = {
    'User_Agent': random.choice(user_agents)
}
ipHeads = {
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':random.choice(user_agents),
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Referer':'http://www.xicidaili.com/nn/',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8',
}

class douban():
    def __init__(self):
        self.client = pymongo.MongoClient('localhost', 27017)
        self.db = self.client['books']
        self.tool = self.client['tool']
        self.collectIp = self.tool['ip']

    def getFromSQL(self):
        item = self.collectIp.find_one({'http': 'http'})
        proxies = {}
        proxies[item['http']] = 'http://' + item['ip'] + ':' + item['port']
        return proxies

    def getAllTag(self):
        ipDic = {}
        url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
        proxies = self.getFromSQL()
        s = requests.get(url,headers=heads,proxies=proxies)
        if s.status_code == 403:
            values = list(proxies.values())[0]
            ip = values.split('//')[1].split(':')[0]
            self.collect.remove({'ip': ip})
            proxies = getFromSQL()
            s = requests.get(url,headers=heads,proxies=proxies)
        soup = BeautifulSoup(s.text,'lxml')
        titleTags= soup.find_all('a', class_='tag-title-wrapper')
        tagList = soup.find_all('table',class_='tagCol')
        href = {}
        titleList = []
        i = 0
        for titleTag in titleTags:
            title = titleTag['name']
            titleList.append(title)
            trs = tagList[i].find_all('tr')
            hreflist = []
            for tr in trs:
                hreflist.append(tr.td.a['href'])
            href[title] = hreflist
            i = i + 1
        return titleList,href

    def getAllBookUrl(self,title, hrefDic):
        print('a')
        collect = self.db[title]
        for href in hrefDic[title]:
            index = 0
            while 1:
                url = 'https://book.douban.com' + href +'?start='+ str(index) + '&type=T'
                proxies = self.getFromSQL()
                s = requests.get(url, headers=heads,proxies=proxies)
                if s.status_code == 403:
                    values = list(proxies.values())[0]
                    ip = values.split('//')[1].split(':')[0]
                    collect.remove({'ip': ip})
                    proxies = self.getFromSQL()
                    s = requests.get(url,headers=heads,proxies=proxies)
                html = s.text
                soup = BeautifulSoup(html, 'lxml')
                liList= soup.find_all('li',class_='subject-item')
                if  len(liList):
                    for li in liList:
                        id = li.find('a')['href'][32:-1]
                        collect.insert({'bookId':id})
                    index += 20
                    time.sleep(3)
                else:
                    break



if __name__== '__main__':
    p = Pool(4)
    a = douban()
    titleList, hrefDic = a.getAllTag()
    for i in range(len(titleList)):
        print('開始爬取%s'%titleList[i])
        p.apply_async(a.getAllBookUrl, args=(titleList[i],hrefDic))
    p.close()
    p.join()
    # a = douban()
    # titleList, hrefDic = a.getAllTag()
    # a.getAllBookUrl(titleList[0],hrefDic)
    # print('done')

回答

編輯回答