python 出現(xiàn)OSError: raw write() returned invalid length 10

使用BeautifulSoup解析html

soup = BeautifulSoup(data, 'html.parser')
tables = soup.find_all('table')
for table in tables:
    trs = table.find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        for td in tds:
            link = td.find('a')
            if link:
                reg = re.compile(r'[\u2E80-\u9FFF]+')
                matchObj = reg.search(str(link))
                if matchObj:
                    print(matchObj.group())

輸出的時(shí)候開始會(huì)輸出一段數(shù)據(jù)，但后面會(huì)重復(fù)最后一段數(shù)據(jù)后報(bào)錯(cuò)
圖片描述

回答

編輯回答

尕筱澄

試試下面這段代碼，應(yīng)該是可行的。

from bs4 import BeautifulSoup

with open("縣（中國縣級(jí)行政區(qū)）_百度百科.html") as f:
    soup = BeautifulSoup(f, 'html.parser')
    a_s = soup.find_all('a')
    for a in a_s:
        if a.parent.name=="td":
            print(a.get_text())

2017年10月28日 07:05

編輯回答

逗婦惱

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import urllib.request
from bs4 import BeautifulSoup
import re
import string
from pypinyin import pinyin, Style
from model import cities
import json
import codecs

countryUrl = "https://baike.baidu.com/item/%E5%8E%BF/7258656"  # 縣
cityUrl = "https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD%E5%9F%8E%E5%B8%82%E5%88%97%E8%A1%A8"  # 市
countryRequest = urllib.request.Request(countryUrl)
cityRequest = urllib.request.Request(cityUrl)
countryResponse = urllib.request.urlopen(countryRequest)
cityResponse = urllib.request.urlopen(cityRequest)

countryData = countryResponse.read()
cityData = cityResponse.read()
data = countryData.decode('utf-8')
countrySoup = BeautifulSoup(countryData, 'html.parser')
citySoup = BeautifulSoup(cityData, 'html.parser')
tables = countrySoup.find_all('table')
lis = citySoup.find_all('li', class_="")
city = cities([])
for word in string.ascii_uppercase:
    city.addCharacter(word)
for table in tables:
    trs = table.find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        for td in tds:
            link = td.find('a')
            if link:
                reg = re.compile(r'[\u2E80-\u9FFF]+')
                matchObj = reg.search(str(link))
                if matchObj:
                    matchStr = matchObj.group()
                    firstP = pinyin(matchStr, style=Style.FIRST_LETTER)
                    p = firstP[0][0]
                    city.addCountry(str(p).upper(), {'name': matchStr})
for li in lis:
    liNext = str(li.next)
    city.addCity('B', {'name': '北京市'})
    city.addCity('T', {'name': '天津市'})
    city.addCity('S', {'name': '上海市'})
    city.addCity('C', {'name': '重慶市'})
    if liNext != '副省級(jí)市：' and liNext != '地級(jí)市：' and liNext != '縣級(jí)市：':
        continue
    links = li.find_all('a')
    for link in links:
        if link:
            reg = re.compile(r'[\u2E80-\u9FFF]+')
            matchObj = reg.search(str(link))
            if matchObj:
                matchStr = matchObj.group()
                firstP = pinyin(matchStr, style=Style.FIRST_LETTER)
                p = firstP[0][0]
                city.addCity(str(p).upper(), {'name': matchStr})

with codecs.open('test.js', 'w', 'utf-8') as f:
    city.sortByValue()
    json = json.dumps(city.arr, ensure_ascii=False)
    f.write(json)

city.js

class cities(object):
    def __init__(self, arr):
        self.arr = arr

    def addCountry(self, name, city):
        for obj in self.arr:
            if obj['name'] == name:
                if city in obj['cities']:
                    continue
                if '縣' in city['name']:
                    obj['cities'].append(city)

    def addCharacter(self, name):
        obj = {'name': '', 'cities': []}
        obj['name'] = name
        self.arr.append(obj)

    def addCity(self, name, city):
        for obj in self.arr:
            if obj['name'] == name:
                if city in obj['cities']:
                    continue
                obj['cities'].append(city)

    def sortByValue(self):
        for obj in self.arr:
            obj['cities'].sort(key=lambda obj: obj['name'])

2017年2月6日 14:55