# -*- coding: utf-8 -*-
import time
import json
import tempfile
from PIL import Image
from selenium import webdriver
# 要想調(diào)用鍵盤按鍵操作需要引入keys包
from selenium.webdriver.common.keys import Keys
class CreditSpider(object):
def __init__(self):
# 調(diào)用環(huán)境變量指定的PhantomJS瀏覽器創(chuàng)建瀏覽器對象
self.driver = webdriver.PhantomJS()
# self.driver.maximize_window() # 設置全屏
# 如果沒有在環(huán)境變量指定PhantomJS位置
# self.driver = webdriver.PhantomJS(executable_path='/Users/bianyachao/Downloads/phantomjs-2.1.1-macosx/bin/phantomjs')
self.driver.set_window_size(1366, 1098)
def get_credit(self):
url = "http://hd.chinatax.gov.cn/fagui/action/InitCredit.do"
# get方法會一直等到頁面加載,然后才會繼續(xù)程序,通常測試會在這里選擇time.sleep(2)
self.driver.get(url)
time.sleep(2)
# 生成頁面快照并保存
# self.driver.save_screenshot("nsrxy.png")
self.on_click(1)
def on_click(self, n):
while n < 3:
if n == 1:
self.driver.find_element_by_xpath("""http://a[@onclick="changeParam('articleField01','')"]""").click()
else:
self.driver.find_element_by_xpath('//*[@title="下一頁"]').click()
time.sleep(2)
self.driver.save_screenshot("yzm.png")
self.jietu_img(n)
self.click_yzm(n)
n += 1
return
def click_yzm(self, n):
print('the path is {}'.format(n))
element = self.driver.find_element_by_id("verifyCode")
self.driver.find_element_by_id("verifyCode").clear()
# im = Image.open('codeImage.png')
# im.show()
yzm = input("please input code: ")
element.send_keys(yzm)
self.driver.find_element_by_xpath("""http://*[@id="layui-layer1"]/div[3]/a[1]""").click()
time.sleep(4)
# self.driver.save_screenshot("result{}.png".format(n))
self.parse_page(n)
return
def parse_page(self, n):
try:
res_list = []
data = self.driver.find_elements_by_xpath('//td[@class="sv_hei"]//tr/td')
if not data:
self.on_click(n)
for a in data:
d = a.text
res_list.append(d)
self.save_res(res_list)
except Exception as e:
print(e)
return
def save_res(self, res_list):
cont_list = []
del res_list[-1]
for i in range(0, 3):
del res_list[0]
for i in range(0, len(res_list), 3):
res_dic = {
'NSSBH': res_list[i],
'NSRMC': res_list[i+1],
'YEAR': res_list[i+2],
}
cont_list.append(res_dic)
print(cont_list)
# json_str = json.dumps(cont_list)
# with open('res.text', 'a+') as f:
# f.write(json_str + '\n\n')
return
def jietu_img(self, n):
# 參數(shù)說明
# 第一個參數(shù) 開始截圖的x坐標
# 第二個參數(shù) 開始截圖的y坐標
# 第三個參數(shù) 結(jié)束截圖的x坐標
# 第四個參數(shù) 結(jié)束截圖的y坐標
png = Image.open('yzm.png')
if n == 1:
bbox = (531, 510, 731, 560)
else:
bbox = (531, 512, 731, 562)
region = png.crop(bbox) # 此時,region是一個新的圖像對象
time.sleep(1)
region.save('codeImage.png')
return
if __name__ == '__main__':
s = time.time()
a = CreditSpider()
a.get_credit()
# a.jietu_img_2()
print(time.time()-s)
報錯:
Traceback (most recent call last):
File "credit_spider.py", line 122, in <module>
a.get_credit()
File "credit_spider.py", line 31, in get_credit
self.on_click(1)
File "credit_spider.py", line 36, in on_click
self.driver.find_element_by_xpath("""http://a[@onclick="changeParam('articleField01','')"]""").click()
File "/root/anaconda3/lib/python3.6/site-packages/selenium/webdriver/remote/webelement.py", line 80, in click
self._execute(Command.CLICK_ELEMENT)
File "/root/anaconda3/lib/python3.6/site-packages/selenium/webdriver/remote/webelement.py", line 501, in _execute
return self._parent.execute(command, params)
File "/root/anaconda3/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 308, in execute
self.error_handler.check_response(response)
File "/root/anaconda3/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py", line 194, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementNotVisibleException: Message: {"errorMessage":"Element is not currently visible and may not be manipulated","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"81","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:48854","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"id\": \":wdc:1531296937655\", \"sessionId\": \"9443d970-84e2-11e8-9955-092f7358cfea\"}","url":"/click","urlParsed":{"anchor":"","query":"","file":"click","directory":"/","path":"/click","relative":"/click","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/click","queryKey":{},"chunks":["click"]},"urlOriginal":"/session/9443d970-84e2-11e8-9955-092f7358cfea/element/:wdc:1531296937655/click"}}
Screenshot: available via screen
這是什么原因?求大神指導,謝謝了?。?/p>
運行了下你的代碼使用 Chrome 運行也報錯, 找不到標簽
你貼出來的錯誤也是找不到標簽, 建議使用 chrome 運行一次,如果 chrome 運行沒問題
改回PhantomJS 運行還報錯,就把 page_source 保存下來看看獲取的頁面結(jié)果
# 獲取頁面html 保存
result = driver.page_source
with open('tmp.html', 'w', encoding='utf-8') as f:
f.write(result)
剛好前兩天整理的, Linux下跑 PhantomJS... PhantomJS+Selenium爬取數(shù)據(jù)
北大青鳥APTECH成立于1999年。依托北京大學優(yōu)質(zhì)雄厚的教育資源和背景,秉承“教育改變生活”的發(fā)展理念,致力于培養(yǎng)中國IT技能型緊缺人才,是大數(shù)據(jù)專業(yè)的國家
達內(nèi)教育集團成立于2002年,是一家由留學海歸創(chuàng)辦的高端職業(yè)教育培訓機構(gòu),是中國一站式人才培養(yǎng)平臺、一站式人才輸送平臺。2014年4月3日在美國成功上市,融資1
北大課工場是北京大學校辦產(chǎn)業(yè)為響應國家深化產(chǎn)教融合/校企合作的政策,積極推進“中國制造2025”,實現(xiàn)中華民族偉大復興的升級產(chǎn)業(yè)鏈。利用北京大學優(yōu)質(zhì)教育資源及背
博為峰,中國職業(yè)人才培訓領域的先行者
曾工作于聯(lián)想擔任系統(tǒng)開發(fā)工程師,曾在博彥科技股份有限公司擔任項目經(jīng)理從事移動互聯(lián)網(wǎng)管理及研發(fā)工作,曾創(chuàng)辦藍懿科技有限責任公司從事總經(jīng)理職務負責iOS教學及管理工作。
浪潮集團項目經(jīng)理。精通Java與.NET 技術, 熟練的跨平臺面向?qū)ο箝_發(fā)經(jīng)驗,技術功底深厚。 授課風格 授課風格清新自然、條理清晰、主次分明、重點難點突出、引人入勝。
精通HTML5和CSS3;Javascript及主流js庫,具有快速界面開發(fā)的能力,對瀏覽器兼容性、前端性能優(yōu)化等有深入理解。精通網(wǎng)頁制作和網(wǎng)頁游戲開發(fā)。
具有10 年的Java 企業(yè)應用開發(fā)經(jīng)驗。曾經(jīng)歷任德國Software AG 技術顧問,美國Dachieve 系統(tǒng)架構(gòu)師,美國AngelEngineers Inc. 系統(tǒng)架構(gòu)師。