解决 webdriver 与 chrome 版本不匹配的问题
爬虫三大神器工具包:
from selenium import webdriver
from lxml import etree
import requests
常见的场景:
- 用 selenium 来操作登录,获取 cookie
- 解析 cookie
- reque
无头模式
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver95', options=options)
from selenium import webdriver
# browser = webdriver.Chrome()
browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver95')
browser.get('https://baidu.com')
browser.close()
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://baidu.com')
commitBtn = browser.find_element_by_id('su')
commitBtn2 = browser.find_element_by_css_selector('#su')
commitBtn3 = browser.find_elements_by_xpath('//*[@id="su"]')
print(commitBtn)
print(commitBtn2)
print(commitBtn3)
browser.close()
<selenium.webdriver.remote.webelement.WebElement (session="a6a468ca0a719bcbb16b36faa98909c6", element="9648feb9-6848-4779-80c3-ba52422613a7")>
<selenium.webdriver.remote.webelement.WebElement (session="a6a468ca0a719bcbb16b36faa98909c6", element="9648feb9-6848-4779-80c3-ba52422613a7")>
[<selenium.webdriver.remote.webelement.WebElement (session="a6a468ca0a719bcbb16b36faa98909c6", element="9648feb9-6848-4779-80c3-ba52422613a7")>]
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.baidu.com/')
lis2 = browser.find_elements_by_xpath(
'//span[@class="title-content-title"]')
list = [i.text for i in lis2]
print(list)
browser.close()
['菅义伟退选 谁会成日本下任首相', '日媒:菅义伟继任者或为岸田文雄', '入学要求房产套内面积大于60平', '日本科学家藤岛昭加入上海理工', '黑龙江查寝干部学姐出面回应', '汶川地震失去右腿女孩残奥夺金']
from selenium import webdriver
from lxml import etree
browser = webdriver.Chrome()
browser.get('https://www.baidu.com/')
html = browser.page_source
# 解析
source = etree.HTML(html)
list = source.xpath('//span[@class="title-content-title"]/text()')
print(list)
browser.close()
['菅义伟退选 谁会成日本下任首相', '日媒:菅义伟继任者或为岸田文雄', '入学要求房产套内面积大于60平', '日本科学家藤岛昭加入上海理工', '黑龙江查寝干部学姐出面回应', '汶川地震失去右腿女孩残奥夺金']
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://baidu.com')
commitBtn2 = browser.find_element_by_css_selector('#su')
input = browser.find_element_by_xpath('//*[@id="kw"]')
input.send_keys('这是一个输入内容测试')
# 等待3秒
time.sleep(3)
commitBtn2.click()
# print(commitBtn)
# print(commitBtn3)
browser.close()
import time
import pandas as pd
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver95', options=options)
browser.implicitly_wait(30)
def get_content():
print('----------- 正在爬取微博热搜数据数据 -------------')
url = 'https://s.weibo.com/top/summary?cate=realtimehot&sudaref=s.weibo.com&display=0&retcode=6102'
browser.get(url)
time.sleep(10)
html = browser.page_source
df = pd.read_html(html)[0][1:11][['序号', '关键词']]
# df = pd.read_html(
# url, attrs={'id': 'pl_top_realtimehot'}, encoding='utf-8')
browser.close()
return df
get_content()
----------- 正在爬取微博热搜数据数据 -------------
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
序号 | 关键词 | |
---|---|---|
1 | 1.0 | 双十一也是人民空军生日 2200001 |
2 | 2.0 | 72秒致敬人民空军 1806154 |
3 | 3.0 | 双十一成交额 1571659 |
4 | 4.0 | 西安90后女孩10元买彩票中1千万 942009 |
5 | 5.0 | 双十一 897841 |
6 | 6.0 | 被董洁的一句再见整破防了 剧集 846109 |
7 | 7.0 | 小学生打119骂人消防员找上门送道德经 624882 |
8 | 8.0 | 潘粤明现场追星鬼吹灯作者 综艺 542251 |
9 | 9.0 | 空军战机20秒变装秀 534718 |
10 | 10.0 | 美国当心切香肠切了自己的手 476593 |
# ---------解决页面阻塞问题代码-----
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
# ---------解决页面阻塞问题代码-----