爬虫selenium

以下设置基于 selenium==3.6.0

Selenium官方文档

谷歌浏览器设置

通用部分

1
2
3
4
5
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
# chrome_options = webdriver.ChromeOptions()

常用设置

1
2
3
4
5
6
7
8
9
10
11
# 谷歌无头模式
chrome_options.add_argument('--headless')
# 禁用GPU硬件加速
chrome_options.add_argument('--disable-gpu')
# 禁止图片加载
chrome_options.add_argument('blink-settings=imagesEnabled=false')
# 设置 user-agent
chrome_options.add_argument('user-agent=' + user_agent)
# 设置代理
chrome_options.add_argument('proxy-server=' + '192.168.0.28:808')
driver = webdriver.Chrome(executable_path='../phantom/chromedriver', chrome_options=chrome_options)

Chrome可设置的参数列表List of Chromium Command Line Switches

禁止图片加载

1
2
3
4
5
6
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)

# 或者使用下面的设置
chrome_options.add_argument('blink-settings=imagesEnabled=false')

代理ip设置

1
2
3
4
5
6
7
8
9
10
11
12
13
# 静态IP
PROXY = "proxy_host:proxy:port"
desired_capabilities = chrome_options.to_capabilities()
desired_capabilities['proxy'] = {
"httpProxy": PROXY,
"ftpProxy": PROXY,
"sslProxy": PROXY,
"noProxy": None,
"proxyType": "MANUAL",
"class": "org.openqa.selenium.Proxy",
"autodetect": False
}
driver = webdriver.Chrome(desired_capabilities=desired_capabilities)

cookie设置

1
2
3
4
5
6
7
8
# 删除所有的cookie
driver.delete_all_cookies()
# 携带cookie打开
driver.add_cookie({'name':'ABC','value':'DEF'})
driver.get_cookies()

# 通过js新打开一个窗口
driver.execute_script('window.open("https://www.baidu.com");')

火狐浏览器设置

通用部分

1
2
3
4
5
6
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

firefox_options = Options()
# firefox_options = webdriver.FirefoxOptions()
# profile = webdriver.FirefoxProfile()

常用设置

1
2
3
4
5
firefox_options.add_argument('--headless')
firefox_options.add_argument('--disable-gpu')
# options.add_argument('window-size=1200x600')
# driver = webdriver.Firefox(executable_path='../geckodriver', firefox_options=firefox_options)
driver = webdriver.Firefox(firefox_options=firefox_options)

代理ip设置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

def get_firefox_profile_with_proxy_set(profile, proxy_host):
proxy_list = proxy_host.split(':')
agent_ip = proxy_list[0]
agent_port = proxy_list[1]
# 使用代理
profile.set_preference('network.proxy.type', 1)

# 所有协议公用一种代理配置
profile.set_preference('network.proxy.share_proxy_settings', True) profile.set_preference('network.proxy.http', agent_ip)
profile.set_preference('network.proxy.http_port', int(agent_port))
profile.set_preference('network.proxy.ssl', agent_ip)
profile.set_preference('network.proxy.ssl_port', int(agent_port))
# 对于localhost的不用代理,这里必须要配置,否则无法和 webdriver 通讯
profile.set_preference('network.proxy.no_proxies_on', 'localhost,127.0.0.1')
profile.set_preference('network.http.use-cache', False)

return profile


profile = webdriver.FirefoxProfile()
profile = get_firefox_profile_with_proxy_set(profile, '10.12.11.110:8088')
driver = webdriver.Firefox(firefox_profile=profile)

userAgent设置

1
2
3
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", user_agent)
driver = webdriver.Firefox(firefox_profile=profile)

PhantomJS设置

参考链接github

通用部分

1
2
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

常用设置

1
2
3
4
5
6
# 设置userAgent
dcap = DesiredCapabilities.PHANTOMJS.copy()
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)

headers和代理ip设置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive'
}
dcap = DesiredCapabilities.PHANTOMJS.copy()

for key, value in headers.items():
dcap['phantomjs.page.customHeaders.{}'.format(key)] = value

service_args = [
'--proxy=127.0.0.1:9999',
'--proxy-type=socks5',
]
driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args)

代理ip设置

方式一:

1
2
3
4
5
service_args = [
'--proxy=127.0.0.1:9999',
'--proxy-type=socks5',
]
driver = webdriver.PhantomJS(service_args=service_args)

方式二:

1
2
3
4
5
6
7
8
9
10
11
12
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': '101.236.23.202:8866'
})
# 新建一个“期望技能”
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
# 把代理ip加入到技能中
proxy.add_to_capabilities(desired_capabilities)
driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
# 每次设置后都需要启动新的session
driver.start_session(desired_capabilities)
driver.get('http://xxxx')

userAgent设置

方式一:

1
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'

方式二:

1
2
3
4
dcap = DesiredCapabilities.PHANTOMJS.copy()
dcap['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'

driver = webdriver.PhantomJS(desired_capabilities=dcap)

禁用图片设置

1
2
3
4
5
6
dcap = DesiredCapabilities.PHANTOMJS.copy()

# 不载入图片,爬页面速度会快很多
dcap["phantomjs.page.settings.loadImages"] = False

driver = webdriver.PhantomJS(desired_capabilities=dcap)

PhantomJS 代理ip动态切换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.support.wait import WebDriverWait
import time


def selenium_ip(ip):
“”“代理ip动态切换”“”
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': '101.236.23.202:8866'
})
dcap = DesiredCapabilities.PHANTOMJS.copy()
proxy.add_to_capabilities(dcap)
driver = webdriver.PhantomJS(desired_capabilities=dcap)
# 新建一个会话
driver.start_session(desired_capabilities)
driver.get('http://www.baidu.com')
time.sleep(10)
print(driver.get_cookies())
print(driver.session_id)

print('------------切换ip------------')
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': '101.236.35.98:8866'
})
dcap = DesiredCapabilities.PHANTOMJS.copy()
proxy.add_to_capabilities(dcap)
# 新建一个会话
driver.start_session(desired_capabilities)
driver.get('http://www.baidu.com')
time.sleep(10)
print(driver.get_cookies())
print(driver.session_id)

print('------------还原为系统代理------------')
proxy = webdriver.Proxy()
proxy.proxy_type = ProxyType.DIRECT
dcap = DesiredCapabilities.PHANTOMJS.copy()
proxy.add_to_capabilities(dcap)
driver.start_session(dcap)
driver.get('http://xxxx')
pass

结合scrapy使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from scrapy.http import HtmlResponse
from logging import getLogger
import requests

class TMTTRandomSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

def __init__(self, proxy_url, timeout=60, service_args=[], max_try_count=3):
self.logger = getLogger(__name__)
self.proxy_url = proxy_url
self.timeout = timeout
self.max_try_count = max_try_count
random_ip = self.get_random_proxy()
if random_ip:
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': 'https://{}'.format(random_ip)
})
dcap = DesiredCapabilities.PHANTOMJS.copy()
proxy.add_to_capabilities(dcap)
self.browser = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args)
# self.browser = webdriver.PhantomJS(service_args=service_args)
# self.browser.set_window_size(1400, 700)
self.browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser, self.timeout)

def spider_closed(self):
self.browser.close()
self.browser.quit()
self.logger.info('browser quit')
print('browser quit')

@classmethod
def from_crawler(cls, crawler):
s = cls(
proxy_url=crawler.settings.get('PROXY_URL'),
timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'),
max_try_count=crawler.settings.get('MAX_TYR_COUNT'),
)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s

def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False

def execute_changeip(self):
random_ip = self.get_random_proxy()
if random_ip:
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': 'https://{}'.format(random_ip)
})
dcap = DesiredCapabilities.PHANTOMJS.copy()
# 把代理ip加入到技能中
proxy.add_to_capabilities(dcap)
# 新建一个会话
self.browser.start_session(dcap)
self.logger.info('PhantomJS execute_changeip: {}'.format(random_ip))

def process_request(self, request, spider):
"""
用PhantomJS抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
located = request.meta.get('located')
if not located:
# 如果不存在次参数忽略交给下一请求处理
return None
self.logger.info('PhantomJS is Starting Request: {}'.format(request.url))
try_count = 0
random_ip = self.get_random_proxy()
if random_ip:
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': 'https://{}'.format(random_ip)
})
dcap = DesiredCapabilities.PHANTOMJS.copy()
# 把代理ip加入到技能中
proxy.add_to_capabilities(dcap)
# 新建一个会话
self.browser.start_session(dcap)
self.logger.info('PhantomJS execute_changeip: {}'.format(random_ip))
while try_count < self.max_try_count:
try_count += 1
self.logger.info('PhantomJS is try_count: {}'.format(try_count))
self.browser.get(request.url)
try:
locator = (By.ID, located)
self.wait.until(EC.presence_of_element_located(locator))
try_count = self.max_try_count
return HtmlResponse(url=request.url, body=self.browser.page_source,
request=request, encoding='utf-8', status=200)
except TimeoutException:
if try_count == self.max_try_count:
# self.execute_changeip()
self.logger.info('PhantomJS is try_count Failed')
return HtmlResponse(url=request.url, status=500, request=request)

常用useragent

chrome

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

Safari

1
2
3
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",

Firefox

1
2
3
4
5
6
"Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"

以上内容参考文章:

python爬虫之如何随机更换User-Agent

Selenium Python3 请求头配置

selenium设置proxy、headers(phantomjs、Chrome、Firefox)

stackoverflow

知乎

github