http代理是爬虫工作中解决反爬的一项关键措施,下面说明不同场景下代理的使用及其验证
requests中使用代理
- 无加密代理
import requests
proxy1 = {'http': '117.90.51.49:42668', 'https': '117.90.51.49:42668'}
resp = requests.get('http://httpbin.org/ip', proxies=proxy1)
print(resp.json())
- 需认证的代理
import requests
proxy1 = {'http': 'http://user:passwd@106.15.95.226:9187', 'https': 'https://user:passwd@106.15.95.236:9187'}
resp = requests.get('http://httpbin.org/ip', proxies=proxy1)
print(resp.json())
# {'origin': '101.47.19.29, 106.15.95.236'}
selenium+浏览器中使用代理
phantomJS
## selenium+phantomJS代理
from selenium import webdriver
driver = webdriver.PhantomJS(
# executable_path = '/usr/local/bin/phantomjs',
service_args = [
'--ignore-ssl-errors=true',
'--proxy=106.15.95.236:9187', # IP:port
'--proxy-type=http',
'--proxy-auth=user:passwd', # 如需认证添加
])
url = 'http://httpbin.org/ip'
driver.get(url)
html = driver.page_source
html
Chrome
- 无加密代理
## selenium+Chrome代理
# https://blog.csdn.net/zwq912318834/article/details/78933910
# 保存Chrome与chromedriver都是最新版
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
#options.set_headless(headless=True)
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument('--proxy-server=http://218.66.151.87:31868')
driver = webdriver.Chrome(
executable_path='~/tools/chromedriver',
chrome_options=options,
service_args=['--ignore-ssl-errors=true'])
url = 'http://httpbin.org/ip'
driver.get(url)
html = driver.page_source
html
driver.quit()
#driver.close()
- 需认证的代理
加密代理在Chrome中需要通过插件的方式实现
# 验证,失败
import os
import re
import time
import zipfile
from selenium import webdriver
# Chrome代理模板插件地址: https://github.com/revotu/selenium-chrome-auth-proxy
CHROME_PROXY_HELPER_DIR = 'chrome-proxy-helper'
# 存储自定义Chrome代理扩展文件的目录
CUSTOM_CHROME_PROXY_EXTENSIONS_DIR = 'chrome-proxy-extensions'
def get_chrome_proxy_extension(proxy):
"""获取一个Chrome代理扩展,里面配置有指定的代理(带用户名密码认证)
proxy - 指定的代理,格式: username:password@ip:port
"""
m = re.compile('([^:]+):([^\@]+)\@([\d\.]+):(\d+)').search(proxy)
if m:
# 提取代理的各项参数
username = m.groups()[0]
password = m.groups()[1]
ip = m.groups()[2]
port = m.groups()[3]
# 创建一个定制Chrome代理扩展(zip文件)
if not os.path.exists(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR):
os.mkdir(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR)
extension_file_path = os.path.join(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR, '{}.zip'.format(proxy.replace(':', '_')))
if not os.path.exists(extension_file_path):
# 扩展文件不存在,创建
zf = zipfile.ZipFile(extension_file_path, mode='w')
zf.write(os.path.join(CHROME_PROXY_HELPER_DIR, 'manifest.json'), 'manifest.json')
# 替换模板中的代理参数
background_content = open(os.path.join(CHROME_PROXY_HELPER_DIR, 'background.js')).read()
background_content = background_content.replace('%proxy_host', ip)
background_content = background_content.replace('%proxy_port', port)
background_content = background_content.replace('%username', username)
background_content = background_content.replace('%password', password)
zf.writestr('background.js', background_content)
zf.close()
return extension_file_path
else:
raise Exception('Invalid proxy format. Should be username:password@ip:port')
if __name__ == '__main__':
options = webdriver.ChromeOptions()
# 添加一个自定义的代理插件(配置特定的代理,含用户名密码认证)
options.add_extension(get_chrome_proxy_extension(proxy='username:password@ip:port'))
driver = webdriver.Chrome(chrome_options=options)
# 访问一个IP回显网站,查看代理配置是否生效了
driver.get('http://httpbin.org/ip')
print(driver.page_source)
time.sleep(15)
driver.quit()
代理的使用
鉴于代理的有效性不能达到100%,在实际使用过程中需要考虑根据代理的有效性来维护一个代理池,以蘑菇代理的返回结果为例
import requests
IP_proxy = 'http://mvip.piping.mogumiao.com/proxy/api/get_ip_bs?appKey=4162021207ase8293a0a2d853bf27fd12&count=5&expiryDate=0&format=1'
ip_pool = requests.get(url=IP_proxy)
ip_pool = ip_pool.json()
ips = ip_pool['msg']
ips
# [{'ip': '60.184.203.95', 'port': '40887'},
# {'ip': '115.217.165.163', 'port': '43841'},
# {'ip': '113.93.103.139', 'port': '36542'},
# {'ip': '180.116.154.245', 'port': '34463'},
# {'ip': '117.69.200.122', 'port': '43095'}]
持续从接口获取代理
给定蘑菇代理接口不稳定或者访问限制,使用以下方法多次尝试
def ips_proxy(trys = 20, sleeps = 2):
try_num = 1
ipros = None
while try_num <= trys:
print('=====')
try:
try_num = try_num + 1
ip_pool = requests.get(url=IP_proxy)
ip_pool = ip_pool.json()
print(ip_pool)
print(ip_pool['code'] == 0)
if ip_pool['code'] == '0':
ipros = ip_pool['msg']
# print(ips)
break
else:
time.sleep(sleeps)
continue
except:
pass
return ipros
ips_proxy(2)
代理有效性验证
## 根据IP+port判断http代理是否有效
def test_IP(ip, port):
try:
telnetlib.Telnet(ip, port=port, timeout=1)
except:
return False
else:
return True
# 过滤无效代理
def filter_validIP(ips):
res = []
for i in ips:
ip = i['ip']
port = i['port']
if test_IP(ip, port):
res.append(i)
return res
更新代理池
假定ips
为内存中的代理池变量,可以在request
的过程中根据请求时长、返回结果来更新ips
header_code = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
}
def get_html_withproxy(url, trys=10):
try_num = 1
global ips
# print(ips)
while try_num <= trys:
# print(try_num <= trys)
try:
randi = random.sample(range(0, len(ips)), 1)
ip_t = ips[randi[0]]
ip = ip_t['ip']
port = ip_t['port']
proxies = ip + ':' + port
proxies_http = 'http://' + proxies
proxies_https = 'https://' + proxies
proxys = {'http': proxies_http, 'https': proxies_https}
# print(proxys)
u1 = requests.get(url, headers=header_code, proxies=proxys, timeout=2)
# print(u1)
except:
ips.pop(randi[0])
try_num = try_num + 1
# print(try_num)
if len(ips) < 3: #当代理数量过少重新获取
# ip_pool = requests.get(url=IP_proxy)
# ip_pool = ip_pool.json()
# ips = ip_pool['msg']
ips = ips.extend(ips_proxy())
else:
break
if try_num <= trys:
return u1
else:
return None
ip_pool = requests.get(url=IP_proxy)
ip_pool = ip_pool.json()
ips = ip_pool['msg']
# ips = filter_validIP(ips)
## 热销产品
url = 'http://www.huaxintrust.com/productlist.asp?page=1&pid=1'
rvt = get_html_withproxy(url)
rvt