python爬虫-代理使用

http代理是爬虫工作中解决反爬的一项关键措施，下面说明不同场景下代理的使用及其验证

requests中使用代理

无加密代理

import requests

proxy1 = {'http': '117.90.51.49:42668', 'https': '117.90.51.49:42668'}
resp = requests.get('http://httpbin.org/ip', proxies=proxy1)
print(resp.json())

需认证的代理

import requests

proxy1 = {'http': 'http://user:passwd@106.15.95.226:9187', 'https': 'https://user:passwd@106.15.95.236:9187'}
resp = requests.get('http://httpbin.org/ip', proxies=proxy1)
print(resp.json())
# {'origin': '101.47.19.29, 106.15.95.236'}

selenium+浏览器中使用代理

phantomJS

## selenium+phantomJS代理
from selenium import webdriver
driver = webdriver.PhantomJS(
    # executable_path = '/usr/local/bin/phantomjs',
    service_args = [
    '--ignore-ssl-errors=true',
   '--proxy=106.15.95.236:9187',  # IP:port
   '--proxy-type=http',
   '--proxy-auth=user:passwd', # 如需认证添加
   ])
url = 'http://httpbin.org/ip'
driver.get(url)
html = driver.page_source
html

Chrome

无加密代理

## selenium+Chrome代理
# https://blog.csdn.net/zwq912318834/article/details/78933910
# 保存Chrome与chromedriver都是最新版
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
#options.set_headless(headless=True)
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument('--proxy-server=http://218.66.151.87:31868')
driver = webdriver.Chrome(
    executable_path='~/tools/chromedriver',
    chrome_options=options,
    service_args=['--ignore-ssl-errors=true'])
url = 'http://httpbin.org/ip'
driver.get(url)
html = driver.page_source
html
driver.quit()
#driver.close()

需认证的代理

加密代理在Chrome中需要通过插件的方式实现

# 验证,失败
import os
import re
import time
import zipfile
from selenium import webdriver
# Chrome代理模板插件地址: https://github.com/revotu/selenium-chrome-auth-proxy
CHROME_PROXY_HELPER_DIR = 'chrome-proxy-helper'
# 存储自定义Chrome代理扩展文件的目录
CUSTOM_CHROME_PROXY_EXTENSIONS_DIR = 'chrome-proxy-extensions'
def get_chrome_proxy_extension(proxy):
    """获取一个Chrome代理扩展,里面配置有指定的代理(带用户名密码认证)
    proxy - 指定的代理,格式: username:password@ip:port
    """
    m = re.compile('([^:]+):([^\@]+)\@([\d\.]+):(\d+)').search(proxy)
    if m:
        # 提取代理的各项参数
        username = m.groups()[0]
        password = m.groups()[1]
        ip = m.groups()[2]
        port = m.groups()[3]
        # 创建一个定制Chrome代理扩展(zip文件)
        if not os.path.exists(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR):
            os.mkdir(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR)
        extension_file_path = os.path.join(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR, '{}.zip'.format(proxy.replace(':', '_')))
        if not os.path.exists(extension_file_path):
            # 扩展文件不存在，创建
            zf = zipfile.ZipFile(extension_file_path, mode='w')
            zf.write(os.path.join(CHROME_PROXY_HELPER_DIR, 'manifest.json'), 'manifest.json')
            # 替换模板中的代理参数
            background_content = open(os.path.join(CHROME_PROXY_HELPER_DIR, 'background.js')).read()
            background_content = background_content.replace('%proxy_host', ip)
            background_content = background_content.replace('%proxy_port', port)
            background_content = background_content.replace('%username', username)
            background_content = background_content.replace('%password', password)
            zf.writestr('background.js', background_content)
            zf.close()
        return extension_file_path
    else:
        raise Exception('Invalid proxy format. Should be username:password@ip:port')
if __name__ == '__main__':
    options = webdriver.ChromeOptions()
    # 添加一个自定义的代理插件（配置特定的代理，含用户名密码认证）
    options.add_extension(get_chrome_proxy_extension(proxy='username:password@ip:port'))
    driver = webdriver.Chrome(chrome_options=options)
    # 访问一个IP回显网站，查看代理配置是否生效了
    driver.get('http://httpbin.org/ip')
    print(driver.page_source)
    time.sleep(15)
    driver.quit()

代理的使用

鉴于代理的有效性不能达到100%，在实际使用过程中需要考虑根据代理的有效性来维护一个代理池，以蘑菇代理的返回结果为例

import requests
IP_proxy = 'http://mvip.piping.mogumiao.com/proxy/api/get_ip_bs?appKey=4162021207ase8293a0a2d853bf27fd12&count=5&expiryDate=0&format=1'
ip_pool = requests.get(url=IP_proxy)
ip_pool = ip_pool.json()
ips = ip_pool['msg']
ips
# [{'ip': '60.184.203.95', 'port': '40887'},
#  {'ip': '115.217.165.163', 'port': '43841'},
#  {'ip': '113.93.103.139', 'port': '36542'},
#  {'ip': '180.116.154.245', 'port': '34463'},
#  {'ip': '117.69.200.122', 'port': '43095'}]

持续从接口获取代理

给定蘑菇代理接口不稳定或者访问限制，使用以下方法多次尝试

def ips_proxy(trys = 20, sleeps = 2):
    try_num = 1
    ipros = None
    while try_num <= trys:
        print('=====')
        try:
            try_num = try_num + 1
            ip_pool = requests.get(url=IP_proxy)
            ip_pool = ip_pool.json()
            print(ip_pool)
            print(ip_pool['code'] == 0)
            if ip_pool['code'] == '0':
                ipros = ip_pool['msg']
                # print(ips)
                break
            else:
                time.sleep(sleeps)
                continue
        except:
            pass
    return ipros
ips_proxy(2)

代理有效性验证

## 根据IP+port判断http代理是否有效
def test_IP(ip, port):
    try:
        telnetlib.Telnet(ip, port=port, timeout=1)
    except:
        return False
    else:
        return True

# 过滤无效代理
def filter_validIP(ips):
    res = []
    for i in ips:
        ip = i['ip']
        port = i['port']
        if test_IP(ip, port):
            res.append(i)
    return res

更新代理池

假定ips为内存中的代理池变量，可以在request的过程中根据请求时长、返回结果来更新ips

header_code = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive'
}
def get_html_withproxy(url, trys=10):
    try_num = 1
    global ips
    # print(ips)
    while try_num <= trys:
        # print(try_num <= trys)
        try:
            randi = random.sample(range(0, len(ips)), 1)
            ip_t = ips[randi[0]]
            ip = ip_t['ip']
            port = ip_t['port']
            proxies = ip + ':' + port
            proxies_http = 'http://' + proxies
            proxies_https = 'https://' + proxies
            proxys = {'http': proxies_http, 'https': proxies_https}
            # print(proxys)
            u1 = requests.get(url, headers=header_code, proxies=proxys, timeout=2)
            # print(u1)
        except:
            ips.pop(randi[0])
            try_num = try_num + 1
            # print(try_num)
            if len(ips) < 3: #当代理数量过少重新获取
                # ip_pool = requests.get(url=IP_proxy)
                # ip_pool = ip_pool.json()
                # ips = ip_pool['msg']
                ips = ips.extend(ips_proxy())
        else:
            break
    if try_num <= trys:
        return u1
    else:
        return None

ip_pool = requests.get(url=IP_proxy)
ip_pool = ip_pool.json()
ips = ip_pool['msg']
# ips = filter_validIP(ips)

## 热销产品
url = 'http://www.huaxintrust.com/productlist.asp?page=1&pid=1'
rvt = get_html_withproxy(url)
rvt

2018/04/09

requests中使用代理

selenium+浏览器中使用代理

phantomJS

Chrome

代理的使用

持续从接口获取代理

代理有效性验证

更新代理池