R爬虫-代理使用

2018/04/12

Categories: 爬虫 Tags: 代理 httr rvest

R爬虫的利器rvest,可以方便实现数据的定位与提取

添加代理需在httr下实现

直接在请求中使用代理

library(httr)

## header 使用vector类型
h <- c("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
       "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
       "gzip, deflate",
       "keep-alive")
names(h) <- c("User-Agent", "Accept", "Accept-Encoding", "Connection")

## 持续获取代理
get_ips = function(){
  while(TRUE){
    IP_proxy = 'http://mvip.piping.mogumiao.com/proxy/api/get_ip_bs?appKey=41620212070f4853bf27fd12&count=20&expiryDate=0&format=1'
    ips = read_json(IP_proxy)
    
    if(ips$code == '0'){
      break
    }else{
      Sys.sleep(5)
    }
  }
  
  ips = ips[['msg']]
  
  ips = as.data.frame(do.call(rbind, ips))
  ips$ip = as.character(ips$ip)
  ips$port = as.numeric(ips$port)
  return(ips)
}
ips = get_ips()

# port              ip
# 1  39358 125.109.194.199
# 2  22420  180.122.148.51
# 3  35374  121.234.90.107
# 4  24793     1.196.3.186
# 5  36472  115.213.206.29


get_html_withproxy = function(url, ips, trys = 10){
  try_num = 1
  while (try_num <= trys) {
    # print(try_num <= trys)
    tryCatch({
      randi = sample(c(1:nrow(ips)), size = 1)
      ip = ips$ip[randi]
      port = ips$port[randi]
      u1 = GET(url, use_proxy(ip, port), timeout(1), add_headers(.headers = h))
      # print(u1)
      break
    }, error = function(e) {
      # print(e)
      ips <<- ips[-randi, ]
      try_num <<- try_num + 1
      # print(try_num)
      
      if(nrow(ips) < 5){
        ips <<- rbind(ips, get_ips())
      }
    })
  }
  if(try_num <= trys){
    return(read_html(u1))
  }else{
    return(NA)
  }
}

url = 'http://www.huaxintrust.com/productlist.asp?page=1&pid=1'
ips = get_ips()
ttt = get_html_withproxy(url, ips, trys = 10)
ttt
# {xml_document}
# <html xmlns="http://www.w3.org/1999/xhtml">
#   [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<title>华信信托 ...
# [2] <body>\r\n    <div id="box">\r\n        <div id="top">\r\n    <div class="top1">\r\n     ...
# > 

维护简单的代理池(或者全局变量)

代理有效性的验证

validIP = function(ip, port){
  url = 'http://httpbin.org/ip' #一个可以正常请求访问的网站,改地址也可直接返回代理是否已经使用
  res = FALSE
  try({
    u1 = GET(url, use_proxy(ip, port), timeout(1)) #timeout表示等待响应时长
    res = TRUE
  })
  return(res)
}

代理池更新

library(parallel)
library(dplyr)
library(httr)
library(readr)

updateIps = function(oldips = NULL){
  proxy_url = 'http://mvip.piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3276318912478216381273a5cec0&count=10&expiryDate=0&format=2'
  if(is.null(oldips)){
    while(TRUE){
      ips = read_table(proxy_url,
                       col_names = FALSE)
      oldips = ips$X1
      if(length(oldips) == 10){
        break
      }
    }
  }
  
  ips = read_table(proxy_url,
                   col_names = FALSE)
  ips = ips$X1
  
  if(length(ips) < 10){
    ltips = oldips
  }else{
    ltips = c(ips, oldips)
  }
  
  
  ltips = as.list(ltips)
  # 使用并行的方法验证有效性
  rr = mclapply(ltips, FUN = function(x){
    ipt = strsplit(x, ":")[[1]]
    tt =validIP(ipt[1], as.integer(ipt[2]))
    tt
  }, mc.cores = 5
  )
  
  newips = unlist(ltips)[unlist(rr)]
  
  return(newips)
}

ips = updateIps()
ips = updateIps(ips)