R爬虫的利器rvest
,可以方便实现数据的定位与提取
添加代理需在httr
下实现
直接在请求中使用代理
library(httr)
## header 使用vector类型
h <- c("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"gzip, deflate",
"keep-alive")
names(h) <- c("User-Agent", "Accept", "Accept-Encoding", "Connection")
## 持续获取代理
get_ips = function(){
while(TRUE){
IP_proxy = 'http://mvip.piping.mogumiao.com/proxy/api/get_ip_bs?appKey=41620212070f4853bf27fd12&count=20&expiryDate=0&format=1'
ips = read_json(IP_proxy)
if(ips$code == '0'){
break
}else{
Sys.sleep(5)
}
}
ips = ips[['msg']]
ips = as.data.frame(do.call(rbind, ips))
ips$ip = as.character(ips$ip)
ips$port = as.numeric(ips$port)
return(ips)
}
ips = get_ips()
# port ip
# 1 39358 125.109.194.199
# 2 22420 180.122.148.51
# 3 35374 121.234.90.107
# 4 24793 1.196.3.186
# 5 36472 115.213.206.29
get_html_withproxy = function(url, ips, trys = 10){
try_num = 1
while (try_num <= trys) {
# print(try_num <= trys)
tryCatch({
randi = sample(c(1:nrow(ips)), size = 1)
ip = ips$ip[randi]
port = ips$port[randi]
u1 = GET(url, use_proxy(ip, port), timeout(1), add_headers(.headers = h))
# print(u1)
break
}, error = function(e) {
# print(e)
ips <<- ips[-randi, ]
try_num <<- try_num + 1
# print(try_num)
if(nrow(ips) < 5){
ips <<- rbind(ips, get_ips())
}
})
}
if(try_num <= trys){
return(read_html(u1))
}else{
return(NA)
}
}
url = 'http://www.huaxintrust.com/productlist.asp?page=1&pid=1'
ips = get_ips()
ttt = get_html_withproxy(url, ips, trys = 10)
ttt
# {xml_document}
# <html xmlns="http://www.w3.org/1999/xhtml">
# [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<title>华信信托 ...
# [2] <body>\r\n <div id="box">\r\n <div id="top">\r\n <div class="top1">\r\n ...
# >
维护简单的代理池(或者全局变量)
代理有效性的验证
validIP = function(ip, port){
url = 'http://httpbin.org/ip' #一个可以正常请求访问的网站,改地址也可直接返回代理是否已经使用
res = FALSE
try({
u1 = GET(url, use_proxy(ip, port), timeout(1)) #timeout表示等待响应时长
res = TRUE
})
return(res)
}
代理池更新
library(parallel)
library(dplyr)
library(httr)
library(readr)
updateIps = function(oldips = NULL){
proxy_url = 'http://mvip.piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3276318912478216381273a5cec0&count=10&expiryDate=0&format=2'
if(is.null(oldips)){
while(TRUE){
ips = read_table(proxy_url,
col_names = FALSE)
oldips = ips$X1
if(length(oldips) == 10){
break
}
}
}
ips = read_table(proxy_url,
col_names = FALSE)
ips = ips$X1
if(length(ips) < 10){
ltips = oldips
}else{
ltips = c(ips, oldips)
}
ltips = as.list(ltips)
# 使用并行的方法验证有效性
rr = mclapply(ltips, FUN = function(x){
ipt = strsplit(x, ":")[[1]]
tt =validIP(ipt[1], as.integer(ipt[2]))
tt
}, mc.cores = 5
)
newips = unlist(ltips)[unlist(rr)]
return(newips)
}
ips = updateIps()
ips = updateIps(ips)