虚位以待(AD)
虚位以待(AD)
首页 > 软件编程 > Java编程 > java代理实现爬取代理IP的示例

java代理实现爬取代理IP的示例
类别:Java编程   作者:码皇   来源:互联网   点击:

今天小编就为大家分享一篇java代理实现爬取代理IP的示例,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧

仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和Jsoup(版本1.10.2)

如果用了pom,那么就是以下两个:

    <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.28</version></dependency><dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version></dependency>

完整的代码如下:

    package com.tuniu.fcm.facade.IPProxy;
    import com.alibaba.fastjson.JSONObject;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    /** * 获取代理IP,需要 * com.alibaba.fastjson.JSONObject以及Jsoup */public class ProxyCralwerUnusedVPN {
    ThreadLocal<Integer> localWantedNumber = new ThreadLocal<Integer>();
    ThreadLocal<List<ProxyInfo>> localProxyInfos = new ThreadLocal<List<ProxyInfo>>();
    public static void main(String[] args) {
    ProxyCralwerUnusedVPN proxyCrawler = new ProxyCralwerUnusedVPN();
    /** * 想要获取的代理IP个数,由需求方自行指定。(如果个数太多,将导致返回变慢) */ proxyCrawler.startCrawler(1);
    }
    /** * 暴露给外部模块调用的入口 * @param wantedNumber 调用方期望获取到的代理IP个数 */ public String startCrawler(int wantedNumber) {
    localWantedNumber.set(wantedNumber);
    kuaidailiCom("http://www.xicidaili.com/nn/", 15);
    kuaidailiCom("http://www.xicidaili.com/nt/", 15);
    kuaidailiCom("http://www.xicidaili.com/wt/", 15);
    kuaidailiCom("http://www.kuaidaili.com/free/inha/", 15);
    kuaidailiCom("http://www.kuaidaili.com/free/intr/", 15);
    kuaidailiCom("http://www.kuaidaili.com/free/outtr/", 15);
    /** * 构造返回数据 */ ProxyResponse response = new ProxyResponse();
    response.setSuccess("true");
    Map<String, Object> dataInfoMap = new HashMap<String, Object>();
    dataInfoMap.put("numFound", localProxyInfos.get().size());
    dataInfoMap.put("pageNum", 1);
    dataInfoMap.put("proxy", localProxyInfos.get());
    response.setData(dataInfoMap);
    String responseString = JSONObject.toJSON(response).toString();
    System.out.println(responseString);
    return responseString;
    }
    private void kuaidailiCom(String baseUrl, int totalPage) {
    String ipReg = "\d{
    1,3}
    \.\d{
    1,3}
    \.\d{
    1,3}
    \.\d{
    1,3}
    \d{
    1,6}
    ";
    Pattern ipPtn = Pattern.compile(ipReg);
    for (int i = 1;
    i < totalPage;
    i++) {
    if (getCurrentProxyNumber() >= localWantedNumber.get()) {
    return;
    }
    try {
    Document doc = Jsoup.connect(baseUrl + i + "/") .header("Accept", "text/html,application/xhtml+xml,application/xml;
    q=0.9,image/webp,*/*;
    q=0.8") .header("Accept-Encoding", "gzip, deflate, sdch") .header("Accept-Language", "zh-CN,zh;
    q=0.8,en;
    q=0.6") .header("Cache-Control", "max-age=0") .header("User-Agent", "Mozilla/5.0 (Macintosh;
    Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36") .header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244;
    _gat=1;
    _ga=GA1.2.1061361785.1462812244") .header("Host", "www.kuaidaili.com") .header("Referer", "http://www.kuaidaili.com/free/outha/") .timeout(30 * 1000) .get();
    Matcher m = ipPtn.matcher(doc.text());
    while (m.find()) {
    if (getCurrentProxyNumber() >= localWantedNumber.get()) {
    break;
    }
    String[] strs = m.group().split(" ");
    if (checkProxy(strs[0], Integer.parseInt(strs[1]))) {
    System.out.println("获取到可用代理IPt" + strs[0] + "t" + strs[1]);
    addProxy(strs[0], strs[1], "http");
    }
    }
    }
    catch (Exception e) {
    e.printStackTrace();
    }
    }
    }
    private static boolean checkProxy(String ip, Integer port) {
    try {
    //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页 Jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2 * 1000) .proxy(ip, port) .get();
    return true;
    }
    catch (Exception e) {
    return false;
    }
    }
    private int getCurrentProxyNumber() {
    List<ProxyInfo> proxyInfos = localProxyInfos.get();
    if (proxyInfos == null) {
    proxyInfos = new ArrayList<ProxyInfo>();
    localProxyInfos.set(proxyInfos);
    return 0;
    }
    else {
    return proxyInfos.size();
    }
    }
    private void addProxy(String ip, String port, String protocol){
    List<ProxyInfo> proxyInfos = localProxyInfos.get();
    if (proxyInfos == null) {
    proxyInfos = new ArrayList<ProxyInfo>();
    proxyInfos.add(new ProxyInfo(ip, port, protocol));
    }
    else {
    proxyInfos.add(new ProxyInfo(ip, port, protocol));
    }
    }
    }
    class ProxyInfo {
    private String userName = "";
    private String ip;
    private String password = "";
    private String type;
    private String port;
    private int is_internet = 1;
    public ProxyInfo(String ip, String port, String type) {
    this.ip = ip;
    this.type = type;
    this.port = port;
    }
    public String getUserName() {
    return userName;
    }
    public void setUserName(String userName) {
    this.userName = userName;
    }
    public String getIp() {
    return ip;
    }
    public void setIp(String ip) {
    this.ip = ip;
    }
    public String getPassword() {
    return password;
    }
    public void setPassword(String password) {
    this.password = password;
    }
    public String getType() {
    return type;
    }
    public void setType(String type) {
    this.type = type;
    }
    public String getPort() {
    return port;
    }
    public void setPort(String port) {
    this.port = port;
    }
    public int getIs_internet() {
    return is_internet;
    }
    public void setIs_internet(int is_internet) {
    this.is_internet = is_internet;
    }
    }
    class ProxyResponse {
    private String success;
    private Map<String, Object> data;
    public String getSuccess() {
    return success;
    }
    public void setSuccess(String success) {
    this.success = success;
    }
    public Map<String, Object> getData() {
    return data;
    }
    public void setData(Map<String, Object> data) {
    this.data = data;
    }
    }

以上这篇java代理实现爬取代理IP的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。

您可能感兴趣的文章:

  • 利用Python爬取可用的代理IP
  • java实现轻量型http代理服务器示例
相关热词搜索: java 爬取代理IP