Skip to content

xwlmdd/ipProxyPool

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

12 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

项目介绍 基于Spring-boot+Mybatis+redis+webmagic+百度orc开发系统架构。主要爬取的网站有全网代理、西刺代理、米扑代理、快代理。主要可以使用的IP代理分布在米扑和西刺,其他两个比较少可以用。其中米扑代理IP端口是图片,这里使用了百度免费orc识别图片。(有兴趣的同学可以看一下Tess4j技术训练识别类库)

环境搭建 该项目采用maven统一构建,首先创建一个maven项目,在pom.xml文件中以下依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.mdd</groupId>
    <artifactId>ipproxypool</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>ipProxyPool</name>
    <description>Demo project for Spring Boot</description>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>1.5.6.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <java.version>1.7</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-redis</artifactId>
        </dependency>
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>1.3.0</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-thymeleaf</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <!--<scope>runtime</scope>-->
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.springframework.restdocs</groupId>
            <artifactId>spring-restdocs-mockmvc</artifactId>
            <scope>test</scope>
        </dependency>
        <!--<dependency>-->
        <!--<groupId>org.springframework.boot</groupId>-->
        <!--<artifactId>spring-boot-starter-log42j</artifactId>-->
        <!--</dependency>-->
        <!--webMagic -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.5.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.5.3</version>
        </dependency>

        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.4</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/commons-lang/commons-lang -->
        <dependency>
            <groupId>commons-lang</groupId>
            <artifactId>commons-lang</artifactId>
            <version>2.6</version>
        </dependency>


    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>

            <plugin>
                <groupId>org.mybatis.generator</groupId>
                <artifactId>mybatis-generator-maven-plugin</artifactId>
                <version>1.3.2</version>
                <configuration>
                    <verbose>true</verbose>
                    <overwrite>true</overwrite>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>
编写Application程序启动类
package com.mdd.proxyip;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
public class Application {

   public static void main(String[] args) {
        SpringApplication.run(Application.class, args);
   }
}

关键代码如下 使用webmagic获取ip代理详情(全网代理ip分散在不同标签【找规律解析出来】|| 米扑代理ip端口是图片【采用百度orc】)

/**
 * 云代理网站ip抓取
 * @author xwl 2017.6.3
 */
@Component
public class QuanWanProxyIpCrawler implements PageProcessor {

   private Logger logger = Logger.getLogger(QuanWanProxyIpCrawler.class);

   // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
   private Site site = Site.me().setCycleRetryTimes(3).setRetryTimes(3).setSleepTime(1000)
         .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0");

   public Site getSite() {
      return site;
   }

   public void process(Page page) {
      Html html = page.getHtml();
      List<String> urlList = page.getHtml().regex("http://www\\.goubanjia\\.com/index\\d*\\.shtml").all();
      // 结果集
      List<ProxyIp> proxyIpList = new ArrayList<ProxyIp>();
      List<String> proxyIpTrList = html.xpath("//*[@id='list']/table/tbody/tr").all();
      if (proxyIpTrList != null && proxyIpTrList.size() > 0) {
         for (String trHtml : proxyIpTrList) {
            ProxyIp proxyIp = new ProxyIp();
            String ip = analyzeIp(trHtml);
            // ip地址
            proxyIp.setProxyIp(ip.substring(0, ip.indexOf(":")));
            // 端口号
            proxyIp.setProxyPort(Integer.parseInt(ip.substring(ip.indexOf(":") + 1, ip.length())));
            analyzeProxyIp(trHtml, proxyIp);
            proxyIpList.add(proxyIp);
         }
         page.putField("proxyIpList", proxyIpList);
      }
      page.addTargetRequests(urlList);
   }

   public static void main(String[] args) {
      Spider.create(new QuanWanProxyIpCrawler()).addUrl("http://www.goubanjia.com/").thread(5).run();
   }
   
   
   /**
    * 该网站ip每个数字分开,没有规律
    * 
    * @param trHtml
    * @return
    */
   private String analyzeIp(String trHtml) {
      trHtml = "<html><head></head><body><table>" + trHtml + "</table></body></html>";
      StringBuffer ip = new StringBuffer();
      if (StringUtils.isBlank(trHtml)) {
         return null;
      }
      List<Node> nodeList = Jsoup.parse(trHtml).getElementsByTag("td").get(0).childNodes();
      if (nodeList == null || nodeList.size() <= 0) {
         return null;
      }
      for (int i = 0; i < nodeList.size() - 1; i++) {
         Node node = nodeList.get(i);
         String nodeHtml = node.outerHtml();
         // display:none;不显示的
         if (StringUtils.isNotBlank(nodeHtml) && nodeHtml.contains("display:none;")
               || nodeHtml.contains("display: none;")) {
            continue;
         }
         String text = CommonUtils.simpleMatch(nodeHtml, ">\\s*(.*)\\s*</");
         ip.append(text);
      }
      String resultIp = ip.toString().replace("null", "").replace(" ", "").trim();
      String port = CommonUtils.simpleMatch(nodeList.get(nodeList.size() - 1).outerHtml(), ">\\s*(.*)\\s*</");
      resultIp = resultIp + ":" + port;
      return resultIp;
   }

   /**
    * 解析除了ip、port之外的属性
    * 
    * @param trHtml
    * @param proxyIp
    * @return
    */
   private void analyzeProxyIp(String trHtml, ProxyIp proxyIp) {
      trHtml = "<html><head></head><body><table>" + trHtml + "</table></body></html>";
      if (StringUtils.isBlank(trHtml)) {
         return;
      }
      try {
         Elements tdElements = Jsoup.parse(trHtml).select("tr > td");
         if (tdElements == null || tdElements.isEmpty()) {
            return;
         }
         // 匿名度
         String anonymity = tdElements.get(1).select("a").text();
         // 类型
         String proxyType = tdElements.get(2).select("a").text();
         // 地址
         StringBuffer ipAddress = new StringBuffer();
         Elements aElements = tdElements.get(3).select("a");
         for (Element a : aElements) {
            ipAddress.append(a.text());
         }
         // 响应速度
         String responseSpeed = tdElements.get(5).text();
         proxyIp.setAnonymity(anonymity);
         proxyIp.setProxyType(proxyType);
         proxyIp.setIpAddress(ipAddress.toString());
         proxyIp.setResponseSpeed(responseSpeed);
      } catch (Exception e) {
         logger.debug("解析失败",e);
      }
   }
}
 米扑代理ip端口是图片【采用百度orc】
package com.mdd.proxyip.utils;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.log4j.Logger;
import java.io.*;
import java.util.HashMap;

/**
 * Created by xwl on 2017/8/20.
 */
public class VCodeCheckUtils {

    private static final Logger logger = Logger.getLogger(VCodeCheckUtils.class);

    private static  String OCRUrl = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic";
//    private static String OCRUrl = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage";
    private static final  String ACCESS_TOKEN =  getAccessToken();

    /**
     * 获取AccessToken
     * 百度开发
     * AppId:10028388
     * APIKey:kdZU5aOeI7FguVfWzql7LOGM
     * SecretKey:Xxcze1I2RLUhB8NFd7T4u4fHdBGundrn
     *
     * @return
     */
    public static String getAccessToken() {
        String accessToken = "";
        HttpRequestData httpRequestData = new HttpRequestData();
        HashMap<String, String> params = new HashMap<>();
        params.put("grant_type", "client_credentials");
        params.put("client_id", "xxxxx");
        params.put("client_secret", "xxxxx");
        httpRequestData.setRequestMethod("GET");
        httpRequestData.setParams(params);
        httpRequestData.setRequestUrl("https://aip.baidubce.com/oauth/2.0/token");
        HttpResponse response = HttpClientUtils.execute(httpRequestData);
        String json = "";
        try {
            json = IOUtils.toString(response.getEntity().getContent());
        } catch (IOException e) {
            e.printStackTrace();
        }
        if (response.getStatusLine().getStatusCode() == 200) {
            JSONObject jsonObject = JSONObject.parseObject(json);
            if (jsonObject != null && !jsonObject.isEmpty()) {
                accessToken = jsonObject.getString("access_token");
            }
        }
        return accessToken;
    }

    /**
     * 获取识别验证码
     * @param imageUrl
     * @return
     */
    public static String OCRVCode(String imageUrl){
        String VCode = "";

        if (StringUtils.isBlank(ACCESS_TOKEN)) {
            logger.error("accessToken为空");
            return VCode;
        }
        OCRUrl = OCRUrl + "?access_token=" + ACCESS_TOKEN;

        HashMap<String, String> headers = new HashMap<>();
        headers.put("Content-Type", "application/x-www-form-urlencoded");

        HashMap<String, String> params = new HashMap<>();
        imageUrl = ImageBase64ToStringUtils.imageToStringByBase64(imageUrl);
        params.put("image", imageUrl);

        HttpRequestData httpRequestData = new HttpRequestData();
        httpRequestData.setHeaders(headers);
        httpRequestData.setRequestMethod("post");
        httpRequestData.setParams(params);
        httpRequestData.setRequestUrl(OCRUrl);
        HttpResponse response = HttpClientUtils.execute(httpRequestData);
        String json = "";
        if (response.getStatusLine().getStatusCode() == 200) {
            try {
                json = IOUtils.toString(response.getEntity().getContent());
                System.out.println(json);
            } catch (IOException e) {
                logger.error("请求识别失败!", e);
            }
        }
        JSONObject jsonObject = JSONObject.parseObject(json);
        JSONArray wordsResult = jsonObject.getJSONArray("words_result");
        VCode = wordsResult.getJSONObject(0).getString("words");
        return VCode;
    }

    /**
     * 将本地图片进行Base64位编码
     * @param imageFile
     * @return
     */
    public static String encodeImgageToBase64(String imageFile) {
        // 其进行Base64编码处理
        byte[] data = null;
        // 读取图片字节数组
        try {
            InputStream in = new FileInputStream(imageFile);
            data = new byte[in.available()];
            in.read(data);
            in.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

        // 对字节数组Base64编码
        return Base64Util.encode(data);
    }

    public static void main(String[] args) throws IOException {
//        System.out.println(getAccessToken());
//        System.out.println(OCRVCode("G:ygrandimg.png"));
        System.out.println(OCRVCode("http://proxy.mimvp.com/common/ygrandimg.php?id=7&port=NmTiAmzvMpTI4"));
    }
}

运行如下(存储在redis)

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages