某招聘平台爬虫java

Scroll Down

背景是什么呢? 就是闲的无聊,想学习一下爬虫技术,于是就误打误撞学到了selenium,至于selenium是啥玩意呢? 网上有很多资料,很多自动化测试都用这个开发,并且用过python的朋友应该也知道这个框架,这里就不废话了。

准备环境

  1. 因为现在许多网站做了反爬虫机制,所以我这里选型是用到了selenium来爬取目标数据,这里我是模拟“人类”来操作chrome浏览器的,所以这里涉及到系统要安装chrome浏览器
  2. 对于java环境,我是用的maven来管理jar包
  3. 需要准备chromedriver驱动

对于以上三点,这里我只解释第三点,下载的chromedriver需要与chrome版本一致,以我的版本为主,如下图:
chrome
chromeDriver地址如下所示:

https://npm.taobao.org/mirrors/chromedriver/

下载之后把chromedriver.exe粘贴到自己的chrome安装路径下
图片

maven依赖的jar包如下:

	<dependency>
	    <groupId>org.seleniumhq.selenium</groupId>
	    <artifactId>selenium-java</artifactId>
	    <version>3.141.59</version>
	</dependency>
	<!--阿里fastJson-->
	<dependency>
	    <groupId>com.alibaba</groupId>
	    <artifactId>fastjson</artifactId>
	    <version>1.2.47</version>
	</dependency>
	<dependency>
	    <groupId>cn.hutool</groupId>
	    <artifactId>hutool-all</artifactId>
	    <version>4.1.12</version>
	    <scope>compile</scope>
	</dependency>
	<dependency>
	    <groupId>org.apache.commons</groupId>
	    <artifactId>commons-pool2</artifactId>
	</dependency>
	<dependency>
	    <groupId>org.jsoup</groupId>
	    <artifactId>jsoup</artifactId>
	    <version>1.11.1</version>
	</dependency>

撸码

public static void main(String[] args) {

        new Thread(() -> {
            try (FileWriter fileWriter = new FileWriter(BossSpider.class.getResource("/").getPath() + "job.txt")) {
                while (true) {
                    Job take = jobs.take();
                    StringBuilder builder = new StringBuilder();
                    builder
                            .append("公司\t").append(take.getName()).append("\t")
                            .append("职位\t").append(take.getJob()).append("\t")
                            .append("城市\t").append(take.getCity()).append("\t")
                            .append("学历和经验\t").append(take.getEdu()).append("\t")
                            .append("薪资\t").append(take.getSalary()).append("\t\n");
                    fileWriter.write(builder.toString());
                    fileWriter.flush();
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }).start();
        //这个路径必须是你chromedriver的路径
        System.setProperty("webdriver.chrome.driver", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe");
        ChromeOptions chromeOptions = new ChromeOptions(); 
        //创建无Chrome无头参数
        //同时可以加入ip代理
        ChromeOptions options=new ChromeOptions(); 
        String proxyServer = "xxx:80"; 
        Proxy proxy = new Proxy().setHttpProxy(proxyServer); 
        options.setProxy(proxy);
        ChromeDriver driver = new ChromeDriver(options);
        try {
            String url = "https://www.zhipin.com/";
            driver.get(url); 
            //利用css选择器来定位搜索框和搜索按钮的节点
            WebElement input = driver.findElement(By.cssSelector("#wrap > div.column-search-panel > div > div > div.search-form > form > div.search-form-con > p > input"));
            WebElement submit = driver.findElement(By.cssSelector("#wrap > div.column-search-panel > div > div > div.search-form > form > button"));
            input.clear();
            //模拟写入搜索关键字
            input.sendKeys("你所搜索的职位");
            //这里必须让线程睡1秒 来降低机器的识别
            Thread.sleep(2*1000);
            //模拟搜索按钮的点击
            driver.executeScript("arguments[0].click();", submit); 
            //搜索过程中 让线程睡几秒,等搜索结果出来之后再执行getNext方法
            Thread.sleep(5*1000);
            //获取职位
            getNext(driver, 1);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            driver.close();
        }
    }

获取数据

static void getNext(ChromeDriver driver, int pageNow) {
        // 上面线程睡5秒之后差不多页面的数据就已经出来了,然后获取页面的结果
        String pageSource = driver.getPageSource();
        //下面可选,,,将页面写入到本地,然后方便调试 不然每次调试都要模拟,容易被察觉
        String path = (BossSpider.class.getResource("/").getPath() + (pageNow) + "t.html").substring(1);
        File file = new File(path);
        if (!Files.exists(Paths.get(path))) {
            try {
                Files.createFile(Paths.get(path));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
		
        try (FileOutputStream fileOutputStream = new FileOutputStream(file)) {
            fileOutputStream.write(pageSource.getBytes());
            //利用jsoup解析上面的页面 解析各个节点,然后并获取每个节点的数据(差不多就是职位,公司名 薪资什么的)
            //...
            Document parse = Jsoup.parse(pageSource);
            Element element = parse.getElementsByClass("job-list").get(0);
            Elements elementsByClass = element.getElementsByClass("info-primary");
            elementsByClass.forEach(element1 -> {
                Job job = new Job();
                Element element2 = element1.getElementsByClass("job-name").get(0);
                String jobTitle = element2.text();
                String jobArea = element1.getElementsByClass("job-area-wrapper").text();
                Element element3 = element1.getElementsByClass("job-limit").get(0);
                String salary = element3.child(1).text();
                String edu = element3.child(2).text();
                Element companyText = element1.getElementsByClass("company-text").get(0);
                job.setName(companyText.text());
                job.setJob(jobTitle);
                job.setCity(jobArea);
                job.setEdu(edu);
                job.setSalary(salary);
                jobs.add(job);
            });
            //上面数据写入成功之后,然后 执行下一页 继续抓下一页数据
            WebElement next = driver.findElement(By.className("next"));
            if (next != null) {
                driver.executeScript("arguments[0].click();", next);
                Thread.sleep(1*1000);
                //线程眯一会儿,然后执行下一页
                getNext(driver, pageNow + 1);
            }
        } catch (Exception e) {

        } 
    }
 static class Job {
        private String job;
        private String name;
        private String salary;
        private String city;
        private String edu; 
        public String getJob() {
            return job;
        } 
        public void setJob(String job) {
            this.job = job;
        } 
        public String getName() {
            return name;
        } 
        public void setName(String name) {
            this.name = name;
        } 
        public String getSalary() {
            return salary;
        } 
        public void setSalary(String salary) {
            this.salary = salary;
        } 
        public String getCity() {
            return city;
        } 
        public void setCity(String city) {
            this.city = city;
        } 
        public String getEdu() {
            return edu;
        } 
        public void setEdu(String edu) {
            this.edu = edu;
        }
    }

差不多到这一步就结束了

上面的教程仅供学习,以上抓取的数据均未保存并未使用,本篇文章仅用于记录学习成果,如果商用,后果自负。
如果本篇文章侵权,请联系我 QQ