java实现爬虫(爬取51job网的招聘信息)
如图,爬去以下的相关信息。
F12分析网站源码
大体的内容都在id=resultList的标签下面。
每条职位信息都对应一个class=el的标签。
下一页的信息是一个class为bk下的a对应的href
实现代码:
新建maven项目,导入依赖
<dependencies>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
<!-- https://mvnrepository.com/artifact/ch.hsr/geohash -->
<dependency>
<groupId>ch.hsr</groupId>
<artifactId>geohash</artifactId>
<version>1.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-httpclient/commons-httpclient -->
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
</dependencies>
新建bean,封装每条岗位的信息
public class JobBean {
private String jobName;
private String company;
private String address;
private String salary;
private String date;
public void set(String jobName, String company, String address, String salary, String date) {
this.jobName = jobName;
this.company = company;
this.address = address;
this.salary = salary;
this.date = date;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public String getCompany() {
return company;
}
public void setCompany(String company) {
this.company = company;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getSalary() {
return salary;
}
public void setSalary(String salary) {
this.salary = salary;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String toString() {
return "jobName=" + jobName + ", company=" + company + ", address=" + address + ", salary=" + salary
+ ", date=" + date;
}
}
主实现方法
package cn.jixiang.main;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.jixiang.bean.JobBean;
public class TestMain {
private static int count = 0;
public static void main(String[] args) {
String strUrl = "https://search.51job.com/list/170200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
Document document = getDom(strUrl);
List<JobBean> list = getPageInfo(document);
System.out.println("---------------"+(++count)+"-------------");
for (JobBean jobBean : list) {
System.out.println(jobBean);
}
getNextPageInfo(document);
}
public static void getNextPageInfo(Document document){
Elements elements = document.select(".bk");
Element element = elements.get(1);
String strUrl = element.select("a").attr("href");
if(strUrl==null){
return;
}
Document dom = getDom(strUrl);
List<JobBean> list = getPageInfo(dom);
System.out.println("---------------"+(++count)+"-------------");
for (JobBean jobBean : list) {
System.out.println(jobBean);
}
try {
Thread.sleep(400);
} catch (InterruptedException e) {
e.printStackTrace();
}
getNextPageInfo(dom);
}
public static List<JobBean> getPageInfo(Document document){
List<JobBean> list = new ArrayList<JobBean>();
Elements elements = document.select("#resultList .el");
elements.remove(0);
for (Element element : elements) {
Elements elements2 = element.select("span");
JobBean jobBean = new JobBean();
jobBean.set(elements2.get(0).text(), elements2.get(1).text(), elements2.get(2).text(), elements2.get(3).text(), elements2.get(4).text());
list.add(jobBean);
}
return list;
}
public static Document getDom(String strUrl){
try {
URL url = new URL(strUrl);
Document document = Jsoup.parse(url, 4000);
return document;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
另一种方法,我们发现每一页的url只有对应的页数不同,所以我们可以先获取到总页数,然后在遍历url,爬去每个url对应的网站。
代码实现:
package cn.jixiang.main;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.jixiang.bean.JobBean;
public class TestMain2 {
public static void main(String[] args) {
String strUrl = "https://search.51job.com/list/170200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
Document document = getDom(strUrl);
int pageNum = getPageNum(document);
for(int i=1;i<=pageNum;i++){
strUrl = "https://search.51job.com/list/170200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,"+i+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
Document document2 = getDom(strUrl);
List<JobBean> list = getPageInfo(document2);
System.out.println("-------------"+i+"--------------");
for (JobBean jobBean : list) {
System.out.println(jobBean);
}
}
}
public static int getPageNum(Document document) {
Elements elements = document.select(".td");
String text = elements.get(0).text();
Pattern pattern = Pattern.compile("[^0-9]");
Matcher matcher = pattern.matcher(text);
String pageNum = matcher.replaceAll("");
return Integer.parseInt(pageNum);
}
public static List<JobBean> getPageInfo(Document document) {
List<JobBean> list = new ArrayList<JobBean>();
Elements elements = document.select("#resultList .el");
elements.remove(0);
for (Element element : elements) {
Elements elements2 = element.select("span");
JobBean jobBean = new JobBean();
jobBean.set(elements2.get(0).text(), elements2.get(1).text(), elements2.get(2).text(),
elements2.get(3).text(), elements2.get(4).text());
list.add(jobBean);
}
return list;
}
public static Document getDom(String strUrl) {
try {
URL url = new URL(strUrl);
Document document = Jsoup.parse(url, 4000);
return document;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}