idea爬虫爬取招聘信息,大数据
某工厂来学校培训大数据爬虫,先提供个网页
<%@ page language="java" import="java.util.*" pageEncoding="utf-8"%>
<html>
<head>
<link rel="stylesheet" href="layui/css/layui.css">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>大数据-烟台徐老师</title>
<script src="js/jquery-3.2.1.min.js"></script>
<script src="layui/layui.js"></script>
<script src="js/indexAjax.js"></script>
<script src="js/indexAjax2.js"></script>
<script src="js/indexAjax3.js"></script>
<script src="js/indexAjax4.js"></script>
<script src="js/indexAjax5.js"></script>
<script>
function scrapy() {
// var layer = layui.layer;
$.ajax({
url:'${pageContext.request.contextPath }/scrapy.do',
success:function () {
layui.use('layer', function(){
var layer = layui.layer;
layer.alert('爬取成功!');
});
}
});
}
</script>
<link rel="stylesheet"
href="layui/css/layui.css">
</head>
<body class="layui-layout-body">
<div class="layui-layout layui-layout-admin">
<div class="layui-header">
<div class="layui-logo">
<a href="index.jsp"><img src="images/logoblack.png" /></a>
</div>
<!-- 头部区域(可配合layui已有的水平导航) -->
<ul class="layui-nav layui-layout-left">
<li class="layui-nav-item"><a href="">控制台</a></li>
<li class="layui-nav-item"><a href="">商品管理</a></li>
<li class="layui-nav-item"><a href="">用户</a></li>
<li class="layui-nav-item"><a href="javascript:;">BigData</a>
<dl class="layui-nav-child">
<dd>
<a href="showCrawlerData.jsp" target="main">查询数据</a>
</dd>
<dd>
<a href="recruit/insertZhiRecruit">智联招聘</a>
</dd>
<dd>
<a href="javascript:scrapy();">前程无忧</a>
</dd>
<dd>
<a href="recruit/insertBossRecruit">Boss直聘</a>
</dd>
<dd>
<a href="recruit/delAllRecruit">删库谨慎</a>
</dd>
</dl>
</li>
</ul>
<ul class="layui-nav layui-layout-right">
<li class="layui-nav-item"><a href="javascript:;"> <img
src="images/logo.png" class="layui-nav-img"> 某大牛培训老师 </a>
<dl class="layui-nav-child">
<dd>
<a href="javascript:ajaxRequest5();">基本资料</a>
</dd>
<dd>
<a href="echarts.jsp">安全设置</a>
</dd>
</dl>
</li>
<li class="layui-nav-item"><a href="javascript:;">退了</a></li>
</ul>
</div>
<div class="layui-side layui-bg-black">
<div class="layui-side-scroll">
<!-- 左侧导航区域(可配合layui已有的垂直导航) -->
<ul id="menu" class="layui-nav layui-nav-tree" lay-filter="test">
<li class="layui-nav-item"><a class=""
href="javascript:;">爬虫管理</a>
<dl class="layui-nav-child">
<dd>
<a href="javascript:ajaxRequest();">爬取招聘数据</a>
</dd>
<dd>
<a href="javascript:ajaxRequest3();">爬取行政区域</a>
</dd>
<dd>
<a href="javascript:ajaxRequest2();">爬取图片</a>
</dd>
</dl>
</li>
<li class="layui-nav-item"><a href="javascript:;">数据处理</a>
<dl class="layui-nav-child">
<dd>
<a href="ik/ikData">生成分词</a>
</dd>
<dd>
<a href="ik/ikData">招聘信息分析</a>
</dd>
</dl>
</li>
<li class="layui-nav-item"><a href="javascript:;">大数据处理</a>
<dl class="layui-nav-child">
<dd>
<a href="ik/ikData">生成分词</a>
</dd>
<dd>
<a href="javascript:ajaxRequest4();" target="main">提交数据</a>
</dd>
<dd>
<a href="ik/ikData">大数据统计</a>
</dd>
</dl>
</li>
<li class="layui-nav-item"><a href="javascript:;">系统管理</a>
<dl class="layui-nav-child">
<dd>
<a href="javascript:;">用户管理</a>
</dd>
<dd>
<a href="javascript:;">权限管理</a>
</dd>
</dl>
</li>
</ul>
</div>
</div>
<div class="layui-body" style="padding: 5px">
<!-- 内容主体区域 -->
<div >
<iframe name="main" frameborder="0" width="100%" height="90%" src="welcome.html"></iframe>
</div>
</div>
<div class="layui-footer">
<!-- 底部固定区域 -->
</div>
</div>
<script src="${pageContext.request.contextPath }/layui/layui.js"></script>
<script>
//JavaScript代码区域
layui.use(['jquery','element'], function(){
var element = layui.element;
var $ = layui.$;
$("#menu li").click(function(){
$(this).siblings().removeClass("layui-nav-itemed");
})
});
</script>
</body>
</html>
package com.ld.jsoup.servlet;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
//ctrl+o
public class JsoupServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
//设置编码格式
request.setCharacterEncoding("UTF-8");//请求
response.setContentType("text/html;charset=UTF-8");//响应
//获取PrintWriter对象设置响应文本
PrintWriter out = response.getWriter();
ExecutorService executorService = Executors.newFixedThreadPool(5);
for(int i=1;i<=10;i++){
final String url="https://search.51job.com/list/120400%252C010000,000000,0000,00,9,99,java,2,"+
i+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
//connect方法访问某个网址,get获取Document文本对象
try {
final Document document = Jsoup.connect(url).get();
executorService.execute(new Runnable() {
public void run() {
//select方法 根据选择器获取对应的元素集合
Elements elements = document.select("p.t1.tg1 span a");
//遍历元素集合
for(Element element:elements){
String absUrl = element.absUrl("href");
//System.out.println(absUrl);
Document doc = null;
try {
doc = Jsoup.connect(absUrl).get();
} catch (IOException e) {
e.printStackTrace();
}
Elements eles = doc.select("div.cn h1");
for(Element ele:eles){
//text()方法 获取元素中的内容
System.out.println("jsoup===="+element.text());
}
}
}
});
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("-----------------------------------------------------");
}
out.print("success");
out.close();
}
@Override
protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
doGet(req,resp);
}
}
function ajaxRequest() {
$.ajax({
"url":"jsoup",
"type":"GET",
"dataType":"text",
"success":ifSuccess,
"error":function(){
alert("请求错误!");
}
});
}
function ifSuccess(data){
if(data=="success"){
alert("爬取成功!");
}else{
alert("爬取失败!");
}
}
<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns="http://java.sun.com/xml/ns/javaee"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd"
version="3.0">
<display-name>Archetype Created Web Application</display-name>
<welcome-file-list>
<welcome-file>index.jsp</welcome-file>
</welcome-file-list>
<servlet>
<servlet-name>JsoupImgServlet</servlet-name>
<servlet-class>com.ld.jsoup.servlet.JsoupImgServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>JsoupImgServlet</servlet-name>
<url-pattern>/src</url-pattern>
</servlet-mapping>
<servlet>
<servlet-name>JsoupServlet</servlet-name>
<servlet-class>com.ld.jsoup.servlet.JsoupServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>JsoupServlet</servlet-name>
<url-pattern>/jsoup</url-pattern>
</servlet-mapping>
<servlet>
<servlet-name>JobServlet</servlet-name>
<servlet-class>com.ld.jsoup.servlet.JobServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>JobServlet</servlet-name>
<url-pattern>/JobServlet</url-pattern>
</servlet-mapping>
<servlet>
<servlet-name>uploadFileServlet</servlet-name>
<servlet-class>com.ld.jsoup.servlet.uploadFileServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>uploadFileServlet</servlet-name>
<url-pattern>/uploadFileServlet</url-pattern>
</servlet-mapping>
<servlet>
<servlet-name>JobToFileServlet</servlet-name>
<servlet-class>com.ld.jsoup.servlet.JobToFileServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>JobToFileServlet</servlet-name>
<url-pattern>/JobToFile</url-pattern>
</servlet-mapping>
</web-app>
点击运行,爬取招聘数据
还是刚入门,只能做些比较low的东西,反正觉得用线程池很好用,下次学习存入数据库,那个就比较实用了