学习Java爬虫Day01-抓取百度实时热点
1.创建一个maveng工程
pom文件如下设置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hua.cn</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.37</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.4.2</version>
</dependency>
<!--mapper封装了单表的增删改查,有了他就不用自己写了-->
<!-- https://mvnrepository.com/artifact/tk.mybatis/mapper -->
<dependency>
<groupId>tk.mybatis</groupId>
<artifactId>mapper</artifactId>
<version>4.1.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup 爬虫库 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
</project>
2.创建BaiDuNews实体类
package com.hua.po;
import javax.persistence.Column;
import javax.persistence.Table;
/**
* Created by hua on 2019/3/31.
* @description: 百度实时热点的po对象
*/
@Table(name = "baidu_news")
public class BaiDuNews {
private Integer id;
private String keyword;
private String type;
private String clazz;
@Column(name="search_num")
private Integer searchNum;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getClazz() {
return clazz;
}
public void setClazz(String clazz) {
this.clazz = clazz;
}
public Integer getSearchNum() {
return searchNum;
}
public void setSearchNum(Integer searchNum) {
this.searchNum = searchNum;
}
}
3.创建与之对应的数据库
CREATE TABLE `baidu_news` (
`id` int(11) NOT NULL COMMENT '主键自增',
`keyword` varchar(255) DEFAULT NULL COMMENT '关键字',
`type` varchar(255) DEFAULT NULL COMMENT '类型',
`clazz` varchar(255) DEFAULT NULL COMMENT '新闻相关性',
`search_num` int(11) DEFAULT NULL COMMENT '搜索指数',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
4.创建mybatis配置文件
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<!-- 开启驼峰自动映射 -->
<settings>
<!--<setting name="mapUnderscoreToCamelCase" value="false"/>-->
<setting name="defaultStatementTimeout" value="60"/>
</settings>
<typeAliases>
<package name="com.itcast.po"/>
</typeAliases>
<!-- 配置环境,制定数据库连接信息 -->
<environments default="local">
<environment id="local">
<transactionManager type="JDBC"/>
<dataSource type="POOLED">
<property name="driver" value="com.mysql.jdbc.Driver"/>
<property name="url" value="jdbc:mysql://127.0.0.1:3306/hua_crawler?useSSL=false&serverTimezone=GMT%2B8"/>
<property name="username" value="root"/>
<property name="password" value="root"/>
</dataSource>
</environment>
</environments>
<mappers>
<package name="com.hua.mapper"/>
</mappers>
</configuration>
5.创建获取SqlSession的util类
package com.hua.util;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
import tk.mybatis.mapper.entity.Config;
import tk.mybatis.mapper.mapperhelper.MapperHelper;
/**
* Created by hua on 2019/3/31.
*/
public class MybatisHelper {
private static SqlSessionFactory sqlSessionFactoryLocal;
static {
try {
sqlSessionFactoryLocal = new SqlSessionFactoryBuilder().build(Resources.getResourceAsReader("mybatis-config.xml"), "local");
SqlSession sessionLocal = null;
try {
sessionLocal = sqlSessionFactoryLocal.openSession();
MapperHelper mapperHelper = new MapperHelper();
Config config = new Config();
config.setEnableMethodAnnotation(true);
config.setNotEmpty(true);
mapperHelper.setConfig(config);
mapperHelper.registerMapper(Mapper.class);
mapperHelper.registerMapper(MySqlMapper.class);
mapperHelper.processConfiguration(sessionLocal.getConfiguration());
} catch (Exception e) {
e.printStackTrace();
} finally {
if (sessionLocal != null) {
sessionLocal.close();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static SqlSession getSqlSessionLocal() {
return sqlSessionFactoryLocal.openSession();
}
}
6.创建接口BaiduNewsMapper并继承Mapper(里面封装了增删改查)
package com.hua.mapper;
import com.hua.po.BaiDuNews;
import tk.mybatis.mapper.common.Mapper;
/**
* Created by hua on 2019/3/31.
*/
public interface BaiduNewsMapper extends Mapper<BaiDuNews> {
}
7.创建主功能类Day01_BaiduNewsCrawler
package com.hua.main;
import com.hua.mapper.BaiduNewsMapper;
import com.hua.po.BaiDuNews;
import com.hua.util.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
/**
* Created by hua on 2019/3/31.
* @description:抓取百度时事热点
*/
public class Day01_BaiduNewsCrawler {
public static void main(String[] args) throws IOException {
// 获取sqlSession
SqlSession sqlSession = MybatisHelper.getSqlSessionLocal();
// 注入要操作的表的mapper, 方便单表操作
BaiduNewsMapper baiduNewsMapper = sqlSession.getMapper(BaiduNewsMapper.class);
//查询表中数据
/* List<BaiDuNews> baiduNews = baiduNewsMapper.selectAll();
for(BaiDuNews news : baiduNews){
System.out.print(news.getId());
}
sqlSession.close();*/
// 分析网站结构, 获取数据并入库
// 要抓取的url
String url = "http://top.baidu.com/buzz?b=1";
// 使用爬虫库获取
Document doc = Jsoup.connect(url).get();
getElementAndInsert(doc, baiduNewsMapper, "实时热点");
// 获取新闻列表
Elements lis = doc.select("#flist li");
for (int i = 2; i < lis.size(); i++) {
Element li = lis.get(i);
// title = type
String title = li.select("a").attr("title");
String href = "http://top.baidu.com" + li.select("a").attr("href").substring(1);
doc = Jsoup.connect(href).get();
getElementAndInsert(doc, baiduNewsMapper, title);
}
// 提交事务
sqlSession.commit();
// 关流
sqlSession.close();
}
public static void getElementAndInsert(Document doc, BaiduNewsMapper baiduNewsMapper, String type) {
// 使用元素选择器获取对应的html元素
Elements trs = doc.select("#main > div.mainBody > div > table tr");
// 遍历获取需要值
for (Element tr : trs) {
// 获取页面上需要值
String keyword = tr.select(".list-title").text();
String clazz = tr.select(".tc").text();
// 这个变量控制数据是否可用
String tempNum = tr.select(".last").text();
int num;
try {
num = Integer.parseInt(tempNum);
} catch (NumberFormatException e) {
System.out.println("Debug");
// 非数字的字符串, 全部跳过
continue;
}
// 封装到po对象中
BaiDuNews baiduNews = new BaiDuNews();
baiduNews.setKeyword(keyword);
baiduNews.setClazz(clazz);
baiduNews.setSearchNum(num);
// 数据还未获取, 待完成
baiduNews.setType(type);
// 入库
baiduNewsMapper.insert(baiduNews);
System.out.println("入库: " + keyword);
}
}
}
8.最后的项目结构是
9.运行程序,测试结果