使用xpath提取页面数据-代码实例
使用xpath提取页面数据,下面简单介绍一个代码实例
随便拿一个网站抓取里面的数据,比如安徽人大中的地方法规:
链接地址:http://www.ahrd.gov.cn/npcweb/web/list.jsp?colId=1366078128953013
右击查看源:
抓取标题时间链接
实例代码如下
public void zqmz_dffg(){
String url="http://www.ahrd.gov.cn/npcweb/web/list.jsp?colId=1366078128953013";
try {
String content = doGet(url);
/**
* htmlcleaner是对html分析提取数据,个人 觉得 htmlcleaner 比 htmlparser 好用。htmlcleaner 的 xpath特好用。
* htmlcleaner 对不规范的html兼容性比较好。
* htmlcleaner.clean()中的参数,可以是文件,可以是url,可以是字符串内容。
*/
HtmlCleaner hc = new HtmlCleaner();
TagNode tn = hc.clean(content);
Document dom = new DomSerializer(new CleanerProperties()).createDOM(tn);
/**
* 1. / 表示绝对路径 表示从xml的根位置开始或子元素(一个层次结构)
* 2. // 表示相对路劲 表示不分任何层次结构的选择元素。
* 3. * 表示通配符 表示匹配所有元素
*/
// 具体参照W3School官方文档:http://www.w3school.com.cn/xpath/index.asp
String pathUri = "//ul[@class='sublist_lm']/li";
XPath xPath = XPathFactory.newInstance().newXPath();
Object result;
result = xPath.evaluate(pathUri, dom, XPathConstants.NODESET);
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
for (int i = 0; i < nodeList; i++) {
Node node = nodeList.item(i);
System.out.println("------------");
//System.out.println("node--value-"+node.);
Object dateNode = xPath.evaluate(pathUri+"[position() = "+(i+1)+"]/i", node, XPathConstants.NODESET);
Object aNode = xPath.evaluate(pathUri+"[position() = "+(i+1)+"]/a", node, XPathConstants.NODESET);
String title = "";
String date = "";
String href = "";
String cont = "";
if(dateNode instanceof NodeList) {
NodeList tempList = (NodeList)dateNode;
if(tempList.getLength() > 0) {
date = tempList.item(0).getTextContent();
}
}
if(aNode instanceof NodeList) {
NodeList tempList = (NodeList)aNode;
if(tempList.getLength() > 0) {
title = tempList.item(0).getTextContent().trim();
Node hrefNode = tempList.item(0).getAttributes().getNamedItem("href");
if(hrefNode != null) {
href = hrefNode.getTextContent();
}
}
}
System.out.println((i+1)+"--标题--"+title+"--时间--"+date+"--链接--"+href);
补充:如果一天抓取一次,对重复数据的处理,我是通过标题来判断的标题相同就不给再次添加
Map<String, String> map=new HashMap<String, String>();
//infoList查取的是数据库中抓取的值,遍历标题存放到map集合中去,然后判断map中含有新抓取的标题直接跳过,没有添加
if(infoList.size()>0){
for(Info infoModel:infoList){
map.put(infoModel.getStrMasTitle(),infoModel.getStrMasTitle());
}
boolean contains = map.containsKey(title);
if (contains) {
System.out.println("在Map集合中包含键名" + title);
continue;
} else {
System.out.println("在Map集合中不包含键名" + title);
this.saveObj(info);
}
}else{
this.saveObj(info);
}
}
}
}catch(Exception e) {
e.printStackTrace();
}
}
/**
* 发送 get请求
*/
public static String doGet(String url) {
CloseableHttpClient httpclient = HttpClients.createDefault();
String result = "";
try {
// 创建httpget.
HttpGet httpget = new HttpGet(url);
// 执行get请求.
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(5000)
.setConnectTimeout(5000)
.build();//设置请求和传输超时时间
httpget.setConfig(requestConfig);
CloseableHttpResponse response = httpclient.execute(httpget);
try {
// 获取响应实体
HttpEntity entity = response.getEntity();
// 打印响应状态
System.out.println(response.getStatusLine());
if (entity != null) {
result = EntityUtils.toString(entity,"utf-8");
}
} finally {
response.close();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
// 关闭连接,释放资源
try {
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
public static void main(String[] args) {
Tests test=new Tests();
test.zqmz_dffg();
}
jar包下载地址:链接: https://pan.baidu.com/s/16BU-Kld6tn3P66qyapzMDg 密码: rm6s