正则表达式学习 ---- 抓取远程网页并解析HTML
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpStatus;
- import org.apache.commons.httpclient.methods.GetMethod;
- public class HttpClientDemo {
- /**
- *
- * @param url
- * @return
- * @throws Exception
- */
- public static String getHTML(String url) throws Exception {
- HttpClient httpClient = new HttpClient();
- GetMethod getMethod = new GetMethod(url);
- int statusCode = httpClient.executeMethod(getMethod);
- if (statusCode != HttpStatus.SC_OK) {
- System.err.println("Method failed: " + getMethod.getStatusLine());
- return null;
- }
- // 读取内容
- byte[] responseBody = getMethod.getResponseBody();
- getMethod.releaseConnection();
- return new String(responseBody);
- }
- /**
- *
- * @throws Exception
- */
- public static void test(String url) throws Exception{
- String html = getHTML(url);
- Pattern p = null;
- Matcher m = null;
- StringBuffer sb0 = new StringBuffer();
- // ul正则
- String regex = "<ul class=\"d2_9\">([\\s\\S]*<li>)<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]</li>([\\s].*)";
- // 链接正则
- String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]";
- p = Pattern.compile(regex);
- // m = p.matcher(sb.toString());
- m = p.matcher(html);
- int count = 0;
- // ul字符串
- while (m.find()) {
- sb0.append(m.group());
- }
- //System.out.println(sb0.toString());
- p = Pattern.compile(regexa);
- m = p.matcher(sb0.toString());
- // 链接地址和标题
- while (m.find()) {
- System.out.println("地址:" + m.group(1));
- System.out.println("标题:" + m.group(2));
- System.out.println("时间:" + m.group(3));
- count++;
- }
- System.out.println("抓取条数:"+count);
- }
- public static void main(String[] args) throws Exception {
- String url = "http://cpc.people.com.cn/GB/194302/194306/index.html";
- test(url);
- }
- }