网页爬虫程序pageSpider

2009-05-05 19:44

该程序仅对单个URL所对应的page网页信息进行抓取（pageSpider.java）。程序流程图如下：

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;

public class pageSpider implements Runnable {

HttpURLConnection httpUrlConnection;
InputStream inputStream;
BufferedReader bufferedReader;
String url;

public pageSpider() {

try {url="http://www.baidu.com"; } catch (Exception e) {e.printStackTrace();}

    try {
     httpUrlConnection = (HttpURLConnection) new URL(url).openConnection(); //创建连接
    } catch (MalformedURLException e) {
     e.printStackTrace();
    } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
    }

System.out.println("---------start-----------");

    Thread thread = new Thread(this);
    thread.start();
    try {thread.join();} catch (InterruptedException e) {e.printStackTrace();}

System.out.println("----------end------------");
}

public void run() {
    // TODO Auto-generated method stub
    try {
     httpUrlConnection.setRequestMethod("GET");
    } catch (ProtocolException e) {
     e.printStackTrace();
    }

    try {
     httpUrlConnection.setUseCaches(true); //使用缓存
     httpUrlConnection.connect();           //建立连接
    } catch (IOException e) {
     e.printStackTrace();
    }

    try {
     inputStream = httpUrlConnection.getInputStream(); //读取输入流
     bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "gb2312"));
     String string;
     while ((string = bufferedReader.readLine()) != null) {
        System.out.println(string); //打印输出
     }
    } catch (IOException e) {
     e.printStackTrace();
    } finally {
     try {
      bufferedReader.close();
      inputStream.close();
      httpUrlConnection.disconnect();
     } catch (IOException e) {
      e.printStackTrace();
     }

}

public static void main(String[] args) {
new pageSpider();
}

}

网页爬虫程序pageSpider

相关推荐