该程序仅对单个URL所对应的page网页信息进行抓取(pageSpider.java)。程序流程图如下:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
public class pageSpider implements Runnable {
HttpURLConnection httpUrlConnection;
InputStream inputStream;
BufferedReader bufferedReader;
String url;
public pageSpider() {
try {url="http://www.baidu.com"; } catch (Exception e) {e.printStackTrace();}
try {
httpUrlConnection = (HttpURLConnection) new URL(url).openConnection(); //创建连接
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("---------start-----------");
Thread thread = new Thread(this);
thread.start();
try {thread.join();} catch (InterruptedException e) {e.printStackTrace();}
System.out.println("----------end------------");
}
public void run() {
// TODO Auto-generated method stub
try {
httpUrlConnection.setRequestMethod("GET");
} catch (ProtocolException e) {
e.printStackTrace();
}
try {
httpUrlConnection.setUseCaches(true); //使用缓存
httpUrlConnection.connect(); //建立连接
} catch (IOException e) {
e.printStackTrace();
}
try {
inputStream = httpUrlConnection.getInputStream(); //读取输入流
bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "gb2312"));
String string;
while ((string = bufferedReader.readLine()) != null) {
System.out.println(string); //打印输出
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
bufferedReader.close();
inputStream.close();
httpUrlConnection.disconnect();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
new pageSpider();
}
}