抓取http网页的全部链接
Asp.net中抓取网页的全部链接
效果图:
后台代码实现:
usingSystem;
usingSystem.Collections;
usingSystem.ComponentModel;
usingSystem.Data;
usingSystem.Drawing;
usingSystem.Web;
usingSystem.Web.SessionState;
usingSystem.Web.UI;
usingSystem.Web.UI.WebControls;
usingSystem.Web.UI.HtmlControls;
usingSystem.Net;
usingSystem.IO;
usingSystem.Collections;
usingSystem.Text.RegularExpressions;
namespacegetwebsite
...{
/**////<summary>
///WebForm1的摘要说明。
///</summary>
publicclassWebForm1:System.Web.UI.Page
...{
protectedSystem.Web.UI.WebControls.TextBoxTextBox1;
protectedSystem.Web.UI.WebControls.ButtonButton1;
protectedSystem.Web.UI.WebControls.TextBoxTextBox2;
protectedSystem.Web.UI.WebControls.RegularExpressionValidatorRegularExpressionValidator1;
privatevoidPage_Load(objectsender,System.EventArgse)
...{
if(!this.IsPostBack)
...{
}
//在此处放置用户代码以初始化页面
}
Web窗体设计器生成的代码#regionWeb窗体设计器生成的代码
overrideprotectedvoidOnInit(EventArgse)
...{
//
//CODEGEN:该调用是ASP.NETWeb窗体设计器所必需的。
//
InitializeComponent();
base.OnInit(e);
}
/**////<summary>
///设计器支持所需的方法-不要使用代码编辑器修改
///此方法的内容。
///</summary>
privatevoidInitializeComponent()
...{
this.Button1.Click+=newSystem.EventHandler(this.Button1_Click);
this.Load+=newSystem.EventHandler(this.Page_Load);
}
#endregion
privatevoidButton1_Click(objectsender,System.EventArgse)
...{
this.TextBox2.Text="";
stringweb_url=this.TextBox1.Text;
stringall_code="";
HttpWebRequestall_codeRequest=(HttpWebRequest)WebRequest.Create(web_url);
WebResponseall_codeResponse=all_codeRequest.GetResponse();
StreamReadersr=newStreamReader(all_codeResponse.GetResponseStream());
all_code=sr.ReadToEnd();
sr.Close();
ArrayListmy_list=newArrayList();
stringp=@"http://([w-]+.)+[w-]+(/[w-./?%&=]*)?";
Regexre=newRegex(p,RegexOptions.IgnoreCase);
MatchCollectionmc=re.Matches(all_code);
for(inti=0;i<=mc.Count-1;i++)
...{
bool_foo=false;
stringname=mc[i].ToString();
foreach(stringlistinmy_list)
...{
if(name==list)
...{
_foo=true;
break;
}
}//过滤
if(!_foo)
...{
TextBox2.Text+=name+" ";
}
}
}
}
}
usingSystem.Collections;
usingSystem.ComponentModel;
usingSystem.Data;
usingSystem.Drawing;
usingSystem.Web;
usingSystem.Web.SessionState;
usingSystem.Web.UI;
usingSystem.Web.UI.WebControls;
usingSystem.Web.UI.HtmlControls;
usingSystem.Net;
usingSystem.IO;
usingSystem.Collections;
usingSystem.Text.RegularExpressions;
namespacegetwebsite
...{
/**////<summary>
///WebForm1的摘要说明。
///</summary>
publicclassWebForm1:System.Web.UI.Page
...{
protectedSystem.Web.UI.WebControls.TextBoxTextBox1;
protectedSystem.Web.UI.WebControls.ButtonButton1;
protectedSystem.Web.UI.WebControls.TextBoxTextBox2;
protectedSystem.Web.UI.WebControls.RegularExpressionValidatorRegularExpressionValidator1;
privatevoidPage_Load(objectsender,System.EventArgse)
...{
if(!this.IsPostBack)
...{
}
//在此处放置用户代码以初始化页面
}
Web窗体设计器生成的代码#regionWeb窗体设计器生成的代码
overrideprotectedvoidOnInit(EventArgse)
...{
//
//CODEGEN:该调用是ASP.NETWeb窗体设计器所必需的。
//
InitializeComponent();
base.OnInit(e);
}
/**////<summary>
///设计器支持所需的方法-不要使用代码编辑器修改
///此方法的内容。
///</summary>
privatevoidInitializeComponent()
...{
this.Button1.Click+=newSystem.EventHandler(this.Button1_Click);
this.Load+=newSystem.EventHandler(this.Page_Load);
}
#endregion
privatevoidButton1_Click(objectsender,System.EventArgse)
...{
this.TextBox2.Text="";
stringweb_url=this.TextBox1.Text;
stringall_code="";
HttpWebRequestall_codeRequest=(HttpWebRequest)WebRequest.Create(web_url);
WebResponseall_codeResponse=all_codeRequest.GetResponse();
StreamReadersr=newStreamReader(all_codeResponse.GetResponseStream());
all_code=sr.ReadToEnd();
sr.Close();
ArrayListmy_list=newArrayList();
stringp=@"http://([w-]+.)+[w-]+(/[w-./?%&=]*)?";
Regexre=newRegex(p,RegexOptions.IgnoreCase);
MatchCollectionmc=re.Matches(all_code);
for(inti=0;i<=mc.Count-1;i++)
...{
bool_foo=false;
stringname=mc[i].ToString();
foreach(stringlistinmy_list)
...{
if(name==list)
...{
_foo=true;
break;
}
}//过滤
if(!_foo)
...{
TextBox2.Text+=name+" ";
}
}
}
}
}