HtmlAgilityPack+C#做IP代理爬虫(二)
一、查找资料并进可能多的抓取IP代理数据、并存储IP代理池
二、从代理池中筛选数据并把有效数据添加到另一张表中,并及时更新
三、定时更新IP代理池
由于网站的IP代理地址需要实时更新,并且程序需要对数据库中的爬虫IP数据需要及时筛选,并插入到新表中。数据验证一定要及时,所以程序一定要做到高并发。
1、检测IP代理是否可用
public static void SelectIP(object ip) //分配端口数据
{
List<string> ips = ip as List<string>;
var result = Parallel.ForEach(ips, (row) =>
{
if (!string.IsNullOrEmpty(row))
FilterIP(row);
});
}
public static int i = 0;
public static void FilterIP(string ip)//验证IP代理
{
System.GC.Collect();
var request = (HttpWebRequest)WebRequest.Create(GetStr());
request.Proxy = new WebProxy(ip.ToString());
request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");//定义gzip压缩页面支持
request.ContentType = "application/x-www-form-urlencoded";//定义文档类型及编码
request.AllowAutoRedirect = false;//禁止自动跳转
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";//设置User-Agent,伪装成Google Chrome浏览器
request.KeepAlive = false;
request.Timeout = 7000;
request.Method = "POST";
Random random = new Random();
try
{
var requestStream = request.GetRequestStream();
Encoding bin = Encoding.GetEncoding("UTF-8");
var data = Encoding.UTF8.GetBytes("a=10&b=15");
requestStream.Write(data, 0, data.Length);
var response = request.GetResponse();
using (StreamReader reader = new StreamReader(response.GetResponseStream(), bin))
{
i++;
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine(DateTime.Now.ToLongTimeString() + " " + i +" Success: " + ip.ToString());
}
response.Close();
requestStream.Flush();
}
catch (Exception e)
{
i++;
Console.ForegroundColor = ConsoleColor.White;
Console.WriteLine(DateTime.Now.ToLongTimeString()+" "+i+" Fail: " + e.Message);
}
2、保存可用的IP代理到TipPool中
string[] sstring = ip.ToString().Split(':');
SqlParameter[] sps = new SqlParameter[] {
new SqlParameter("@ip",sstring[0]),
new SqlParameter("@port",Convert.ToInt32(sstring[1]))
};
string str = "[dbo].[S_store_insert_ip]";
try
{
Action th = () =>
{
SqlHelper.ExecuteNonquery(str, CommandType.StoredProcedure, sps);
};
th.BeginInvoke(null, null);
}
finally { }
}
3、IP不可用时,在新表TipPool中删除IP
string[] sstring = ip.ToString().Split(':');
SqlParameter[] sps = new SqlParameter[] {
new SqlParameter("@ip",sstring[0]),
new SqlParameter("@port",Convert.ToInt32(sstring[1]))
};
string str = "[dbo].[S_store_delete_ip]";
try
{
Action th = () =>
{
SqlHelper.ExecuteNonquery(str, CommandType.StoredProcedure, sps);
};
th.BeginInvoke(null, null);
}
finally { }
4、程序的效果是