异步Web抓取F#,出错了?
不太确定是否可以这样做,但是,我的问题是:我的代码有问题吗?它不会像我想的那样快,而且由于我使用了很多异步工作流程,所以我可能会做错某些事情。这里的目标是建立一些可以在一个小时内抓取20 000页的内容。异步Web抓取F#,出错了?
open System
open System.Text
open System.Net
open System.IO
open System.Text.RegularExpressions
open System.Collections.Generic
open System.ComponentModel
open Microsoft.FSharp
open System.Threading
//This is the Parallel.Fs file
type ComparableUri (uri: string) =
inherit System.Uri(uri)
let elts (uri:System.Uri) =
uri.Scheme, uri.Host, uri.Port, uri.Segments
interface System.IComparable with
member this.CompareTo(uri2) =
compare (elts this) (elts(uri2 :?> ComparableUri))
override this.Equals(uri2) =
compare this (uri2 :?> ComparableUri) = 0
override this.GetHashCode() = 0
///////////////////////////////////////////////Functions to retrieve html string//////////////////////////////
let mutable error = Set.empty<ComparableUri>
let mutable visited = Set.empty<ComparableUri>
let getHtmlPrimitiveAsyncDelay (delay:int) (uri : ComparableUri) =
async{
try
let req = (WebRequest.Create(uri)) :?> HttpWebRequest
// 'use' is equivalent to ‘using’ in C# for an IDisposable
req.UserAgent<-"Mozilla"
//Console.WriteLine("Waiting")
do! Async.Sleep(delay * 250)
let! resp = (req.AsyncGetResponse())
Console.WriteLine(uri.AbsoluteUri+" got response after delay "+string delay)
use stream = resp.GetResponseStream()
use reader = new StreamReader(stream)
let html = reader.ReadToEnd()
return html
with
| _ as ex -> Console.WriteLine(ex.ToString())
lock error (fun() -> error<- error.Add uri)
lock visited (fun() -> visited<-visited.Add uri)
return "BadUri"
}
///////////////////////////////////////////////Active Pattern Matching to retreive href//////////////////////////////
let (|Matches|_|) (pat:string) (inp:string) =
let m = Regex.Matches(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Count > 0
then Some (List.tail [ for g in m -> g.Value ])
else None
let (|Match|_|) (pat:string) (inp:string) =
let m = Regex.Match(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Success then
Some (List.tail [ for g in m.Groups -> g.Value ])
else
None
///////////////////////////////////////////////Find Bad href//////////////////////////////
let isEmail (link:string) =
link.Contains("@")
let isMailto (link:string) =
if Seq.length link >=6 then
link.[0..5] = "mailto"
else
false
let isJavascript (link:string) =
if Seq.length link >=10 then
link.[0..9] = "javascript"
else
false
let isBadUri (link:string) =
link="BadUri"
let isEmptyHttp (link:string) =
link="http://"
let isFile (link:string)=
if Seq.length link >=6 then
link.[0..5] = "file:/"
else
false
let containsPipe (link:string) =
link.Contains("|")
let isAdLink (link:string) =
if Seq.length link >=6 then
link.[0..5] = "adlink"
elif Seq.length link >=9 then
link.[0..8] = "http://adLink"
else
false
///////////////////////////////////////////////Find Bad href//////////////////////////////
let getHref (htmlString:string) =
let urlPat = "href=\"([^\"]+)"
match htmlString with
| Matches urlPat urls -> urls |> List.map(fun href -> match href with
| Match (urlPat) (link::[]) -> link
| _ -> failwith "The href was not in correct format, there was more than one match")
| _ -> Console.WriteLine("No links for this page");[]
|> List.filter(fun link -> not(isEmail link))
|> List.filter(fun link -> not(isMailto link))
|> List.filter(fun link -> not(isJavascript link))
|> List.filter(fun link -> not(isBadUri link))
|> List.filter(fun link -> not(isEmptyHttp link))
|> List.filter(fun link -> not(isFile link))
|> List.filter(fun link -> not(containsPipe link))
|> List.filter(fun link -> not(isAdLink link))
let treatAjax (href:System.Uri) =
let link = href.ToString()
let firstPart = (link.Split([|"#"|],System.StringSplitOptions.None)).[0]
new Uri(firstPart)
//only follow pages with certain extnsion or ones with no exensions
let followHref (href:System.Uri) =
let valid2 = set[".py"]
let valid3 = set[".php";".htm";".asp"]
let valid4 = set[".php3";".php4";".php5";".html";".aspx"]
let arrLength = href.Segments |> Array.length
let lastExtension = (href.Segments).[arrLength-1]
let lengthLastExtension = Seq.length lastExtension
if (lengthLastExtension <= 3) then
not(lastExtension.Contains("."))
else
//test for the 2 case
let last4 = lastExtension.[(lengthLastExtension-1)-3..(lengthLastExtension-1)]
let isValid2 = valid2|>Seq.exists(fun validEnd -> last4.EndsWith(validEnd))
if isValid2 then
true
else
if lengthLastExtension <= 4 then
not(last4.Contains("."))
else
let last5 = lastExtension.[(lengthLastExtension-1)-4..(lengthLastExtension-1)]
let isValid3 = valid3|>Seq.exists(fun validEnd -> last5.EndsWith(validEnd))
if isValid3 then
true
else
if lengthLastExtension <= 5 then
not(last5.Contains("."))
else
let last6 = lastExtension.[(lengthLastExtension-1)-5..(lengthLastExtension-1)]
let isValid4 = valid4|>Seq.exists(fun validEnd -> last6.EndsWith(validEnd))
if isValid4 then
true
else
not(last6.Contains(".")) && not(lastExtension.[0..5] = "mailto")
//Create the correct links/-> add the homepage , make then a comparabel Uri
let hrefLinksToUri (uri:ComparableUri) (hrefLinks:string list) =
hrefLinks
|> List.map(fun link -> try
if Seq.length link <4 then
Some(new Uri(uri, link))
else
if link.[0..3] = "http" then
Some(new Uri(link))
else
Some(new Uri(uri, link))
with
| _ as ex -> Console.WriteLine(link);
lock error (fun() ->error<-error.Add uri)
None
)
|> List.filter(fun link -> link.IsSome)
|> List.map(fun o -> o.Value)
|> List.map(fun uri -> new ComparableUri(string uri))
//Treat uri , removing ajax last part , and only following links specified b Benoit
let linksToFollow (hrefUris:ComparableUri list) =
hrefUris
|>List.map(treatAjax)
|>List.filter(fun link -> followHref link)
|>List.map(fun uri -> new ComparableUri(string uri))
|>Set.ofList
let needToVisit uri =
(lock visited (fun() -> not(visited.Contains uri))) && (lock error (fun() -> not(error.Contains uri)))
let getLinksToFollowAsyncDelay (delay:int) (uri: ComparableUri) =
//write
async{
let! links = getHtmlPrimitiveAsyncDelay delay uri
lock visited (fun() ->visited<-visited.Add uri)
let linksToFollow = getHref links
|> hrefLinksToUri uri
|> linksToFollow
|> Set.filter(needToVisit)
return linksToFollow
}
let getDelay(uri:ComparableUri) (authorityDelay:Dictionary<string,System.Diagnostics.Stopwatch >) =
let uriAuthority = uri.Authority
let hasAuthority,watch = authorityDelay.TryGetValue(uriAuthority)
if hasAuthority then
let elapsed = watch.Elapsed
let s = TimeSpan(0,0,0,0,500)-elapsed
if s.TotalMilliseconds < 0.0 then
0
else
int(s.TotalMilliseconds)
else
let temp = System.Diagnostics.Stopwatch()
temp.Start()
authorityDelay.Add(uriAuthority,temp)
0
let rec getLinksToFollowFromSetAsync maxIteration (uris: seq<ComparableUri>) =
let authorityDelay = Dictionary<string,System.Diagnostics.Stopwatch>()
if maxIteration = 100 then
Console.WriteLine("Finished")
else
//Unite by authority add delay for those we same authority others ignore
let stopwatch= System.Diagnostics.Stopwatch()
stopwatch.Start()
let newLinks = uris
|> Seq.map( fun uri -> let delay = lock authorityDelay (fun() -> getDelay uri authorityDelay)
getLinksToFollowAsyncDelay delay uri)
|> Async.Parallel
|> Async.RunSynchronously
|> Seq.concat
stopwatch.Stop()
Console.WriteLine("\n\n\n\n\n\n\nTimeElapse : "+string stopwatch.Elapsed+"\n\n\n\n\n\n\n\n\n")
getLinksToFollowFromSetAsync (maxIteration+1) newLinks
seq[set[ComparableUri("http://rue89.com/")]]
|>PSeq.ofSeq
|>PSeq.iter(getLinksToFollowFromSetAsync 0)
getLinksToFollowFromSetAsync 0 (seq[ComparableUri("http://twitter.com/")])
Console.WriteLine("Finished")
一些feedBack会很棒!谢谢(请注意,这只是我为乐趣所做的一切)
我认为罪魁祸首是线do! Async.Sleep(delay * 250)
- 你会逐渐等待越来越长的时间。这是什么原因?
你不想经常请求到同一台服务器,否则网站管理员会不高兴你正在使用所有的服务器bandwithch(http://*.com/questions/3021888/asynchronous-crawling-f)。感谢您查看代码 – jlezard 2010-06-11 16:50:00
同意。如果你立刻开始了所有的事情,那么交错的延迟是有道理的,但是如果你刚刚完成了这一页让你回到相同权威的页面,可能时间已经过去了,但现在你正在等待N + 1秒... – Brian 2010-06-11 16:52:13
我想你想使用'每个权威一个代理'来确保你每秒向该权威发送不超过1个请求(或其他)。但是,您需要代理商或其他人来跟踪“上次您在此发送请求”。现在你只需说“等15秒”即可发送第15个请求,即使第14个请求已在40s前发送。你需要在正确的时间“开始时钟”。 – Brian 2010-06-11 16:53:54
请格式化很长的代码片段,以便它们适合大约50列,并且不需要右滚动。 – Brian 2010-06-11 16:54:54