10 people like it.
Like the snippet!
AsyncSeq - Introduction and Crawler
This snippet demonstrates programming using asynchronous sequences. It contains (hidden) implementation of AsyncSeq type and combinators for working with it. More importantly, it demonstrates how to use asynchronous sequences to implement a simple sequential on-demand crawler.
1:
2:
3:
4:
5:
6:
7:
|
// When accessed, generates numbers 1 and 2. The number
// is returned 1 second after value is requested.
let oneTwo = asyncSeq {
do! Async.Sleep(1000)
yield 1
do! Async.Sleep(1000)
yield 2 }
|
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
|
let urls =
[ "http://bing.com"; "http://yahoo.com";
"http://google.com"; "http://msn.com" ]
// Asynchronous sequence that returns URLs and lengths
// of the downloaded HTML. Web pages from a given list
// are downloaded synchronously in sequence.
let pages = asyncSeq {
use wc = new WebClient()
for url in urls do
try
let! html = wc.AsyncDownloadString(Uri(url))
yield url, html.Length
with _ ->
yield url, -1 }
// Asynchronous workflow that prints results
async {
for url, length in pages do
printfn "%s (%d)" url length }
|> Async.Start
// Print URL of pages that are smaller than 50k
pages
|> AsyncSeq.filter (fun (_, len) -> len < 50000)
|> AsyncSeq.map fst
|> AsyncSeq.iter (printfn "%s")
|> Async.Start
|
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
|
open HtmlAgilityPack
open System.Text.RegularExpressions
/// Asynchronously download the document and parse the HTML
let downloadDocument url = (*[omit:(...)*)async {
try let wc = new WebClient()
let! html = wc.AsyncDownloadString(Uri(url))
let doc = new HtmlDocument()
doc.LoadHtml(html)
return Some doc
with _ -> return None }(*[/omit]*)
/// Extract all links from the document that start with "http://"
let extractLinks (doc:HtmlDocument) = (...)
/// Extract the <title> of the web page
let getTitle (doc:HtmlDocument) = (...)
/// Crawl the internet starting from the specified page.
/// From each page follow the first not-yet-visited page.
let rec randomCrawl url =
let visited = new System.Collections.Generic.HashSet<_>()
// Visits page and then recursively visits all referenced pages
let rec loop url = asyncSeq {
if visited.Add(url) then
let! doc = downloadDocument url
match doc with
| Some doc ->
// Yield url and title as the next element
yield url, getTitle doc
// For every link, yield all referenced pages too
for link in extractLinks doc do
yield! loop link
| _ -> () }
loop url
// Use AsyncSeq combinators to print the titles of the first 10
// web sites that are from other domains than bing.com
randomCrawl "http://news.bing.com"
|> AsyncSeq.filter (fun (url, title) -> url.Contains("bing.com") |> not)
|> AsyncSeq.map snd
|> AsyncSeq.take 10
|> AsyncSeq.iter (printfn "%s")
|> Async.Start
|
val oneTwo : AsyncSeq<int>
Full name: Script.Samples.oneTwo
val asyncSeq : AsyncSeq.AsyncSeqBuilder
Full name: Script.AsyncSeqExtensions.asyncSeq
Builds an asynchronou sequence using the computation builder syntax
Multiple items
type Async
static member AsBeginEnd : computation:('Arg -> Async<'T>) -> ('Arg * AsyncCallback * obj -> IAsyncResult) * (IAsyncResult -> 'T) * (IAsyncResult -> unit)
static member AwaitEvent : event:IEvent<'Del,'T> * ?cancelAction:(unit -> unit) -> Async<'T> (requires delegate and 'Del :> Delegate)
static member AwaitIAsyncResult : iar:IAsyncResult * ?millisecondsTimeout:int -> Async<bool>
static member AwaitTask : task:Task<'T> -> Async<'T>
static member AwaitWaitHandle : waitHandle:WaitHandle * ?millisecondsTimeout:int -> Async<bool>
static member CancelDefaultToken : unit -> unit
static member Catch : computation:Async<'T> -> Async<Choice<'T,exn>>
static member FromBeginEnd : beginAction:(AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg:'Arg1 * beginAction:('Arg1 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * beginAction:('Arg1 * 'Arg2 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * arg3:'Arg3 * beginAction:('Arg1 * 'Arg2 * 'Arg3 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromContinuations : callback:(('T -> unit) * (exn -> unit) * (OperationCanceledException -> unit) -> unit) -> Async<'T>
static member Ignore : computation:Async<'T> -> Async<unit>
static member OnCancel : interruption:(unit -> unit) -> Async<IDisposable>
static member Parallel : computations:seq<Async<'T>> -> Async<'T []>
static member RunSynchronously : computation:Async<'T> * ?timeout:int * ?cancellationToken:CancellationToken -> 'T
static member Sleep : millisecondsDueTime:int -> Async<unit>
static member Start : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions * ?cancellationToken:CancellationToken -> Task<'T>
static member StartChild : computation:Async<'T> * ?millisecondsTimeout:int -> Async<Async<'T>>
static member StartChildAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions -> Async<Task<'T>>
static member StartImmediate : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartWithContinuations : computation:Async<'T> * continuation:('T -> unit) * exceptionContinuation:(exn -> unit) * cancellationContinuation:(OperationCanceledException -> unit) * ?cancellationToken:CancellationToken -> unit
static member SwitchToContext : syncContext:SynchronizationContext -> Async<unit>
static member SwitchToNewThread : unit -> Async<unit>
static member SwitchToThreadPool : unit -> Async<unit>
static member TryCancelled : computation:Async<'T> * compensation:(OperationCanceledException -> unit) -> Async<'T>
static member CancellationToken : Async<CancellationToken>
static member DefaultCancellationToken : CancellationToken
Full name: Microsoft.FSharp.Control.Async
--------------------
type Async<'T>
Full name: Microsoft.FSharp.Control.Async<_>
static member Async.Sleep : millisecondsDueTime:int -> Async<unit>
val urls : string list
Full name: Script.Samples.urls
val pages : AsyncSeq<string * int>
Full name: Script.Samples.pages
val wc : WebClient
Multiple items
type WebClient =
inherit Component
new : unit -> WebClient
member BaseAddress : string with get, set
member CachePolicy : RequestCachePolicy with get, set
member CancelAsync : unit -> unit
member Credentials : ICredentials with get, set
member DownloadData : address:string -> byte[] + 1 overload
member DownloadDataAsync : address:Uri -> unit + 1 overload
member DownloadFile : address:string * fileName:string -> unit + 1 overload
member DownloadFileAsync : address:Uri * fileName:string -> unit + 1 overload
member DownloadString : address:string -> string + 1 overload
...
Full name: System.Net.WebClient
--------------------
WebClient() : unit
val url : string
val html : string
member WebClient.AsyncDownloadString : address:Uri -> Async<string>
Multiple items
type Uri =
new : uriString:string -> Uri + 5 overloads
member AbsolutePath : string
member AbsoluteUri : string
member Authority : string
member DnsSafeHost : string
member Equals : comparand:obj -> bool
member Fragment : string
member GetComponents : components:UriComponents * format:UriFormat -> string
member GetHashCode : unit -> int
member GetLeftPart : part:UriPartial -> string
...
Full name: System.Uri
--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
property String.Length: int
val async : AsyncBuilder
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.async
val length : int
val printfn : format:Printf.TextWriterFormat<'T> -> 'T
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
static member Async.Start : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
Multiple items
module AsyncSeq
from Script
Module with helper functions for working with asynchronous sequences
--------------------
type AsyncSeq<'T> = Async<AsyncSeqInner<'T>>
Full name: Script.AsyncSeq<_>
An asynchronous sequence represents a delayed computation that can be
started to produce either Cons value consisting of the next element of the
sequence (head) together with the next asynchronous sequence (tail) or a
special value representing the end of the sequence (Nil)
val filter : f:('T -> bool) -> input:AsyncSeq<'T> -> AsyncSeq<'T>
Full name: Script.AsyncSeq.filter
Same as AsyncSeq.filterAsync, but the specified predicate is synchronous
and processes the input element immediately.
val len : int
val map : f:('T -> 'a) -> input:AsyncSeq<'T> -> AsyncSeq<'a>
Full name: Script.AsyncSeq.map
Same as AsyncSeq.mapAsync, but the specified function is synchronous
and returns the result of projection immediately.
val fst : tuple:('T1 * 'T2) -> 'T1
Full name: Microsoft.FSharp.Core.Operators.fst
val iter : f:('T -> unit) -> input:AsyncSeq<'T> -> Async<unit>
Full name: Script.AsyncSeq.iter
Same as AsyncSeq.iterAsync, but the specified function is synchronous
and performs the side-effect immediately.
namespace System
namespace System.Text
namespace System.Text.RegularExpressions
val downloadDocument : url:string -> Async<'a option>
Full name: Script.Samples.downloadDocument
Asynchronously download the document and parse the HTML
val doc : 'a
union case Option.Some: Value: 'T -> Option<'T>
union case Option.None: Option<'T>
val extractLinks : doc:'a -> 'b list
Full name: Script.Samples.extractLinks
Extract all links from the document that start with "http://"
try
[ for a in doc.DocumentNode.SelectNodes("//a") do
if a.Attributes.Contains("href") then
let href = a.Attributes.["href"].Value
if href.StartsWith("http://") then
let endl = href.IndexOf('?')
yield if endl > 0 then href.Substring(0, endl) else href ]
with _ -> []
val getTitle : doc:'a -> string
Full name: Script.Samples.getTitle
Extract the <title> of the web page
let title = doc.DocumentNode.SelectSingleNode("//title")
if title <> null then title.InnerText.Trim() else "Untitled"
val randomCrawl : url:string -> AsyncSeq<string * string>
Full name: Script.Samples.randomCrawl
Crawl the internet starting from the specified page.
From each page follow the first not-yet-visited page.
val visited : HashSet<string>
namespace System.Collections
namespace System.Collections.Generic
Multiple items
type HashSet<'T> =
new : unit -> HashSet<'T> + 3 overloads
member Add : item:'T -> bool
member Clear : unit -> unit
member Comparer : IEqualityComparer<'T>
member Contains : item:'T -> bool
member CopyTo : array:'T[] -> unit + 2 overloads
member Count : int
member ExceptWith : other:IEnumerable<'T> -> unit
member GetEnumerator : unit -> Enumerator<'T>
member GetObjectData : info:SerializationInfo * context:StreamingContext -> unit
...
nested type Enumerator
Full name: System.Collections.Generic.HashSet<_>
--------------------
HashSet() : unit
HashSet(comparer: IEqualityComparer<'T>) : unit
HashSet(collection: IEnumerable<'T>) : unit
HashSet(collection: IEnumerable<'T>, comparer: IEqualityComparer<'T>) : unit
val loop : (string -> AsyncSeq<string * string>)
HashSet.Add(item: string) : bool
val doc : obj option
val doc : obj
val link : string
val title : string
String.Contains(value: string) : bool
val not : value:bool -> bool
Full name: Microsoft.FSharp.Core.Operators.not
val snd : tuple:('T1 * 'T2) -> 'T2
Full name: Microsoft.FSharp.Core.Operators.snd
val take : count:int -> input:AsyncSeq<'T> -> AsyncSeq<'T>
Full name: Script.AsyncSeq.take
Returns the first N elements of an asynchronous sequence
More information