2 people like it.
Like the snippet!
StackOverflowCrawler
Tries to find the best technologies from stackoverflow.
Don't use too wide tags (with many thousand request) or firewall will block you!
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
|
module StackOverflowCrawler
open System // Mennään .NET-perus-stäkillä.
open System.Net // async-webrequest-versio helppo tehdä tällä: http://fsharppowerpack.codeplex.com/
open System.IO // string-parsinta kannattaisi tehdä tällä: http://htmlagilitypack.codeplex.com/
open System.Web
let fetch (url : Uri) =
let req = WebRequest.Create (url) :?> HttpWebRequest
use stream = req.GetResponse().GetResponseStream()
use reader = new StreamReader(stream)
reader.ReadToEnd()
let makeUrl pagetype (tags:string) =
new Uri("http://stackoverflow.com/" + pagetype + "/tagged/" + HttpUtility.UrlEncode(tags))
let questions, unanswered = makeUrl "questions", makeUrl "unanswered"
let sumcount (fetched:string) =
let startpos = (fetched.IndexOf "<div class=\"summarycount al\">")+29
let endpos = fetched.IndexOf("</div>",startpos)
fetched.Substring(startpos,endpos-startpos).Replace(",","") |> Double.Parse
let relatedtags (basetag:string) (fetched:string) = //lisää parsintaa...
let rec relativepositions (links:string) (found:string list) =
let startpos = links.IndexOf("/questions/tagged/" + basetag + "+")
let realpos = startpos + 19 + basetag.Length
let endpos = links.IndexOf("\"", realpos)
let tag = links.Substring(realpos,endpos-realpos)
match startpos with -1 -> found | _ -> tag :: relativepositions (links.Substring realpos) found
relativepositions fetched []
type surfmode = Inclusive | Exclusive
let checktag basetag (sm:surfmode) =
let acceptRate, minCount = 0.02, 1000.0;
let add = match sm with Inclusive -> "+" | Exclusive -> "+-"
let rec surf (tags:string) (tagsToSurf:string list) =
let fetchTotalPage = tags |> (questions >> fetch)
let taggedQuestions = fetchTotalPage |> sumcount
if taggedQuestions >= minCount then
let unasweredWithTag = unanswered >> fetch >> sumcount
let ratio = (unasweredWithTag tags) / taggedQuestions
let surfTheRestOfTree =
let test tag = surf (tags + add + tag) []
List.iter test tagsToSurf
do printfn "Ratio %f and count %g with tags %s" ratio taggedQuestions tags
match ratio with
| r when r <= acceptRate -> do printfn "--- Accepted: %s ---" basetag
| _ ->
match relatedtags tags fetchTotalPage with
| first::rest ->
do printfn "Failed. Trying %d related..." rest.Length
surf (tags + add + first) rest
surfTheRestOfTree
| _ -> surfTheRestOfTree
surf (basetag.ToLower()) []
do printfn "Everything checked."
//Interactive tests:
//questions "java"
//questions "F#"
//unanswered "java"
//let fetched = questions "java" |> fetch
//fetched |> sumcount
//fetched |> relatedtags "java"
//checktag "F#" surfmode.Exclusive
//checktag "flash+flex" surfmode.Inclusive
//checktag "flash+flex" surfmode.Exclusive
//checktag "java" surfmode.Exclusive //jauhaa ikuisuuden eikä löydä mitään?
|
module StackOverflowCrawler
namespace System
namespace System.Net
namespace System.IO
namespace System.Web
val fetch : url:Uri -> string
Full name: StackOverflowCrawler.fetch
val url : Uri
Multiple items
type Uri =
new : uriString:string -> Uri + 5 overloads
member AbsolutePath : string
member AbsoluteUri : string
member Authority : string
member DnsSafeHost : string
member Equals : comparand:obj -> bool
member Fragment : string
member GetComponents : components:UriComponents * format:UriFormat -> string
member GetHashCode : unit -> int
member GetLeftPart : part:UriPartial -> string
...
Full name: System.Uri
--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
val req : HttpWebRequest
type WebRequest =
inherit MarshalByRefObject
member Abort : unit -> unit
member AuthenticationLevel : AuthenticationLevel with get, set
member BeginGetRequestStream : callback:AsyncCallback * state:obj -> IAsyncResult
member BeginGetResponse : callback:AsyncCallback * state:obj -> IAsyncResult
member CachePolicy : RequestCachePolicy with get, set
member ConnectionGroupName : string with get, set
member ContentLength : int64 with get, set
member ContentType : string with get, set
member Credentials : ICredentials with get, set
member EndGetRequestStream : asyncResult:IAsyncResult -> Stream
...
Full name: System.Net.WebRequest
WebRequest.Create(requestUri: Uri) : WebRequest
WebRequest.Create(requestUriString: string) : WebRequest
type HttpWebRequest =
inherit WebRequest
member Abort : unit -> unit
member Accept : string with get, set
member AddRange : range:int -> unit + 7 overloads
member Address : Uri
member AllowAutoRedirect : bool with get, set
member AllowWriteStreamBuffering : bool with get, set
member AutomaticDecompression : DecompressionMethods with get, set
member BeginGetRequestStream : callback:AsyncCallback * state:obj -> IAsyncResult
member BeginGetResponse : callback:AsyncCallback * state:obj -> IAsyncResult
member ClientCertificates : X509CertificateCollection with get, set
...
Full name: System.Net.HttpWebRequest
val stream : Stream
HttpWebRequest.GetResponse() : WebResponse
val reader : StreamReader
Multiple items
type StreamReader =
inherit TextReader
new : stream:Stream -> StreamReader + 9 overloads
member BaseStream : Stream
member Close : unit -> unit
member CurrentEncoding : Encoding
member DiscardBufferedData : unit -> unit
member EndOfStream : bool
member Peek : unit -> int
member Read : unit -> int + 1 overload
member ReadLine : unit -> string
member ReadToEnd : unit -> string
...
Full name: System.IO.StreamReader
--------------------
StreamReader(stream: Stream) : unit
StreamReader(path: string) : unit
StreamReader(stream: Stream, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(stream: Stream, encoding: Text.Encoding) : unit
StreamReader(path: string, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(path: string, encoding: Text.Encoding) : unit
StreamReader(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
StreamReader.ReadToEnd() : string
val makeUrl : pagetype:string -> tags:string -> Uri
Full name: StackOverflowCrawler.makeUrl
val pagetype : string
val tags : string
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = String
Full name: Microsoft.FSharp.Core.string
Multiple items
type HttpUtility =
new : unit -> HttpUtility
static member HtmlAttributeEncode : s:string -> string + 1 overload
static member HtmlDecode : s:string -> string + 1 overload
static member HtmlEncode : s:string -> string + 2 overloads
static member JavaScriptStringEncode : value:string -> string + 1 overload
static member ParseQueryString : query:string -> NameValueCollection + 1 overload
static member UrlDecode : str:string -> string + 3 overloads
static member UrlDecodeToBytes : str:string -> byte[] + 3 overloads
static member UrlEncode : str:string -> string + 3 overloads
static member UrlEncodeToBytes : str:string -> byte[] + 3 overloads
...
Full name: System.Web.HttpUtility
--------------------
HttpUtility() : unit
HttpUtility.UrlEncode(bytes: byte []) : string
HttpUtility.UrlEncode(str: string) : string
HttpUtility.UrlEncode(str: string, e: Text.Encoding) : string
HttpUtility.UrlEncode(bytes: byte [], offset: int, count: int) : string
val questions : (string -> Uri)
Full name: StackOverflowCrawler.questions
val unanswered : (string -> Uri)
Full name: StackOverflowCrawler.unanswered
val sumcount : fetched:string -> float
Full name: StackOverflowCrawler.sumcount
val fetched : string
val startpos : int
String.IndexOf(value: string) : int
String.IndexOf(value: char) : int
String.IndexOf(value: string, comparisonType: StringComparison) : int
String.IndexOf(value: string, startIndex: int) : int
String.IndexOf(value: char, startIndex: int) : int
String.IndexOf(value: string, startIndex: int, comparisonType: StringComparison) : int
String.IndexOf(value: string, startIndex: int, count: int) : int
String.IndexOf(value: char, startIndex: int, count: int) : int
String.IndexOf(value: string, startIndex: int, count: int, comparisonType: StringComparison) : int
val endpos : int
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
type Double =
struct
member CompareTo : value:obj -> int + 1 overload
member Equals : obj:obj -> bool + 1 overload
member GetHashCode : unit -> int
member GetTypeCode : unit -> TypeCode
member ToString : unit -> string + 3 overloads
static val MinValue : float
static val MaxValue : float
static val Epsilon : float
static val NegativeInfinity : float
static val PositiveInfinity : float
...
end
Full name: System.Double
Double.Parse(s: string) : float
Double.Parse(s: string, provider: IFormatProvider) : float
Double.Parse(s: string, style: Globalization.NumberStyles) : float
Double.Parse(s: string, style: Globalization.NumberStyles, provider: IFormatProvider) : float
val relatedtags : basetag:string -> fetched:string -> string list
Full name: StackOverflowCrawler.relatedtags
val basetag : string
val relativepositions : (string -> string list -> string list)
val links : string
val found : string list
type 'T list = List<'T>
Full name: Microsoft.FSharp.Collections.list<_>
val realpos : int
property String.Length: int
val tag : string
type surfmode =
| Inclusive
| Exclusive
Full name: StackOverflowCrawler.surfmode
union case surfmode.Inclusive: surfmode
union case surfmode.Exclusive: surfmode
val checktag : basetag:string -> sm:surfmode -> unit
Full name: StackOverflowCrawler.checktag
val sm : surfmode
val acceptRate : float
val minCount : float
val add : string
val surf : (string -> string list -> unit)
val tagsToSurf : string list
val fetchTotalPage : string
val taggedQuestions : float
val unasweredWithTag : (string -> float)
val ratio : float
val surfTheRestOfTree : unit
val test : (string -> unit)
Multiple items
module List
from Microsoft.FSharp.Collections
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
val iter : action:('T -> unit) -> list:'T list -> unit
Full name: Microsoft.FSharp.Collections.List.iter
val printfn : format:Printf.TextWriterFormat<'T> -> 'T
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
val r : float
val first : string
val rest : string list
property List.Length: int
String.ToLower() : string
String.ToLower(culture: Globalization.CultureInfo) : string
More information