26 people like it.

F# googelsearch

A google search automation.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
57: 
58: 
59: 
60: 
61: 
62: 
63: 
64: 
65: 
66: 
67: 
68: 
69: 
70: 
71: 
72: 
73: 
74: 
75: 
76: 
77: 
78: 
79: 
open System
open System.IO
open System.Net
open System.Threading
open System.Windows.Forms
open SHDocVw
open mshtml
open System.Windows.Forms
open System.Text.RegularExpressions

let (|Match|_|) (pat:string) (inp:string) =
    let m = Regex.Match(inp, pat) in
    if m.Success
    then Some (List.tail [ for g in m.Groups -> g.Value ])
    else None

let (|Matches|_|) (pat:string) (inp:string) =
    let m = Regex.Matches(inp, pat) in
    if m.Count > 0
    then Some ( [ for g in m -> g.Value ])
    else None



let getPage (url:string) = 
        let (html:HtmlDocument ref) =  ref null

        let handler (sender:obj) (e: WebBrowserDocumentCompletedEventArgs) = 
            let wb = sender :?> (WebBrowser)
            html := (wb.Document)

        use wb = new WebBrowser()
        wb.Visible<-true
        wb.DocumentCompleted.Add(handler wb)
        wb.Navigate(url)

        while wb.ReadyState <> WebBrowserReadyState.Complete do
                    Application.DoEvents()
    
        html

let googleSearch term = 
    let rec googleSearch link pass res = 
        let search = getPage link
        Thread.Sleep(TimeSpan.FromSeconds(5.0))
    
        let potentialLinks = 
            seq{ for link in search.Value.Links do
                     if not(link.OuterHtml.Contains("google")) then 
                         yield link.OuterHtml }
            |>Seq.toArray
   
        let pat = "href=\"([^>]+)\">"

        let links = 
             potentialLinks
             |> Array.filter(fun elem -> elem.Contains("<A class=\"l\" onmousedown="))
             |> Array.map( fun elem -> match elem with 
                                       |Match pat (link::t) -> link
                                       | _ -> failwith "critical parsing error")
        let next = 
             potentialLinks
             |> Array.filter(fun elem -> elem.Contains("<A class=\"fl\"")&& elem.Contains("/search?"))
             |> Array.map( fun elem -> match elem with 
                                       | Match pat (link::t) -> "http://www.google.com"+link
                                       | _ -> failwith "critical parsing error")

        let newRes = res|>Array.append(links)
        
        if (next.Length-1) <= pass then
           newRes
        else
           let newPass = pass+1
           googleSearch ((next.[newPass]).Replace(";","&")) newPass newRes

    googleSearch ("http://www.google.com/search?num=100&hl=en&q="+term+"&btnG=Search&aq=f&aqi=&aql=&oq=") -1 [||]   


googleSearch "F# love" 
namespace System
namespace System.IO
namespace System.Net
namespace System.Threading
namespace System.Windows
namespace System.Windows.Forms
namespace System.Text
namespace System.Text.RegularExpressions
type Match =
  inherit Group
  member Groups : GroupCollection
  member NextMatch : unit -> Match
  member Result : replacement:string -> string
  static member Empty : Match
  static member Synchronized : inner:Match -> Match

Full name: System.Text.RegularExpressions.Match
val pat : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
val inp : string
val m : Match
Multiple items
type Regex =
  new : pattern:string -> Regex + 1 overload
  member GetGroupNames : unit -> string[]
  member GetGroupNumbers : unit -> int[]
  member GroupNameFromNumber : i:int -> string
  member GroupNumberFromName : name:string -> int
  member IsMatch : input:string -> bool + 1 overload
  member Match : input:string -> Match + 2 overloads
  member Matches : input:string -> MatchCollection + 1 overload
  member Options : RegexOptions
  member Replace : input:string * replacement:string -> string + 5 overloads
  ...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
Regex.Match(input: string, pattern: string) : Match
Regex.Match(input: string, pattern: string, options: RegexOptions) : Match
property Group.Success: bool
union case Option.Some: Value: 'T -> Option<'T>
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val tail : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.tail
val g : Group
property Match.Groups: GroupCollection
property Capture.Value: string
union case Option.None: Option<'T>
val m : MatchCollection
Regex.Matches(input: string, pattern: string) : MatchCollection
Regex.Matches(input: string, pattern: string, options: RegexOptions) : MatchCollection
property MatchCollection.Count: int
val g : Match
val getPage : url:string -> HtmlDocument ref

Full name: Script.getPage
val url : string
val html : HtmlDocument ref
type HtmlDocument =
  member ActiveElement : HtmlElement
  member ActiveLinkColor : Color with get, set
  member All : HtmlElementCollection
  member AttachEventHandler : eventName:string * eventHandler:EventHandler -> unit
  member BackColor : Color with get, set
  member Body : HtmlElement
  member Cookie : string with get, set
  member CreateElement : elementTag:string -> HtmlElement
  member DefaultEncoding : string
  member DetachEventHandler : eventName:string * eventHandler:EventHandler -> unit
  ...

Full name: System.Windows.Forms.HtmlDocument
Multiple items
val ref : value:'T -> 'T ref

Full name: Microsoft.FSharp.Core.Operators.ref

--------------------
type 'T ref = Ref<'T>

Full name: Microsoft.FSharp.Core.ref<_>
val handler : (obj -> WebBrowserDocumentCompletedEventArgs -> unit)
val sender : obj
type obj = Object

Full name: Microsoft.FSharp.Core.obj
val e : WebBrowserDocumentCompletedEventArgs
Multiple items
type WebBrowserDocumentCompletedEventArgs =
  inherit EventArgs
  new : url:Uri -> WebBrowserDocumentCompletedEventArgs
  member Url : Uri

Full name: System.Windows.Forms.WebBrowserDocumentCompletedEventArgs

--------------------
WebBrowserDocumentCompletedEventArgs(url: Uri) : unit
val wb : WebBrowser
Multiple items
type WebBrowser =
  inherit WebBrowserBase
  new : unit -> WebBrowser
  member AllowNavigation : bool with get, set
  member AllowWebBrowserDrop : bool with get, set
  member CanGoBack : bool
  member CanGoForward : bool
  member Document : HtmlDocument
  member DocumentStream : Stream with get, set
  member DocumentText : string with get, set
  member DocumentTitle : string
  member DocumentType : string
  ...

Full name: System.Windows.Forms.WebBrowser

--------------------
WebBrowser() : unit
property WebBrowser.Document: HtmlDocument
property Control.Visible: bool
event WebBrowser.DocumentCompleted: IEvent<WebBrowserDocumentCompletedEventHandler,WebBrowserDocumentCompletedEventArgs>
member IObservable.Add : callback:('T -> unit) -> unit
WebBrowser.Navigate(urlString: string) : unit
WebBrowser.Navigate(url: Uri) : unit
WebBrowser.Navigate(urlString: string, newWindow: bool) : unit
WebBrowser.Navigate(url: Uri, newWindow: bool) : unit
WebBrowser.Navigate(urlString: string, targetFrameName: string) : unit
WebBrowser.Navigate(url: Uri, targetFrameName: string) : unit
WebBrowser.Navigate(urlString: string, targetFrameName: string, postData: byte [], additionalHeaders: string) : unit
WebBrowser.Navigate(url: Uri, targetFrameName: string, postData: byte [], additionalHeaders: string) : unit
property WebBrowser.ReadyState: WebBrowserReadyState
type WebBrowserReadyState =
  | Uninitialized = 0
  | Loading = 1
  | Loaded = 2
  | Interactive = 3
  | Complete = 4

Full name: System.Windows.Forms.WebBrowserReadyState
field WebBrowserReadyState.Complete = 4
type Application =
  static member AddMessageFilter : value:IMessageFilter -> unit
  static member AllowQuit : bool
  static member CommonAppDataPath : string
  static member CommonAppDataRegistry : RegistryKey
  static member CompanyName : string
  static member CurrentCulture : CultureInfo with get, set
  static member CurrentInputLanguage : InputLanguage with get, set
  static member DoEvents : unit -> unit
  static member EnableVisualStyles : unit -> unit
  static member ExecutablePath : string
  ...
  nested type MessageLoopCallback

Full name: System.Windows.Forms.Application
Application.DoEvents() : unit
val googleSearch : term:string -> string []

Full name: Script.googleSearch
val term : string
val googleSearch : (string -> int -> string [] -> string [])
val link : string
val pass : int
val res : string []
val search : HtmlDocument ref
Multiple items
type Thread =
  inherit CriticalFinalizerObject
  new : start:ThreadStart -> Thread + 3 overloads
  member Abort : unit -> unit + 1 overload
  member ApartmentState : ApartmentState with get, set
  member CurrentCulture : CultureInfo with get, set
  member CurrentUICulture : CultureInfo with get, set
  member DisableComObjectEagerCleanup : unit -> unit
  member ExecutionContext : ExecutionContext
  member GetApartmentState : unit -> ApartmentState
  member GetCompressedStack : unit -> CompressedStack
  member GetHashCode : unit -> int
  ...

Full name: System.Threading.Thread

--------------------
Thread(start: ThreadStart) : unit
Thread(start: ParameterizedThreadStart) : unit
Thread(start: ThreadStart, maxStackSize: int) : unit
Thread(start: ParameterizedThreadStart, maxStackSize: int) : unit
Thread.Sleep(timeout: TimeSpan) : unit
Thread.Sleep(millisecondsTimeout: int) : unit
Multiple items
type TimeSpan =
  struct
    new : ticks:int64 -> TimeSpan + 3 overloads
    member Add : ts:TimeSpan -> TimeSpan
    member CompareTo : value:obj -> int + 1 overload
    member Days : int
    member Duration : unit -> TimeSpan
    member Equals : value:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member Hours : int
    member Milliseconds : int
    member Minutes : int
    ...
  end

Full name: System.TimeSpan

--------------------
TimeSpan()
TimeSpan(ticks: int64) : unit
TimeSpan(hours: int, minutes: int, seconds: int) : unit
TimeSpan(days: int, hours: int, minutes: int, seconds: int) : unit
TimeSpan(days: int, hours: int, minutes: int, seconds: int, milliseconds: int) : unit
TimeSpan.FromSeconds(value: float) : TimeSpan
val potentialLinks : string []
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
val link : HtmlElement
property Ref.Value: HtmlDocument
property HtmlDocument.Links: HtmlElementCollection
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
property HtmlElement.OuterHtml: string
String.Contains(value: string) : bool
module Seq

from Microsoft.FSharp.Collections
val toArray : source:seq<'T> -> 'T []

Full name: Microsoft.FSharp.Collections.Seq.toArray
val links : string []
type Array =
  member Clone : unit -> obj
  member CopyTo : array:Array * index:int -> unit + 1 overload
  member GetEnumerator : unit -> IEnumerator
  member GetLength : dimension:int -> int
  member GetLongLength : dimension:int -> int64
  member GetLowerBound : dimension:int -> int
  member GetUpperBound : dimension:int -> int
  member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
  member Initialize : unit -> unit
  member IsFixedSize : bool
  ...

Full name: System.Array
val filter : predicate:('T -> bool) -> array:'T [] -> 'T []

Full name: Microsoft.FSharp.Collections.Array.filter
val elem : string
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.map
Multiple items
active recognizer Match: string -> string -> string list option

Full name: Script.( |Match|_| )

--------------------
type Match =
  inherit Group
  member Groups : GroupCollection
  member NextMatch : unit -> Match
  member Result : replacement:string -> string
  static member Empty : Match
  static member Synchronized : inner:Match -> Match

Full name: System.Text.RegularExpressions.Match
val t : string list
val failwith : message:string -> 'T

Full name: Microsoft.FSharp.Core.Operators.failwith
val next : string []
val newRes : string []
val append : array1:'T [] -> array2:'T [] -> 'T []

Full name: Microsoft.FSharp.Collections.Array.append
property Array.Length: int
val newPass : int
Raw view Test code New version

More information

Link:http://fssnip.net/35
Posted:13 years ago
Author:Chief Inspector Clouseau
Tags: google search , webcrawling