1 people like it.

Screen Scraping with the XTract Package

XTract (https://github.com/TahaHachana/XTract) is a simple screen scraping package for F#. This sample shows how to describe the data model using a record, define the extractors that will collect the data using CSS selectors, scrape the data from the target URL(s) and save it.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
open System
open System.IO
open XTract

// Describe the data model.
type Tweet =
    {
        avatar: string
        screenName: string
        account: string
        tweet: string
    }

// Define the data extractors.
// avatar field
let avatar =
    "div:nth-child(1).anchor > div:nth-child(2) > div.row.data-row > div.col-md-5 > div:nth-child(1).media > a:nth-child(1).media-left > img:nth-child(1).avatar.lazy"
    |> Extractor.New
    |> Extractor.WithAttributes ["data-original"]

// screenName and account fields
let screenName =
    "div > div > div.media-body.twitter-media-body > h4.media-heading > a"
    |> Extractor.New
    |> Extractor.WithAttributes ["text"; "href"]

// tweet text field
let tweet =
    "div > div > div > div.media-body.twitter-media-body > p"
    |> Extractor.New

// Initialize a scraper
let scraper = Scraper<Tweet> [avatar; screenName; tweet]

let url = "http://fsharp-hub.apphb.com/"

// Scrape a single item
let firstMatch = scraper.Scrape url

// Or scrape all the items
let allMatches = scraper.ScrapeAll url

// Scrape multiple pages and let the scraper handle storing
// the records, then get the data as an array or in JSON format.
let data = scraper.Data()

let jsonData = scraper.JsonData()

// Save as CSV 
let desktop = Environment.GetFolderPath Environment.SpecialFolder.Desktop
let path = Path.Combine(desktop, "data.csv")
scraper.SaveCsv(path)

// Save an Excel workbook
let path' = Path.Combine(desktop, "data.xlsx")
scraper.SaveExcel path'
namespace System
namespace System.IO
namespace XTract
type Tweet =
  {avatar: string;
   screenName: string;
   account: string;
   tweet: string;}

Full name: Script.Tweet
Tweet.avatar: string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
Tweet.screenName: string
Tweet.account: string
Tweet.tweet: string
val avatar : Extractor

Full name: Script.avatar
type Extractor =
  {selector: Selector;
   pattern: string;
   attributes: string list;
   many: bool;
   groupBy: GroupBy option;}
  static member New : selector:Selector -> Extractor
  static member WithAttributes : attributes:string list -> property:Extractor -> Extractor
  static member WithMany : many:bool -> groupBy:GroupBy -> property:Extractor -> Extractor
  static member WithPattern : pattern:string -> property:Extractor -> Extractor

Full name: XTract.Extraction.Extractor
static member Extractor.New : selector:Selector -> Extractor
static member Extractor.WithAttributes : attributes:string list -> property:Extractor -> Extractor
val screenName : Extractor

Full name: Script.screenName
val tweet : Extractor

Full name: Script.tweet
val scraper : Scraper<Tweet>

Full name: Script.scraper
Multiple items
module Scraper

from XTract

--------------------
type Scraper<'T (requires equality)> =
  new : extractors:Extractor list -> Scraper<'T>
  member Log : msg:string -> unit
  member SaveCsv : path:string -> unit
  member SaveExcel : path:string -> unit
  member Scrape : url:string -> 'T option
  member ScrapeAll : url:string -> 'T list option
  member ScrapeAllHtml : html:string -> url:string -> 'T list option
  member ScrapeHtml : html:string -> url:string -> 'T option
  member StoreFailedRequest : url:string -> unit
  member ThrottleScrape : urls:seq<string> -> Async<unit>
  ...

Full name: XTract.Scraper.Scraper<_>

--------------------
new : extractors:Extractor list -> Scraper<'T>
val url : string

Full name: Script.url
val firstMatch : Tweet option

Full name: Script.firstMatch
member Scraper.Scrape : url:string -> 'T option
val allMatches : Tweet list option

Full name: Script.allMatches
member Scraper.ScrapeAll : url:string -> 'T list option
val data : obj

Full name: Script.data
property Scraper.Data: Tweet []
val jsonData : obj

Full name: Script.jsonData
property Scraper.JsonData: string
val desktop : string

Full name: Script.desktop
type Environment =
  static member CommandLine : string
  static member CurrentDirectory : string with get, set
  static member Exit : exitCode:int -> unit
  static member ExitCode : int with get, set
  static member ExpandEnvironmentVariables : name:string -> string
  static member FailFast : message:string -> unit + 1 overload
  static member GetCommandLineArgs : unit -> string[]
  static member GetEnvironmentVariable : variable:string -> string + 1 overload
  static member GetEnvironmentVariables : unit -> IDictionary + 1 overload
  static member GetFolderPath : folder:SpecialFolder -> string + 1 overload
  ...
  nested type SpecialFolder
  nested type SpecialFolderOption

Full name: System.Environment
Environment.GetFolderPath(folder: Environment.SpecialFolder) : string
Environment.GetFolderPath(folder: Environment.SpecialFolder, option: Environment.SpecialFolderOption) : string
type SpecialFolder =
  | ApplicationData = 26
  | CommonApplicationData = 35
  | LocalApplicationData = 28
  | Cookies = 33
  | Desktop = 0
  | Favorites = 6
  | History = 34
  | InternetCache = 32
  | Programs = 2
  | MyComputer = 17
  ...

Full name: System.Environment.SpecialFolder
field Environment.SpecialFolder.Desktop = 0
val path : string

Full name: Script.path
type Path =
  static val DirectorySeparatorChar : char
  static val AltDirectorySeparatorChar : char
  static val VolumeSeparatorChar : char
  static val InvalidPathChars : char[]
  static val PathSeparator : char
  static member ChangeExtension : path:string * extension:string -> string
  static member Combine : [<ParamArray>] paths:string[] -> string + 3 overloads
  static member GetDirectoryName : path:string -> string
  static member GetExtension : path:string -> string
  static member GetFileName : path:string -> string
  ...

Full name: System.IO.Path
Path.Combine([<ParamArray>] paths: string []) : string
Path.Combine(path1: string, path2: string) : string
Path.Combine(path1: string, path2: string, path3: string) : string
Path.Combine(path1: string, path2: string, path3: string, path4: string) : string
member Scraper.SaveCsv : path:string -> unit
val path' : string

Full name: Script.path'
member Scraper.SaveExcel : path:string -> unit
Raw view Test code New version

More information

Link:http://fssnip.net/p7
Posted:9 years ago
Author:Taha Hachana
Tags: scraping , data