1 people like it.
Like the snippet!
Screen Scraping with the XTract Package
XTract (https://github.com/TahaHachana/XTract) is a simple screen scraping package for F#. This sample shows how to describe the data model using a record, define the extractors that will collect the data using CSS selectors, scrape the data from the target URL(s) and save it.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
|
open System
open System.IO
open XTract
// Describe the data model.
type Tweet =
{
avatar: string
screenName: string
account: string
tweet: string
}
// Define the data extractors.
// avatar field
let avatar =
"div:nth-child(1).anchor > div:nth-child(2) > div.row.data-row > div.col-md-5 > div:nth-child(1).media > a:nth-child(1).media-left > img:nth-child(1).avatar.lazy"
|> Extractor.New
|> Extractor.WithAttributes ["data-original"]
// screenName and account fields
let screenName =
"div > div > div.media-body.twitter-media-body > h4.media-heading > a"
|> Extractor.New
|> Extractor.WithAttributes ["text"; "href"]
// tweet text field
let tweet =
"div > div > div > div.media-body.twitter-media-body > p"
|> Extractor.New
// Initialize a scraper
let scraper = Scraper<Tweet> [avatar; screenName; tweet]
let url = "http://fsharp-hub.apphb.com/"
// Scrape a single item
let firstMatch = scraper.Scrape url
// Or scrape all the items
let allMatches = scraper.ScrapeAll url
// Scrape multiple pages and let the scraper handle storing
// the records, then get the data as an array or in JSON format.
let data = scraper.Data()
let jsonData = scraper.JsonData()
// Save as CSV
let desktop = Environment.GetFolderPath Environment.SpecialFolder.Desktop
let path = Path.Combine(desktop, "data.csv")
scraper.SaveCsv(path)
// Save an Excel workbook
let path' = Path.Combine(desktop, "data.xlsx")
scraper.SaveExcel path'
|
namespace System
namespace System.IO
namespace XTract
type Tweet =
{avatar: string;
screenName: string;
account: string;
tweet: string;}
Full name: Script.Tweet
Tweet.avatar: string
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = String
Full name: Microsoft.FSharp.Core.string
Tweet.screenName: string
Tweet.account: string
Tweet.tweet: string
val avatar : Extractor
Full name: Script.avatar
type Extractor =
{selector: Selector;
pattern: string;
attributes: string list;
many: bool;
groupBy: GroupBy option;}
static member New : selector:Selector -> Extractor
static member WithAttributes : attributes:string list -> property:Extractor -> Extractor
static member WithMany : many:bool -> groupBy:GroupBy -> property:Extractor -> Extractor
static member WithPattern : pattern:string -> property:Extractor -> Extractor
Full name: XTract.Extraction.Extractor
static member Extractor.New : selector:Selector -> Extractor
static member Extractor.WithAttributes : attributes:string list -> property:Extractor -> Extractor
val screenName : Extractor
Full name: Script.screenName
val tweet : Extractor
Full name: Script.tweet
val scraper : Scraper<Tweet>
Full name: Script.scraper
Multiple items
module Scraper
from XTract
--------------------
type Scraper<'T (requires equality)> =
new : extractors:Extractor list -> Scraper<'T>
member Log : msg:string -> unit
member SaveCsv : path:string -> unit
member SaveExcel : path:string -> unit
member Scrape : url:string -> 'T option
member ScrapeAll : url:string -> 'T list option
member ScrapeAllHtml : html:string -> url:string -> 'T list option
member ScrapeHtml : html:string -> url:string -> 'T option
member StoreFailedRequest : url:string -> unit
member ThrottleScrape : urls:seq<string> -> Async<unit>
...
Full name: XTract.Scraper.Scraper<_>
--------------------
new : extractors:Extractor list -> Scraper<'T>
val url : string
Full name: Script.url
val firstMatch : Tweet option
Full name: Script.firstMatch
member Scraper.Scrape : url:string -> 'T option
val allMatches : Tweet list option
Full name: Script.allMatches
member Scraper.ScrapeAll : url:string -> 'T list option
val data : obj
Full name: Script.data
property Scraper.Data: Tweet []
val jsonData : obj
Full name: Script.jsonData
property Scraper.JsonData: string
val desktop : string
Full name: Script.desktop
type Environment =
static member CommandLine : string
static member CurrentDirectory : string with get, set
static member Exit : exitCode:int -> unit
static member ExitCode : int with get, set
static member ExpandEnvironmentVariables : name:string -> string
static member FailFast : message:string -> unit + 1 overload
static member GetCommandLineArgs : unit -> string[]
static member GetEnvironmentVariable : variable:string -> string + 1 overload
static member GetEnvironmentVariables : unit -> IDictionary + 1 overload
static member GetFolderPath : folder:SpecialFolder -> string + 1 overload
...
nested type SpecialFolder
nested type SpecialFolderOption
Full name: System.Environment
Environment.GetFolderPath(folder: Environment.SpecialFolder) : string
Environment.GetFolderPath(folder: Environment.SpecialFolder, option: Environment.SpecialFolderOption) : string
type SpecialFolder =
| ApplicationData = 26
| CommonApplicationData = 35
| LocalApplicationData = 28
| Cookies = 33
| Desktop = 0
| Favorites = 6
| History = 34
| InternetCache = 32
| Programs = 2
| MyComputer = 17
...
Full name: System.Environment.SpecialFolder
field Environment.SpecialFolder.Desktop = 0
val path : string
Full name: Script.path
type Path =
static val DirectorySeparatorChar : char
static val AltDirectorySeparatorChar : char
static val VolumeSeparatorChar : char
static val InvalidPathChars : char[]
static val PathSeparator : char
static member ChangeExtension : path:string * extension:string -> string
static member Combine : [<ParamArray>] paths:string[] -> string + 3 overloads
static member GetDirectoryName : path:string -> string
static member GetExtension : path:string -> string
static member GetFileName : path:string -> string
...
Full name: System.IO.Path
Path.Combine([<ParamArray>] paths: string []) : string
Path.Combine(path1: string, path2: string) : string
Path.Combine(path1: string, path2: string, path3: string) : string
Path.Combine(path1: string, path2: string, path3: string, path4: string) : string
member Scraper.SaveCsv : path:string -> unit
val path' : string
Full name: Script.path'
member Scraper.SaveExcel : path:string -> unit
More information