3 people like it.

Poor man's HTML renderer

Takes some HTML, outputs some plaintext. Not tail recursive, not efficient, but got the job done for my use case (had a product feed where the descriptions were HTML, but I needed plaintext).

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
open HtmlAgilityPack
open System.Text

let rec getText (sb:StringBuilder) (node:HtmlNode) : StringBuilder =
    match node.NodeType with
    | HtmlNodeType.Document ->
        node.ChildNodes |> Seq.fold (getText) sb
    | HtmlNodeType.Text ->
        let textNode = (node :?> HtmlTextNode)
        let text = textNode.Text |> HtmlEntity.DeEntitize
        sb.Append text
    | HtmlNodeType.Element ->
        match node.Name with
        | "p" ->
            let sb = node.ChildNodes |> Seq.fold getText (sb.AppendLine())
            sb.AppendLine()
        | "li" ->
            let sb = node.ChildNodes |> Seq.fold (getText) sb
            sb.AppendLine()
        | "div" ->
            node.ChildNodes |> Seq.fold getText (sb.AppendLine())
        | "ul" ->
            let sb = sb.AppendLine()
            node.ChildNodes
            |> Seq.fold(fun (stateSb:StringBuilder) node ->
                let itemSb = new StringBuilder()
                itemSb.Append("• ") |> ignore
                let itemSb = getText itemSb node
                stateSb.Append(itemSb)
            ) (new StringBuilder())
            |> sb.Append
        | "ol" ->
            let sb = sb.AppendLine()
            node.ChildNodes
            |> Seq.fold(fun ((stateSb:StringBuilder),count) node ->
                let itemSb = new StringBuilder()
                itemSb.Append(sprintf "%i. ") |> ignore
                let itemSb = getText itemSb node
                stateSb.Append(itemSb), count + 1
            ) ((new StringBuilder()),1)
            |> fst
            |> sb.Append
        | _ -> node.ChildNodes |> Seq.fold (getText) sb
    | HtmlNodeType.Comment -> sb
    | unknown -> printfn "Unknown value: %A" unknown; sb
namespace System
namespace System.Text
val getText : sb:StringBuilder -> node:'a -> StringBuilder

Full name: Script.getText
val sb : StringBuilder
Multiple items
type StringBuilder =
  new : unit -> StringBuilder + 5 overloads
  member Append : value:string -> StringBuilder + 18 overloads
  member AppendFormat : format:string * arg0:obj -> StringBuilder + 4 overloads
  member AppendLine : unit -> StringBuilder + 1 overload
  member Capacity : int with get, set
  member Chars : int -> char with get, set
  member Clear : unit -> StringBuilder
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EnsureCapacity : capacity:int -> int
  member Equals : sb:StringBuilder -> bool
  ...

Full name: System.Text.StringBuilder

--------------------
StringBuilder() : unit
StringBuilder(capacity: int) : unit
StringBuilder(value: string) : unit
StringBuilder(value: string, capacity: int) : unit
StringBuilder(capacity: int, maxCapacity: int) : unit
StringBuilder(value: string, startIndex: int, length: int, capacity: int) : unit
val node : 'a
module Seq

from Microsoft.FSharp.Collections
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State

Full name: Microsoft.FSharp.Collections.Seq.fold
StringBuilder.Append(value: char []) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: obj) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint64) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint32) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint16) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: decimal) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: float) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: float32) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: int64) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: int) : StringBuilder
   (+0 other overloads)
StringBuilder.AppendLine() : StringBuilder
StringBuilder.AppendLine(value: string) : StringBuilder
val ignore : value:'T -> unit

Full name: Microsoft.FSharp.Core.Operators.ignore
val sprintf : format:Printf.StringFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.sprintf
val fst : tuple:('T1 * 'T2) -> 'T1

Full name: Microsoft.FSharp.Core.Operators.fst
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
Raw view Test code New version

More information

Link:http://fssnip.net/7Vw
Posted:6 years ago
Author:Nat Elkins
Tags: html , htmlagilitypack html