11 people like it.

Lazy Xml

A Lazy Xml structure for processing large xml documents.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
57: 
58: 
59: 
60: 
61: 
62: 
63: 
64: 
65: 
66: 
67: 
68: 
69: 
70: 
71: 
72: 
73: 
74: 
75: 
76: 
77: 
78: 
79: 
80: 
81: 
82: 
83: 
84: 
85: 
86: 
87: 
88: 
#r "FSharp.Powerpack.dll"

open System
open System.Xml
open Microsoft.FSharp.Collections

let wiki = "c:\enwiki-20120307-pages-articles\enwiki-20120307-pages-articles.xml"

type name = string
type attributes = (string * string) list 
type LazyXml =
    | Element of (name * attributes * LazyList<LazyXml>)
    | Text of string


let readLazyXml (xmlUri : string) : LazyXml = 
    let readAttributes (reader : XmlReader) = 
        if reader.HasAttributes then
            [ while reader.MoveToNextAttribute() do yield (reader.Name, reader.Value) ]
        else []
    let rec read (reader : XmlReader) = 
        seq {
            if reader.Read() then
                match reader.NodeType with
                | XmlNodeType.Element ->
                    let reader' = (reader.ReadSubtree() |> (fun reader' -> reader'.Read() |> ignore; reader'))
                    yield Element (reader.Name, readAttributes reader, reader' |> read |> LazyList.ofSeq)
                    reader'.Close(); reader.Skip() // close nested reader, move forward current reader 
                    // continue
                    yield! read reader
                | XmlNodeType.EndElement ->
                    ()
                | XmlNodeType.Whitespace ->
                    yield! read reader
                | XmlNodeType.Text ->
                    yield Text reader.Value
                    yield! read reader
                | _ -> failwithf "Not supported XmlNodeType: %s" <| reader.NodeType.ToString()
            else
                ()
        }
    XmlReader.Create(xmlUri) |> read |> LazyList.ofSeq |> LazyList.head

printfn "%A" <| readLazyXml wiki 

//Element
//  ("mediawiki",
//   [("xmlns", "http://www.mediawiki.org/xml/export-0.6/");
//    ("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance");
//    ("xsi:schemaLocation",
//     "http://www.mediawiki.org/xml/export-0.6/ http://www.mediawiki.org/xml/expo
//rt-0.6.xsd");
//    ("version", "0.6"); ("xml:lang", "en")],
//   seq
//     [Element
//        ("siteinfo", [],
//         seq
//           [Element ("sitename", [], seq [Text "Wikipedia"]);
//            Element
//              ("base", [], seq [Text "http://en.wikipedia.org/wiki/Main_Page"]);
//
//            Element ("generator", [], seq [Text "MediaWiki 1.19wmf1"]);
//            Element ("case", [], seq [Text "first-letter"]); ...]);
//      Element
//        ("page", [],
//         seq
//           [Element ("title", [], seq [Text "AccessibleComputing"]);
//            Element ("ns", [], seq [Text "0"]);
//            Element ("id", [], seq [Text "10"]);
//            Element ("redirect", [("title", "Computer accessibility")], seq []);
//
//            ...]);
//      Element
//        ("page", [],
//         seq
//           [Element ("title", [], seq [Text "Anarchism"]);
//            Element ("ns", [], seq [Text "0"]);
//            Element ("id", [], seq [Text "12"]); Element ("sha1", [], seq []);
//            ...]);
//      Element
//        ("page", [],
//         seq
//           [Element ("title", [], seq [Text "AfghanistanHistory"]);
//            Element ("ns", [], seq [Text "0"]);
//            Element ("id", [], seq [Text "13"]);
//            Element ("redirect", [("title", "History of Afghanistan")], seq []);
//
//            ...]); ...])
namespace System
namespace System.Xml
namespace Microsoft
namespace Microsoft.FSharp
namespace Microsoft.FSharp.Collections
val wiki : string

Full name: Script.wiki
type name = string

Full name: Script.name
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
type attributes = (string * string) list

Full name: Script.attributes
type 'T list = List<'T>

Full name: Microsoft.FSharp.Collections.list<_>
type LazyXml =
  | Element of (name * attributes * obj)
  | Text of string

Full name: Script.LazyXml
union case LazyXml.Element: (name * attributes * obj) -> LazyXml
Multiple items
union case LazyXml.Text: string -> LazyXml

--------------------
namespace System.Text
val readLazyXml : xmlUri:string -> LazyXml

Full name: Script.readLazyXml
val xmlUri : string
val readAttributes : (XmlReader -> (string * string) list)
val reader : XmlReader
type XmlReader =
  member AttributeCount : int
  member BaseURI : string
  member CanReadBinaryContent : bool
  member CanReadValueChunk : bool
  member CanResolveEntity : bool
  member Close : unit -> unit
  member Depth : int
  member EOF : bool
  member GetAttribute : name:string -> string + 2 overloads
  member HasAttributes : bool
  ...

Full name: System.Xml.XmlReader
property XmlReader.HasAttributes: bool
XmlReader.MoveToNextAttribute() : bool
property XmlReader.Name: string
property XmlReader.Value: string
val read : (XmlReader -> seq<LazyXml>)
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
XmlReader.Read() : bool
property XmlReader.NodeType: XmlNodeType
type XmlNodeType =
  | None = 0
  | Element = 1
  | Attribute = 2
  | Text = 3
  | CDATA = 4
  | EntityReference = 5
  | Entity = 6
  | ProcessingInstruction = 7
  | Comment = 8
  | Document = 9
  ...

Full name: System.Xml.XmlNodeType
field XmlNodeType.Element = 1
val reader' : XmlReader
XmlReader.ReadSubtree() : XmlReader
val ignore : value:'T -> unit

Full name: Microsoft.FSharp.Core.Operators.ignore
XmlReader.Close() : unit
XmlReader.Skip() : unit
field XmlNodeType.EndElement = 15
field XmlNodeType.Whitespace = 13
field XmlNodeType.Text = 3
val failwithf : format:Printf.StringFormat<'T,'Result> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.failwithf
Enum.ToString() : string
Enum.ToString(format: string) : string
XmlReader.Create(input: IO.TextReader) : XmlReader
   (+0 other overloads)
XmlReader.Create(input: IO.Stream) : XmlReader
   (+0 other overloads)
XmlReader.Create(inputUri: string) : XmlReader
   (+0 other overloads)
XmlReader.Create(reader: XmlReader, settings: XmlReaderSettings) : XmlReader
   (+0 other overloads)
XmlReader.Create(input: IO.TextReader, settings: XmlReaderSettings) : XmlReader
   (+0 other overloads)
XmlReader.Create(input: IO.Stream, settings: XmlReaderSettings) : XmlReader
   (+0 other overloads)
XmlReader.Create(inputUri: string, settings: XmlReaderSettings) : XmlReader
   (+0 other overloads)
XmlReader.Create(input: IO.TextReader, settings: XmlReaderSettings, inputContext: XmlParserContext) : XmlReader
   (+0 other overloads)
XmlReader.Create(input: IO.TextReader, settings: XmlReaderSettings, baseUri: string) : XmlReader
   (+0 other overloads)
XmlReader.Create(input: IO.Stream, settings: XmlReaderSettings, inputContext: XmlParserContext) : XmlReader
   (+0 other overloads)
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
Raw view Test code New version

More information

Link:http://fssnip.net/bd
Posted:12 years ago
Author:Nick Palladinos
Tags: lazy , xml