11 people like it.

A minimalist XML Parser

A minimalist XML Parser

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
182: 
183: 
184: 
185: 
186: 
187: 
188: 
189: 
190: 
191: 
192: 
193: 
194: 
195: 
196: 
197: 
198: 
199: 
200: 
201: 
202: 
203: 
204: 
205: 
206: 
207: 
208: 
209: 
210: 
211: 
212: 
213: 
214: 
215: 
216: 
217: 
218: 
219: 
220: 
221: 
222: 
223: 
224: 
225: 
226: 
227: 
228: 
229: 
230: 
231: 
232: 
233: 
234: 
235: 
236: 
237: 
238: 
239: 
240: 
241: 
242: 
243: 
244: 
245: 
246: 
247: 
248: 
249: 
250: 
251: 
252: 
253: 
254: 
255: 
256: 
257: 
258: 
259: 
260: 
261: 
262: 
263: 
264: 
265: 
266: 
267: 
268: 
269: 
270: 
271: 
//BNF-like for XML:
//XmlRoot         := XmlElement
//XmlElement      := '<' Name ' ' Attribute* '>' (XmlElement* | XmlValueContent) '<\' Name '>' | 
//                   '<' Name ' ' Attribute* '\>'
//Attribute       := Name '=' '"' Value '"'
//Value           := any char except "
//XmlValueContent := any char except <
//Name            := A-Za-z_[A-Za-z0-9_-]*

open System

[<CustomEquality; NoComparison>]
type XmlAttribute = 
    {Name : string; Value : string}
    override this.Equals(that) =
        match that with
        | :? XmlAttribute as other -> this.Name = other.Name
        | _ -> false
    override this.GetHashCode() = hash this.Name

type XmlElementBody =
    | StrValue of string
    | XmlValue of XmlElement array
and
    XmlElement   = {Name : string; Attributes : XmlAttribute array; Body : XmlElementBody}

let NullXmlAttribute = {Name = null; Value = null}

let NullXmlElement = {Name = null; Attributes = null; Body = XmlValue null}

let readWhitespace (s : string, idx) =
    let mutable idx1 = idx
    while idx1 < s.Length && (s.[idx1] = ' ' || s.[idx1] = '\r' || s.[idx1] = '\n' || s.[idx1] = '\t') do
        idx1 <- idx1 + 1
    (s, idx1)

let subString idxDelta (s : string, idx) =
    if idxDelta > idx
        then (s.Substring(idx, idxDelta - idx), (s, idxDelta))
        else (null, (s, idx))

let readName (name : string byref) (s : string, idx) =
    let checkValidChars (idx1) = 
        ('_' = s.[idx1]) || ('A' <= s.[idx1] && s.[idx1] <= 'Z') || ('a' <= s.[idx1] && s.[idx1] <= 'z')
    let mutable idx1 = idx
    if idx1 < s.Length && checkValidChars idx1
        then idx1 <- idx1 + 1
        else failwith (sprintf "Invalid Name at %d" idx1) 
    while idx1 < s.Length && (checkValidChars idx1 || ('0' <= s.[idx1] && s.[idx1] <= '9') || s.[idx1] = '-')
                do idx1 <- idx1 + 1
    let (name_, retVal) = subString idx1 (s, idx)
    name <- name_; retVal

let readXmlValueContent (content : string byref) (s : string, idx) =
    let mutable idx1 = idx
    while idx1 < s.Length && '<' <> s.[idx1] do idx1 <- idx1 + 1
    let (content_, retVal) = subString idx1 (s, idx)
    content <- content_; retVal

let readValue (value : string byref) (s : string, idx) =
    let mutable idx1 = idx
    while idx1 < s.Length && '\"' <> s.[idx1] do idx1 <- idx1 + 1
    let (value_, retVal) = subString idx1 (s, idx)
    value <- value_; retVal

let readFixed ch (s : string, idx) =
    if s.[idx] <> ch then failwith (sprintf "Invalid token at %d" idx)
    (s, idx + 1)

let checkFixed ch (s : string, idx) = s.[idx] = ch

let readAttribute (attr : XmlAttribute byref) (s : string, idx) =
    let mutable name : string = null
    let (_, idx1) = (s, idx) |> readName &name
    if name <> null then
        let mutable value : string = null
        let (_, idx2) = (s, idx1) |> readWhitespace |> readFixed '=' |> readWhitespace |>
                                     readFixed '\"' |> readValue &value |> readFixed '\"'
        attr <- {XmlAttribute.Name = name; Value = value}; (s, idx2)
    else
        attr <- NullXmlAttribute; (s, idx1)

let readAttributeList (attrs : XmlAttribute array byref) (s : string, idx) =
    let (_, idx1) = (s, idx) |> readWhitespace
    let mutable a : XmlAttribute list = []
    let mutable idx2 = idx1
    while (not (checkFixed '>' (s, idx2))) && (not (checkFixed '/' (s, idx2))) do
        let mutable attr = NullXmlAttribute
        let (_, idx3) = (s, idx2) |> readAttribute &attr |> readWhitespace
        if attr.Name <> null then a <- attr :: a
        if idx2 = idx3 then failwith (sprintf "Malformed XML at %d" idx2)
        idx2 <- idx3
    if a |> List.ofSeq |> Seq.distinct |> Seq.length <> (a |> List.length) then
        failwith "Attribute names must be unique"
    attrs <- a |> List.rev |> List.toArray; (s, idx2)

let readClosingElement (name : string byref) (s : string, idx) =
    let (_, idx1) = (s, idx) |> readFixed '<' |> readFixed '/' |> readWhitespace |>
                                readName &name |> readWhitespace |> readFixed '>' |> readWhitespace
    (s, idx1)

let rec readElement (elem : XmlElement byref) (s : string, idx) =
    let mutable name : string = null
    let mutable attrs : XmlAttribute array = null
    let (_, idx1) = (s, idx) |> readWhitespace |> readFixed '<' |> readName &name |>
                                readWhitespace |> readAttributeList &attrs |> readWhitespace
    if checkFixed '/' (s, idx1) then //'<' Name ' ' Attribute* '\>'
        let (_, idx2) = (s, idx1) |> readFixed '/' |> readFixed '>'
        elem <- {XmlElement.Name = name; Attributes = attrs; Body = StrValue String.Empty}; (s, idx2) //return
    else //'<' Name ' ' Attribute* '>' (XmlElement* | XmlValueContent) '<\' Name '>'
        let (_, idx2) = (s, idx1) |> readFixed '>' |> readWhitespace
        let (e, idx3) =
            if checkFixed '<' (s, idx2) then //XmlElement*
                if checkFixed '/' (s, idx2 + 1) then //no xml value content
                    (XmlValue [||], idx2)
                else //element list
                    let mutable elems : XmlElement array = null
                    let (_, idx3) = (s, idx2) |> readElementList &elems
                    (XmlValue elems, idx3)
            else //XmlValueContent
                let mutable content : string = null
                let (_, idx3) = (s, idx2) |> readXmlValueContent &content
                (StrValue content, idx3)
        let mutable name_ : string = null
        let (_, idx4) = (s, idx3) |> readClosingElement &name_
        if name <> name_ then failwith (sprintf "Closing element <%s> is missing" name)
        elem <- {XmlElement.Name = name; Attributes = attrs; Body = e}; (s, idx4) //return
and
    readElementList (elems : XmlElement array byref) (s : string, idx) =
        let mutable a : XmlElement list = []
        let mutable idx1 = idx
        while checkFixed '<' (s, idx1) && (not (checkFixed '/' (s, idx1 + 1))) do
            let mutable e : XmlElement = NullXmlElement
            let (_, idx2) = (s, idx1) |> readElement &e |> readWhitespace
            idx1 <- idx2
            a <-  e :: a
        elems <- a |> List.rev |> List.toArray; (s, idx1)

let readXmlFragment (s: string) =
    let mutable idx = 0
    let mutable a = []
    while (idx < s.Length) do
        let mutable e : XmlElement = NullXmlElement
        let (_, idx1) = (s, idx) |> readElement &e
        idx <- idx1
        a <- e :: a
    a |> List.rev |> List.toArray

let readXmlRoot (s: string) = readXmlFragment(s).[0]

//helper:
let print (node: XmlElement) =
    let printAttributes elem =
        for a in elem.Attributes do printf " @%s: %s" a.Name a.Value
    let rec rec_print (node: XmlElement, tab : int) =
        for i = 1 to tab do printf "  "
        printf "{%s: " node.Name
        match node.Body with
        | StrValue x -> 
            printf "%s" x
        | XmlValue xs -> 
            for x in xs do printfn ""; rec_print(x, tab + 1)
        printAttributes node
        printf "}"
    rec_print (node, 0); printfn ""

let printn node = print node; printfn ""

//test cases:
let test1 = "<note>
             <to>Tove</to>
             <from>Jani</from>
             <heading>Reminder</heading>
             <body>Don't forget me this weekend!</body>
            </note>"
let result1 = readXmlRoot(test1)
printn(result1)

let test2 = "<A>
                <B/>
                <C/>
                <D>
                    <E/>
                    <F/>
                    <G>
                        <H><I></I></H>
                    </G>
                    <J/>
                </D>
            </A>"
let result2 = readXmlRoot(test2)
printn(result2)

let test3 = "<bookstore>
                  <book category=\"COOKING\">
                    <title lang=\"en\">Everyday Italian</title>
                    <author>Giada De Laurentiis</author>
                    <year>2005</year>
                    <price>30.00</price>
                  </book>
                  <book category=\"CHILDREN\">
                    <title lang=\"en\">Harry Potter</title>
                    <author>J K. Rowling</author>
                    <year>2005</year>
                    <price>29.99</price>
                  </book>
                  <book category=\"WEB\">
                    <title lang=\"en\">Learning XML</title>
                    <author>Erik T. Ray</author>
                    <year>2003</year>
                    <price>39.95</price>
                  </book>
             </bookstore>"
let result3 = readXmlRoot(test3)
printn(result3)

let test4 = "<Signature xmlns=\"http://www.w3.org/2000/09/xmldsig#\">
                <SignedInfo>
                  <CanonicalizationMethod Algorithm=\"http://www.w3.org/TR/2001/REC-xml-c14n-20010315\" />
                  <SignatureMethod Algorithm=\"http://www.w3.org/2000/09/xmldsig#rsa-sha1\" />
                  <Reference URI=\"#object\">
                    <DigestMethod Algorithm=\"http://www.w3.org/2000/09/xmldsig#sha1\" />
                    <DigestValue>OPnpF/ZNLDxJ/I+1F3iHhlmSwgo=</DigestValue>
                  </Reference>
                </SignedInfo>
                <SignatureValue>nihUFQg4mDhLgecvhIcKb9Gz8VRTOlw+adiZOBBXgK4JodEe5aFfCqm8WcRIT8GLLXSk8PsUP4//SsKqUBQkpotcAqQAhtz2v9kCWdoUDnAOtFZkd/CnsZ1sge0ndha40wWDV+nOWyJxkYgicvB8POYtSmldLLepPGMz+J7/Uws=</SignatureValue>
                <KeyInfo>
                  <KeyValue>
                    <RSAKeyValue>
                      <Modulus>4IlzOY3Y9fXoh3Y5f06wBbtTg94Pt6vcfcd1KQ0FLm0S36aGJtTSb6pYKfyX7PqCUQ8wgL6xUJ5GRPEsu9gyz8ZobwfZsGCsvu40CWoT9fcFBZPfXro1Vtlh/xl/yYHm+Gzqh0Bw76xtLHSfLfpVOrmZdwKmSFKMTvNXOFd0V18=</Modulus>
                      <Exponent>AQAB</Exponent>
                    </RSAKeyValue>
                  </KeyValue>
                </KeyInfo>
                <Object Id=\"object\">some text
                  with spaces and CR-LF.</Object>
                </Signature>"
let result4 = readXmlRoot(test4)
printn(result4)

let test5 = "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">
             <head>
             <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />
             <title>XHTML 1.0 Strict Example</title>
             <script type=\"text/javascript\">             
             function loadpdf() {
                document.getElementById(\"pdf-object\").src=\"http://www.w3.org/TR/xhtml1/xhtml1.pdf\";
             }
             </script>
             </head>
             <body onload=\"loadpdf()\">
             <p>
             <p>This is an example of an Extensible HyperText Markup Language</p>
             <br />
             <img id=\"validation-icon\"
                src=\"http://www.w3.org/Icons/valid-xhtml10\"
                alt=\"Valid XHTML 1.0 Strict\" /><br />
             <object id=\"pdf-object\"
                name=\"pdf-object\"
                type=\"application/pdf\"
                data=\"http://www.w3.org/TR/xhtml1/xhtml1.pdf\"
                width=\"100%\"
                height=\"500\">
             </object>
             </p>
             </body>
            </html>"
let result5 = readXmlRoot(test5)
printn(result5)

Console.ReadLine() |> ignore
namespace System
Multiple items
type CustomEqualityAttribute =
  inherit Attribute
  new : unit -> CustomEqualityAttribute

Full name: Microsoft.FSharp.Core.CustomEqualityAttribute

--------------------
new : unit -> CustomEqualityAttribute
Multiple items
type NoComparisonAttribute =
  inherit Attribute
  new : unit -> NoComparisonAttribute

Full name: Microsoft.FSharp.Core.NoComparisonAttribute

--------------------
new : unit -> NoComparisonAttribute
type XmlAttribute =
  {Name: string;
   Value: string;}
  override Equals : that:obj -> bool
  override GetHashCode : unit -> int

Full name: Script.XmlAttribute
XmlAttribute.Name: string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
XmlAttribute.Value: string
val this : XmlAttribute
override XmlAttribute.Equals : that:obj -> bool

Full name: Script.XmlAttribute.Equals
val that : obj
val other : XmlAttribute
override XmlAttribute.GetHashCode : unit -> int

Full name: Script.XmlAttribute.GetHashCode
val hash : obj:'T -> int (requires equality)

Full name: Microsoft.FSharp.Core.Operators.hash
type XmlElementBody =
  | StrValue of string
  | XmlValue of XmlElement array

Full name: Script.XmlElementBody
union case XmlElementBody.StrValue: string -> XmlElementBody
union case XmlElementBody.XmlValue: XmlElement array -> XmlElementBody
type XmlElement =
  {Name: string;
   Attributes: XmlAttribute array;
   Body: XmlElementBody;}

Full name: Script.XmlElement
type 'T array = 'T []

Full name: Microsoft.FSharp.Core.array<_>
XmlElement.Name: string
XmlElement.Attributes: XmlAttribute array
XmlElement.Body: XmlElementBody
val NullXmlAttribute : XmlAttribute

Full name: Script.NullXmlAttribute
val NullXmlElement : XmlElement

Full name: Script.NullXmlElement
val readWhitespace : s:string * idx:int -> string * int

Full name: Script.readWhitespace
val s : string
val idx : int
val mutable idx1 : int
property String.Length: int
val subString : idxDelta:int -> s:string * idx:int -> string * (string * int)

Full name: Script.subString
val idxDelta : int
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
val readName : name:byref<string> -> s:string * idx:int -> string * int

Full name: Script.readName
val name : byref<string>
type byref<'T> = (# "<Common IL Type Omitted>" #)

Full name: Microsoft.FSharp.Core.byref<_>
val checkValidChars : (int -> bool)
val idx1 : int
val failwith : message:string -> 'T

Full name: Microsoft.FSharp.Core.Operators.failwith
val sprintf : format:Printf.StringFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.sprintf
val name_ : string
val retVal : string * int
val readXmlValueContent : content:byref<string> -> s:string * idx:int -> string * int

Full name: Script.readXmlValueContent
val content : byref<string>
val content_ : string
val readValue : value:byref<string> -> s:string * idx:int -> string * int

Full name: Script.readValue
val value : byref<string>
val value_ : string
val readFixed : ch:char -> s:string * idx:int -> string * int

Full name: Script.readFixed
val ch : char
val checkFixed : ch:char -> s:string * idx:int -> bool

Full name: Script.checkFixed
val readAttribute : attr:byref<XmlAttribute> -> s:string * idx:int -> string * int

Full name: Script.readAttribute
val attr : byref<XmlAttribute>
val mutable name : string
val mutable value : string
val idx2 : int
val readAttributeList : attrs:byref<XmlAttribute array> -> s:string * idx:int -> string * int

Full name: Script.readAttributeList
val attrs : byref<XmlAttribute array>
val mutable a : XmlAttribute list
type 'T list = List<'T>

Full name: Microsoft.FSharp.Collections.list<_>
val mutable idx2 : int
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val mutable attr : XmlAttribute
val idx3 : int
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val ofSeq : source:seq<'T> -> 'T list

Full name: Microsoft.FSharp.Collections.List.ofSeq
module Seq

from Microsoft.FSharp.Collections
val distinct : source:seq<'T> -> seq<'T> (requires equality)

Full name: Microsoft.FSharp.Collections.Seq.distinct
val length : source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.length
val length : list:'T list -> int

Full name: Microsoft.FSharp.Collections.List.length
val rev : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.rev
val toArray : list:'T list -> 'T []

Full name: Microsoft.FSharp.Collections.List.toArray
val readClosingElement : name:byref<string> -> s:string * idx:int -> string * int

Full name: Script.readClosingElement
val readElement : elem:byref<XmlElement> -> s:string * idx:int -> string * int

Full name: Script.readElement
val elem : byref<XmlElement>
val mutable attrs : XmlAttribute array
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
field string.Empty
val e : XmlElementBody
val mutable elems : XmlElement array
val readElementList : elems:byref<XmlElement array> -> s:string * idx:int -> string * int

Full name: Script.readElementList
val mutable content : string
val mutable name_ : string
val idx4 : int
val elems : byref<XmlElement array>
val mutable a : XmlElement list
val mutable e : XmlElement
val readXmlFragment : s:string -> XmlElement []

Full name: Script.readXmlFragment
val mutable idx : int
val readXmlRoot : s:string -> XmlElement

Full name: Script.readXmlRoot
val print : node:XmlElement -> unit

Full name: Script.print
val node : XmlElement
val printAttributes : (XmlElement -> unit)
val elem : XmlElement
val a : XmlAttribute
val printf : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printf
val rec_print : (XmlElement * int -> unit)
val tab : int
Multiple items
val int : value:'T -> int (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.int

--------------------
type int = int32

Full name: Microsoft.FSharp.Core.int

--------------------
type int<'Measure> = int

Full name: Microsoft.FSharp.Core.int<_>
val i : int
val x : string
val xs : XmlElement array
val x : XmlElement
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
val printn : node:XmlElement -> unit

Full name: Script.printn
val test1 : string

Full name: Script.test1
val result1 : XmlElement

Full name: Script.result1
val test2 : string

Full name: Script.test2
val result2 : XmlElement

Full name: Script.result2
val test3 : string

Full name: Script.test3
val result3 : XmlElement

Full name: Script.result3
val test4 : string

Full name: Script.test4
val result4 : XmlElement

Full name: Script.result4
val test5 : string

Full name: Script.test5
val result5 : XmlElement

Full name: Script.result5
type Console =
  static member BackgroundColor : ConsoleColor with get, set
  static member Beep : unit -> unit + 1 overload
  static member BufferHeight : int with get, set
  static member BufferWidth : int with get, set
  static member CapsLock : bool
  static member Clear : unit -> unit
  static member CursorLeft : int with get, set
  static member CursorSize : int with get, set
  static member CursorTop : int with get, set
  static member CursorVisible : bool with get, set
  ...

Full name: System.Console
Console.ReadLine() : string
val ignore : value:'T -> unit

Full name: Microsoft.FSharp.Core.Operators.ignore
Raw view Test code New version

More information

Link:http://fssnip.net/jD
Posted:10 years ago
Author:Fabio Galuppo
Tags: xml , parsing