8 people like it.

URL Canonicalization

This function produces safe URLs for Web requests or URI construction. It forces the pattern: http:// + www. OR subdomain. + domain + absolute path. When used in an application that takes URLs as input, the user would be able to type "example.com" instead of "http://example.com" or "http://www.example.com". It also supports domains like google.co.uk or google.com.au.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
open System
open System.Text.RegularExpressions

let canonicalize (url : string) =
    let domPat = "[^\.]+\.\w{2,3}(\.\w{2})?"
    let url' = Uri.TryCreate(url, UriKind.Absolute)
    let uri =
        match url' with
        | true, str -> Some str
        | _ ->
            let url'' = Uri.TryCreate("http://" + url, UriKind.Absolute)
            match url'' with
            | true, str -> Some str
            | _ -> None
    
    match uri with
    | Some x ->
        let host = x.Host
        let path = x.AbsolutePath
        let host' = Regex(domPat, RegexOptions.RightToLeft).Match(host).Value
        let pattern = "(?i)^https?://((www\.)|([^\.]+\.))" + Regex.Escape(host') + "[^\"]*"
        let m = Regex(pattern).IsMatch(string x)
        match m with
        | true -> "http://" + host + path
        | false -> "http://www." + host + path
    | None -> ""

// Example:
let a = canonicalize "microsoft.com/web"
let b = canonicalize "www.bing.com"
let c = canonicalize "http://fssnip.net/tags/seq"
let d = canonicalize "fsharp-code.blogspot.com"
let e = canonicalize "google.co.uk"
let f = canonicalize "google.com.au"

// Output:
// val a : string = "http://www.microsoft.com/web"
// val b : string = "http://www.bing.com/"
// val c : string = "http://www.fssnip.net/tags/seq"
// val d : string = "http://fsharp-code.blogspot.com/"
// val e : string = "http://www.google.co.uk/"
// val f : string = "http://www.google.com.au/"
namespace System
namespace System.Text
namespace System.Text.RegularExpressions
val canonicalize : url:string -> string

Full name: Script.canonicalize
val url : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
val domPat : string
val url' : bool * Uri
Multiple items
type Uri =
  new : uriString:string -> Uri + 5 overloads
  member AbsolutePath : string
  member AbsoluteUri : string
  member Authority : string
  member DnsSafeHost : string
  member Equals : comparand:obj -> bool
  member Fragment : string
  member GetComponents : components:UriComponents * format:UriFormat -> string
  member GetHashCode : unit -> int
  member GetLeftPart : part:UriPartial -> string
  ...

Full name: System.Uri

--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
Uri.TryCreate(baseUri: Uri, relativeUri: Uri, result: byref<Uri>) : bool
Uri.TryCreate(baseUri: Uri, relativeUri: string, result: byref<Uri>) : bool
Uri.TryCreate(uriString: string, uriKind: UriKind, result: byref<Uri>) : bool
type UriKind =
  | RelativeOrAbsolute = 0
  | Absolute = 1
  | Relative = 2

Full name: System.UriKind
field UriKind.Absolute = 1
val uri : Uri option
val str : Uri
union case Option.Some: Value: 'T -> Option<'T>
val url'' : bool * Uri
union case Option.None: Option<'T>
val x : Uri
val host : string
property Uri.Host: string
val path : string
property Uri.AbsolutePath: string
val host' : string
Multiple items
type Regex =
  new : pattern:string -> Regex + 1 overload
  member GetGroupNames : unit -> string[]
  member GetGroupNumbers : unit -> int[]
  member GroupNameFromNumber : i:int -> string
  member GroupNumberFromName : name:string -> int
  member IsMatch : input:string -> bool + 1 overload
  member Match : input:string -> Match + 2 overloads
  member Matches : input:string -> MatchCollection + 1 overload
  member Options : RegexOptions
  member Replace : input:string * replacement:string -> string + 5 overloads
  ...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
type RegexOptions =
  | None = 0
  | IgnoreCase = 1
  | Multiline = 2
  | ExplicitCapture = 4
  | Compiled = 8
  | Singleline = 16
  | IgnorePatternWhitespace = 32
  | RightToLeft = 64
  | ECMAScript = 256
  | CultureInvariant = 512

Full name: System.Text.RegularExpressions.RegexOptions
field RegexOptions.RightToLeft = 64
type Match =
  inherit Group
  member Groups : GroupCollection
  member NextMatch : unit -> Match
  member Result : replacement:string -> string
  static member Empty : Match
  static member Synchronized : inner:Match -> Match

Full name: System.Text.RegularExpressions.Match
val pattern : string
Regex.Escape(str: string) : string
val m : bool
val a : string

Full name: Script.a
val b : string

Full name: Script.b
val c : string

Full name: Script.c
val d : string

Full name: Script.d
val e : string

Full name: Script.e
val f : string

Full name: Script.f

More information

Link:http://fssnip.net/22
Posted:14 years ago
Author:Taha Hachana
Tags: url , uri , regex