29 people like it.

Jaro-Winkler in F#

Jaro-Winkler is a fast and effective name matching algorithm. For more info see "A Comparison of String Distance Metrics for Name-Matching Tasks" http://www.isi.edu/info-agents/workshops/ijcai03/papers/Cohen-p.pdf or the Wikipedia article http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
let windowAt str offset radius =
    let startAt = max 0 (offset - radius)
    let endAt = min (offset + radius) (String.length str - 1)  
    [ for i in startAt .. endAt -> str.[i] ]

let jaro s1 s2 = 
    let matchRadius = 
        let s1_l, s2_l = String.length s1, String.length s2 in
            (min s1_l s2_l) / 2 +
            (min s1_l s2_l) % 2

    let commonChars chars1 chars2 =
        [
            for i = 0 to String.length chars1 - 1 do
                let matchChar = chars1.[i]
                let windowChars = windowAt chars2 i matchRadius
                if windowChars |> List.exists (fun c -> c = matchChar) then
                    yield matchChar 
        ]
            
    let common1 = commonChars s1 s2
    let common2 = commonChars s2 s1

    let totalTranspositions = 
        List.zip common1 common2
        |> List.fold (fun s (c1,c2) -> if c1 <> c2 then s + 1.0 else s) 0.0

    let transpositions = totalTranspositions / 2.0
    let s1length = float (String.length s1)
    let s2length = float (String.length s2)
    let c1length = float (List.length common1)    
    let c2length = float (List.length common2)

    ((c1length / s1length) + 
     (c2length / s2length) + 
     ((c1length - transpositions) / c1length)) 
     / 3.0


let jaroWinkler s1 s2 = 
    let jaroScore = jaro s1 s2
    let p = 0.1
    let maxLength = (min s1.Length s2.Length) - 1
    let rec calcL i acc =
        if i > maxLength || s1.[i] <> s2.[i] then acc
        else calcL (i + 1) (acc + 1.0)
    let l = calcL 0 0.0
    jaroScore + (l * p * (1.0 - jaroScore))
val windowAt : str:string -> offset:int -> radius:int -> char list

Full name: Script.windowAt
val str : string
val offset : int
val radius : int
val startAt : int
val max : e1:'T -> e2:'T -> 'T (requires comparison)

Full name: Microsoft.FSharp.Core.Operators.max
val endAt : int
val min : e1:'T -> e2:'T -> 'T (requires comparison)

Full name: Microsoft.FSharp.Core.Operators.min
module String

from Microsoft.FSharp.Core
val length : str:string -> int

Full name: Microsoft.FSharp.Core.String.length
val i : int
val jaro : s1:string -> s2:string -> float

Full name: Script.jaro
val s1 : string
val s2 : string
val matchRadius : int
val s1_l : int
val s2_l : int
val commonChars : (string -> string -> char list)
val chars1 : string
val chars2 : string
val matchChar : char
val windowChars : char list
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val exists : predicate:('T -> bool) -> list:'T list -> bool

Full name: Microsoft.FSharp.Collections.List.exists
val c : char
val common1 : char list
val common2 : char list
val totalTranspositions : float
val zip : list1:'T1 list -> list2:'T2 list -> ('T1 * 'T2) list

Full name: Microsoft.FSharp.Collections.List.zip
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State

Full name: Microsoft.FSharp.Collections.List.fold
val s : float
val c1 : char
val c2 : char
val transpositions : float
val s1length : float
Multiple items
val float : value:'T -> float (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.float

--------------------
type float = System.Double

Full name: Microsoft.FSharp.Core.float

--------------------
type float<'Measure> = float

Full name: Microsoft.FSharp.Core.float<_>
val s2length : float
val c1length : float
val length : list:'T list -> int

Full name: Microsoft.FSharp.Collections.List.length
val c2length : float
val jaroWinkler : s1:string -> s2:string -> float

Full name: Script.jaroWinkler
val jaroScore : float
val p : float
val maxLength : int
property System.String.Length: int
val calcL : (int -> float -> float)
val acc : float
val l : float

More information

Link:http://fssnip.net/2M
Posted:13 years ago
Author:Rick Minerich
Tags: text , matching