4 people like it.
Like the snippet!
Soundex Algorithm
Algorithms for generating US Census and Daitch-Mokotoff soundex string(s) based on a text input. Soundex is a phonetic algorithm for indexing names by sound, as pronounced in English. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
161:
162:
163:
164:
165:
166:
167:
168:
169:
170:
171:
172:
173:
174:
175:
176:
177:
178:
179:
180:
181:
182:
183:
184:
185:
186:
187:
188:
189:
190:
191:
192:
193:
194:
195:
|
open System
open System.Linq
open System.Text
module Soundex =
/// The Soundex code for a name consists of a letter followed by three numerical digits: the letter
/// is the first letter of the name, and the digits encode the remaining consonants.
/// The National Archives and Records Administration (NARA) maintains the current rule set for the
/// official implementation of Soundex used by the U.S. Government. The American Soundex is a variant
/// of the original Russell Soundex algorithm.
/// Reference: http://www.archives.gov/research/census/soundex.html
let American(text : string) =
let chooser c =
match Char.ToLowerInvariant(c) with
| 'b' | 'f' | 'p' | 'v' -> '1'
| 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' -> '2'
| 'd' | 't' -> '3'
| 'l' -> '4'
| 'm' | 'n' -> '5'
| 'r' -> '6'
| 'h' | 'w' -> '-'
| _ -> if Char.IsDigit(c) then c else '.'
let folder (state : char list) (c) =
match chooser(Seq.head state), chooser(c) with
| p, i when (p <> i && i <> '-') || i = '0' -> i :: state
| _, _ -> state
let value = text.Trim() + "000"
let soundex = Seq.toList (value.Substring(1))
|> List.fold folder [ Char.ToUpperInvariant(value.[0]) ]
|> List.filter (fun c -> c <> '.')
|> List.rev
|> Seq.truncate 4
|> Seq.toArray
String(soundex)
/// Daitch–Mokotoff Soundex (D–M Soundex) was developed in 1985 by genealogist Gary Mokotoff and later
/// improved by genealogist Randy Daitch because of problems they encountered while trying to apply the
/// Russell Soundex to Jews with Germanic or Slavic surnames.
/// References:
/// <list type="bullet">
/// <description>http://www.avotaynu.com/soundex.htm</description>
/// <description>http://www.jewishgen.org/InfoFiles/soundex.html</description>
/// </list>
let DaitchMokotoff(text : string) =
let isVowel(value : string) =
let first = if String.IsNullOrEmpty(value) then String.Empty else value.Substring(0, 1)
match first with
| "A" | "E" | "I" | "O" | "U" | "Y" -> true
| _ -> false
let (|Match|_|) (values : seq<string>) (codes : seq<string>) (s : string) =
let result = Seq.tryFind (fun value -> s.StartsWith(value)) values
if result.IsNone then None else Some( codes, isVowel(result.Value), s.Substring(result.Value.Length))
let (|Group|_|) (s : string) =
match s with
| Match ["AI";"AJ";"AY"] ["0"; "1"; ""] result -> Some(result)
| Match ["AU"] ["0"; "7"; ""] result -> Some(result)
| Match ["A"] ["0"; ""; ""] result -> Some(result)
| Match ["B"] ["7"; "7"; "7"] result -> Some(result)
| Match ["CHS"] ["5"; "54"; "54"] result -> Some(result)
| Match ["CH"] ["KH"; "TCH"] result -> Some(result)
| Match ["CK"] ["K"; "TSK"] result -> Some(result)
| Match ["CZS";"CZ";"CSZ";"CS"] ["4"; "4"; "4"] result -> Some(result)
| Match ["C"] ["K"; "TZ"] result -> Some(result)
| Match ["DRZ";"DRS"] ["4"; "4"; "4"] result -> Some(result)
| Match ["DSZ";"DSH";"DS"] ["4"; "4"; "4"] result -> Some(result)
| Match ["DZH";"DZS";"DZ"] ["4"; "4"; "4"] result -> Some(result)
| Match ["DT"] ["3"; "3"; "3"] result -> Some(result)
| Match ["EI";"EJ";"EY"] ["0"; "1"; ""] result -> Some(result)
| Match ["EU"] ["1"; "1"; ""] result -> Some(result)
| Match ["E"] ["0"; ""; ""] result -> Some(result)
| Match ["FB"; "F"] ["7"; "7"; "7"] result -> Some(result)
| Match ["G"] ["5"; "5"; "5"] result -> Some(result)
| Match ["H"] ["5"; "5"; ""] result -> Some(result)
| Match ["IA";"IE";"IO";"IU"] ["1"; ""; ""] result -> Some(result)
| Match ["I"] ["0"; ""; ""] result -> Some(result)
| Match ["J2Y"] ["1"; "1"; "1"] result -> Some(result)
| Match ["J"] ["J2Y"; "DZH"] result -> Some(result)
| Match ["KS"] ["5"; "54"; "54"] result -> Some(result)
| Match ["KH"; "K"] ["5"; "5"; "5"] result -> Some(result)
| Match ["L"] ["8"; "8"; "8"] result -> Some(result)
| Match ["MN";"NM"] ["66"; "66"; "66"] result -> Some(result)
| Match ["M"; "N"] ["6"; "6"; "6"] result -> Some(result)
| Match ["OI";"OJ";"OY"] ["0"; "1"; ""] result -> Some(result)
| Match ["O"] ["0"; ""; ""] result -> Some(result)
| Match ["PF";"PH"; "P"] ["7"; "7"; "7"] result -> Some(result)
| Match ["RTZ"] ["94"; "94"; "94"] result -> Some(result)
| Match ["RS";"RZ"] ["RTZ"; "ZH"] result -> Some(result)
| Match ["R"] ["9"; "9"; "9"] result -> Some(result)
| Match ["SCHTSCH";"SCHTSH";"SCHTCH"] ["2"; "4"; "4"] result -> Some(result)
| Match ["SCH"] ["4"; "4"; "4"] result -> Some(result)
| Match ["SHTCH";"SHCH";"SHTSH"] ["2"; "4"; "4"] result -> Some(result)
| Match ["SHT";"SCHT";"SCHD"] ["2"; "43"; "43"] result -> Some(result)
| Match ["SH"] ["4"; "4"; "4"] result -> Some(result)
| Match ["STCH";"STSCH";"SC"] ["2"; "4"; "4"] result -> Some(result)
| Match ["STRZ";"STRS";"STSH"] ["2"; "4"; "4"] result -> Some(result)
| Match ["ST"] ["2"; "43"; "43"] result -> Some(result)
| Match ["SZCZ";"SZCS"] ["2"; "4"; "4"] result -> Some(result)
| Match ["SZT";"SHD";"SZD";"SD"] ["2"; "43"; "43"] result -> Some(result)
| Match ["SZ";"S"] ["4"; "4"; "4"] result -> Some(result)
| Match ["TCH";"TTCH";"TTSCH";"THS"] ["4";"4";"4"] result -> Some(result)
| Match ["TH"] ["3";"3";"3"] result -> Some(result)
| Match ["TRZ";"TRS"] ["4";"4";"4"] result -> Some(result)
| Match ["TSCH";"TSH"] ["4";"4";"4"] result -> Some(result)
| Match ["TSK"] ["45";"45";"45"] result -> Some(result)
| Match ["TTSZ";"TTS";"TC"] ["4";"4";"4"] result -> Some(result)
| Match ["TZS";"TTZ";"TZ";"TSZ";"TS"] ["4";"4";"4"] result -> Some(result)
| Match ["T"] ["3";"3";"3"] result -> Some(result)
| Match ["UI";"UJ";"UY"] ["0";"1";""] result -> Some(result)
| Match ["UE";"U"] ["0";"";""] result -> Some(result)
| Match ["V"] ["7";"7";"7"] result -> Some(result)
| Match ["W"] ["7";"7";"7"] result -> Some(result)
| Match ["X"] ["5";"54";"54"] result -> Some(result)
| Match ["Y"] ["1";"";""] result -> Some(result)
| Match ["ZHDZH";"ZDZH";"ZDZ"] ["2";"4";"4"] result -> Some(result)
| Match ["ZD";"ZHD"] ["2";"43";"43"] result -> Some(result)
| Match ["ZSCH";"ZSH";"ZH";"ZS"] ["4";"4";"4"] result -> Some(result)
| Match ["Z"] ["4";"4";"4"] result -> Some(result)
| _ -> None
let search(value : string) =
match value with
| Group result -> (*printfn "%s -> %A" value result;*) result
| _ -> seq[ "" ], false, value.Substring(1)
let rec decompose (value : string) =
let head, isVowel, next = search(value)
if String.IsNullOrEmpty(next) then
seq [ head, isVowel ]
else
Seq.append [ head, isVowel ] (decompose(next))
let rec encode (codes : seq<string>, start : bool, beforeVowel : bool) =
let first = Seq.nth 0 codes
if String.IsNullOrEmpty(first) then
[ ]
else
if String.IsNullOrEmpty(first) || Char.IsDigit(Seq.nth 0 first) then
if start then
[ (Seq.nth 0 codes) ]
else if beforeVowel then
[ (Seq.nth 1 codes) ]
else
[ (Seq.nth 2 codes) ]
else
let head, isVowel, next = search(first)
if Seq.length codes = 1 then
encode(head, start, beforeVowel)
else
List.append (encode(head, start, beforeVowel)) (encode(Seq.skip 1 codes, start, beforeVowel))
let reduce (results : string list) (values : string list) =
let appender(value : string) =
fun (s : string) ->
let result = (s + value)
if result.Length >= 6 then result.Substring(0, 6) else result
seq { for value in values do yield List.map (appender(value)) results }
|> Seq.concat
|> Seq.toList
let first, second = decompose ( text.ToUpperInvariant() )
|> Seq.pairwise
|> Seq.filter (fun (a, b) -> a <> b)
|> Seq.toList
|> List.unzip
let items, vowels = List.unzip (List.head first :: second)
List.zip items ( List.append (List.tail vowels) [ false ] )
|> List.mapi (fun (i) (codes, isVowel) -> encode(codes, i = 0, isVowel))
|> List.filter (fun c -> not(List.isEmpty c) && (List.head c) <> "")
|> List.reduce reduce
|> List.toSeq
|> Seq.distinct
|> Seq.map (fun s -> if s.Length >= 6 then s else String.Concat(s, String('0', 6 - s.Length)))
(*
Soundex.American("Ashcraft") // "A261"
Soundex.American("jackson") // "J250"
Soundex.American("miller") // "M460"
Soundex.American("Wilson") // "W425"
Soundex.American("Schmit") // "S530"
Soundex.American("Lloyd") // "L300"
Soundex.DaitchMokotoff("Peters") // [ "739400"; "734000" ];
Soundex.DaitchMokotoff("Peterson") // [ "739460"; "734600" ];
Soundex.DaitchMokotoff("Moskowitz") // [ "645740" ];
Soundex.DaitchMokotoff("Moskovitz") // [ "645740" ];
Soundex.DaitchMokotoff("Auerbach") // [ "097500"; "097400" ];
Soundex.DaitchMokotoff("Uhrbach") // [ "097500"; "097400" ];
Soundex.DaitchMokotoff("Jackson") // [ "154600"; "454600"; "145460"; "445460" ];
*)
|
namespace System
namespace System.Linq
namespace System.Text
module Soundex
from Script
val American : text:string -> String
Full name: Script.Soundex.American
The Soundex code for a name consists of a letter followed by three numerical digits: the letter
is the first letter of the name, and the digits encode the remaining consonants.
The National Archives and Records Administration (NARA) maintains the current rule set for the
official implementation of Soundex used by the U.S. Government. The American Soundex is a variant
of the original Russell Soundex algorithm.
Reference: http://www.archives.gov/research/census/soundex.html
val text : string
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = String
Full name: Microsoft.FSharp.Core.string
val chooser : (char -> char)
val c : char
type Char =
struct
member CompareTo : value:obj -> int + 1 overload
member Equals : obj:obj -> bool + 1 overload
member GetHashCode : unit -> int
member GetTypeCode : unit -> TypeCode
member ToString : unit -> string + 1 overload
static val MaxValue : char
static val MinValue : char
static member ConvertFromUtf32 : utf32:int -> string
static member ConvertToUtf32 : highSurrogate:char * lowSurrogate:char -> int + 1 overload
static member GetNumericValue : c:char -> float + 1 overload
...
end
Full name: System.Char
Char.ToLowerInvariant(c: char) : char
Char.IsDigit(c: char) : bool
Char.IsDigit(s: string, index: int) : bool
val folder : (char list -> char -> char list)
val state : char list
Multiple items
val char : value:'T -> char (requires member op_Explicit)
Full name: Microsoft.FSharp.Core.Operators.char
--------------------
type char = Char
Full name: Microsoft.FSharp.Core.char
type 'T list = List<'T>
Full name: Microsoft.FSharp.Collections.list<_>
module Seq
from Microsoft.FSharp.Collections
val head : source:seq<'T> -> 'T
Full name: Microsoft.FSharp.Collections.Seq.head
val p : char
val i : char
val value : string
String.Trim() : string
String.Trim([<ParamArray>] trimChars: char []) : string
val soundex : char []
val toList : source:seq<'T> -> 'T list
Full name: Microsoft.FSharp.Collections.Seq.toList
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
Multiple items
module List
from Microsoft.FSharp.Collections
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State
Full name: Microsoft.FSharp.Collections.List.fold
Char.ToUpperInvariant(c: char) : char
val filter : predicate:('T -> bool) -> list:'T list -> 'T list
Full name: Microsoft.FSharp.Collections.List.filter
val rev : list:'T list -> 'T list
Full name: Microsoft.FSharp.Collections.List.rev
val truncate : count:int -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.truncate
val toArray : source:seq<'T> -> 'T []
Full name: Microsoft.FSharp.Collections.Seq.toArray
Multiple items
type String =
new : value:char -> string + 7 overloads
member Chars : int -> char
member Clone : unit -> obj
member CompareTo : value:obj -> int + 1 overload
member Contains : value:string -> bool
member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
member EndsWith : value:string -> bool + 2 overloads
member Equals : obj:obj -> bool + 2 overloads
member GetEnumerator : unit -> CharEnumerator
member GetHashCode : unit -> int
...
Full name: System.String
--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Encoding) : unit
val DaitchMokotoff : text:string -> seq<string>
Full name: Script.Soundex.DaitchMokotoff
Daitch–Mokotoff Soundex (D–M Soundex) was developed in 1985 by genealogist Gary Mokotoff and later
improved by genealogist Randy Daitch because of problems they encountered while trying to apply the
Russell Soundex to Jews with Germanic or Slavic surnames.
References:
<list type="bullet">
<description>http://www.avotaynu.com/soundex.htm</description>
<description>http://www.jewishgen.org/InfoFiles/soundex.html</description>
</list>
val isVowel : (string -> bool)
val first : string
String.IsNullOrEmpty(value: string) : bool
field string.Empty
val values : seq<string>
Multiple items
val seq : sequence:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Core.Operators.seq
--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>
Full name: Microsoft.FSharp.Collections.seq<_>
val codes : seq<string>
val s : string
val result : string option
val tryFind : predicate:('T -> bool) -> source:seq<'T> -> 'T option
Full name: Microsoft.FSharp.Collections.Seq.tryFind
String.StartsWith(value: string) : bool
String.StartsWith(value: string, comparisonType: StringComparison) : bool
String.StartsWith(value: string, ignoreCase: bool, culture: Globalization.CultureInfo) : bool
property Option.IsNone: bool
union case Option.None: Option<'T>
union case Option.Some: Value: 'T -> Option<'T>
property Option.Value: string
property String.Length: int
active recognizer Match: seq<string> -> seq<string> -> string -> (seq<string> * bool * string) option
val result : seq<string> * bool * string
val search : (string -> seq<string> * bool * string)
active recognizer Group: string -> (seq<string> * bool * string) option
val decompose : (string -> seq<seq<string> * bool>)
val head : seq<string>
val isVowel : bool
val next : string
val append : source1:seq<'T> -> source2:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.append
val encode : (seq<string> * bool * bool -> string list)
val start : bool
type bool = Boolean
Full name: Microsoft.FSharp.Core.bool
val beforeVowel : bool
val nth : index:int -> source:seq<'T> -> 'T
Full name: Microsoft.FSharp.Collections.Seq.nth
val length : source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.length
val append : list1:'T list -> list2:'T list -> 'T list
Full name: Microsoft.FSharp.Collections.List.append
val skip : count:int -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.skip
val reduce : (string list -> string list -> string list)
val results : string list
val values : string list
val appender : (string -> string -> string)
val result : string
val map : mapping:('T -> 'U) -> list:'T list -> 'U list
Full name: Microsoft.FSharp.Collections.List.map
val concat : sources:seq<#seq<'T>> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.concat
val first : (seq<string> * bool) list
val second : (seq<string> * bool) list
String.ToUpperInvariant() : string
val pairwise : source:seq<'T> -> seq<'T * 'T>
Full name: Microsoft.FSharp.Collections.Seq.pairwise
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.filter
val a : seq<string> * bool
val b : seq<string> * bool
val unzip : list:('T1 * 'T2) list -> 'T1 list * 'T2 list
Full name: Microsoft.FSharp.Collections.List.unzip
val items : seq<string> list
val vowels : bool list
val head : list:'T list -> 'T
Full name: Microsoft.FSharp.Collections.List.head
val zip : list1:'T1 list -> list2:'T2 list -> ('T1 * 'T2) list
Full name: Microsoft.FSharp.Collections.List.zip
val tail : list:'T list -> 'T list
Full name: Microsoft.FSharp.Collections.List.tail
val mapi : mapping:(int -> 'T -> 'U) -> list:'T list -> 'U list
Full name: Microsoft.FSharp.Collections.List.mapi
val i : int
val c : string list
val not : value:bool -> bool
Full name: Microsoft.FSharp.Core.Operators.not
val isEmpty : list:'T list -> bool
Full name: Microsoft.FSharp.Collections.List.isEmpty
val reduce : reduction:('T -> 'T -> 'T) -> list:'T list -> 'T
Full name: Microsoft.FSharp.Collections.List.reduce
val toSeq : list:'T list -> seq<'T>
Full name: Microsoft.FSharp.Collections.List.toSeq
val distinct : source:seq<'T> -> seq<'T> (requires equality)
Full name: Microsoft.FSharp.Collections.Seq.distinct
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
Full name: Microsoft.FSharp.Collections.Seq.map
String.Concat([<ParamArray>] values: string []) : string
(+0 other overloads)
String.Concat(values: Collections.Generic.IEnumerable<string>) : string
(+0 other overloads)
String.Concat<'T>(values: Collections.Generic.IEnumerable<'T>) : string
(+0 other overloads)
String.Concat([<ParamArray>] args: obj []) : string
(+0 other overloads)
String.Concat(arg0: obj) : string
(+0 other overloads)
String.Concat(str0: string, str1: string) : string
(+0 other overloads)
String.Concat(arg0: obj, arg1: obj) : string
(+0 other overloads)
String.Concat(str0: string, str1: string, str2: string) : string
(+0 other overloads)
String.Concat(arg0: obj, arg1: obj, arg2: obj) : string
(+0 other overloads)
String.Concat(str0: string, str1: string, str2: string, str3: string) : string
(+0 other overloads)
More information