open System
open System.Linq
open System.Text
module Soundex =
/// The Soundex code for a name consists of a letter followed by three numerical digits: the letter
/// is the first letter of the name, and the digits encode the remaining consonants.
/// The National Archives and Records Administration (NARA) maintains the current rule set for the
/// official implementation of Soundex used by the U.S. Government. The American Soundex is a variant
/// of the original Russell Soundex algorithm.
/// Reference: http://www.archives.gov/research/census/soundex.html
let American(text : string) =
let chooser c =
match Char.ToLowerInvariant(c) with
| 'b' | 'f' | 'p' | 'v' -> '1'
| 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' -> '2'
| 'd' | 't' -> '3'
| 'l' -> '4'
| 'm' | 'n' -> '5'
| 'r' -> '6'
| 'h' | 'w' -> '-'
| _ -> if Char.IsDigit(c) then c else '.'
let folder (state : char list) (c) =
match chooser(Seq.head state), chooser(c) with
| p, i when (p <> i && i <> '-') || i = '0' -> i :: state
| _, _ -> state
let value = text.Trim() + "000"
let soundex = Seq.toList (value.Substring(1))
|> List.fold folder [ Char.ToUpperInvariant(value.[0]) ]
|> List.filter (fun c -> c <> '.')
|> List.rev
|> Seq.truncate 4
|> Seq.toArray
String(soundex)
/// Daitch–Mokotoff Soundex (D–M Soundex) was developed in 1985 by genealogist Gary Mokotoff and later
/// improved by genealogist Randy Daitch because of problems they encountered while trying to apply the
/// Russell Soundex to Jews with Germanic or Slavic surnames.
/// References:
///
/// http://www.avotaynu.com/soundex.htm
/// http://www.jewishgen.org/InfoFiles/soundex.html
///
let DaitchMokotoff(text : string) =
let isVowel(value : string) =
let first = if String.IsNullOrEmpty(value) then String.Empty else value.Substring(0, 1)
match first with
| "A" | "E" | "I" | "O" | "U" | "Y" -> true
| _ -> false
let (|Match|_|) (values : seq) (codes : seq) (s : string) =
let result = Seq.tryFind (fun value -> s.StartsWith(value)) values
if result.IsNone then None else Some( codes, isVowel(result.Value), s.Substring(result.Value.Length))
let (|Group|_|) (s : string) =
match s with
| Match ["AI";"AJ";"AY"] ["0"; "1"; ""] result -> Some(result)
| Match ["AU"] ["0"; "7"; ""] result -> Some(result)
| Match ["A"] ["0"; ""; ""] result -> Some(result)
| Match ["B"] ["7"; "7"; "7"] result -> Some(result)
| Match ["CHS"] ["5"; "54"; "54"] result -> Some(result)
| Match ["CH"] ["KH"; "TCH"] result -> Some(result)
| Match ["CK"] ["K"; "TSK"] result -> Some(result)
| Match ["CZS";"CZ";"CSZ";"CS"] ["4"; "4"; "4"] result -> Some(result)
| Match ["C"] ["K"; "TZ"] result -> Some(result)
| Match ["DRZ";"DRS"] ["4"; "4"; "4"] result -> Some(result)
| Match ["DSZ";"DSH";"DS"] ["4"; "4"; "4"] result -> Some(result)
| Match ["DZH";"DZS";"DZ"] ["4"; "4"; "4"] result -> Some(result)
| Match ["DT"] ["3"; "3"; "3"] result -> Some(result)
| Match ["EI";"EJ";"EY"] ["0"; "1"; ""] result -> Some(result)
| Match ["EU"] ["1"; "1"; ""] result -> Some(result)
| Match ["E"] ["0"; ""; ""] result -> Some(result)
| Match ["FB"; "F"] ["7"; "7"; "7"] result -> Some(result)
| Match ["G"] ["5"; "5"; "5"] result -> Some(result)
| Match ["H"] ["5"; "5"; ""] result -> Some(result)
| Match ["IA";"IE";"IO";"IU"] ["1"; ""; ""] result -> Some(result)
| Match ["I"] ["0"; ""; ""] result -> Some(result)
| Match ["J2Y"] ["1"; "1"; "1"] result -> Some(result)
| Match ["J"] ["J2Y"; "DZH"] result -> Some(result)
| Match ["KS"] ["5"; "54"; "54"] result -> Some(result)
| Match ["KH"; "K"] ["5"; "5"; "5"] result -> Some(result)
| Match ["L"] ["8"; "8"; "8"] result -> Some(result)
| Match ["MN";"NM"] ["66"; "66"; "66"] result -> Some(result)
| Match ["M"; "N"] ["6"; "6"; "6"] result -> Some(result)
| Match ["OI";"OJ";"OY"] ["0"; "1"; ""] result -> Some(result)
| Match ["O"] ["0"; ""; ""] result -> Some(result)
| Match ["PF";"PH"; "P"] ["7"; "7"; "7"] result -> Some(result)
| Match ["RTZ"] ["94"; "94"; "94"] result -> Some(result)
| Match ["RS";"RZ"] ["RTZ"; "ZH"] result -> Some(result)
| Match ["R"] ["9"; "9"; "9"] result -> Some(result)
| Match ["SCHTSCH";"SCHTSH";"SCHTCH"] ["2"; "4"; "4"] result -> Some(result)
| Match ["SCH"] ["4"; "4"; "4"] result -> Some(result)
| Match ["SHTCH";"SHCH";"SHTSH"] ["2"; "4"; "4"] result -> Some(result)
| Match ["SHT";"SCHT";"SCHD"] ["2"; "43"; "43"] result -> Some(result)
| Match ["SH"] ["4"; "4"; "4"] result -> Some(result)
| Match ["STCH";"STSCH";"SC"] ["2"; "4"; "4"] result -> Some(result)
| Match ["STRZ";"STRS";"STSH"] ["2"; "4"; "4"] result -> Some(result)
| Match ["ST"] ["2"; "43"; "43"] result -> Some(result)
| Match ["SZCZ";"SZCS"] ["2"; "4"; "4"] result -> Some(result)
| Match ["SZT";"SHD";"SZD";"SD"] ["2"; "43"; "43"] result -> Some(result)
| Match ["SZ";"S"] ["4"; "4"; "4"] result -> Some(result)
| Match ["TCH";"TTCH";"TTSCH";"THS"] ["4";"4";"4"] result -> Some(result)
| Match ["TH"] ["3";"3";"3"] result -> Some(result)
| Match ["TRZ";"TRS"] ["4";"4";"4"] result -> Some(result)
| Match ["TSCH";"TSH"] ["4";"4";"4"] result -> Some(result)
| Match ["TSK"] ["45";"45";"45"] result -> Some(result)
| Match ["TTSZ";"TTS";"TC"] ["4";"4";"4"] result -> Some(result)
| Match ["TZS";"TTZ";"TZ";"TSZ";"TS"] ["4";"4";"4"] result -> Some(result)
| Match ["T"] ["3";"3";"3"] result -> Some(result)
| Match ["UI";"UJ";"UY"] ["0";"1";""] result -> Some(result)
| Match ["UE";"U"] ["0";"";""] result -> Some(result)
| Match ["V"] ["7";"7";"7"] result -> Some(result)
| Match ["W"] ["7";"7";"7"] result -> Some(result)
| Match ["X"] ["5";"54";"54"] result -> Some(result)
| Match ["Y"] ["1";"";""] result -> Some(result)
| Match ["ZHDZH";"ZDZH";"ZDZ"] ["2";"4";"4"] result -> Some(result)
| Match ["ZD";"ZHD"] ["2";"43";"43"] result -> Some(result)
| Match ["ZSCH";"ZSH";"ZH";"ZS"] ["4";"4";"4"] result -> Some(result)
| Match ["Z"] ["4";"4";"4"] result -> Some(result)
| _ -> None
let search(value : string) =
match value with
| Group result -> (*printfn "%s -> %A" value result;*) result
| _ -> seq[ "" ], false, value.Substring(1)
let rec decompose (value : string) =
let head, isVowel, next = search(value)
if String.IsNullOrEmpty(next) then
seq [ head, isVowel ]
else
Seq.append [ head, isVowel ] (decompose(next))
let rec encode (codes : seq, start : bool, beforeVowel : bool) =
let first = Seq.nth 0 codes
if String.IsNullOrEmpty(first) then
[ ]
else
if String.IsNullOrEmpty(first) || Char.IsDigit(Seq.nth 0 first) then
if start then
[ (Seq.nth 0 codes) ]
else if beforeVowel then
[ (Seq.nth 1 codes) ]
else
[ (Seq.nth 2 codes) ]
else
let head, isVowel, next = search(first)
if Seq.length codes = 1 then
encode(head, start, beforeVowel)
else
List.append (encode(head, start, beforeVowel)) (encode(Seq.skip 1 codes, start, beforeVowel))
let reduce (results : string list) (values : string list) =
let appender(value : string) =
fun (s : string) ->
let result = (s + value)
if result.Length >= 6 then result.Substring(0, 6) else result
seq { for value in values do yield List.map (appender(value)) results }
|> Seq.concat
|> Seq.toList
let first, second = decompose ( text.ToUpperInvariant() )
|> Seq.pairwise
|> Seq.filter (fun (a, b) -> a <> b)
|> Seq.toList
|> List.unzip
let items, vowels = List.unzip (List.head first :: second)
List.zip items ( List.append (List.tail vowels) [ false ] )
|> List.mapi (fun (i) (codes, isVowel) -> encode(codes, i = 0, isVowel))
|> List.filter (fun c -> not(List.isEmpty c) && (List.head c) <> "")
|> List.reduce reduce
|> List.toSeq
|> Seq.distinct
|> Seq.map (fun s -> if s.Length >= 6 then s else String.Concat(s, String('0', 6 - s.Length)))
(*
Soundex.American("Ashcraft") // "A261"
Soundex.American("jackson") // "J250"
Soundex.American("miller") // "M460"
Soundex.American("Wilson") // "W425"
Soundex.American("Schmit") // "S530"
Soundex.American("Lloyd") // "L300"
Soundex.DaitchMokotoff("Peters") // [ "739400"; "734000" ];
Soundex.DaitchMokotoff("Peterson") // [ "739460"; "734600" ];
Soundex.DaitchMokotoff("Moskowitz") // [ "645740" ];
Soundex.DaitchMokotoff("Moskovitz") // [ "645740" ];
Soundex.DaitchMokotoff("Auerbach") // [ "097500"; "097400" ];
Soundex.DaitchMokotoff("Uhrbach") // [ "097500"; "097400" ];
Soundex.DaitchMokotoff("Jackson") // [ "154600"; "454600"; "145460"; "445460" ];
*)