3 people like it.

What word is most like the word "turtle"?

A while ago I posted a snippet to calculate the 'Discrete Fréchet Distance' between two curves. If we treat a word as a 'curve' by giving each letter an index (with similar-sounding letters having closer indices) we can compare words by the Fréchet distance between them! An alternative to edit-distance...

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
open System
open System.Net

// Discrete Frechet Distance: 
// (Based on the 1994 algorithm from Thomas Eiter and Heikki Mannila.)
let frechet (P : array<float*float>) (Q : array<float*float>) =
    let sq (x : float) = x * x
    let min3 x y z = [x; y; z] |> List.min
    let d (a : float*float) (b: float*float) =
        let ab_x = fst(a) - fst(b)
        let ab_y = snd(a) - snd(b)
        sqrt(sq ab_x + sq ab_y)

    let p, q = Array.length P, Array.length Q
    let ca = Array2D.init p q (fun _ _ -> -1.0)

    let rec c i j =
        if ca.[i, j] > -1.0 then
            ca.[i, j]
        else
            if i = 0 && j = 0 then
                ca.[i, j] <- d (P.[0]) (Q.[0])
            elif i > 0 && j = 0 then 
                ca.[i, j] <- Math.Max((c (i-1) 0), (d P.[i] Q.[0]))
            elif i = 0 && j > 0 then 
                ca.[i, j] <- Math.Max((c 0 (j-1)), (d P.[0] Q.[j]))
            elif i > 0 && j > 0 then
                ca.[i, j] <- Math.Max(min3 (c (i-1) j) (c (i-1) (j-1)) (c i (j-1)), (d P.[i] Q.[j]))
            else
                ca.[i, j] <- nan
            ca.[i, j]
    c (p-1) (q-1)

// Use frechet as an operator:
let (-~~) a1 a2 = abs(frechet a1 a2)

// Make an array of letters in roughly sound-alike order, with an index to reflect roughly how
// similar each sounds to its neighbour in the list:
let letters = [|
                    'a', 1.0
                    'e', 2.0
                    'i', 3.0
                    'o', 4.0
                    'u', 5.0
                    'y', 6.0
                    'h', 7.0

                    'b', 17.0
                    'p', 18.0
                    't', 19.0
                    'd', 20.0
                    'j', 21.0

                    'r', 25.0

                    'c', 35.0
                    'k', 36.0
                    'q', 37.0
                    'x', 38.0
                    'g', 39.0

                    'l', 49.0
                    'm', 50.0
                    'n', 51.0

                    's', 61.0
                    'z', 62.0

                    'f', 72.0
                    'v', 73.0

                    'w', 83.0
              |]

// Get the 'similarity index' of a letter:
let letterValue letter = 
    let pair = try
                   letters 
                   |> Array.find (fun elem -> fst(elem) = letter)
               with
                | _ -> ' ', 99.9
    snd(pair)

// Treat a word as a curve of similarity indices:
let wordCurve (word : string) =
    word.ToLower().ToCharArray()
    |> Array.mapi (fun i letter -> float(i), letterValue letter)

// Work out the Frechet distance between two word curves:
let wordDistance word1 word2  =
    (wordCurve word1) -~~ (wordCurve word2)

// Make a funky operator for wordDistance:
let (<-->) = wordDistance

// Read a file from a url but as if it's local for performance:
let urlReader (url : string) =
    let req = WebRequest.Create(url, Timeout = 1000 * 60 * 20)
    try
        let resp = req.GetResponse()
        let stream = resp.GetResponseStream()
        let tempFileName = System.IO.Path.GetTempFileName()
        let tempFileStream = new System.IO.FileStream(tempFileName, System.IO.FileMode.Truncate)
        stream.CopyTo(tempFileStream)
        tempFileStream.Seek(int64(0), System.IO.SeekOrigin.Begin) |> ignore
        new System.IO.StreamReader(tempFileStream)
    with
        | _ as ex -> failwith ex.Message

// Read a word list and break it up into non-trivial, lowercase words:
let longWordList() =
    let goodLetters = [|'a'..'z'|]
    let goodWord (word : string) =
        let badIndices =
            word.ToCharArray()
            |> Array.map (fun letter -> Array.IndexOf(goodLetters, letter)) 
            |> Array.filter (fun index -> index < 0)
        (badIndices |> Array.length) = 0

    let reader = urlReader "http://unix-tree.huihoo.org/V7/usr/dict/words.html"
    seq {
            while not (reader.EndOfStream) do
            yield (reader.ReadLine().ToLower())
    } 
    |> Seq.skip 17 // Skip HTML
    |> Seq.filter (fun word -> word.Length > 2)
    |> Seq.filter (fun word -> goodWord word)
    |> Seq.cache

let mostLike word n =
    longWordList()
    |> Seq.sortBy (fun listWord -> listWord <--> word)
    |> Seq.truncate n

let leastLike word n =
    longWordList()
    |> Seq.sortBy (fun listWord -> -(listWord <--> word))
    |> Seq.truncate n

// Examples:

//    mostLike "turtle" 10 |> Seq.iter (fun item -> printfn "%s" item);;
//    turtle
//    purple
//    diddle
//    dudley
//    paddle
//    peddle
//    piddle
//    puddle
//    puddly
//    toddle

//    leastLike "turtle" 10 |> Seq.iter (fun item -> printfn "%s" item);;
//    bartholomew
//    counterflow
//    grandnephew
//    hereinbelow
//    marshmallow
//    longfellow
//    afterglow
//    bmw
//    bow
//    caw
namespace System
namespace System.Net
val frechet : P:(float * float) array -> Q:(float * float) array -> float

Full name: Script.frechet
val P : (float * float) array
type 'T array = 'T []

Full name: Microsoft.FSharp.Core.array<_>
Multiple items
val float : value:'T -> float (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.float

--------------------
type float = Double

Full name: Microsoft.FSharp.Core.float

--------------------
type float<'Measure> = float

Full name: Microsoft.FSharp.Core.float<_>
val Q : (float * float) array
val sq : (float -> float)
val x : float
val min3 : ('a -> 'a -> 'a -> 'a) (requires comparison)
val x : 'a (requires comparison)
val y : 'a (requires comparison)
val z : 'a (requires comparison)
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val min : list:'T list -> 'T (requires comparison)

Full name: Microsoft.FSharp.Collections.List.min
val d : (float * float -> float * float -> float)
val a : float * float
val b : float * float
val ab_x : float
val fst : tuple:('T1 * 'T2) -> 'T1

Full name: Microsoft.FSharp.Core.Operators.fst
val ab_y : float
val snd : tuple:('T1 * 'T2) -> 'T2

Full name: Microsoft.FSharp.Core.Operators.snd
val sqrt : value:'T -> 'U (requires member Sqrt)

Full name: Microsoft.FSharp.Core.Operators.sqrt
val p : int
val q : int
type Array =
  member Clone : unit -> obj
  member CopyTo : array:Array * index:int -> unit + 1 overload
  member GetEnumerator : unit -> IEnumerator
  member GetLength : dimension:int -> int
  member GetLongLength : dimension:int -> int64
  member GetLowerBound : dimension:int -> int
  member GetUpperBound : dimension:int -> int
  member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
  member Initialize : unit -> unit
  member IsFixedSize : bool
  ...

Full name: System.Array
val length : array:'T [] -> int

Full name: Microsoft.FSharp.Collections.Array.length
val ca : float [,]
module Array2D

from Microsoft.FSharp.Collections
val init : length1:int -> length2:int -> initializer:(int -> int -> 'T) -> 'T [,]

Full name: Microsoft.FSharp.Collections.Array2D.init
val c : (int -> int -> float)
val i : int
val j : int
type Math =
  static val PI : float
  static val E : float
  static member Abs : value:sbyte -> sbyte + 6 overloads
  static member Acos : d:float -> float
  static member Asin : d:float -> float
  static member Atan : d:float -> float
  static member Atan2 : y:float * x:float -> float
  static member BigMul : a:int * b:int -> int64
  static member Ceiling : d:decimal -> decimal + 1 overload
  static member Cos : d:float -> float
  ...

Full name: System.Math
Math.Max(val1: decimal, val2: decimal) : decimal
   (+0 other overloads)
Math.Max(val1: float, val2: float) : float
   (+0 other overloads)
Math.Max(val1: float32, val2: float32) : float32
   (+0 other overloads)
Math.Max(val1: uint64, val2: uint64) : uint64
   (+0 other overloads)
Math.Max(val1: int64, val2: int64) : int64
   (+0 other overloads)
Math.Max(val1: uint32, val2: uint32) : uint32
   (+0 other overloads)
Math.Max(val1: int, val2: int) : int
   (+0 other overloads)
Math.Max(val1: uint16, val2: uint16) : uint16
   (+0 other overloads)
Math.Max(val1: int16, val2: int16) : int16
   (+0 other overloads)
Math.Max(val1: byte, val2: byte) : byte
   (+0 other overloads)
val nan : float

Full name: Microsoft.FSharp.Core.Operators.nan
val a1 : (float * float) array
val a2 : (float * float) array
val abs : value:'T -> 'T (requires member Abs)

Full name: Microsoft.FSharp.Core.Operators.abs
val letters : (char * float) []

Full name: Script.letters
val letterValue : letter:char -> float

Full name: Script.letterValue
val letter : char
val pair : char * float
val find : predicate:('T -> bool) -> array:'T [] -> 'T

Full name: Microsoft.FSharp.Collections.Array.find
val elem : char * float
val wordCurve : word:string -> (float * float) []

Full name: Script.wordCurve
val word : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
String.ToLower() : string
String.ToLower(culture: Globalization.CultureInfo) : string
val mapi : mapping:(int -> 'T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.mapi
val wordDistance : word1:string -> word2:string -> float

Full name: Script.wordDistance
val word1 : string
val word2 : string
val urlReader : url:string -> IO.StreamReader

Full name: Script.urlReader
val url : string
val req : WebRequest
type WebRequest =
  inherit MarshalByRefObject
  member Abort : unit -> unit
  member AuthenticationLevel : AuthenticationLevel with get, set
  member BeginGetRequestStream : callback:AsyncCallback * state:obj -> IAsyncResult
  member BeginGetResponse : callback:AsyncCallback * state:obj -> IAsyncResult
  member CachePolicy : RequestCachePolicy with get, set
  member ConnectionGroupName : string with get, set
  member ContentLength : int64 with get, set
  member ContentType : string with get, set
  member Credentials : ICredentials with get, set
  member EndGetRequestStream : asyncResult:IAsyncResult -> Stream
  ...

Full name: System.Net.WebRequest
WebRequest.Create(requestUri: Uri) : WebRequest
WebRequest.Create(requestUriString: string) : WebRequest
val resp : WebResponse
WebRequest.GetResponse() : WebResponse
val stream : IO.Stream
WebResponse.GetResponseStream() : IO.Stream
val tempFileName : string
namespace System.IO
type Path =
  static val DirectorySeparatorChar : char
  static val AltDirectorySeparatorChar : char
  static val VolumeSeparatorChar : char
  static val InvalidPathChars : char[]
  static val PathSeparator : char
  static member ChangeExtension : path:string * extension:string -> string
  static member Combine : [<ParamArray>] paths:string[] -> string + 3 overloads
  static member GetDirectoryName : path:string -> string
  static member GetExtension : path:string -> string
  static member GetFileName : path:string -> string
  ...

Full name: System.IO.Path
IO.Path.GetTempFileName() : string
val tempFileStream : IO.FileStream
Multiple items
type FileStream =
  inherit Stream
  new : path:string * mode:FileMode -> FileStream + 14 overloads
  member BeginRead : array:byte[] * offset:int * numBytes:int * userCallback:AsyncCallback * stateObject:obj -> IAsyncResult
  member BeginWrite : array:byte[] * offset:int * numBytes:int * userCallback:AsyncCallback * stateObject:obj -> IAsyncResult
  member CanRead : bool
  member CanSeek : bool
  member CanWrite : bool
  member EndRead : asyncResult:IAsyncResult -> int
  member EndWrite : asyncResult:IAsyncResult -> unit
  member Flush : unit -> unit + 1 overload
  member GetAccessControl : unit -> FileSecurity
  ...

Full name: System.IO.FileStream

--------------------
IO.FileStream(path: string, mode: IO.FileMode) : unit
   (+0 other overloads)
IO.FileStream(handle: Win32.SafeHandles.SafeFileHandle, access: IO.FileAccess) : unit
   (+0 other overloads)
IO.FileStream(path: string, mode: IO.FileMode, access: IO.FileAccess) : unit
   (+0 other overloads)
IO.FileStream(handle: Win32.SafeHandles.SafeFileHandle, access: IO.FileAccess, bufferSize: int) : unit
   (+0 other overloads)
IO.FileStream(path: string, mode: IO.FileMode, access: IO.FileAccess, share: IO.FileShare) : unit
   (+0 other overloads)
IO.FileStream(handle: Win32.SafeHandles.SafeFileHandle, access: IO.FileAccess, bufferSize: int, isAsync: bool) : unit
   (+0 other overloads)
IO.FileStream(path: string, mode: IO.FileMode, access: IO.FileAccess, share: IO.FileShare, bufferSize: int) : unit
   (+0 other overloads)
IO.FileStream(path: string, mode: IO.FileMode, access: IO.FileAccess, share: IO.FileShare, bufferSize: int, options: IO.FileOptions) : unit
   (+0 other overloads)
IO.FileStream(path: string, mode: IO.FileMode, access: IO.FileAccess, share: IO.FileShare, bufferSize: int, useAsync: bool) : unit
   (+0 other overloads)
IO.FileStream(path: string, mode: IO.FileMode, rights: Security.AccessControl.FileSystemRights, share: IO.FileShare, bufferSize: int, options: IO.FileOptions) : unit
   (+0 other overloads)
type FileMode =
  | CreateNew = 1
  | Create = 2
  | Open = 3
  | OpenOrCreate = 4
  | Truncate = 5
  | Append = 6

Full name: System.IO.FileMode
field IO.FileMode.Truncate = 5
IO.Stream.CopyTo(destination: IO.Stream) : unit
IO.Stream.CopyTo(destination: IO.Stream, bufferSize: int) : unit
IO.FileStream.Seek(offset: int64, origin: IO.SeekOrigin) : int64
Multiple items
val int64 : value:'T -> int64 (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.int64

--------------------
type int64 = Int64

Full name: Microsoft.FSharp.Core.int64

--------------------
type int64<'Measure> = int64

Full name: Microsoft.FSharp.Core.int64<_>
type SeekOrigin =
  | Begin = 0
  | Current = 1
  | End = 2

Full name: System.IO.SeekOrigin
field IO.SeekOrigin.Begin = 0
val ignore : value:'T -> unit

Full name: Microsoft.FSharp.Core.Operators.ignore
Multiple items
type StreamReader =
  inherit TextReader
  new : stream:Stream -> StreamReader + 9 overloads
  member BaseStream : Stream
  member Close : unit -> unit
  member CurrentEncoding : Encoding
  member DiscardBufferedData : unit -> unit
  member EndOfStream : bool
  member Peek : unit -> int
  member Read : unit -> int + 1 overload
  member ReadLine : unit -> string
  member ReadToEnd : unit -> string
  ...

Full name: System.IO.StreamReader

--------------------
IO.StreamReader(stream: IO.Stream) : unit
IO.StreamReader(path: string) : unit
IO.StreamReader(stream: IO.Stream, detectEncodingFromByteOrderMarks: bool) : unit
IO.StreamReader(stream: IO.Stream, encoding: Text.Encoding) : unit
IO.StreamReader(path: string, detectEncodingFromByteOrderMarks: bool) : unit
IO.StreamReader(path: string, encoding: Text.Encoding) : unit
IO.StreamReader(stream: IO.Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
IO.StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
IO.StreamReader(stream: IO.Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
IO.StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
val ex : exn
val failwith : message:string -> 'T

Full name: Microsoft.FSharp.Core.Operators.failwith
property Exception.Message: string
val longWordList : unit -> seq<string>

Full name: Script.longWordList
val goodLetters : char []
val goodWord : (string -> bool)
val badIndices : int []
String.ToCharArray() : char []
String.ToCharArray(startIndex: int, length: int) : char []
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.map
Array.IndexOf<'T>(array: 'T [], value: 'T) : int
Array.IndexOf(array: Array, value: obj) : int
Array.IndexOf<'T>(array: 'T [], value: 'T, startIndex: int) : int
Array.IndexOf(array: Array, value: obj, startIndex: int) : int
Array.IndexOf<'T>(array: 'T [], value: 'T, startIndex: int, count: int) : int
Array.IndexOf(array: Array, value: obj, startIndex: int, count: int) : int
val filter : predicate:('T -> bool) -> array:'T [] -> 'T []

Full name: Microsoft.FSharp.Collections.Array.filter
val index : int
val reader : IO.StreamReader
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
property IO.StreamReader.EndOfStream: bool
IO.StreamReader.ReadLine() : string
module Seq

from Microsoft.FSharp.Collections
val skip : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.skip
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.filter
property String.Length: int
val cache : source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.cache
val mostLike : word:string -> n:int -> seq<string>

Full name: Script.mostLike
val n : int
val sortBy : projection:('T -> 'Key) -> source:seq<'T> -> seq<'T> (requires comparison)

Full name: Microsoft.FSharp.Collections.Seq.sortBy
val listWord : string
val truncate : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.truncate
val leastLike : word:string -> n:int -> seq<string>

Full name: Script.leastLike
Raw view Test code New version

More information

Link:http://fssnip.net/bg
Posted:12 years ago
Author:Kit Eason
Tags: learning f# , text processing