2 people like it.
Like the snippet!
Detecting fraud with Benford's law
Many datasets, particularly those which span several orders of magnitude, have a special property. About 30% of the initial digits of all the data items will be the digit '1'. This can be used to detect fraud, for instance in expenses claims, as people tend to concoct figures which don't have this property. These functions implement one possible test for matching Benford's law. (Credits in the comments.)
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
|
module ChoAndGaines
open System
// Cho WKT, Gaines BJ (2007) Breaking the (Benford) law: Statistical fraud detection in campaign finance.
// Amer Stat 61, 218–223
/// Observed frequency in a 'bin'.
let e i s =
let n = s |> Seq.length |> float
let startsi = s |> Seq.filter (fun x -> let digit = Int32.Parse(x.ToString().[0..0])
digit = i)
|> Seq.length |> float
startsi / n
/// Probability of appearing in a 'bin' according to Benford's law.
let b i =
Math.Log10(1. + 1./(i |> float))
/// Cho and Gaines distance.
let d s =
[1..9]
|> Seq.map (fun i -> (b i - (e i s)) ** 2.)
|> Seq.sum
|> Math.Sqrt
/// Cho and Gaines distance times square root of data count. (d*)
let d' s =
let n2 = s |> Seq.length |> float |> sqrt
n2 * d s
/// Interpret the d* statistic based on critical values established by Morrow.
///
/// From http://www.johnmorrow.info/projects/benford/benfordMain.pdf
let interpret d' =
let limits =
[
"α 0.1", 1.212
"α 0.05", 1.330
"α 0.01", 1.569
]
let below, above =
limits
|> Array.ofSeq
|> Array.partition (fun (_, t) -> d' < t)
let belowStr, aboveStr =
below
|> Array.map (fun (name, _) -> name)
|> Array.fold (fun acc name -> sprintf "%s %s" acc name) "Below: ",
above
|> Array.map (fun (name, _) -> name)
|> Array.fold (fun acc name -> sprintf "%s %s" acc name) "Above: "
belowStr, aboveStr
// Usage examples:
// ("Below: ", "Above: α 0.1 α 0.05 α 0.01")
let randomTest =
let r = new Random()
[1..1000]
|> Seq.map (fun _ -> r.Next())
|> d'
|> interpret
// ("Below: α 0.1 α 0.05 α 0.01", "Above: ")
let fileSizeTest =
System.IO.Directory.EnumerateFiles(@"c:\windows", "*.*", System.IO.SearchOption.TopDirectoryOnly)
|> Seq.map (fun name -> let f = new System.IO.FileInfo(name)
f.Length)
|> d'
|> interpret
// ("Below: α 0.01", "Above: α 0.1 α 0.05")
let welshLocalAuthorityPopulationsTest =
[
69700
121900
115200
93700
152500
134800
133000
75900
122400
183800
239000
139800
139200
126300
346100
234400
58800
178800
69800
91100
91300
145700
]
|> d'
|> interpret
|
module ChoAndGaines
namespace System
val e : i:int -> s:seq<'a> -> float
Full name: ChoAndGaines.e
Observed frequency in a 'bin'.
val i : int
val s : seq<'a>
val n : float
module Seq
from Microsoft.FSharp.Collections
val length : source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.length
Multiple items
val float : value:'T -> float (requires member op_Explicit)
Full name: Microsoft.FSharp.Core.Operators.float
--------------------
type float = Double
Full name: Microsoft.FSharp.Core.float
--------------------
type float<'Measure> = float
Full name: Microsoft.FSharp.Core.float<_>
val startsi : float
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.filter
val x : 'a
val digit : int
type Int32 =
struct
member CompareTo : value:obj -> int + 1 overload
member Equals : obj:obj -> bool + 1 overload
member GetHashCode : unit -> int
member GetTypeCode : unit -> TypeCode
member ToString : unit -> string + 3 overloads
static val MaxValue : int
static val MinValue : int
static member Parse : s:string -> int + 3 overloads
static member TryParse : s:string * result:int -> bool + 1 overload
end
Full name: System.Int32
Int32.Parse(s: string) : int
Int32.Parse(s: string, provider: IFormatProvider) : int
Int32.Parse(s: string, style: Globalization.NumberStyles) : int
Int32.Parse(s: string, style: Globalization.NumberStyles, provider: IFormatProvider) : int
Object.ToString() : string
val b : i:int -> float
Full name: ChoAndGaines.b
Probability of appearing in a 'bin' according to Benford's law.
type Math =
static val PI : float
static val E : float
static member Abs : value:sbyte -> sbyte + 6 overloads
static member Acos : d:float -> float
static member Asin : d:float -> float
static member Atan : d:float -> float
static member Atan2 : y:float * x:float -> float
static member BigMul : a:int * b:int -> int64
static member Ceiling : d:decimal -> decimal + 1 overload
static member Cos : d:float -> float
...
Full name: System.Math
Math.Log10(d: float) : float
val d : s:seq<'a> -> float
Full name: ChoAndGaines.d
Cho and Gaines distance.
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
Full name: Microsoft.FSharp.Collections.Seq.map
val sum : source:seq<'T> -> 'T (requires member ( + ) and member get_Zero)
Full name: Microsoft.FSharp.Collections.Seq.sum
Math.Sqrt(d: float) : float
val d' : s:seq<'a> -> float
Full name: ChoAndGaines.d'
Cho and Gaines distance times square root of data count. (d*)
val n2 : float
val sqrt : value:'T -> 'U (requires member Sqrt)
Full name: Microsoft.FSharp.Core.Operators.sqrt
val interpret : d':float -> string * string
Full name: ChoAndGaines.interpret
Interpret the d* statistic based on critical values established by Morrow.
From http://www.johnmorrow.info/projects/benford/benfordMain.pdf
val d' : float
val limits : (string * float) list
val below : (string * float) []
val above : (string * float) []
type Array =
member Clone : unit -> obj
member CopyTo : array:Array * index:int -> unit + 1 overload
member GetEnumerator : unit -> IEnumerator
member GetLength : dimension:int -> int
member GetLongLength : dimension:int -> int64
member GetLowerBound : dimension:int -> int
member GetUpperBound : dimension:int -> int
member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
member Initialize : unit -> unit
member IsFixedSize : bool
...
Full name: System.Array
val ofSeq : source:seq<'T> -> 'T []
Full name: Microsoft.FSharp.Collections.Array.ofSeq
val partition : predicate:('T -> bool) -> array:'T [] -> 'T [] * 'T []
Full name: Microsoft.FSharp.Collections.Array.partition
val t : float
val belowStr : string
val aboveStr : string
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []
Full name: Microsoft.FSharp.Collections.Array.map
val name : string
val fold : folder:('State -> 'T -> 'State) -> state:'State -> array:'T [] -> 'State
Full name: Microsoft.FSharp.Collections.Array.fold
val acc : string
val sprintf : format:Printf.StringFormat<'T> -> 'T
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.sprintf
val randomTest : string * string
Full name: ChoAndGaines.randomTest
val r : Random
Multiple items
type Random =
new : unit -> Random + 1 overload
member Next : unit -> int + 2 overloads
member NextBytes : buffer:byte[] -> unit
member NextDouble : unit -> float
Full name: System.Random
--------------------
Random() : unit
Random(Seed: int) : unit
Random.Next() : int
Random.Next(maxValue: int) : int
Random.Next(minValue: int, maxValue: int) : int
val fileSizeTest : string * string
Full name: ChoAndGaines.fileSizeTest
namespace System.IO
type Directory =
static member CreateDirectory : path:string -> DirectoryInfo + 1 overload
static member Delete : path:string -> unit + 1 overload
static member EnumerateDirectories : path:string -> IEnumerable<string> + 2 overloads
static member EnumerateFileSystemEntries : path:string -> IEnumerable<string> + 2 overloads
static member EnumerateFiles : path:string -> IEnumerable<string> + 2 overloads
static member Exists : path:string -> bool
static member GetAccessControl : path:string -> DirectorySecurity + 1 overload
static member GetCreationTime : path:string -> DateTime
static member GetCreationTimeUtc : path:string -> DateTime
static member GetCurrentDirectory : unit -> string
...
Full name: System.IO.Directory
IO.Directory.EnumerateFiles(path: string) : Collections.Generic.IEnumerable<string>
IO.Directory.EnumerateFiles(path: string, searchPattern: string) : Collections.Generic.IEnumerable<string>
IO.Directory.EnumerateFiles(path: string, searchPattern: string, searchOption: IO.SearchOption) : Collections.Generic.IEnumerable<string>
type SearchOption =
| TopDirectoryOnly = 0
| AllDirectories = 1
Full name: System.IO.SearchOption
field IO.SearchOption.TopDirectoryOnly = 0
val f : IO.FileInfo
Multiple items
type FileInfo =
inherit FileSystemInfo
new : fileName:string -> FileInfo
member AppendText : unit -> StreamWriter
member CopyTo : destFileName:string -> FileInfo + 1 overload
member Create : unit -> FileStream
member CreateText : unit -> StreamWriter
member Decrypt : unit -> unit
member Delete : unit -> unit
member Directory : DirectoryInfo
member DirectoryName : string
member Encrypt : unit -> unit
...
Full name: System.IO.FileInfo
--------------------
IO.FileInfo(fileName: string) : unit
property IO.FileInfo.Length: int64
val welshLocalAuthorityPopulationsTest : string * string
Full name: ChoAndGaines.welshLocalAuthorityPopulationsTest
More information