1 people like it.
Like the snippet!
n-gram algorithm
Simple n-gram algorithm implementation
)
( simple implementation of n-grams algorithm )
( using dictionaries based on basic binary trees
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
|
open System.IO
open System.Text.RegularExpressions
let flip f x y = f y x
let curry f x y = f (x, y)
let uncurry f (x, y) = f x y
type 'a Tree when 'a : comparison =
| Nil
| Node of 'a * 'a Tree * 'a Tree
let Leaf x = Node(x, Nil, Nil)
let insert x T =
let rec insert' x T cont =
match T with
| Nil -> cont (Leaf x)
| Node(a, L, R) ->
if (x > a) then
insert' x R (fun e -> cont <| Node(a, L, e))
else
insert' x L (fun e -> cont <| Node(a, e, R))
insert' x T id
let fold_infix f acc T =
let rec fold_infix' f acc T cont =
match T with
| Nil -> cont acc
| Node(a, L, R) ->
fold_infix' f acc L <|
fun e -> fold_infix' f (f e a) R cont
fold_infix' f acc T id
let listToTree L = List.fold (flip insert) Nil L
let treeToList T = fold_infix (fun l x -> x::l) [] T
let treeSort L = treeToList <| listToTree L
let emptyDict = Nil
(* let insert_word word count dict = insert (count, word) dict *)
let update_word x T =
let rec update_word' x T cont =
match T with
| Nil -> cont <| Leaf (x, 1)
| Node((w, c), L, R) ->
if (w = x) then
cont <| Node((w, c+1), L, R)
elif (x < w) then
update_word' x L (fun t -> cont <| Node((w, c), t, R))
else
update_word' x R (fun t -> cont <| Node((w, c), L, t))
update_word' x T id
let rec lookup_word x = function
| Nil -> None
| Node((w, c), L, R) ->
if (w = x) then Some(c)
elif (x < w) then
lookup_word x L
else lookup_word x R
(*let insert_dict word T =
if (None = lookup_word word T) then
insert (word, 1) T
else update_word word T*)
let walk_dict f = fold_infix (fun acc x -> f x) 0
(* Seq.take/Seq.truncate не работают, т.к.
требуется потом приводить их к list,
что, наверное, медленее *)
let take n L =
let rec take' n L cont =
match L with
| [] -> cont []
| (x::xs) when n=1 -> (cont [x])
| (x::xs) -> take' (n-1) xs (fun l -> cont(x::l))
if (n > 0)
then take' n L id
else []
let ngrams len items =
let rec ngrams' len items acc =
if (List.length items) < len then
acc
else
let el = take len items
ngrams' len (List.tail items) (el::acc)
ngrams' len items []
(* не получается записать без аргументов? *)
let bigrams W = ngrams 2 W
let trigrams W = ngrams 3 W
let monograms W = ngrams 1 W
let generateDict words dict =
List.fold (flip update_word) dict words
let rnd_num a b =
let r = new System.Random()
let n = r.Next(a, b+1)
n
let predict word db =
let collect_words = fold_infix (fun acc x -> x::acc)
let rec lookup x T acc cont =
match T with
| Nil -> cont acc
| Node((w, c), L, R) ->
if (x = List.head w) then
lookup x L ((w,c)::acc)
<| fun e -> lookup x R (e@acc) cont
elif (x < List.head w) then
lookup x L acc cont
else lookup x R acc cont
let grams = lookup word db [] id
let rpt a n = [for i in [1..n] do yield a]
let flatgrams = List.collect (uncurry rpt) grams
if List.isEmpty flatgrams then [""]
else List.nth flatgrams ((rnd_num 1 (List.length flatgrams))-1)
let cleanup_string (s : string) =
let banned = ["the"; "a"; "of"]
(* list of the banned words *)
let s = s.Trim().ToLower()
let s' = Regex.Replace(s, @"[^a-z]", "")
if List.exists (fun x -> x = s') banned
then "" else s'
///let path = "C:\\Users\\Dan\\Documents\\Study\\FP\\test1.txt"
let path = "/home/d/Dropbox/Fp/test1.txt"
let text = File.ReadAllText(path)
let words =
Array.foldBack (fun e acc ->
let e' = (cleanup_string e)
if e' = "" then acc else e'::acc)
(text.Split([|' ';'\n'|])) []
let dictTree = generateDict (bigrams words) Nil
|> generateDict (trigrams words)
let say = List.reduce (fun a x -> a + " " + x) <<
(flip predict) dictTree
say "cathedral"
|
namespace System
namespace System.IO
namespace System.Text
namespace System.Text.RegularExpressions
val flip : f:('a -> 'b -> 'c) -> x:'b -> y:'a -> 'c
Full name: Script.flip
val f : ('a -> 'b -> 'c)
val x : 'b
val y : 'a
val curry : f:('a * 'b -> 'c) -> x:'a -> y:'b -> 'c
Full name: Script.curry
val f : ('a * 'b -> 'c)
val x : 'a
val y : 'b
val uncurry : f:('a -> 'b -> 'c) -> x:'a * y:'b -> 'c
Full name: Script.uncurry
type Tree<'a (requires comparison)> =
| Nil
| Node of 'a * 'a Tree * 'a Tree
Full name: Script.Tree<_>
union case Tree.Nil: 'a Tree
union case Tree.Node: 'a * 'a Tree * 'a Tree -> 'a Tree
val Leaf : x:'a -> 'a Tree (requires comparison)
Full name: Script.Leaf
val x : 'a (requires comparison)
val insert : x:'a -> T:'a Tree -> 'a Tree (requires comparison)
Full name: Script.insert
val T : 'a Tree (requires comparison)
val insert' : ('b -> 'b Tree -> ('b Tree -> 'c) -> 'c) (requires comparison)
val x : 'b (requires comparison)
val T : 'b Tree (requires comparison)
val cont : ('b Tree -> 'c) (requires comparison)
val a : 'b (requires comparison)
val L : 'b Tree (requires comparison)
val R : 'b Tree (requires comparison)
val e : 'b Tree (requires comparison)
val id : x:'T -> 'T
Full name: Microsoft.FSharp.Core.Operators.id
val fold_infix : f:('a -> 'b -> 'a) -> acc:'a -> T:'b Tree -> 'a (requires comparison)
Full name: Script.fold_infix
val f : ('a -> 'b -> 'a) (requires comparison)
val acc : 'a
val fold_infix' : (('c -> 'd -> 'c) -> 'c -> 'd Tree -> ('c -> 'e) -> 'e) (requires comparison)
val f : ('c -> 'd -> 'c) (requires comparison)
val acc : 'c
val T : 'd Tree (requires comparison)
val cont : ('c -> 'e)
val a : 'd (requires comparison)
val L : 'd Tree (requires comparison)
val R : 'd Tree (requires comparison)
val e : 'c
val listToTree : L:'a list -> 'a Tree (requires comparison)
Full name: Script.listToTree
val L : 'a list (requires comparison)
Multiple items
module List
from Microsoft.FSharp.Collections
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State
Full name: Microsoft.FSharp.Collections.List.fold
val treeToList : T:'a Tree -> 'a list (requires comparison)
Full name: Script.treeToList
val l : 'a list (requires comparison)
val treeSort : L:'a list -> 'a list (requires comparison)
Full name: Script.treeSort
val emptyDict : 'a Tree (requires comparison)
Full name: Script.emptyDict
val update_word : x:'a -> T:('a * int) Tree -> ('a * int) Tree (requires comparison)
Full name: Script.update_word
val T : ('a * int) Tree (requires comparison)
val update_word' : ('b -> ('b * int) Tree -> (('b * int) Tree -> 'c) -> 'c) (requires comparison)
val T : ('b * int) Tree (requires comparison)
val cont : (('b * int) Tree -> 'c) (requires comparison)
val w : 'b (requires comparison)
val c : int
val L : ('b * int) Tree (requires comparison)
val R : ('b * int) Tree (requires comparison)
val t : ('b * int) Tree (requires comparison)
val lookup_word : x:'a -> _arg1:('a * 'b) Tree -> 'b option (requires comparison and comparison)
Full name: Script.lookup_word
union case Option.None: Option<'T>
val w : 'a (requires comparison)
val c : 'b (requires comparison)
val L : ('a * 'b) Tree (requires comparison and comparison)
val R : ('a * 'b) Tree (requires comparison and comparison)
union case Option.Some: Value: 'T -> Option<'T>
val walk_dict : f:('a -> int) -> ('a Tree -> int) (requires comparison)
Full name: Script.walk_dict
val f : ('a -> int) (requires comparison)
val acc : int
val take : n:int -> L:'a list -> 'a list
Full name: Script.take
val n : int
val L : 'a list
val take' : (int -> 'b list -> ('b list -> 'c) -> 'c)
val L : 'b list
val cont : ('b list -> 'c)
val xs : 'b list
val l : 'b list
val ngrams : len:int -> items:'a list -> 'a list list
Full name: Script.ngrams
val len : int
val items : 'a list
val ngrams' : (int -> 'b list -> 'b list list -> 'b list list)
val items : 'b list
val acc : 'b list list
val length : list:'T list -> int
Full name: Microsoft.FSharp.Collections.List.length
val el : 'b list
val tail : list:'T list -> 'T list
Full name: Microsoft.FSharp.Collections.List.tail
val bigrams : W:'a list -> 'a list list
Full name: Script.bigrams
val W : 'a list
val trigrams : W:'a list -> 'a list list
Full name: Script.trigrams
val monograms : W:'a list -> 'a list list
Full name: Script.monograms
val generateDict : words:'a list -> dict:('a * int) Tree -> ('a * int) Tree (requires comparison)
Full name: Script.generateDict
val words : 'a list (requires comparison)
val dict : ('a * int) Tree (requires comparison)
val rnd_num : a:int -> b:int -> int
Full name: Script.rnd_num
val a : int
val b : int
val r : System.Random
Multiple items
type Random =
new : unit -> Random + 1 overload
member Next : unit -> int + 2 overloads
member NextBytes : buffer:byte[] -> unit
member NextDouble : unit -> float
Full name: System.Random
--------------------
System.Random() : unit
System.Random(Seed: int) : unit
System.Random.Next() : int
System.Random.Next(maxValue: int) : int
System.Random.Next(minValue: int, maxValue: int) : int
val predict : word:string -> db:(string list * int) Tree -> string list
Full name: Script.predict
val word : string
val db : (string list * int) Tree
val collect_words : (System.IComparable list -> System.IComparable Tree -> System.IComparable list)
val acc : System.IComparable list
val x : System.IComparable
val lookup : ('a -> ('a list * 'b) Tree -> ('a list * 'b) list -> (('a list * 'b) list -> 'c) -> 'c) (requires comparison and comparison)
val T : ('a list * 'b) Tree (requires comparison and comparison)
val acc : ('a list * 'b) list (requires comparison and comparison)
val cont : (('a list * 'b) list -> 'c) (requires comparison and comparison)
val w : 'a list (requires comparison)
val L : ('a list * 'b) Tree (requires comparison and comparison)
val R : ('a list * 'b) Tree (requires comparison and comparison)
val head : list:'T list -> 'T
Full name: Microsoft.FSharp.Collections.List.head
val e : ('a list * 'b) list (requires comparison and comparison)
val grams : (string list * int) list
val rpt : ('a -> int -> 'a list)
val a : 'a
val i : int
val flatgrams : string list list
val collect : mapping:('T -> 'U list) -> list:'T list -> 'U list
Full name: Microsoft.FSharp.Collections.List.collect
val isEmpty : list:'T list -> bool
Full name: Microsoft.FSharp.Collections.List.isEmpty
val nth : list:'T list -> index:int -> 'T
Full name: Microsoft.FSharp.Collections.List.nth
val cleanup_string : s:string -> string
Full name: Script.cleanup_string
val s : string
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = System.String
Full name: Microsoft.FSharp.Core.string
val banned : string list
System.String.Trim() : string
System.String.Trim([<System.ParamArray>] trimChars: char []) : string
val s' : string
Multiple items
type Regex =
new : pattern:string -> Regex + 1 overload
member GetGroupNames : unit -> string[]
member GetGroupNumbers : unit -> int[]
member GroupNameFromNumber : i:int -> string
member GroupNumberFromName : name:string -> int
member IsMatch : input:string -> bool + 1 overload
member Match : input:string -> Match + 2 overloads
member Matches : input:string -> MatchCollection + 1 overload
member Options : RegexOptions
member Replace : input:string * replacement:string -> string + 5 overloads
...
Full name: System.Text.RegularExpressions.Regex
--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator) : string
Regex.Replace(input: string, pattern: string, replacement: string) : string
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator, options: RegexOptions) : string
Regex.Replace(input: string, pattern: string, replacement: string, options: RegexOptions) : string
val exists : predicate:('T -> bool) -> list:'T list -> bool
Full name: Microsoft.FSharp.Collections.List.exists
val x : string
val path : string
Full name: Script.path
let path = "C:\\Users\\Dan\\Documents\\Study\\FP\\test1.txt"
val text : string
Full name: Script.text
type File =
static member AppendAllLines : path:string * contents:IEnumerable<string> -> unit + 1 overload
static member AppendAllText : path:string * contents:string -> unit + 1 overload
static member AppendText : path:string -> StreamWriter
static member Copy : sourceFileName:string * destFileName:string -> unit + 1 overload
static member Create : path:string -> FileStream + 3 overloads
static member CreateText : path:string -> StreamWriter
static member Decrypt : path:string -> unit
static member Delete : path:string -> unit
static member Encrypt : path:string -> unit
static member Exists : path:string -> bool
...
Full name: System.IO.File
File.ReadAllText(path: string) : string
File.ReadAllText(path: string, encoding: System.Text.Encoding) : string
val words : string list
Full name: Script.words
module Array
from Microsoft.FSharp.Collections
val foldBack : folder:('T -> 'State -> 'State) -> array:'T [] -> state:'State -> 'State
Full name: Microsoft.FSharp.Collections.Array.foldBack
val e : string
val acc : string list
val e' : string
System.String.Split([<System.ParamArray>] separator: char []) : string []
System.String.Split(separator: string [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int) : string []
System.String.Split(separator: string [], count: int, options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int, options: System.StringSplitOptions) : string []
val dictTree : (string list * int) Tree
Full name: Script.dictTree
val say : (string -> string)
Full name: Script.say
val reduce : reduction:('T -> 'T -> 'T) -> list:'T list -> 'T
Full name: Microsoft.FSharp.Collections.List.reduce
val a : string
More information