1 people like it.

# n-gram algorithm

Simple n-gram algorithm implementation

) ( simple implementation of n-grams algorithm ) ( using dictionaries based on basic binary trees

 ``` 1: 2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33: 34: 35: 36: 37: 38: 39: 40: 41: 42: 43: 44: 45: 46: 47: 48: 49: 50: 51: 52: 53: 54: 55: 56: 57: 58: 59: 60: 61: 62: 63: 64: 65: 66: 67: 68: 69: 70: 71: 72: 73: 74: 75: 76: 77: 78: 79: 80: 81: 82: 83: 84: 85: 86: 87: 88: 89: 90: 91: 92: 93: 94: 95: 96: 97: 98: 99: 100: 101: 102: 103: 104: 105: 106: 107: 108: 109: 110: 111: 112: 113: 114: 115: 116: 117: 118: 119: 120: 121: 122: 123: 124: 125: 126: 127: 128: 129: 130: 131: 132: 133: 134: 135: 136: 137: 138: 139: 140: 141: 142: 143: 144: 145: 146: 147: 148: ``` ``````open System.IO open System.Text.RegularExpressions let flip f x y = f y x let curry f x y = f (x, y) let uncurry f (x, y) = f x y type 'a Tree when 'a : comparison = | Nil | Node of 'a * 'a Tree * 'a Tree let Leaf x = Node(x, Nil, Nil) let insert x T = let rec insert' x T cont = match T with | Nil -> cont (Leaf x) | Node(a, L, R) -> if (x > a) then insert' x R (fun e -> cont <| Node(a, L, e)) else insert' x L (fun e -> cont <| Node(a, e, R)) insert' x T id let fold_infix f acc T = let rec fold_infix' f acc T cont = match T with | Nil -> cont acc | Node(a, L, R) -> fold_infix' f acc L <| fun e -> fold_infix' f (f e a) R cont fold_infix' f acc T id let listToTree L = List.fold (flip insert) Nil L let treeToList T = fold_infix (fun l x -> x::l) [] T let treeSort L = treeToList <| listToTree L let emptyDict = Nil (* let insert_word word count dict = insert (count, word) dict *) let update_word x T = let rec update_word' x T cont = match T with | Nil -> cont <| Leaf (x, 1) | Node((w, c), L, R) -> if (w = x) then cont <| Node((w, c+1), L, R) elif (x < w) then update_word' x L (fun t -> cont <| Node((w, c), t, R)) else update_word' x R (fun t -> cont <| Node((w, c), L, t)) update_word' x T id let rec lookup_word x = function | Nil -> None | Node((w, c), L, R) -> if (w = x) then Some(c) elif (x < w) then lookup_word x L else lookup_word x R (*let insert_dict word T = if (None = lookup_word word T) then insert (word, 1) T else update_word word T*) let walk_dict f = fold_infix (fun acc x -> f x) 0 (* Seq.take/Seq.truncate не работают, т.к. требуется потом приводить их к list, что, наверное, медленее *) let take n L = let rec take' n L cont = match L with | [] -> cont [] | (x::xs) when n=1 -> (cont [x]) | (x::xs) -> take' (n-1) xs (fun l -> cont(x::l)) if (n > 0) then take' n L id else [] let ngrams len items = let rec ngrams' len items acc = if (List.length items) < len then acc else let el = take len items ngrams' len (List.tail items) (el::acc) ngrams' len items [] (* не получается записать без аргументов? *) let bigrams W = ngrams 2 W let trigrams W = ngrams 3 W let monograms W = ngrams 1 W let generateDict words dict = List.fold (flip update_word) dict words let rnd_num a b = let r = new System.Random() let n = r.Next(a, b+1) n let predict word db = let collect_words = fold_infix (fun acc x -> x::acc) let rec lookup x T acc cont = match T with | Nil -> cont acc | Node((w, c), L, R) -> if (x = List.head w) then lookup x L ((w,c)::acc) <| fun e -> lookup x R (e@acc) cont elif (x < List.head w) then lookup x L acc cont else lookup x R acc cont let grams = lookup word db [] id let rpt a n = [for i in [1..n] do yield a] let flatgrams = List.collect (uncurry rpt) grams if List.isEmpty flatgrams then [""] else List.nth flatgrams ((rnd_num 1 (List.length flatgrams))-1) let cleanup_string (s : string) = let banned = ["the"; "a"; "of"] (* list of the banned words *) let s = s.Trim().ToLower() let s' = Regex.Replace(s, @"[^a-z]", "") if List.exists (fun x -> x = s') banned then "" else s' ///let path = "C:\\Users\\Dan\\Documents\\Study\\FP\\test1.txt" let path = "/home/d/Dropbox/Fp/test1.txt" let text = File.ReadAllText(path) let words = Array.foldBack (fun e acc -> let e' = (cleanup_string e) if e' = "" then acc else e'::acc) (text.Split([|' ';'\n'|])) [] let dictTree = generateDict (bigrams words) Nil |> generateDict (trigrams words) let say = List.reduce (fun a x -> a + " " + x) << (flip predict) dictTree say "cathedral" ``````
namespace System
namespace System.IO
namespace System.Text
namespace System.Text.RegularExpressions
val flip : f:('a -> 'b -> 'c) -> x:'b -> y:'a -> 'c

Full name: Script.flip
val f : ('a -> 'b -> 'c)
val x : 'b
val y : 'a
val curry : f:('a * 'b -> 'c) -> x:'a -> y:'b -> 'c

Full name: Script.curry
val f : ('a * 'b -> 'c)
val x : 'a
val y : 'b
val uncurry : f:('a -> 'b -> 'c) -> x:'a * y:'b -> 'c

Full name: Script.uncurry
type Tree<'a (requires comparison)> =
| Nil
| Node of 'a * 'a Tree * 'a Tree

Full name: Script.Tree<_>
union case Tree.Nil: 'a Tree
union case Tree.Node: 'a * 'a Tree * 'a Tree -> 'a Tree
val Leaf : x:'a -> 'a Tree (requires comparison)

Full name: Script.Leaf
val x : 'a (requires comparison)
val insert : x:'a -> T:'a Tree -> 'a Tree (requires comparison)

Full name: Script.insert
val T : 'a Tree (requires comparison)
val insert' : ('b -> 'b Tree -> ('b Tree -> 'c) -> 'c) (requires comparison)
val x : 'b (requires comparison)
val T : 'b Tree (requires comparison)
val cont : ('b Tree -> 'c) (requires comparison)
val a : 'b (requires comparison)
val L : 'b Tree (requires comparison)
val R : 'b Tree (requires comparison)
val e : 'b Tree (requires comparison)
val id : x:'T -> 'T

Full name: Microsoft.FSharp.Core.Operators.id
val fold_infix : f:('a -> 'b -> 'a) -> acc:'a -> T:'b Tree -> 'a (requires comparison)

Full name: Script.fold_infix
val f : ('a -> 'b -> 'a) (requires comparison)
val acc : 'a
val fold_infix' : (('c -> 'd -> 'c) -> 'c -> 'd Tree -> ('c -> 'e) -> 'e) (requires comparison)
val f : ('c -> 'd -> 'c) (requires comparison)
val acc : 'c
val T : 'd Tree (requires comparison)
val cont : ('c -> 'e)
val a : 'd (requires comparison)
val L : 'd Tree (requires comparison)
val R : 'd Tree (requires comparison)
val e : 'c
val listToTree : L:'a list -> 'a Tree (requires comparison)

Full name: Script.listToTree
val L : 'a list (requires comparison)
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State

Full name: Microsoft.FSharp.Collections.List.fold
val treeToList : T:'a Tree -> 'a list (requires comparison)

Full name: Script.treeToList
val l : 'a list (requires comparison)
val treeSort : L:'a list -> 'a list (requires comparison)

Full name: Script.treeSort
val emptyDict : 'a Tree (requires comparison)

Full name: Script.emptyDict
val update_word : x:'a -> T:('a * int) Tree -> ('a * int) Tree (requires comparison)

Full name: Script.update_word
val T : ('a * int) Tree (requires comparison)
val update_word' : ('b -> ('b * int) Tree -> (('b * int) Tree -> 'c) -> 'c) (requires comparison)
val T : ('b * int) Tree (requires comparison)
val cont : (('b * int) Tree -> 'c) (requires comparison)
val w : 'b (requires comparison)
val c : int
val L : ('b * int) Tree (requires comparison)
val R : ('b * int) Tree (requires comparison)
val t : ('b * int) Tree (requires comparison)
val lookup_word : x:'a -> _arg1:('a * 'b) Tree -> 'b option (requires comparison and comparison)

Full name: Script.lookup_word
union case Option.None: Option<'T>
val w : 'a (requires comparison)
val c : 'b (requires comparison)
val L : ('a * 'b) Tree (requires comparison and comparison)
val R : ('a * 'b) Tree (requires comparison and comparison)
union case Option.Some: Value: 'T -> Option<'T>
val walk_dict : f:('a -> int) -> ('a Tree -> int) (requires comparison)

Full name: Script.walk_dict
val f : ('a -> int) (requires comparison)
val acc : int
val take : n:int -> L:'a list -> 'a list

Full name: Script.take
val n : int
val L : 'a list
val take' : (int -> 'b list -> ('b list -> 'c) -> 'c)
val L : 'b list
val cont : ('b list -> 'c)
val xs : 'b list
val l : 'b list
val ngrams : len:int -> items:'a list -> 'a list list

Full name: Script.ngrams
val len : int
val items : 'a list
val ngrams' : (int -> 'b list -> 'b list list -> 'b list list)
val items : 'b list
val acc : 'b list list
val length : list:'T list -> int

Full name: Microsoft.FSharp.Collections.List.length
val el : 'b list
val tail : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.tail
val bigrams : W:'a list -> 'a list list

Full name: Script.bigrams
val W : 'a list
val trigrams : W:'a list -> 'a list list

Full name: Script.trigrams
val monograms : W:'a list -> 'a list list

Full name: Script.monograms
val generateDict : words:'a list -> dict:('a * int) Tree -> ('a * int) Tree (requires comparison)

Full name: Script.generateDict
val words : 'a list (requires comparison)
val dict : ('a * int) Tree (requires comparison)
val rnd_num : a:int -> b:int -> int

Full name: Script.rnd_num
val a : int
val b : int
val r : System.Random
Multiple items
type Random =
new : unit -> Random + 1 overload
member Next : unit -> int + 2 overloads
member NextBytes : buffer:byte[] -> unit
member NextDouble : unit -> float

Full name: System.Random

--------------------
System.Random() : unit
System.Random(Seed: int) : unit
System.Random.Next() : int
System.Random.Next(maxValue: int) : int
System.Random.Next(minValue: int, maxValue: int) : int
val predict : word:string -> db:(string list * int) Tree -> string list

Full name: Script.predict
val word : string
val db : (string list * int) Tree
val collect_words : (System.IComparable list -> System.IComparable Tree -> System.IComparable list)
val acc : System.IComparable list
val x : System.IComparable
val lookup : ('a -> ('a list * 'b) Tree -> ('a list * 'b) list -> (('a list * 'b) list -> 'c) -> 'c) (requires comparison and comparison)
val T : ('a list * 'b) Tree (requires comparison and comparison)
val acc : ('a list * 'b) list (requires comparison and comparison)
val cont : (('a list * 'b) list -> 'c) (requires comparison and comparison)
val w : 'a list (requires comparison)
val L : ('a list * 'b) Tree (requires comparison and comparison)
val R : ('a list * 'b) Tree (requires comparison and comparison)
val head : list:'T list -> 'T

val e : ('a list * 'b) list (requires comparison and comparison)
val grams : (string list * int) list
val rpt : ('a -> int -> 'a list)
val a : 'a
val i : int
val flatgrams : string list list
val collect : mapping:('T -> 'U list) -> list:'T list -> 'U list

Full name: Microsoft.FSharp.Collections.List.collect
val isEmpty : list:'T list -> bool

Full name: Microsoft.FSharp.Collections.List.isEmpty
val nth : list:'T list -> index:int -> 'T

Full name: Microsoft.FSharp.Collections.List.nth
val cleanup_string : s:string -> string

Full name: Script.cleanup_string
val s : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = System.String

Full name: Microsoft.FSharp.Core.string
val banned : string list
System.String.Trim() : string
System.String.Trim([<System.ParamArray>] trimChars: char []) : string
val s' : string
Multiple items
type Regex =
new : pattern:string -> Regex + 1 overload
member GetGroupNames : unit -> string[]
member GetGroupNumbers : unit -> int[]
member GroupNameFromNumber : i:int -> string
member GroupNumberFromName : name:string -> int
member IsMatch : input:string -> bool + 1 overload
member Match : input:string -> Match + 2 overloads
member Matches : input:string -> MatchCollection + 1 overload
member Options : RegexOptions
member Replace : input:string * replacement:string -> string + 5 overloads
...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator) : string
Regex.Replace(input: string, pattern: string, replacement: string) : string
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator, options: RegexOptions) : string
Regex.Replace(input: string, pattern: string, replacement: string, options: RegexOptions) : string
val exists : predicate:('T -> bool) -> list:'T list -> bool

Full name: Microsoft.FSharp.Collections.List.exists
val x : string
val path : string

Full name: Script.path

let path = "C:\\Users\\Dan\\Documents\\Study\\FP\\test1.txt"
val text : string

Full name: Script.text
type File =
static member AppendAllLines : path:string * contents:IEnumerable<string> -> unit + 1 overload
static member AppendAllText : path:string * contents:string -> unit + 1 overload
static member AppendText : path:string -> StreamWriter
static member Copy : sourceFileName:string * destFileName:string -> unit + 1 overload
static member Create : path:string -> FileStream + 3 overloads
static member CreateText : path:string -> StreamWriter
static member Decrypt : path:string -> unit
static member Delete : path:string -> unit
static member Encrypt : path:string -> unit
static member Exists : path:string -> bool
...

Full name: System.IO.File
File.ReadAllText(path: string, encoding: System.Text.Encoding) : string
val words : string list

Full name: Script.words
module Array

from Microsoft.FSharp.Collections
val foldBack : folder:('T -> 'State -> 'State) -> array:'T [] -> state:'State -> 'State

Full name: Microsoft.FSharp.Collections.Array.foldBack
val e : string
val acc : string list
val e' : string
System.String.Split([<System.ParamArray>] separator: char []) : string []
System.String.Split(separator: string [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int) : string []
System.String.Split(separator: string [], count: int, options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int, options: System.StringSplitOptions) : string []
val dictTree : (string list * int) Tree

Full name: Script.dictTree
val say : (string -> string)

Full name: Script.say
val reduce : reduction:('T -> 'T -> 'T) -> list:'T list -> 'T

Full name: Microsoft.FSharp.Collections.List.reduce
val a : string