1 people like it.

n-gram algorithm

Simple n-gram algorithm implementation

) ( simple implementation of n-grams algorithm ) ( using dictionaries based on basic binary trees


  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
open System.IO
open System.Text.RegularExpressions
let flip f x y = f y x
let curry f x y = f (x, y)
let uncurry f (x, y) = f x y

type 'a Tree when 'a : comparison = 
     | Nil 
     | Node of 'a * 'a Tree * 'a Tree

let Leaf x = Node(x, Nil, Nil)

let insert x T =
    let rec insert' x T cont = 
        match T with
        | Nil           -> cont (Leaf x)
        | Node(a, L, R) ->
          if (x > a) then
              insert' x R (fun e -> cont <| Node(a, L, e))
          else
              insert' x L (fun e -> cont <| Node(a, e, R))
    insert' x T id

let fold_infix f acc T =
    let rec fold_infix' f acc T cont = 
        match T with
        | Nil -> cont acc
        | Node(a, L, R) ->
          fold_infix' f acc L <| 
          fun e -> fold_infix' f (f e a) R cont
    fold_infix' f acc T id
        

let listToTree L = List.fold (flip insert) Nil L
let treeToList T = fold_infix (fun l x -> x::l) [] T
let treeSort L = treeToList <| listToTree L

let emptyDict = Nil
(* let insert_word word count dict = insert (count, word) dict *)

let update_word x T =
    let rec update_word' x T cont =
        match T with
        | Nil                -> cont <| Leaf (x, 1)
        | Node((w, c), L, R) ->
        if (w = x) then
            cont <| Node((w, c+1), L, R)
        elif (x < w) then
            update_word' x L (fun t -> cont <| Node((w, c), t, R))
        else
            update_word' x R (fun t -> cont <| Node((w, c), L, t))
    update_word' x T id
        
let rec lookup_word x = function 
    | Nil -> None
    | Node((w, c), L, R) ->
      if (w = x) then Some(c)
      elif (x < w) then
          lookup_word x L
      else lookup_word x R
       
(*let insert_dict word T = 
    if (None = lookup_word word T) then
        insert (word, 1) T
    else update_word word T*)

let walk_dict f = fold_infix (fun acc x -> f x) 0


(* Seq.take/Seq.truncate не работают, т.к.
требуется потом приводить их к list, 
что, наверное, медленее *)
let take n L =
    let rec take' n L cont = 
        match L with
        | []               -> cont []
        | (x::xs) when n=1 -> (cont [x])
        | (x::xs)          -> take' (n-1) xs (fun l -> cont(x::l))
    if (n > 0) 
    then take' n L id 
    else []

let ngrams len items =
    let rec ngrams' len items acc = 
        if (List.length items) < len then
            acc
        else 
            let el = take len items
            ngrams' len (List.tail items) (el::acc)
    ngrams' len items []

(* не получается записать без аргументов? *)
let bigrams W = ngrams 2 W
let trigrams W = ngrams 3 W
let monograms W = ngrams 1 W

let generateDict words dict = 
    List.fold (flip update_word) dict words


let rnd_num a b =
    let r = new System.Random()
    let n = r.Next(a, b+1)
    n

let predict word db = 
    let collect_words = fold_infix (fun acc x -> x::acc) 
    let rec lookup x T acc cont  =
        match T with
        | Nil -> cont acc
        | Node((w, c), L, R) ->
            if (x = List.head w) then
               lookup x L ((w,c)::acc) 
               <| fun e -> lookup x R (e@acc) cont
            elif (x < List.head w) then
               lookup x L acc cont
            else lookup x R acc cont
    let grams = lookup word db [] id
    let rpt a n = [for i in [1..n] do yield a]
    let flatgrams = List.collect (uncurry rpt) grams
    if List.isEmpty flatgrams then [""]
    else List.nth flatgrams ((rnd_num 1 (List.length flatgrams))-1)


let cleanup_string (s : string) = 
    let banned = ["the"; "a"; "of"] 
    (* list of the banned words *)
    let s = s.Trim().ToLower()
    let s' = Regex.Replace(s, @"[^a-z]", "")
    if List.exists (fun x -> x = s') banned
    then "" else s'

///let path = "C:\\Users\\Dan\\Documents\\Study\\FP\\test1.txt"
let path = "/home/d/Dropbox/Fp/test1.txt"
let text = File.ReadAllText(path)
let words = 
    Array.foldBack (fun e acc -> 
                    let e' = (cleanup_string e)
                    if e' = "" then acc else e'::acc)
                   (text.Split([|' ';'\n'|])) []

let dictTree = generateDict (bigrams words) Nil 
               |> generateDict (trigrams words)

let say = List.reduce (fun a x -> a + " " + x) <<
          (flip predict) dictTree 

say "cathedral"
namespace System
namespace System.IO
namespace System.Text
namespace System.Text.RegularExpressions
val flip : f:('a -> 'b -> 'c) -> x:'b -> y:'a -> 'c

Full name: Script.flip
val f : ('a -> 'b -> 'c)
val x : 'b
val y : 'a
val curry : f:('a * 'b -> 'c) -> x:'a -> y:'b -> 'c

Full name: Script.curry
val f : ('a * 'b -> 'c)
val x : 'a
val y : 'b
val uncurry : f:('a -> 'b -> 'c) -> x:'a * y:'b -> 'c

Full name: Script.uncurry
type Tree<'a (requires comparison)> =
  | Nil
  | Node of 'a * 'a Tree * 'a Tree

Full name: Script.Tree<_>
union case Tree.Nil: 'a Tree
union case Tree.Node: 'a * 'a Tree * 'a Tree -> 'a Tree
val Leaf : x:'a -> 'a Tree (requires comparison)

Full name: Script.Leaf
val x : 'a (requires comparison)
val insert : x:'a -> T:'a Tree -> 'a Tree (requires comparison)

Full name: Script.insert
val T : 'a Tree (requires comparison)
val insert' : ('b -> 'b Tree -> ('b Tree -> 'c) -> 'c) (requires comparison)
val x : 'b (requires comparison)
val T : 'b Tree (requires comparison)
val cont : ('b Tree -> 'c) (requires comparison)
val a : 'b (requires comparison)
val L : 'b Tree (requires comparison)
val R : 'b Tree (requires comparison)
val e : 'b Tree (requires comparison)
val id : x:'T -> 'T

Full name: Microsoft.FSharp.Core.Operators.id
val fold_infix : f:('a -> 'b -> 'a) -> acc:'a -> T:'b Tree -> 'a (requires comparison)

Full name: Script.fold_infix
val f : ('a -> 'b -> 'a) (requires comparison)
val acc : 'a
val fold_infix' : (('c -> 'd -> 'c) -> 'c -> 'd Tree -> ('c -> 'e) -> 'e) (requires comparison)
val f : ('c -> 'd -> 'c) (requires comparison)
val acc : 'c
val T : 'd Tree (requires comparison)
val cont : ('c -> 'e)
val a : 'd (requires comparison)
val L : 'd Tree (requires comparison)
val R : 'd Tree (requires comparison)
val e : 'c
val listToTree : L:'a list -> 'a Tree (requires comparison)

Full name: Script.listToTree
val L : 'a list (requires comparison)
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State

Full name: Microsoft.FSharp.Collections.List.fold
val treeToList : T:'a Tree -> 'a list (requires comparison)

Full name: Script.treeToList
val l : 'a list (requires comparison)
val treeSort : L:'a list -> 'a list (requires comparison)

Full name: Script.treeSort
val emptyDict : 'a Tree (requires comparison)

Full name: Script.emptyDict
val update_word : x:'a -> T:('a * int) Tree -> ('a * int) Tree (requires comparison)

Full name: Script.update_word
val T : ('a * int) Tree (requires comparison)
val update_word' : ('b -> ('b * int) Tree -> (('b * int) Tree -> 'c) -> 'c) (requires comparison)
val T : ('b * int) Tree (requires comparison)
val cont : (('b * int) Tree -> 'c) (requires comparison)
val w : 'b (requires comparison)
val c : int
val L : ('b * int) Tree (requires comparison)
val R : ('b * int) Tree (requires comparison)
val t : ('b * int) Tree (requires comparison)
val lookup_word : x:'a -> _arg1:('a * 'b) Tree -> 'b option (requires comparison and comparison)

Full name: Script.lookup_word
union case Option.None: Option<'T>
val w : 'a (requires comparison)
val c : 'b (requires comparison)
val L : ('a * 'b) Tree (requires comparison and comparison)
val R : ('a * 'b) Tree (requires comparison and comparison)
union case Option.Some: Value: 'T -> Option<'T>
val walk_dict : f:('a -> int) -> ('a Tree -> int) (requires comparison)

Full name: Script.walk_dict
val f : ('a -> int) (requires comparison)
val acc : int
val take : n:int -> L:'a list -> 'a list

Full name: Script.take
val n : int
val L : 'a list
val take' : (int -> 'b list -> ('b list -> 'c) -> 'c)
val L : 'b list
val cont : ('b list -> 'c)
val xs : 'b list
val l : 'b list
val ngrams : len:int -> items:'a list -> 'a list list

Full name: Script.ngrams
val len : int
val items : 'a list
val ngrams' : (int -> 'b list -> 'b list list -> 'b list list)
val items : 'b list
val acc : 'b list list
val length : list:'T list -> int

Full name: Microsoft.FSharp.Collections.List.length
val el : 'b list
val tail : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.tail
val bigrams : W:'a list -> 'a list list

Full name: Script.bigrams
val W : 'a list
val trigrams : W:'a list -> 'a list list

Full name: Script.trigrams
val monograms : W:'a list -> 'a list list

Full name: Script.monograms
val generateDict : words:'a list -> dict:('a * int) Tree -> ('a * int) Tree (requires comparison)

Full name: Script.generateDict
val words : 'a list (requires comparison)
val dict : ('a * int) Tree (requires comparison)
val rnd_num : a:int -> b:int -> int

Full name: Script.rnd_num
val a : int
val b : int
val r : System.Random
Multiple items
type Random =
  new : unit -> Random + 1 overload
  member Next : unit -> int + 2 overloads
  member NextBytes : buffer:byte[] -> unit
  member NextDouble : unit -> float

Full name: System.Random

--------------------
System.Random() : unit
System.Random(Seed: int) : unit
System.Random.Next() : int
System.Random.Next(maxValue: int) : int
System.Random.Next(minValue: int, maxValue: int) : int
val predict : word:string -> db:(string list * int) Tree -> string list

Full name: Script.predict
val word : string
val db : (string list * int) Tree
val collect_words : (System.IComparable list -> System.IComparable Tree -> System.IComparable list)
val acc : System.IComparable list
val x : System.IComparable
val lookup : ('a -> ('a list * 'b) Tree -> ('a list * 'b) list -> (('a list * 'b) list -> 'c) -> 'c) (requires comparison and comparison)
val T : ('a list * 'b) Tree (requires comparison and comparison)
val acc : ('a list * 'b) list (requires comparison and comparison)
val cont : (('a list * 'b) list -> 'c) (requires comparison and comparison)
val w : 'a list (requires comparison)
val L : ('a list * 'b) Tree (requires comparison and comparison)
val R : ('a list * 'b) Tree (requires comparison and comparison)
val head : list:'T list -> 'T

Full name: Microsoft.FSharp.Collections.List.head
val e : ('a list * 'b) list (requires comparison and comparison)
val grams : (string list * int) list
val rpt : ('a -> int -> 'a list)
val a : 'a
val i : int
val flatgrams : string list list
val collect : mapping:('T -> 'U list) -> list:'T list -> 'U list

Full name: Microsoft.FSharp.Collections.List.collect
val isEmpty : list:'T list -> bool

Full name: Microsoft.FSharp.Collections.List.isEmpty
val nth : list:'T list -> index:int -> 'T

Full name: Microsoft.FSharp.Collections.List.nth
val cleanup_string : s:string -> string

Full name: Script.cleanup_string
val s : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = System.String

Full name: Microsoft.FSharp.Core.string
val banned : string list
System.String.Trim() : string
System.String.Trim([<System.ParamArray>] trimChars: char []) : string
val s' : string
Multiple items
type Regex =
  new : pattern:string -> Regex + 1 overload
  member GetGroupNames : unit -> string[]
  member GetGroupNumbers : unit -> int[]
  member GroupNameFromNumber : i:int -> string
  member GroupNumberFromName : name:string -> int
  member IsMatch : input:string -> bool + 1 overload
  member Match : input:string -> Match + 2 overloads
  member Matches : input:string -> MatchCollection + 1 overload
  member Options : RegexOptions
  member Replace : input:string * replacement:string -> string + 5 overloads
  ...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator) : string
Regex.Replace(input: string, pattern: string, replacement: string) : string
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator, options: RegexOptions) : string
Regex.Replace(input: string, pattern: string, replacement: string, options: RegexOptions) : string
val exists : predicate:('T -> bool) -> list:'T list -> bool

Full name: Microsoft.FSharp.Collections.List.exists
val x : string
val path : string

Full name: Script.path


let path = "C:\\Users\\Dan\\Documents\\Study\\FP\\test1.txt"
val text : string

Full name: Script.text
type File =
  static member AppendAllLines : path:string * contents:IEnumerable<string> -> unit + 1 overload
  static member AppendAllText : path:string * contents:string -> unit + 1 overload
  static member AppendText : path:string -> StreamWriter
  static member Copy : sourceFileName:string * destFileName:string -> unit + 1 overload
  static member Create : path:string -> FileStream + 3 overloads
  static member CreateText : path:string -> StreamWriter
  static member Decrypt : path:string -> unit
  static member Delete : path:string -> unit
  static member Encrypt : path:string -> unit
  static member Exists : path:string -> bool
  ...

Full name: System.IO.File
File.ReadAllText(path: string) : string
File.ReadAllText(path: string, encoding: System.Text.Encoding) : string
val words : string list

Full name: Script.words
module Array

from Microsoft.FSharp.Collections
val foldBack : folder:('T -> 'State -> 'State) -> array:'T [] -> state:'State -> 'State

Full name: Microsoft.FSharp.Collections.Array.foldBack
val e : string
val acc : string list
val e' : string
System.String.Split([<System.ParamArray>] separator: char []) : string []
System.String.Split(separator: string [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int) : string []
System.String.Split(separator: string [], count: int, options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int, options: System.StringSplitOptions) : string []
val dictTree : (string list * int) Tree

Full name: Script.dictTree
val say : (string -> string)

Full name: Script.say
val reduce : reduction:('T -> 'T -> 'T) -> list:'T list -> 'T

Full name: Microsoft.FSharp.Collections.List.reduce
val a : string
Raw view Test code New version

More information

Link:http://fssnip.net/aR
Posted:12 years ago
Author:Daniil Frumin
Tags: trees , binary trees , n-gram