3 people like it.

Porter Stemmer

An Implementation of the Porter Stemming Algorithm in F# for text analysis. Please see: http://tartarus.org/martin/PorterStemmer/

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
182: 
183: 
184: 
185: 
186: 
187: 
188: 
189: 
190: 
191: 
192: 
193: 
194: 
195: 
196: 
197: 
198: 
199: 
200: 
201: 
202: 
203: 
204: 
205: 
206: 
207: 
208: 
209: 
210: 
211: 
212: 
213: 
214: 
215: 
216: 
217: 
218: 
219: 
220: 
221: 
222: 
223: 
224: 
225: 
226: 
227: 
228: 
229: 
230: 
231: 
232: 
233: 
234: 
235: 
236: 
237: 
238: 
239: 
240: 
241: 
242: 
243: 
244: 
245: 
246: 
247: 
248: 
249: 
250: 
251: 
252: 
253: 
254: 
255: 
256: 
257: 
258: 
259: 
260: 
261: 
262: 
263: 
264: 
265: 
266: 
267: 
268: 
269: 
270: 
271: 
272: 
273: 
274: 
275: 
276: 
277: 
278: 
module Stemmer
open System

let (|BaseVowel|_|) a = match Char.ToLower(a) with 'a' | 'e' | 'i' | 'o' | 'u' -> Some a | _ -> None

let rec (|Consonant|_|) xs =
    match xs with
    | [] -> None
    | a::_ when Char.IsWhiteSpace(a) -> None
    | BaseVowel a::_ -> None
    | 'y'::Consonant _ | 'Y'::Consonant _-> None
    | x::rest -> Some(x,rest)
and (|Vowel|_|) xs = 
    match xs with
    | [] -> None 
    | a::_ when Char.IsWhiteSpace(a) -> None
    | Consonant _ -> None 
    | x::rest -> Some(x,rest)

let (|C|_|) xs = 
    let rec loop xs acc =
        match xs with
        | Consonant (x,xs) -> loop xs (x::acc) 
        | _ -> match acc with [] -> None | _ -> Some(acc|>List.rev,xs)
    loop xs []

let (|V|_|) xs  = 
    let rec loop xs acc =
        match xs with
        | Vowel (x,xs) -> loop xs (x::acc) 
        | _ -> match acc with [] -> None | _ -> Some(acc|>List.rev,xs)
    loop xs []

let (|VC|_|) xs  =
    let rec loop xs acc =
        match xs with 
        | [] -> match acc with [] -> None | _ -> Some(acc,xs)
        |  a::_ when Char.IsWhiteSpace(a) -> match acc with [] -> None | _ -> Some(acc,xs)
        | C (cs, V (vs,xs)) -> loop xs ((vs,cs)::acc)
        | C (cs, xs) -> loop xs (([],cs)::acc)
        | V (vs, xs) -> loop xs ((vs,[])::acc)
        | _ -> None
    loop xs []

let calcMeasure vs = (0,vs) ||> List.fold (fun c (vs,cs) -> match vs,cs with [],_ | _,[] -> c | _ -> c+1)

let (|Ends|_|) stem xs  = 
    let rec loop xs ys =
        match xs,ys with
        | [],[] -> Some(xs)
        | [],_ -> None
        | _, [] -> Some(xs)
        | a::xs,b::ys when a = b || Char.ToLower(a) = Char.ToLower(b) -> loop xs ys
        | _ -> None
    loop xs stem

let ContainsVowel = List.exists ((|BaseVowel|_|) >> Option.isSome)

let (|EndsWithDoubleC|_|) xs =
    match xs with
    | Consonant (a, Consonant (b,xs)) when a = b -> Some (a,xs)
    | _ -> None

let (|EndsWithCVC|_|) xs =
    match xs with
    | Consonant (a, Vowel (_, Consonant (_,xs))) ->
        match a with 
        | 'w' | 'x' | 'y' | 'W' | 'X' | 'Y' -> None
        | _ -> Some(xs)
    | _ -> None
let (|NotEndsWithCVC|_|) xs = match xs with EndsWithCVC (_) -> None | _ -> Some(xs)

let (|Measure|_|) xs = match xs with VC (cvs,rest) -> calcMeasure cvs |> Some | _ -> None

let Measure xs = match xs with Measure (m) -> m | _  -> 0

let step1a xs =
    match xs with
    | Ends ['s';'e';'s';'s'] (rest) -> 's'::'s'::rest
    | Ends ['s';'e';'i'] (rest) -> 'i'::rest
    | 's'::'s'::rest -> xs
    | 's'::rest -> rest
    | _ -> xs

let contains y xs  = xs |> List.exists (fun x -> x=y)

let step1bTx xs =
    match xs with
    | Ends ['t';'a'] _ -> 'e'::xs
    | Ends ['l'; 'b'] _ -> 'e'::xs
    | Ends ['z'; 'i'] _ -> 'e'::xs
    | EndsWithDoubleC (x,rest) when ['l';'s';'z'] |> contains x |> not -> x::rest
    | EndsWithCVC (_) & Measure (m) when m = 1 -> 'e'::xs
    | _ -> xs

let step1b xs =
    match xs with
    | Ends ['d';'e';'e'] rest -> if Measure rest > 0 then 'e'::'e':: rest else xs
    | Ends ['d';'e'] rest when ContainsVowel rest -> rest |> step1bTx 
    | Ends ['g';'n';'i'] rest when ContainsVowel rest -> rest |> step1bTx 
    | _ -> xs

let step1c xs = 
    match xs with 
    | Ends ['y'] rest when ContainsVowel rest -> 'i'::rest
    | _ -> xs 

let step2 xs =
    match xs with
    | Ends ['l';'a';'n';'o';'i';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest 
    | Ends ['l';'a';'n';'o';'i';'t'] rest when Measure rest > 0 -> 'n'::'o'::'i'::'t'::rest 
    | Ends ['i';'c';'n';'e'] rest when Measure rest > 0 -> 'e'::'c'::'n'::'e'::rest 
    | Ends ['i';'c';'n';'a'] rest when Measure rest > 0 -> 'e'::'c'::'n'::'a'::rest 
    | Ends ['r';'e';'z';'i'] rest when Measure rest > 0 -> 'e'::'z'::'i'::rest 
    | Ends ['i';'l';'b'] rest when Measure rest > 0 -> 'e'::'l'::'b'::rest 
    | Ends ['i';'l';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['i';'l';'t';'n';'e'] rest when Measure rest > 0 -> 't'::'n'::'e'::rest 
    | Ends ['i';'l';'e'] rest when Measure rest > 0 -> 'e'::rest 
    | Ends ['i';'l';'s';'u';'o'] rest when Measure rest > 0 -> 's'::'u'::'o'::rest 
    | Ends ['n';'o';'i';'t';'a';'z';'i'] rest when Measure rest > 0 -> 'e'::'z'::'i'::rest 
    | Ends ['n';'o';'i';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest 
    | Ends ['r';'o';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest 
    | Ends ['m';'z';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['s';'s';'e';'n';'e';'v';'i'] rest when Measure rest > 0 -> 'e'::'v'::'i'::rest 
    | Ends ['s';'s';'e';'n';'l';'u';'f'] rest when Measure rest > 0 -> 'l'::'u'::'f'::rest 
    | Ends ['s';'s';'e';'n';'s';'u';'o'] rest when Measure rest > 0 -> 's'::'u'::'o'::rest 
    | Ends ['i';'t';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['i';'t';'i';'v';'i'] rest when Measure rest > 0 -> 'e'::'v'::'i'::rest 
    | Ends ['i';'t';'i';'l';'i';'b'] rest when Measure rest > 0 -> 'e'::'l'::'b'::rest 
    | Ends ['i';'g';'o';'l'] rest when Measure rest > 0 -> 'g'::'o'::'l'::rest 
    | _ -> xs

let step3 xs =
    match xs with
    | Ends ['e';'t';'a';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest 
    | Ends ['e';'v';'i';'t';'a'] rest when Measure rest > 0 -> rest 
    | Ends ['e';'z';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['i';'t';'i';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest 
    | Ends ['l';'a';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest 
    | Ends ['l';'u';'f'] rest when Measure rest > 0 -> rest 
    | Ends ['s';'s';'e';'n'] rest when Measure rest > 0 -> rest 
    | _ -> xs

let step4 xs =
    match xs with
    | Ends ['l';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'c';'n';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'c';'n';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['r';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['c';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'l';'b';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'l';'b';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'e';'m';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'e';'m'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['n';'o';'i'] rest when 
        match rest with 
        | Measure (m) & (Ends ['s'] (_) | Ends ['t'] (_)) when m > 1 -> true
        | _ -> false
        -> rest 
    | Ends ['u';'o'] rest when Measure rest > 1 -> rest 
    | Ends ['m';'s';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'t';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['i';'t';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['s';'u';'o'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'v';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'z';'i'] rest when Measure rest > 1 -> rest 
    | _ -> xs

let step5a xs =
    match xs with
    | Ends ['e'] rest when Measure rest > 1 -> rest
    | Ends ['e'] rest when 
        match rest with
        | NotEndsWithCVC (_) & Measure (m) when m = 1 -> true
        | _ -> false
        -> rest
    | _ -> xs

let step5b xs =
    match xs with
    | EndsWithDoubleC ('l',rest) & Measure (m) when m > 1 -> 'l'::rest
    | _ -> xs

let stemCsRev = step1a >> step1b >> step1c >> step2 >> step3 >> step4 >> step5a >> step5b
let stemCs = List.rev >> stemCsRev >> List.rev
let stem (s:string) = System.String(s.ToLower().ToCharArray() |> Array.toList |> stemCs |> List.toArray)

(* TEST SCRIPT *)
(*
#load "Stemmer.fs"
open Stemmer

let test a b = if  a |> stem  = b then () else failwithf "%s %s" a b

//verified against NLTK implementation http://text-processing.com/demo/stem/
let testWords =
    [
        "caresses","caress"
        "ponies","poni"
        "ties","ti"
        "caress","caress"
        "cats","cat"
        "feed","feed"
        "agreed","agre"
        "plastered","plaster"
        "bled","bled"
        "motoring","motor"
        "sing","sing"
        "conflated","conflat" //"conflate"
        "troubled","troubl"
        "sized","size"
        "hopping","hop"
        "tanned","tan"
        "falling","fall"
        "hissing","hiss"
        "fizzed","fizz"
        "failing","fail"
        "filing","file"
        "relational","relat"
        "conditional","condit"
        "rational","ration"
        "valenci","valenc"
        "hesitanci","hesit"
        "digitizer","digit"
        "conformabli","conform"
        "radicalli","radic"
        "differentli","differ"
        "vileli","vile"
        "analogousli","analog"
        "vietnamizati","vietnamizati"
        "predication","predic"
        "operator","oper"
        "feudalism","feudal"
        "decisiveness","decis"
        "hopefulness","hope"
        "callousness","callous"
        "formaliti","formal"
        "sensitiviti","sensit"
        "sensibiliti","sensibl"
        "triplicate","triplic"
        "formative","form"
        "formalize","formal"
        "electriciti","electr"
        "electrical","electr"
        "hopeful","hope"
        "goodness","good"
        "revival","reviv"
        "allowance","allow"
        "inference","infer"
        "airliner","airlin"
        "gyroscopic","gyroscop"
        "adjustable","adjust"
        "defensible","defens"
        "irritant","irrit"
        "replacement","replac"
        "adjustment","adjust"
        "dependent","depend"
        "adoption","adopt"
        "homologou","homolog"
        "communism","commun"
        "activate","activ"
        "angulariti","angular"
        "homologous","homolog"
        "effective","effect"
        "bowdlerize","bowdler"
        "probate","probat"
        "rate","rate"
        "cease","ceas"
        "controll","control"
        "roll","roll"
    ]

let runTest() = 
    testWords |> List.iter (fun (a,b) -> test a b; printfn "%s -> %s" a b)
    printfn "done"
*)
module Stemmer
namespace System
val a : char
type Char =
  struct
    member CompareTo : value:obj -> int + 1 overload
    member Equals : obj:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member GetTypeCode : unit -> TypeCode
    member ToString : unit -> string + 1 overload
    static val MaxValue : char
    static val MinValue : char
    static member ConvertFromUtf32 : utf32:int -> string
    static member ConvertToUtf32 : highSurrogate:char * lowSurrogate:char -> int + 1 overload
    static member GetNumericValue : c:char -> float + 1 overload
    ...
  end

Full name: System.Char
Char.ToLower(c: char) : char
Char.ToLower(c: char, culture: Globalization.CultureInfo) : char
union case Option.Some: Value: 'T -> Option<'T>
union case Option.None: Option<'T>
val xs : char list
Char.IsWhiteSpace(c: char) : bool
Char.IsWhiteSpace(s: string, index: int) : bool
active recognizer BaseVowel: char -> char option

Full name: Stemmer.( |BaseVowel|_| )
active recognizer Consonant: char list -> (char * char list) option

Full name: Stemmer.( |Consonant|_| )
val x : char
val rest : char list
active pattern result C: unit
val loop : (char list -> char list -> (char list * char list) option)
val acc : char list
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val rev : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.rev
active pattern result V: unit
active recognizer Vowel: char list -> (char * char list) option

Full name: Stemmer.( |Vowel|_| )
val loop : (char list -> (char list * char list) list -> ((char list * char list) list * char list) option)
val acc : (char list * char list) list
active recognizer C: char list -> (char list * char list) option

Full name: Stemmer.( |C|_| )
val cs : char list
active recognizer V: char list -> (char list * char list) option

Full name: Stemmer.( |V|_| )
val vs : char list
val calcMeasure : vs:('a list * 'b list) list -> int

Full name: Stemmer.calcMeasure
val vs : ('a list * 'b list) list
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State

Full name: Microsoft.FSharp.Collections.List.fold
val c : int
val vs : 'a list
val cs : 'b list
val stem : char list
val loop : (char list -> char list -> char list option)
val ys : char list
val b : char
val ContainsVowel : (char list -> bool)

Full name: Stemmer.ContainsVowel
val exists : predicate:('T -> bool) -> list:'T list -> bool

Full name: Microsoft.FSharp.Collections.List.exists
module Option

from Microsoft.FSharp.Core
val isSome : option:'T option -> bool

Full name: Microsoft.FSharp.Core.Option.isSome
active recognizer EndsWithCVC: char list -> char list option

Full name: Stemmer.( |EndsWithCVC|_| )
Multiple items
type MeasureAttribute =
  inherit Attribute
  new : unit -> MeasureAttribute

Full name: Microsoft.FSharp.Core.MeasureAttribute

--------------------
new : unit -> MeasureAttribute
active recognizer VC: char list -> ((char list * char list) list * char list) option

Full name: Stemmer.( |VC|_| )
val cvs : (char list * char list) list
Multiple items
val Measure : xs:char list -> int

Full name: Stemmer.Measure

--------------------
active recognizer Measure: char list -> int option

Full name: Stemmer.( |Measure|_| )

--------------------
type MeasureAttribute =
  inherit Attribute
  new : unit -> MeasureAttribute

Full name: Microsoft.FSharp.Core.MeasureAttribute

--------------------
new : unit -> MeasureAttribute
Multiple items
active recognizer Measure: char list -> int option

Full name: Stemmer.( |Measure|_| )

--------------------
type MeasureAttribute =
  inherit Attribute
  new : unit -> MeasureAttribute

Full name: Microsoft.FSharp.Core.MeasureAttribute

--------------------
new : unit -> MeasureAttribute
val m : int
val step1a : xs:char list -> char list

Full name: Stemmer.step1a
active recognizer Ends: char list -> char list -> char list option

Full name: Stemmer.( |Ends|_| )
val contains : y:'a -> xs:'a list -> bool (requires equality)

Full name: Stemmer.contains
val y : 'a (requires equality)
val xs : 'a list (requires equality)
val x : 'a (requires equality)
val step1bTx : xs:char list -> char list

Full name: Stemmer.step1bTx
active recognizer EndsWithDoubleC: char list -> (char * char list) option

Full name: Stemmer.( |EndsWithDoubleC|_| )
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val step1b : xs:char list -> char list

Full name: Stemmer.step1b
val step1c : xs:char list -> char list

Full name: Stemmer.step1c
val step2 : xs:char list -> char list

Full name: Stemmer.step2
val step3 : xs:char list -> char list

Full name: Stemmer.step3
val step4 : xs:char list -> char list

Full name: Stemmer.step4
val step5a : xs:char list -> char list

Full name: Stemmer.step5a
active recognizer NotEndsWithCVC: char list -> char list option

Full name: Stemmer.( |NotEndsWithCVC|_| )
val step5b : xs:char list -> char list

Full name: Stemmer.step5b
val stemCsRev : (char list -> char list)

Full name: Stemmer.stemCsRev
val stemCs : (char list -> char list)

Full name: Stemmer.stemCs
val stem : s:string -> String

Full name: Stemmer.stem
val s : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
String.ToLower() : string
String.ToLower(culture: Globalization.CultureInfo) : string
type Array =
  member Clone : unit -> obj
  member CopyTo : array:Array * index:int -> unit + 1 overload
  member GetEnumerator : unit -> IEnumerator
  member GetLength : dimension:int -> int
  member GetLongLength : dimension:int -> int64
  member GetLowerBound : dimension:int -> int
  member GetUpperBound : dimension:int -> int
  member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
  member Initialize : unit -> unit
  member IsFixedSize : bool
  ...

Full name: System.Array
val toList : array:'T [] -> 'T list

Full name: Microsoft.FSharp.Collections.Array.toList
val toArray : list:'T list -> 'T []

Full name: Microsoft.FSharp.Collections.List.toArray
Raw view Test code New version

More information

Link:http://fssnip.net/eR
Posted:12 years ago
Author:Faisal Waris
Tags: nlp