3 people like it.
Like the snippet!
Porter Stemmer
An Implementation of the Porter Stemming Algorithm in F# for text analysis.
Please see: http://tartarus.org/martin/PorterStemmer/
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
161:
162:
163:
164:
165:
166:
167:
168:
169:
170:
171:
172:
173:
174:
175:
176:
177:
178:
179:
180:
181:
182:
183:
184:
185:
186:
187:
188:
189:
190:
191:
192:
193:
194:
195:
196:
197:
198:
199:
200:
201:
202:
203:
204:
205:
206:
207:
208:
209:
210:
211:
212:
213:
214:
215:
216:
217:
218:
219:
220:
221:
222:
223:
224:
225:
226:
227:
228:
229:
230:
231:
232:
233:
234:
235:
236:
237:
238:
239:
240:
241:
242:
243:
244:
245:
246:
247:
248:
249:
250:
251:
252:
253:
254:
255:
256:
257:
258:
259:
260:
261:
262:
263:
264:
265:
266:
267:
268:
269:
270:
271:
272:
273:
274:
275:
276:
277:
278:
|
module Stemmer
open System
let (|BaseVowel|_|) a = match Char.ToLower(a) with 'a' | 'e' | 'i' | 'o' | 'u' -> Some a | _ -> None
let rec (|Consonant|_|) xs =
match xs with
| [] -> None
| a::_ when Char.IsWhiteSpace(a) -> None
| BaseVowel a::_ -> None
| 'y'::Consonant _ | 'Y'::Consonant _-> None
| x::rest -> Some(x,rest)
and (|Vowel|_|) xs =
match xs with
| [] -> None
| a::_ when Char.IsWhiteSpace(a) -> None
| Consonant _ -> None
| x::rest -> Some(x,rest)
let (|C|_|) xs =
let rec loop xs acc =
match xs with
| Consonant (x,xs) -> loop xs (x::acc)
| _ -> match acc with [] -> None | _ -> Some(acc|>List.rev,xs)
loop xs []
let (|V|_|) xs =
let rec loop xs acc =
match xs with
| Vowel (x,xs) -> loop xs (x::acc)
| _ -> match acc with [] -> None | _ -> Some(acc|>List.rev,xs)
loop xs []
let (|VC|_|) xs =
let rec loop xs acc =
match xs with
| [] -> match acc with [] -> None | _ -> Some(acc,xs)
| a::_ when Char.IsWhiteSpace(a) -> match acc with [] -> None | _ -> Some(acc,xs)
| C (cs, V (vs,xs)) -> loop xs ((vs,cs)::acc)
| C (cs, xs) -> loop xs (([],cs)::acc)
| V (vs, xs) -> loop xs ((vs,[])::acc)
| _ -> None
loop xs []
let calcMeasure vs = (0,vs) ||> List.fold (fun c (vs,cs) -> match vs,cs with [],_ | _,[] -> c | _ -> c+1)
let (|Ends|_|) stem xs =
let rec loop xs ys =
match xs,ys with
| [],[] -> Some(xs)
| [],_ -> None
| _, [] -> Some(xs)
| a::xs,b::ys when a = b || Char.ToLower(a) = Char.ToLower(b) -> loop xs ys
| _ -> None
loop xs stem
let ContainsVowel = List.exists ((|BaseVowel|_|) >> Option.isSome)
let (|EndsWithDoubleC|_|) xs =
match xs with
| Consonant (a, Consonant (b,xs)) when a = b -> Some (a,xs)
| _ -> None
let (|EndsWithCVC|_|) xs =
match xs with
| Consonant (a, Vowel (_, Consonant (_,xs))) ->
match a with
| 'w' | 'x' | 'y' | 'W' | 'X' | 'Y' -> None
| _ -> Some(xs)
| _ -> None
let (|NotEndsWithCVC|_|) xs = match xs with EndsWithCVC (_) -> None | _ -> Some(xs)
let (|Measure|_|) xs = match xs with VC (cvs,rest) -> calcMeasure cvs |> Some | _ -> None
let Measure xs = match xs with Measure (m) -> m | _ -> 0
let step1a xs =
match xs with
| Ends ['s';'e';'s';'s'] (rest) -> 's'::'s'::rest
| Ends ['s';'e';'i'] (rest) -> 'i'::rest
| 's'::'s'::rest -> xs
| 's'::rest -> rest
| _ -> xs
let contains y xs = xs |> List.exists (fun x -> x=y)
let step1bTx xs =
match xs with
| Ends ['t';'a'] _ -> 'e'::xs
| Ends ['l'; 'b'] _ -> 'e'::xs
| Ends ['z'; 'i'] _ -> 'e'::xs
| EndsWithDoubleC (x,rest) when ['l';'s';'z'] |> contains x |> not -> x::rest
| EndsWithCVC (_) & Measure (m) when m = 1 -> 'e'::xs
| _ -> xs
let step1b xs =
match xs with
| Ends ['d';'e';'e'] rest -> if Measure rest > 0 then 'e'::'e':: rest else xs
| Ends ['d';'e'] rest when ContainsVowel rest -> rest |> step1bTx
| Ends ['g';'n';'i'] rest when ContainsVowel rest -> rest |> step1bTx
| _ -> xs
let step1c xs =
match xs with
| Ends ['y'] rest when ContainsVowel rest -> 'i'::rest
| _ -> xs
let step2 xs =
match xs with
| Ends ['l';'a';'n';'o';'i';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest
| Ends ['l';'a';'n';'o';'i';'t'] rest when Measure rest > 0 -> 'n'::'o'::'i'::'t'::rest
| Ends ['i';'c';'n';'e'] rest when Measure rest > 0 -> 'e'::'c'::'n'::'e'::rest
| Ends ['i';'c';'n';'a'] rest when Measure rest > 0 -> 'e'::'c'::'n'::'a'::rest
| Ends ['r';'e';'z';'i'] rest when Measure rest > 0 -> 'e'::'z'::'i'::rest
| Ends ['i';'l';'b'] rest when Measure rest > 0 -> 'e'::'l'::'b'::rest
| Ends ['i';'l';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest
| Ends ['i';'l';'t';'n';'e'] rest when Measure rest > 0 -> 't'::'n'::'e'::rest
| Ends ['i';'l';'e'] rest when Measure rest > 0 -> 'e'::rest
| Ends ['i';'l';'s';'u';'o'] rest when Measure rest > 0 -> 's'::'u'::'o'::rest
| Ends ['n';'o';'i';'t';'a';'z';'i'] rest when Measure rest > 0 -> 'e'::'z'::'i'::rest
| Ends ['n';'o';'i';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest
| Ends ['r';'o';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest
| Ends ['m';'z';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest
| Ends ['s';'s';'e';'n';'e';'v';'i'] rest when Measure rest > 0 -> 'e'::'v'::'i'::rest
| Ends ['s';'s';'e';'n';'l';'u';'f'] rest when Measure rest > 0 -> 'l'::'u'::'f'::rest
| Ends ['s';'s';'e';'n';'s';'u';'o'] rest when Measure rest > 0 -> 's'::'u'::'o'::rest
| Ends ['i';'t';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest
| Ends ['i';'t';'i';'v';'i'] rest when Measure rest > 0 -> 'e'::'v'::'i'::rest
| Ends ['i';'t';'i';'l';'i';'b'] rest when Measure rest > 0 -> 'e'::'l'::'b'::rest
| Ends ['i';'g';'o';'l'] rest when Measure rest > 0 -> 'g'::'o'::'l'::rest
| _ -> xs
let step3 xs =
match xs with
| Ends ['e';'t';'a';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest
| Ends ['e';'v';'i';'t';'a'] rest when Measure rest > 0 -> rest
| Ends ['e';'z';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest
| Ends ['i';'t';'i';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest
| Ends ['l';'a';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest
| Ends ['l';'u';'f'] rest when Measure rest > 0 -> rest
| Ends ['s';'s';'e';'n'] rest when Measure rest > 0 -> rest
| _ -> xs
let step4 xs =
match xs with
| Ends ['l';'a'] rest when Measure rest > 1 -> rest
| Ends ['e';'c';'n';'a'] rest when Measure rest > 1 -> rest
| Ends ['e';'c';'n';'e'] rest when Measure rest > 1 -> rest
| Ends ['r';'e'] rest when Measure rest > 1 -> rest
| Ends ['c';'i'] rest when Measure rest > 1 -> rest
| Ends ['e';'l';'b';'a'] rest when Measure rest > 1 -> rest
| Ends ['e';'l';'b';'i'] rest when Measure rest > 1 -> rest
| Ends ['t';'n';'a'] rest when Measure rest > 1 -> rest
| Ends ['t';'n';'e';'m';'e'] rest when Measure rest > 1 -> rest
| Ends ['t';'n';'e';'m'] rest when Measure rest > 1 -> rest
| Ends ['t';'n';'e'] rest when Measure rest > 1 -> rest
| Ends ['n';'o';'i'] rest when
match rest with
| Measure (m) & (Ends ['s'] (_) | Ends ['t'] (_)) when m > 1 -> true
| _ -> false
-> rest
| Ends ['u';'o'] rest when Measure rest > 1 -> rest
| Ends ['m';'s';'i'] rest when Measure rest > 1 -> rest
| Ends ['e';'t';'a'] rest when Measure rest > 1 -> rest
| Ends ['i';'t';'i'] rest when Measure rest > 1 -> rest
| Ends ['s';'u';'o'] rest when Measure rest > 1 -> rest
| Ends ['e';'v';'i'] rest when Measure rest > 1 -> rest
| Ends ['e';'z';'i'] rest when Measure rest > 1 -> rest
| _ -> xs
let step5a xs =
match xs with
| Ends ['e'] rest when Measure rest > 1 -> rest
| Ends ['e'] rest when
match rest with
| NotEndsWithCVC (_) & Measure (m) when m = 1 -> true
| _ -> false
-> rest
| _ -> xs
let step5b xs =
match xs with
| EndsWithDoubleC ('l',rest) & Measure (m) when m > 1 -> 'l'::rest
| _ -> xs
let stemCsRev = step1a >> step1b >> step1c >> step2 >> step3 >> step4 >> step5a >> step5b
let stemCs = List.rev >> stemCsRev >> List.rev
let stem (s:string) = System.String(s.ToLower().ToCharArray() |> Array.toList |> stemCs |> List.toArray)
(* TEST SCRIPT *)
(*
#load "Stemmer.fs"
open Stemmer
let test a b = if a |> stem = b then () else failwithf "%s %s" a b
//verified against NLTK implementation http://text-processing.com/demo/stem/
let testWords =
[
"caresses","caress"
"ponies","poni"
"ties","ti"
"caress","caress"
"cats","cat"
"feed","feed"
"agreed","agre"
"plastered","plaster"
"bled","bled"
"motoring","motor"
"sing","sing"
"conflated","conflat" //"conflate"
"troubled","troubl"
"sized","size"
"hopping","hop"
"tanned","tan"
"falling","fall"
"hissing","hiss"
"fizzed","fizz"
"failing","fail"
"filing","file"
"relational","relat"
"conditional","condit"
"rational","ration"
"valenci","valenc"
"hesitanci","hesit"
"digitizer","digit"
"conformabli","conform"
"radicalli","radic"
"differentli","differ"
"vileli","vile"
"analogousli","analog"
"vietnamizati","vietnamizati"
"predication","predic"
"operator","oper"
"feudalism","feudal"
"decisiveness","decis"
"hopefulness","hope"
"callousness","callous"
"formaliti","formal"
"sensitiviti","sensit"
"sensibiliti","sensibl"
"triplicate","triplic"
"formative","form"
"formalize","formal"
"electriciti","electr"
"electrical","electr"
"hopeful","hope"
"goodness","good"
"revival","reviv"
"allowance","allow"
"inference","infer"
"airliner","airlin"
"gyroscopic","gyroscop"
"adjustable","adjust"
"defensible","defens"
"irritant","irrit"
"replacement","replac"
"adjustment","adjust"
"dependent","depend"
"adoption","adopt"
"homologou","homolog"
"communism","commun"
"activate","activ"
"angulariti","angular"
"homologous","homolog"
"effective","effect"
"bowdlerize","bowdler"
"probate","probat"
"rate","rate"
"cease","ceas"
"controll","control"
"roll","roll"
]
let runTest() =
testWords |> List.iter (fun (a,b) -> test a b; printfn "%s -> %s" a b)
printfn "done"
*)
|
module Stemmer
namespace System
val a : char
type Char =
struct
member CompareTo : value:obj -> int + 1 overload
member Equals : obj:obj -> bool + 1 overload
member GetHashCode : unit -> int
member GetTypeCode : unit -> TypeCode
member ToString : unit -> string + 1 overload
static val MaxValue : char
static val MinValue : char
static member ConvertFromUtf32 : utf32:int -> string
static member ConvertToUtf32 : highSurrogate:char * lowSurrogate:char -> int + 1 overload
static member GetNumericValue : c:char -> float + 1 overload
...
end
Full name: System.Char
Char.ToLower(c: char) : char
Char.ToLower(c: char, culture: Globalization.CultureInfo) : char
union case Option.Some: Value: 'T -> Option<'T>
union case Option.None: Option<'T>
val xs : char list
Char.IsWhiteSpace(c: char) : bool
Char.IsWhiteSpace(s: string, index: int) : bool
active recognizer BaseVowel: char -> char option
Full name: Stemmer.( |BaseVowel|_| )
active recognizer Consonant: char list -> (char * char list) option
Full name: Stemmer.( |Consonant|_| )
val x : char
val rest : char list
active pattern result C: unit
val loop : (char list -> char list -> (char list * char list) option)
val acc : char list
Multiple items
module List
from Microsoft.FSharp.Collections
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
val rev : list:'T list -> 'T list
Full name: Microsoft.FSharp.Collections.List.rev
active pattern result V: unit
active recognizer Vowel: char list -> (char * char list) option
Full name: Stemmer.( |Vowel|_| )
val loop : (char list -> (char list * char list) list -> ((char list * char list) list * char list) option)
val acc : (char list * char list) list
active recognizer C: char list -> (char list * char list) option
Full name: Stemmer.( |C|_| )
val cs : char list
active recognizer V: char list -> (char list * char list) option
Full name: Stemmer.( |V|_| )
val vs : char list
val calcMeasure : vs:('a list * 'b list) list -> int
Full name: Stemmer.calcMeasure
val vs : ('a list * 'b list) list
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State
Full name: Microsoft.FSharp.Collections.List.fold
val c : int
val vs : 'a list
val cs : 'b list
val stem : char list
val loop : (char list -> char list -> char list option)
val ys : char list
val b : char
val ContainsVowel : (char list -> bool)
Full name: Stemmer.ContainsVowel
val exists : predicate:('T -> bool) -> list:'T list -> bool
Full name: Microsoft.FSharp.Collections.List.exists
module Option
from Microsoft.FSharp.Core
val isSome : option:'T option -> bool
Full name: Microsoft.FSharp.Core.Option.isSome
active recognizer EndsWithCVC: char list -> char list option
Full name: Stemmer.( |EndsWithCVC|_| )
Multiple items
type MeasureAttribute =
inherit Attribute
new : unit -> MeasureAttribute
Full name: Microsoft.FSharp.Core.MeasureAttribute
--------------------
new : unit -> MeasureAttribute
active recognizer VC: char list -> ((char list * char list) list * char list) option
Full name: Stemmer.( |VC|_| )
val cvs : (char list * char list) list
Multiple items
val Measure : xs:char list -> int
Full name: Stemmer.Measure
--------------------
active recognizer Measure: char list -> int option
Full name: Stemmer.( |Measure|_| )
--------------------
type MeasureAttribute =
inherit Attribute
new : unit -> MeasureAttribute
Full name: Microsoft.FSharp.Core.MeasureAttribute
--------------------
new : unit -> MeasureAttribute
Multiple items
active recognizer Measure: char list -> int option
Full name: Stemmer.( |Measure|_| )
--------------------
type MeasureAttribute =
inherit Attribute
new : unit -> MeasureAttribute
Full name: Microsoft.FSharp.Core.MeasureAttribute
--------------------
new : unit -> MeasureAttribute
val m : int
val step1a : xs:char list -> char list
Full name: Stemmer.step1a
active recognizer Ends: char list -> char list -> char list option
Full name: Stemmer.( |Ends|_| )
val contains : y:'a -> xs:'a list -> bool (requires equality)
Full name: Stemmer.contains
val y : 'a (requires equality)
val xs : 'a list (requires equality)
val x : 'a (requires equality)
val step1bTx : xs:char list -> char list
Full name: Stemmer.step1bTx
active recognizer EndsWithDoubleC: char list -> (char * char list) option
Full name: Stemmer.( |EndsWithDoubleC|_| )
val not : value:bool -> bool
Full name: Microsoft.FSharp.Core.Operators.not
val step1b : xs:char list -> char list
Full name: Stemmer.step1b
val step1c : xs:char list -> char list
Full name: Stemmer.step1c
val step2 : xs:char list -> char list
Full name: Stemmer.step2
val step3 : xs:char list -> char list
Full name: Stemmer.step3
val step4 : xs:char list -> char list
Full name: Stemmer.step4
val step5a : xs:char list -> char list
Full name: Stemmer.step5a
active recognizer NotEndsWithCVC: char list -> char list option
Full name: Stemmer.( |NotEndsWithCVC|_| )
val step5b : xs:char list -> char list
Full name: Stemmer.step5b
val stemCsRev : (char list -> char list)
Full name: Stemmer.stemCsRev
val stemCs : (char list -> char list)
Full name: Stemmer.stemCs
val stem : s:string -> String
Full name: Stemmer.stem
val s : string
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = String
Full name: Microsoft.FSharp.Core.string
Multiple items
type String =
new : value:char -> string + 7 overloads
member Chars : int -> char
member Clone : unit -> obj
member CompareTo : value:obj -> int + 1 overload
member Contains : value:string -> bool
member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
member EndsWith : value:string -> bool + 2 overloads
member Equals : obj:obj -> bool + 2 overloads
member GetEnumerator : unit -> CharEnumerator
member GetHashCode : unit -> int
...
Full name: System.String
--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
String.ToLower() : string
String.ToLower(culture: Globalization.CultureInfo) : string
type Array =
member Clone : unit -> obj
member CopyTo : array:Array * index:int -> unit + 1 overload
member GetEnumerator : unit -> IEnumerator
member GetLength : dimension:int -> int
member GetLongLength : dimension:int -> int64
member GetLowerBound : dimension:int -> int
member GetUpperBound : dimension:int -> int
member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
member Initialize : unit -> unit
member IsFixedSize : bool
...
Full name: System.Array
val toList : array:'T [] -> 'T list
Full name: Microsoft.FSharp.Collections.Array.toList
val toArray : list:'T list -> 'T []
Full name: Microsoft.FSharp.Collections.List.toArray
More information