1 people like it.
Like the snippet!
Porter Stemmer
Porter Stemmer, please contribute fixes, suggestions, this is merely a syntactic rewrite, I believe though there are some slight mistakes. http://tartarus.org/~martin/PorterStemmer/ ,fixed up endswithdoubleconsonant and endswithocondition
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
161:
162:
163:
164:
165:
166:
167:
168:
169:
170:
171:
172:
173:
174:
175:
176:
177:
178:
179:
180:
181:
182:
183:
184:
185:
186:
187:
188:
189:
190:
191:
192:
193:
194:
195:
196:
197:
198:
199:
200:
201:
202:
203:
204:
205:
206:
207:
208:
209:
210:
211:
212:
213:
214:
215:
216:
217:
218:
219:
220:
221:
222:
223:
224:
225:
226:
227:
228:
229:
230:
231:
232:
233:
234:
235:
236:
237:
238:
239:
240:
241:
242:
243:
244:
245:
246:
247:
248:
249:
250:
251:
252:
253:
254:
255:
256:
257:
|
(* Origionally from https://blogs.msdn.com/b/christianb/archive/2011/06/24/a-porter-stemmer-in-f.aspx
*)
open System
type step = string -> string
type Letter =
Empty
| Vowel of char
| Conso of char
let isVowel v = Seq.exists ((=) v) "aeiouAEIOU"
let isVowelOnly s = Seq.forall isVowel s
let categoriseWord w =
let (|C|V|) s = if isVowel s then V s else C s
let idx p = Seq.findIndex ((=) p)
let nth s = Seq.nth (idx s w - 1) w
let cat = function
V s -> Vowel s
| s ->
if idx s w = 0 then Conso s
elif s = 'y' || s = 'Y' then
match nth s with
V _ -> Conso s
| s -> Vowel s
else Conso s
w |> Seq.map cat
(* Fix below 3 functions at some stage, they work just seems there is 5 times more code than necess. *)
let measureWord w =
if Seq.length w < 2 then 0
else
let rec measureWord' (curWord:Letter seq) (previousLetter:Letter) curScore =
if (Seq.isEmpty curWord) then curScore
else
let curLetter = Seq.head curWord
let skipLetter = Seq.skip 1 curWord
match previousLetter, curLetter with
Vowel _, Conso _ -> measureWord' skipLetter curLetter (curScore + 1)
| _, _ -> measureWord' skipLetter curLetter curScore
measureWord' (categoriseWord w) Empty 0
let endsWithDoubleConsonant word =
if word = "" then false
else
let categorised = categoriseWord word
let sLen = Seq.length categorised
let preLast = Seq.nth (sLen - 2) categorised
match preLast with
| Conso c ->
let last = Seq.nth ( sLen - 1) categorised
match last with
| Conso d when c = d -> true
| _ -> false
| _ -> false
let endsWithOCondition word =
if word = "" then false
else
let categorised = categoriseWord word
let len = Seq.length categorised
if len < 3 then false
else
let seqMaxIndex = len - 1
let prePrelast = Seq.nth (seqMaxIndex - 2) categorised
match prePrelast with
| Conso c ->
let preLast = Seq.nth (seqMaxIndex - 1) categorised
match preLast with
| Vowel _->
let last = Seq.nth seqMaxIndex categorised
match last with
| Conso d -> ( d <> 'w' && d <> 'x' && d <> 'y' )
| _ -> false
| _ -> false
| _ -> false
let step1A_subs =
[ "sses", "ss"; "ies" , "i"
"ss" , "ss"; "s" , "" ]
let step1B_subs =
[ "at", "ate"; "bl","ble"
"iz", "ize"]
let step1B2_subs =
[ "eed", "ee" ]
let step2_subs =
[ "ational", "ate" ; "tional" , "tion"
"enci" , "ence"; "anci" , "ance"
"izer" , "ize" ; "bli" , "ble"
"alli" , "al" ; "entli" , "ent"
"eli" , "e" ; "ousli" , "ous"
"ization", "ize" ; "ation" , "ate"
"ator" , "ate" ; "alism" , "al"
"iveness", "ive" ; "fulness", "ful"
"ousness", "ous" ; "aliti" , "al"
"iviti" , "ive" ; "biliti" , "ble"
"logi" , "log" ]
let step3_subs =
[ "icate", "ic"; "ative", ""
"alize", "al"; "iciti", "ic"
"ical" , "ic"; "ful" , ""
"ness" , "" ]
let step4_subs =
[ "al" , ""; "ance", ""
"ence" , ""; "er" , ""
"ic" , ""; "able", ""
"ible" , ""; "ant" , ""
"ement", ""; "ment", ""
"ion" , ""; "ism" , ""
"ou" , ""; "iti" , ""
"ate" , ""; "ive" , ""
"ous" , ""; "ize" , ""
"ise" , "" ]
let stem (winput : string) =
let len (s : string) = s.Length
let substr (w : string) i = w.Substring(0, len w - i)
let containsVowel s =
Seq.exists isVowel s
let replaceLast (w : string) (s : string) r =
let place = w.LastIndexOf s in
if place >= 0 then
w.Remove(place, len s).Insert(place,r)
else w
let findFirstReplace (word : string) n pList =
let cand =
pList
|> Seq.tryFind (fun (e,b) -> word.EndsWith e)
if cand <> None then
cand |> Option.get |> fun (e,r) ->
if measureWord (substr word e.Length) > n
then replaceLast word e r
else word
else word
let step1A (word : string) =
findFirstReplace word 0 step1A_subs
let step1B (word : string) =
let secondLast (w : string) =
let endIfO w =
if endsWithOCondition w && measureWord w = 1
then w ^ "e"
else w
let endIfD w = endsWithDoubleConsonant w
match Seq.nth (len w - 1) w with
| 'l' | 's' | 'z' -> w
| _ ->
if endIfD w
then substr w 1
else w
let step1Bx (w : string) =
findFirstReplace w -1 step1B_subs
let sub w n =
if containsVowel (substr w n)
then substr w n
else w
match word with
| w when w.EndsWith "eed" -> findFirstReplace word 0 step1B2_subs
| w when w.EndsWith "ed" -> sub word 2
| w when w.EndsWith "ing" -> sub word 3
| w -> w
let step1C : step = function
| w when containsVowel (substr w 0) && w.EndsWith "y" -> replaceLast w "y" "i"
| w -> w
let step2 (word : string) =
findFirstReplace word 0 step2_subs
let step3 (word : string) =
findFirstReplace word 0 step3_subs
let step4 (word : string) =
findFirstReplace word 1 step4_subs
let step5 (word : string) =
if word.EndsWith "e" then
let s = substr word 1
if word.EndsWith "l" && measureWord s > 1
then s
else
let o = endsWithOCondition s
if measureWord s > 1 || measureWord s = 1 && not o
then replaceLast word "e" ""
else word
else word
let c word len' f =
if len word < len' then word
else
let ret = f word
if len ret > 0 then ret else word
let rec step w n =
let s b f = step (c w b f) (n+1)
if n < 7 then
match n with
| 0 -> s 3 step1A
| 1 -> s 0 step1B
| 2 -> s 4 step1C
| 3 -> s 4 step2
| 4 -> s 3 step3
| 5 -> s 3 step4
| 6 -> s 2 step5
| _ -> ""
else w
step winput 0
let ps : string -> string option = function
| w when w.Length < 2 -> Some w
| w when isVowelOnly w -> Some w
| w ->
match w.ToLower() with
wl when wl.Length = 2 && wl.[1] = 'y' && isVowel wl.[0] -> Some wl
| w -> Some (stem w)
let filter (str : string) =
str |> Seq.filter (fun c -> Char.IsLetter c || c = ' ')
|> Seq.map string
|> String.concat ""
let test =
"Do you really think it is weakness that yields to temptation? \
I tell you that there are terrible temptations which it requires strength, \
strength and courage to yield to"
(test |> filter).Split ' '
|> Array.filter ((<>) "")
|> Array.map ps
|> Array.iter (fun x -> printf "%s " x.Value)
//"do you realli think it is weak that yield to temptat I tell you that there ar terribl temptat which it requir strength strength and courag to yield to"
// test below taken from http://qaa.ath.cx/porter_js_demo.html, for closer accuracy we should strip non a-z
//"Do you realli think it is weak that yield to temptat I tell you that there ar terribl temptat which it requir strength strength and courag to yield to"
|
namespace System
type step = string -> string
Full name: Script.step
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = String
Full name: Microsoft.FSharp.Core.string
type Letter =
| Empty
| Vowel of char
| Conso of char
Full name: Script.Letter
union case Letter.Empty: Letter
union case Letter.Vowel: char -> Letter
Multiple items
val char : value:'T -> char (requires member op_Explicit)
Full name: Microsoft.FSharp.Core.Operators.char
--------------------
type char = Char
Full name: Microsoft.FSharp.Core.char
union case Letter.Conso: char -> Letter
val isVowel : v:char -> bool
Full name: Script.isVowel
val v : char
module Seq
from Microsoft.FSharp.Collections
val exists : predicate:('T -> bool) -> source:seq<'T> -> bool
Full name: Microsoft.FSharp.Collections.Seq.exists
val isVowelOnly : s:seq<char> -> bool
Full name: Script.isVowelOnly
val s : seq<char>
val forall : predicate:('T -> bool) -> source:seq<'T> -> bool
Full name: Microsoft.FSharp.Collections.Seq.forall
val categoriseWord : w:seq<char> -> seq<Letter>
Full name: Script.categoriseWord
val w : seq<char>
active pattern result C: unit
active pattern result V: unit
val s : char
active pattern result V: char -> Choice<char,char>
active pattern result C: char -> Choice<char,char>
val idx : ('a -> seq<'a> -> int) (requires equality)
val p : 'a (requires equality)
val findIndex : predicate:('T -> bool) -> source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.findIndex
val nth : (char -> char)
val nth : index:int -> source:seq<'T> -> 'T
Full name: Microsoft.FSharp.Collections.Seq.nth
val cat : (char -> Letter)
active recognizer V: char -> Choice<char,char>
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
Full name: Microsoft.FSharp.Collections.Seq.map
val measureWord : w:seq<char> -> int
Full name: Script.measureWord
val length : source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.length
val measureWord' : (seq<Letter> -> Letter -> int -> int)
val curWord : seq<Letter>
Multiple items
val seq : sequence:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Core.Operators.seq
--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>
Full name: Microsoft.FSharp.Collections.seq<_>
val previousLetter : Letter
val curScore : int
val isEmpty : source:seq<'T> -> bool
Full name: Microsoft.FSharp.Collections.Seq.isEmpty
val curLetter : Letter
val head : source:seq<'T> -> 'T
Full name: Microsoft.FSharp.Collections.Seq.head
val skipLetter : seq<Letter>
val skip : count:int -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.skip
val endsWithDoubleConsonant : word:string -> bool
Full name: Script.endsWithDoubleConsonant
val word : string
val categorised : seq<Letter>
val sLen : int
val preLast : Letter
val c : char
val last : Letter
val d : char
val endsWithOCondition : word:string -> bool
Full name: Script.endsWithOCondition
val len : int
val seqMaxIndex : int
val prePrelast : Letter
val step1A_subs : (string * string) list
Full name: Script.step1A_subs
val step1B_subs : (string * string) list
Full name: Script.step1B_subs
val step1B2_subs : (string * string) list
Full name: Script.step1B2_subs
val step2_subs : (string * string) list
Full name: Script.step2_subs
val step3_subs : (string * string) list
Full name: Script.step3_subs
val step4_subs : (string * string) list
Full name: Script.step4_subs
val stem : winput:string -> string
Full name: Script.stem
val winput : string
val len : (string -> int)
val s : string
property String.Length: int
val substr : (string -> int -> string)
val w : string
val i : int
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
val containsVowel : (seq<char> -> bool)
val replaceLast : (string -> string -> string -> string)
val r : string
val place : int
String.LastIndexOf(value: string) : int
String.LastIndexOf(value: char) : int
String.LastIndexOf(value: string, comparisonType: StringComparison) : int
String.LastIndexOf(value: string, startIndex: int) : int
String.LastIndexOf(value: char, startIndex: int) : int
String.LastIndexOf(value: string, startIndex: int, comparisonType: StringComparison) : int
String.LastIndexOf(value: string, startIndex: int, count: int) : int
String.LastIndexOf(value: char, startIndex: int, count: int) : int
String.LastIndexOf(value: string, startIndex: int, count: int, comparisonType: StringComparison) : int
String.Remove(startIndex: int) : string
String.Remove(startIndex: int, count: int) : string
val findFirstReplace : (string -> int -> seq<string * string> -> string)
val n : int
val pList : seq<string * string>
val cand : (string * string) option
val tryFind : predicate:('T -> bool) -> source:seq<'T> -> 'T option
Full name: Microsoft.FSharp.Collections.Seq.tryFind
val e : string
val b : string
String.EndsWith(value: string) : bool
String.EndsWith(value: string, comparisonType: StringComparison) : bool
String.EndsWith(value: string, ignoreCase: bool, culture: Globalization.CultureInfo) : bool
union case Option.None: Option<'T>
module Option
from Microsoft.FSharp.Core
val get : option:'T option -> 'T
Full name: Microsoft.FSharp.Core.Option.get
val step1A : (string -> string)
val step1B : (string -> string)
val secondLast : (string -> string)
val endIfO : (string -> string)
val endIfD : (string -> bool)
val step1Bx : (string -> string)
val sub : (string -> int -> string)
val step1C : step
val step2 : (string -> string)
val step3 : (string -> string)
val step4 : (string -> string)
val step5 : (string -> string)
val o : bool
val not : value:bool -> bool
Full name: Microsoft.FSharp.Core.Operators.not
val c : (string -> int -> (string -> string) -> string)
val len' : int
val f : (string -> string)
val ret : string
Multiple items
val step : (string -> int -> string)
--------------------
type step = string -> string
Full name: Script.step
val s : (int -> (string -> string) -> string)
val b : int
val ps : _arg1:string -> string option
Full name: Script.ps
type 'T option = Option<'T>
Full name: Microsoft.FSharp.Core.option<_>
union case Option.Some: Value: 'T -> Option<'T>
String.ToLower() : string
String.ToLower(culture: Globalization.CultureInfo) : string
val wl : string
val filter : str:string -> string
Full name: Script.filter
val str : string
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.filter
type Char =
struct
member CompareTo : value:obj -> int + 1 overload
member Equals : obj:obj -> bool + 1 overload
member GetHashCode : unit -> int
member GetTypeCode : unit -> TypeCode
member ToString : unit -> string + 1 overload
static val MaxValue : char
static val MinValue : char
static member ConvertFromUtf32 : utf32:int -> string
static member ConvertToUtf32 : highSurrogate:char * lowSurrogate:char -> int + 1 overload
static member GetNumericValue : c:char -> float + 1 overload
...
end
Full name: System.Char
Char.IsLetter(c: char) : bool
Char.IsLetter(s: string, index: int) : bool
Multiple items
type String =
new : value:char -> string + 7 overloads
member Chars : int -> char
member Clone : unit -> obj
member CompareTo : value:obj -> int + 1 overload
member Contains : value:string -> bool
member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
member EndsWith : value:string -> bool + 2 overloads
member Equals : obj:obj -> bool + 2 overloads
member GetEnumerator : unit -> CharEnumerator
member GetHashCode : unit -> int
...
Full name: System.String
--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
val concat : sep:string -> strings:seq<string> -> string
Full name: Microsoft.FSharp.Core.String.concat
val test : string
Full name: Script.test
type Array =
member Clone : unit -> obj
member CopyTo : array:Array * index:int -> unit + 1 overload
member GetEnumerator : unit -> IEnumerator
member GetLength : dimension:int -> int
member GetLongLength : dimension:int -> int64
member GetLowerBound : dimension:int -> int
member GetUpperBound : dimension:int -> int
member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
member Initialize : unit -> unit
member IsFixedSize : bool
...
Full name: System.Array
val filter : predicate:('T -> bool) -> array:'T [] -> 'T []
Full name: Microsoft.FSharp.Collections.Array.filter
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []
Full name: Microsoft.FSharp.Collections.Array.map
val iter : action:('T -> unit) -> array:'T [] -> unit
Full name: Microsoft.FSharp.Collections.Array.iter
val x : string option
val printf : format:Printf.TextWriterFormat<'T> -> 'T
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printf
property Option.Value: string
More information