1 people like it.
Like the snippet!
Porter Stemmer
Porter Stemmer, please contribute fixes, suggestions, this is merely a syntactic rewrite, I believe though there are some slight mistakes. http://tartarus.org/~martin/PorterStemmer/ ,fixed up endswithdoubleconsonant and endswithocondition
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
161:
162:
163:
164:
165:
166:
167:
168:
169:
170:
171:
172:
173:
174:
175:
176:
177:
178:
179:
180:
181:
182:
183:
184:
185:
186:
187:
188:
189:
190:
191:
192:
193:
194:
195:
196:
197:
198:
199:
200:
201:
202:
203:
204:
205:
206:
207:
208:
209:
210:
211:
212:
213:
214:
215:
216:
217:
218:
219:
220:
221:
|
(* Origionally from https://blogs.msdn.com/b/christianb/archive/2011/06/24/a-porter-stemmer-in-f.aspx
*)
type step = string -> string
type Letter =
Empty
| Vowel of char
| Conso of char
let letterToString = function
Empty -> ""
| Vowel v -> string v
| Conso c -> string c
let isVowel v = Seq.exists ((=) v) "aeiouAEIOU"
let isVowelOnly s = s |> Seq.forall isVowel
let (|C|V|) s = if isVowel s then V s else C s
let categoriseWord w =
let idx p s = Seq.findIndex ((=) p) s
let nth s = Seq.nth (idx s w - 1) w
Seq.map(
function
V s -> Vowel s
| s ->
if idx s w = 0 then Conso s
elif s = 'y' || s = 'Y' then
match nth s with
V _ -> Conso s
| s -> Vowel s
else Conso s) w
let containsVowel s =
categoriseWord s
|> Seq.exists (function Vowel _ -> true | _ -> false)
(* Fix below 3 functions at some stage, they work just seems there is 5 times more code than necess. *)
let measureWord w =
if (Seq.length w < 2) then 0
else
let rec measureWord' (curWord:Letter seq) (previousLetter:Letter) curScore =
if (Seq.isEmpty curWord) then curScore
else
let curLetter = Seq.head curWord
match previousLetter, curLetter with
Vowel _, Conso _ -> measureWord' (Seq.skip 1 curWord) curLetter (curScore + 1)
| _, _ -> measureWord' (Seq.skip 1 curWord) curLetter curScore
measureWord' (categoriseWord w) Empty 0
let endsWithDoubleConsonant word =
if word = "" then false
else
let categorised = categoriseWord word
let sLen = Seq.length categorised
let preLast = Seq.nth (sLen - 2) categorised
match preLast with
| Conso c ->
let last = Seq.nth ( sLen - 1) categorised
match last with
| Conso d when c = d -> true
| _ -> false
| _ -> false
let endsWithOCondition word =
if word = "" then false
else
let categorised = categoriseWord word
let len = Seq.length categorised
if len < 3 then false
else
let seqMaxIndex = len - 1
let prePrelast = Seq.nth (seqMaxIndex - 2) categorised
match prePrelast with
| Conso c ->
let preLast = Seq.nth (seqMaxIndex - 1) categorised
match preLast with
| Vowel _->
let last = Seq.nth seqMaxIndex categorised
match last with
| Conso d -> ( d <> 'w' && d <> 'x' && d <> 'y' )
| _ -> false
| _ -> false
| _ -> false
let porterStem (word : string) =
let len (s : string) = s.Length
let substr (w : string) i = w.Substring(0, len w - i)
let (|E|_|) (n,r) (h : string) =
if h.EndsWith n then Some(h,n,r) else None
let (|LT|_|) i (w : string) =
if len w < i then Some()
else None
let replaceLast (w : string) (s : string) r =
let place = w.LastIndexOf s
w.Remove(place, len s).Insert(place,r)
let replaceStr (word,(a : string),b) k =
if measureWord (substr word a.Length) > k
then replaceLast word a b
else word
let step1A : step = function
LT 3 as w -> w
| E ("sses", "ss") r | E ("ies" , "i" ) r
| E ("ss" , "ss") r | E ("s" , "" ) r
-> replaceStr r 0
| e -> e
(* parameters everywhere *)
let step1B word =
let secondLast (w : string) =
let endIfO w =
if endsWithOCondition w && measureWord w = 1
then w ^ "e"
else w
let endIfD w = endsWithDoubleConsonant w
match Seq.nth (len w - 1) w with
| 'l' | 's' | 'z' -> w
| _ -> if endIfD w then substr w 1
else w
let step1Bx : step = function
LT 4 as w -> w
| E ("at" , "ate") r
| E ("bl" , "ble") r
| E ("iz" , "ize") r -> replaceStr r -1
| w -> secondLast w
let c w =
let (word,_,_) = w
let (_,s,_) = w
if containsVowel (substr word (len s) )
then substr word (len s)
else word
match word with
E ("eed" , "ee") r -> replaceStr r 0
| E ("ed" , "") r -> c r |> step1Bx
| E ("ing" , "") r -> c r |> step1Bx
| w -> w
let step1C : step = function
LT 4 as w -> w
| w when containsVowel (substr w 0) && w.EndsWith "y" -> replaceLast w "y" "i"
| w -> w
let step2 : step = function
LT 4 as w -> w
| E ("ational", "ate" ) r | E ("tional" , "tion") r
| E ("enci" , "ence") r | E ("anci" , "ance") r
| E ("izer" , "ize" ) r | E ("bli" , "ble" ) r
| E ("alli" , "al" ) r | E ("entli" , "ent" ) r
| E ("eli" , "e" ) r | E ("ousli" , "ous" ) r
| E ("ization", "ize" ) r | E ("ation" , "ate" ) r
| E ("ator" , "ate" ) r | E ("alism" , "al" ) r
| E ("iveness", "ive" ) r | E ("fulness", "ful" ) r
| E ("ousness", "ous" ) r | E ("aliti" , "al" ) r
| E ("iviti" , "ive" ) r | E ("biliti" , "ble" ) r
| E ("logi" , "log" ) r -> replaceStr r 0
| w -> w
let step3 : step = function
LT 3 as w -> w
| E ("icate", "ic") r | E ("ative", "" ) r
| E ("alize", "al") r | E ("iciti", "ic") r
| E ("ical" , "ic") r | E ("ful" , "" ) r
| E ("ness" , "" ) r -> replaceStr r 0
| w -> w
let step4 : step = function
LT 3 as w -> w
| E ("al" , "") r | E ("ance", "") r
| E ("ence" , "") r | E ("er" , "") r
| E ("ic" , "") r | E ("able", "") r
| E ("ible" , "") r | E ("ant" , "") r
| E ("ement", "") r | E ("ment", "") r
| E ("ion" , "") r | E ("ism" , "") r
| E ("ou" , "") r | E ("iti" , "") r
| E ("ate" , "") r | E ("ive" , "") r
| E ("ous" , "") r | E ("ize" , "") r
| E ("ise" , "") r -> replaceStr r 1
| w -> w
let step5 : step = function
LT 2 as w -> w
| E ("e", "e") r as w ->
let s = substr w 1
let o = endsWithOCondition s
if measureWord s > 1 || measureWord s = 1 && not o
then replaceLast w "e" ""
else w
| w -> if w.EndsWith "l" && measureWord (substr w 1) > 1
then substr w 1
else w
word |> step1A |> step1B |> step1C |> step2 |> step3 |> step4 |> step5
let ps word =
match word with
| "" -> None
| w when w.Length < 2 -> Some w
| w when isVowelOnly word -> Some w
| w ->
match (w.ToLower()) with
wl when wl.Length = 2 && wl.[1] = 'y' && isVowel wl.[0] -> Some wl
| w -> Some (porterStem w)
let test = "Dumplings are cooked balls of dough. They are based on flour, potatoes or bread, and may include meat, fish, vegetables, or sweets. They may be cooked by boiling, steaming, simmering, frying, or baking. They may have a filling, or there may be other ingredients mixed into the dough. Dumplings may be sweet or spicy. They can be eaten by themselves, in soups or stews, with gravy, or in any other way. While some dumplings resemble solid water boiled doughs, such as gnocchi, others such as wontons resemble meatballs with a thin dough covering."
test.Replace(",","").Replace(".","").Split ' ' |> Seq.map ps |> Seq.iter (fun x -> printf "%s " x.Value)
|
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = System.String
Full name: Microsoft.FSharp.Core.string
type Letter =
| Empty
| Vowel of char
| Conso of char
Full name: Script.Letter
union case Letter.Empty: Letter
union case Letter.Vowel: char -> Letter
Multiple items
val char : value:'T -> char (requires member op_Explicit)
Full name: Microsoft.FSharp.Core.Operators.char
--------------------
type char = System.Char
Full name: Microsoft.FSharp.Core.char
union case Letter.Conso: char -> Letter
val letterToString : _arg1:Letter -> string
Full name: Script.letterToString
val v : char
val c : char
val isVowel : v:char -> bool
Full name: Script.isVowel
module Seq
from Microsoft.FSharp.Collections
val exists : predicate:('T -> bool) -> source:seq<'T> -> bool
Full name: Microsoft.FSharp.Collections.Seq.exists
val isVowelOnly : s:seq<char> -> bool
Full name: Script.isVowelOnly
val s : seq<char>
val forall : predicate:('T -> bool) -> source:seq<'T> -> bool
Full name: Microsoft.FSharp.Collections.Seq.forall
active pattern result C: unit
active pattern result V: unit
val s : char
active pattern result V: char -> Choice<char,char>
active pattern result C: char -> Choice<char,char>
val categoriseWord : w:seq<char> -> seq<Letter>
Full name: Script.categoriseWord
val w : seq<char>
val idx : ('a -> seq<'a> -> int) (requires equality)
val p : 'a (requires equality)
val s : seq<'a> (requires equality)
val findIndex : predicate:('T -> bool) -> source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.findIndex
val nth : (char -> char)
val nth : index:int -> source:seq<'T> -> 'T
Full name: Microsoft.FSharp.Collections.Seq.nth
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
Full name: Microsoft.FSharp.Collections.Seq.map
active recognizer V: char -> Choice<char,char>
Full name: Script.( |C|V| )
val containsVowel : s:seq<char> -> bool
Full name: Script.containsVowel
val measureWord : w:seq<char> -> int
Full name: Script.measureWord
val length : source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.length
val measureWord' : (seq<Letter> -> Letter -> int -> int)
val curWord : seq<Letter>
Multiple items
val seq : sequence:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Core.Operators.seq
--------------------
type seq<'T> = System.Collections.Generic.IEnumerable<'T>
Full name: Microsoft.FSharp.Collections.seq<_>
val previousLetter : Letter
val curScore : int
val isEmpty : source:seq<'T> -> bool
Full name: Microsoft.FSharp.Collections.Seq.isEmpty
val curLetter : Letter
val head : source:seq<'T> -> 'T
Full name: Microsoft.FSharp.Collections.Seq.head
val skip : count:int -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.skip
val endsWithDoubleConsonant : word:string -> bool
Full name: Script.endsWithDoubleConsonant
val word : string
val categorised : seq<Letter>
val sLen : int
val preLast : Letter
val last : Letter
val d : char
val endsWithOCondition : word:string -> bool
Full name: Script.endsWithOCondition
val len : int
val seqMaxIndex : int
val prePrelast : Letter
val porterStem : word:string -> string
Full name: Script.porterStem
val len : (string -> int)
val s : string
property System.String.Length: int
val substr : (string -> int -> string)
val w : string
val i : int
System.String.Substring(startIndex: int) : string
System.String.Substring(startIndex: int, length: int) : string
active pattern result E: unit
val n : string
val r : 'a
val h : string
System.String.EndsWith(value: string) : bool
System.String.EndsWith(value: string, comparisonType: System.StringComparison) : bool
System.String.EndsWith(value: string, ignoreCase: bool, culture: System.Globalization.CultureInfo) : bool
union case Option.Some: Value: 'T -> Option<'T>
union case Option.None: Option<'T>
val replaceLast : (string -> string -> string -> string)
val r : string
val place : int
System.String.LastIndexOf(value: string) : int
System.String.LastIndexOf(value: char) : int
System.String.LastIndexOf(value: string, comparisonType: System.StringComparison) : int
System.String.LastIndexOf(value: string, startIndex: int) : int
System.String.LastIndexOf(value: char, startIndex: int) : int
System.String.LastIndexOf(value: string, startIndex: int, comparisonType: System.StringComparison) : int
System.String.LastIndexOf(value: string, startIndex: int, count: int) : int
System.String.LastIndexOf(value: char, startIndex: int, count: int) : int
System.String.LastIndexOf(value: string, startIndex: int, count: int, comparisonType: System.StringComparison) : int
System.String.Remove(startIndex: int) : string
System.String.Remove(startIndex: int, count: int) : string
val replaceStr : (string * string * string -> int -> string)
val a : string
val b : string
val k : int
val step1A : step
type step = string -> string
Full name: Script.step
active recognizer LT: int -> string -> unit option
active recognizer E: string * 'a -> string -> (string * string * 'a) option
val r : string * string * string
val e : string
val step1B : (string -> string)
val secondLast : (string -> string)
val endIfO : (string -> string)
val endIfD : (string -> bool)
val step1Bx : step
val c : (string * string * 'a -> string)
val w : string * string * 'a
val step1C : step
val step2 : step
val step3 : step
val step4 : step
val step5 : step
val o : bool
val not : value:bool -> bool
Full name: Microsoft.FSharp.Core.Operators.not
val ps : word:string -> string option
Full name: Script.ps
System.String.ToLower() : string
System.String.ToLower(culture: System.Globalization.CultureInfo) : string
val wl : string
val test : string
Full name: Script.test
System.String.Replace(oldValue: string, newValue: string) : string
System.String.Replace(oldChar: char, newChar: char) : string
val iter : action:('T -> unit) -> source:seq<'T> -> unit
Full name: Microsoft.FSharp.Collections.Seq.iter
val x : string option
val printf : format:Printf.TextWriterFormat<'T> -> 'T
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printf
property Option.Value: string
More information