1 people like it.

Porter Stemmer

Porter Stemmer, please contribute fixes, suggestions, this is merely a syntactic rewrite, I believe though there are some slight mistakes. http://tartarus.org/~martin/PorterStemmer/ ,fixed up endswithdoubleconsonant and endswithocondition

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
182: 
183: 
184: 
185: 
186: 
187: 
188: 
189: 
190: 
191: 
192: 
193: 
194: 
195: 
196: 
197: 
198: 
199: 
200: 
201: 
202: 
203: 
204: 
205: 
206: 
207: 
208: 
209: 
210: 
211: 
212: 
213: 
214: 
215: 
216: 
217: 
218: 
219: 
220: 
221: 
(* Origionally from https://blogs.msdn.com/b/christianb/archive/2011/06/24/a-porter-stemmer-in-f.aspx
*)

type step = string -> string

type Letter = 
    Empty
  | Vowel of char
  | Conso of char

let letterToString = function
    Empty        -> ""
  | Vowel v      -> string v  
  | Conso c      -> string c
    
let isVowel v = Seq.exists ((=) v) "aeiouAEIOU"
let isVowelOnly s = s |> Seq.forall isVowel
let (|C|V|) s = if isVowel s then V s else C s

let categoriseWord w = 
 
  let idx p s = Seq.findIndex ((=) p) s
  let nth s = Seq.nth (idx s w - 1) w
  
  Seq.map(
    function  
      V s -> Vowel s 
    | s   -> 
      if idx s w = 0 then Conso s
      elif s = 'y' || s = 'Y' then 
        match nth s with
          V _ -> Conso s
        | s   -> Vowel s
      else Conso s) w

let containsVowel s = 
   categoriseWord s
   |> Seq.exists (function Vowel _ -> true | _ -> false)
   
(* Fix below 3 functions at some stage, they work just seems there is 5 times more code than necess. *)
let measureWord w = 
  if (Seq.length w < 2) then 0
  else 
    let rec measureWord' (curWord:Letter seq) (previousLetter:Letter) curScore = 
      if (Seq.isEmpty curWord) then curScore
      else
        let curLetter = Seq.head curWord
        match previousLetter, curLetter with
          Vowel _, Conso _ -> measureWord' (Seq.skip 1 curWord) curLetter (curScore + 1)
        | _, _ -> measureWord' (Seq.skip 1 curWord) curLetter curScore
    measureWord' (categoriseWord w) Empty 0

let endsWithDoubleConsonant word = 
  if word = "" then false
  else
    let categorised = categoriseWord word
    let sLen = Seq.length categorised
    let preLast = Seq.nth (sLen - 2) categorised

    match preLast with
    | Conso c -> 
        let last = Seq.nth ( sLen - 1) categorised
        match last with
        | Conso d when c = d -> true
        | _ -> false
    | _ -> false
 
let endsWithOCondition word = 
  if word = "" then false
  else
    let categorised = categoriseWord word
    let len = Seq.length categorised
    if len < 3 then false
    else
      let seqMaxIndex = len - 1
      let prePrelast = Seq.nth (seqMaxIndex - 2)  categorised
      match prePrelast with
      | Conso c ->
        let preLast = Seq.nth (seqMaxIndex - 1) categorised
        match preLast with
        | Vowel _->
          let last = Seq.nth seqMaxIndex categorised
          match last with
          | Conso d -> ( d <> 'w' && d <> 'x' && d <> 'y' )
          | _ -> false
        | _ -> false
      | _ -> false


let porterStem (word : string) =
  
  let len (s : string) = s.Length
  
  let substr (w : string) i = w.Substring(0, len w - i)

  let (|E|_|) (n,r) (h : string) = 
    if h.EndsWith n then Some(h,n,r) else None
    
  let (|LT|_|) i (w : string)  =
    if len w < i then Some()
    else None
    
  let replaceLast (w : string) (s : string) r = 
    let place = w.LastIndexOf s
    w.Remove(place, len s).Insert(place,r)
    
  let replaceStr (word,(a : string),b) k =
    if measureWord (substr word a.Length) > k
      then replaceLast word a b
    else word 
      
  let step1A : step = function
      LT 3 as w -> w
    | E ("sses", "ss") r | E ("ies" , "i" ) r
    | E ("ss"  , "ss") r | E ("s"   ,  "" ) r 
        -> replaceStr r 0
    | e -> e
          
  (* parameters everywhere *)
  let step1B word =

    let secondLast (w : string) =
      let endIfO w = 
        if endsWithOCondition w && measureWord w = 1 
          then w ^ "e" 
          else w
      
      let endIfD w = endsWithDoubleConsonant w
      match Seq.nth (len w - 1) w with
      | 'l' | 's' | 'z' -> w
      | _ -> if endIfD w then substr w 1
                         else w

    let step1Bx : step = function
        LT 4 as w -> w
      | E ("at" , "ate") r 
      | E ("bl" , "ble") r 
      | E ("iz" , "ize") r -> replaceStr r -1
      | w                  -> secondLast w

    let c w =
      let (word,_,_) = w
      let (_,s,_) = w
      if containsVowel (substr word (len s) )
        then substr word (len s) 
      else word
      
    match word with
      E ("eed" , "ee") r -> replaceStr r 0
    | E ("ed"  , "")   r -> c r |> step1Bx
    | E ("ing" , "")   r -> c r |> step1Bx
    | w -> w
    
  let step1C : step = function
      LT 4 as w -> w 
    | w when containsVowel (substr w 0) && w.EndsWith "y" -> replaceLast w "y" "i"
    | w ->   w
    
  let step2 : step = function 
      LT 4 as w -> w
    | E  ("ational", "ate" ) r | E  ("tional" , "tion") r
    | E  ("enci"   , "ence") r | E  ("anci"   , "ance") r
    | E  ("izer"   , "ize" ) r | E  ("bli"    , "ble" ) r
    | E  ("alli"   , "al"  ) r | E  ("entli"  , "ent" ) r
    | E  ("eli"    , "e"   ) r | E  ("ousli"  , "ous" ) r
    | E  ("ization", "ize" ) r | E  ("ation"  , "ate" ) r
    | E  ("ator"   , "ate" ) r | E  ("alism"  , "al"  ) r
    | E  ("iveness", "ive" ) r | E  ("fulness", "ful" ) r
    | E  ("ousness", "ous" ) r | E  ("aliti"  , "al"  ) r
    | E  ("iviti"  , "ive" ) r | E  ("biliti" , "ble" ) r
    | E  ("logi"   , "log" ) r -> replaceStr r 0
    | w                        -> w
    
  let step3 : step = function
      LT 3 as w -> w
    | E ("icate", "ic") r | E ("ative", ""  ) r
    | E ("alize", "al") r | E ("iciti", "ic") r
    | E ("ical" , "ic") r | E ("ful"  , ""  ) r
    | E ("ness" , ""  ) r -> replaceStr r 0
    | w                   -> w
        
  let step4 : step = function
      LT 3 as w -> w
    | E ("al"   , "") r | E ("ance", "") r  
    | E ("ence" , "") r | E ("er"  , "") r 
    | E ("ic"   , "") r | E ("able", "") r  
    | E ("ible" , "") r | E ("ant" , "") r  
    | E ("ement", "") r | E ("ment", "") r  
    | E ("ion"  , "") r | E ("ism" , "") r 
    | E ("ou"   , "") r | E ("iti" , "") r  
    | E ("ate"  , "") r | E ("ive" , "") r 
    | E ("ous"  , "") r | E ("ize" , "") r 
    | E ("ise"  , "") r -> replaceStr r 1
    | w                 -> w

  let step5 : step = function
      LT 2 as w -> w
    | E ("e", "e") r as w -> 
      let s = substr w 1
      let o = endsWithOCondition s
      if measureWord s > 1 || measureWord s = 1 && not o
        then replaceLast w "e" "" 
      else w
    | w -> if w.EndsWith "l" && measureWord (substr w 1) > 1 
           then substr w 1 
           else w
      
  word |> step1A |> step1B |> step1C |> step2 |> step3 |> step4 |> step5

let ps word = 
  match word with
  | "" -> None
  | w when w.Length < 2  -> Some w
  | w when isVowelOnly word -> Some w
  | w -> 
    match (w.ToLower()) with
      wl when wl.Length = 2 && wl.[1] = 'y' && isVowel wl.[0] -> Some wl
    | w -> Some (porterStem w)
    
let test = "Dumplings are cooked balls of dough. They are based on flour, potatoes or bread, and may include meat, fish, vegetables, or sweets. They may be cooked by boiling, steaming, simmering, frying, or baking. They may have a filling, or there may be other ingredients mixed into the dough. Dumplings may be sweet or spicy. They can be eaten by themselves, in soups or stews, with gravy, or in any other way. While some dumplings resemble solid water boiled doughs, such as gnocchi, others such as wontons resemble meatballs with a thin dough covering."    
test.Replace(",","").Replace(".","").Split ' ' |> Seq.map ps |> Seq.iter (fun x -> printf "%s " x.Value)
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = System.String

Full name: Microsoft.FSharp.Core.string
type Letter =
  | Empty
  | Vowel of char
  | Conso of char

Full name: Script.Letter
union case Letter.Empty: Letter
union case Letter.Vowel: char -> Letter
Multiple items
val char : value:'T -> char (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.char

--------------------
type char = System.Char

Full name: Microsoft.FSharp.Core.char
union case Letter.Conso: char -> Letter
val letterToString : _arg1:Letter -> string

Full name: Script.letterToString
val v : char
val c : char
val isVowel : v:char -> bool

Full name: Script.isVowel
module Seq

from Microsoft.FSharp.Collections
val exists : predicate:('T -> bool) -> source:seq<'T> -> bool

Full name: Microsoft.FSharp.Collections.Seq.exists
val isVowelOnly : s:seq<char> -> bool

Full name: Script.isVowelOnly
val s : seq<char>
val forall : predicate:('T -> bool) -> source:seq<'T> -> bool

Full name: Microsoft.FSharp.Collections.Seq.forall
active pattern result C: unit
active pattern result V: unit
val s : char
active pattern result V: char -> Choice<char,char>
active pattern result C: char -> Choice<char,char>
val categoriseWord : w:seq<char> -> seq<Letter>

Full name: Script.categoriseWord
val w : seq<char>
val idx : ('a -> seq<'a> -> int) (requires equality)
val p : 'a (requires equality)
val s : seq<'a> (requires equality)
val findIndex : predicate:('T -> bool) -> source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.findIndex
val nth : (char -> char)
val nth : index:int -> source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.nth
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
active recognizer V: char -> Choice<char,char>

Full name: Script.( |C|V| )
val containsVowel : s:seq<char> -> bool

Full name: Script.containsVowel
val measureWord : w:seq<char> -> int

Full name: Script.measureWord
val length : source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.length
val measureWord' : (seq<Letter> -> Letter -> int -> int)
val curWord : seq<Letter>
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = System.Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
val previousLetter : Letter
val curScore : int
val isEmpty : source:seq<'T> -> bool

Full name: Microsoft.FSharp.Collections.Seq.isEmpty
val curLetter : Letter
val head : source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.head
val skip : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.skip
val endsWithDoubleConsonant : word:string -> bool

Full name: Script.endsWithDoubleConsonant
val word : string
val categorised : seq<Letter>
val sLen : int
val preLast : Letter
val last : Letter
val d : char
val endsWithOCondition : word:string -> bool

Full name: Script.endsWithOCondition
val len : int
val seqMaxIndex : int
val prePrelast : Letter
val porterStem : word:string -> string

Full name: Script.porterStem
val len : (string -> int)
val s : string
property System.String.Length: int
val substr : (string -> int -> string)
val w : string
val i : int
System.String.Substring(startIndex: int) : string
System.String.Substring(startIndex: int, length: int) : string
active pattern result E: unit
val n : string
val r : 'a
val h : string
System.String.EndsWith(value: string) : bool
System.String.EndsWith(value: string, comparisonType: System.StringComparison) : bool
System.String.EndsWith(value: string, ignoreCase: bool, culture: System.Globalization.CultureInfo) : bool
union case Option.Some: Value: 'T -> Option<'T>
union case Option.None: Option<'T>
val replaceLast : (string -> string -> string -> string)
val r : string
val place : int
System.String.LastIndexOf(value: string) : int
System.String.LastIndexOf(value: char) : int
System.String.LastIndexOf(value: string, comparisonType: System.StringComparison) : int
System.String.LastIndexOf(value: string, startIndex: int) : int
System.String.LastIndexOf(value: char, startIndex: int) : int
System.String.LastIndexOf(value: string, startIndex: int, comparisonType: System.StringComparison) : int
System.String.LastIndexOf(value: string, startIndex: int, count: int) : int
System.String.LastIndexOf(value: char, startIndex: int, count: int) : int
System.String.LastIndexOf(value: string, startIndex: int, count: int, comparisonType: System.StringComparison) : int
System.String.Remove(startIndex: int) : string
System.String.Remove(startIndex: int, count: int) : string
val replaceStr : (string * string * string -> int -> string)
val a : string
val b : string
val k : int
val step1A : step
type step = string -> string

Full name: Script.step
active recognizer LT: int -> string -> unit option
active recognizer E: string * 'a -> string -> (string * string * 'a) option
val r : string * string * string
val e : string
val step1B : (string -> string)
val secondLast : (string -> string)
val endIfO : (string -> string)
val endIfD : (string -> bool)
val step1Bx : step
val c : (string * string * 'a -> string)
val w : string * string * 'a
val step1C : step
val step2 : step
val step3 : step
val step4 : step
val step5 : step
val o : bool
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val ps : word:string -> string option

Full name: Script.ps
System.String.ToLower() : string
System.String.ToLower(culture: System.Globalization.CultureInfo) : string
val wl : string
val test : string

Full name: Script.test
System.String.Replace(oldValue: string, newValue: string) : string
System.String.Replace(oldChar: char, newChar: char) : string
val iter : action:('T -> unit) -> source:seq<'T> -> unit

Full name: Microsoft.FSharp.Collections.Seq.iter
val x : string option
val printf : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printf
property Option.Value: string
Next Version Raw view Test code New version

More information

Link:http://fssnip.net/bl
Posted:5 years ago
Author:David Klein
Tags: stemming , word stem , porter stemmer