7 people like it.

Porter Stemmer

Porter Stemmer, please contribute fixes, suggestions, this is merely a syntactic rewrite, I believe though there are some slight mistakes. http://tartarus.org/~martin/PorterStemmer/ ,fixed up endswithdoubleconsonant and endswithocondition

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
182: 
183: 
184: 
185: 
186: 
187: 
188: 
189: 
190: 
191: 
192: 
193: 
194: 
195: 
196: 
197: 
198: 
199: 
200: 
201: 
202: 
203: 
204: 
205: 
206: 
207: 
208: 
209: 
210: 
211: 
212: 
213: 
214: 
215: 
216: 
217: 
218: 
219: 
220: 
221: 
222: 
(* Origionally from https://blogs.msdn.com/b/christianb/archive/2011/06/24/a-porter-stemmer-in-f.aspx*)
open System

type Letter = 
    Empty
  | Vowel of char
  | Conso of char
    
let isVowel v             = Seq.exists ((=) v) "aeiouAEIOU"
let isVowelOnly s         = Seq.forall isVowel s
let isConsonant c         = not (isVowel c)

let length (s : string)   = s.Length
let containsVowel s       = Seq.exists isVowel s
let substr (w : string) i = w.Substring(0, length w - i)

let categoriseWord w = 

  let (|C|V|) s = if isVowel s then V s else C s

  let idx p = Seq.findIndex ((=) p)
  let nth s = Seq.nth (idx s w - 1) w

  let cat s =
    if isVowel s then Vowel s 
    else
      if idx s w = 0 then Conso s
      elif s = 'y' || s = 'Y' then 
        match nth s with
          V _ -> Conso s
        | s   -> Vowel s
      else Conso s

  w |> Seq.map cat 

(* Fix below 3 functions at some stage, they work just seems there is 5 times more code than necess. *)
let measureWord w = 
  if Seq.length w < 2 then 0
  else 
    let rec measureWord' (curWord:Letter seq) (previousLetter:Letter) curScore = 
      if (Seq.isEmpty curWord) then curScore
      else
        let curLetter  = Seq.head curWord
        let skipLetter = Seq.skip 1 curWord
        match previousLetter, curLetter with
          Vowel _, Conso _ -> measureWord' skipLetter curLetter (curScore + 1)
        | _, _ -> measureWord' skipLetter curLetter curScore
    measureWord' (categoriseWord w) Empty 0

let endsWithDoubleConsonant (word : string) = 
  let length = length word
  let first,second = word.[length - 2], word.[length - 1]

  length > 2 && isConsonant first && first = second

let endsWithOCondition (word : string) = 
  let length = length word
  let lastLength = length - 1

  if length < 3 then false else
      
    isConsonant word.[lastLength]     &&
    isVowel     word.[lastLength - 1] &&
    isConsonant word.[lastLength - 2] &&
    Seq.exists ((<>) word.[lastLength]) "wxy"

let step1A_subs = 
  [ "sses", "ss"; "ies" , "i"
    "ss"  , "ss"; "s"   , "" ] 

let step1B_subs =
  [ "at", "ate"; "bl","ble"
    "iz", "ize"]

let step1B2_subs = 
  [ "eed", "ee" ]
  
let step2_subs =
  [ "ational", "ate" ;  "tional" , "tion" 
    "enci"   , "ence";  "anci"   , "ance" 
    "izer"   , "ize" ;  "bli"    , "ble"  
    "alli"   , "al"  ;  "entli"  , "ent"  
    "eli"    , "e"   ;  "ousli"  , "ous"  
    "ization", "ize" ;  "ation"  , "ate"  
    "ator"   , "ate" ;  "alism"  , "al"   
    "iveness", "ive" ;  "fulness", "ful"  
    "ousness", "ous" ;  "aliti"  , "al"  
    "iviti"  , "ive" ;  "biliti" , "ble" 
    "logi"   , "log" ]

let step3_subs = 
  [ "icate", "ic"; "ative", ""  
    "alize", "al"; "iciti", "ic"
    "ical" , "ic"; "ful"  , "" 
    "ness" , "" ]

let step4_subs =
  [ "al"   , ""; "ance", "" 
    "ence" , ""; "er"  , ""
    "ic"   , ""; "able", "" 
    "ible" , ""; "ant" , ""
    "ement", ""; "ment", "" 
    "ion"  , ""; "ism" , ""
    "ou"   , ""; "iti" , "" 
    "ate"  , ""; "ive" , ""
    "ous"  , ""; "ize" , ""
    "ise"  , "" ]

let stem (winput : string) =

  let replaceLast (w : string) (s : string) r = 
    let place = w.LastIndexOf s in
      if place >= 0 then 
        w.Remove(place, length s).Insert(place,r)
      else w

  let findFirstReplace (word : string) n pList =
    let cand = 
      pList 
      |> Seq.tryFind (fun (e,b) -> word.EndsWith e)
    if cand <> None then
       cand |> Option.get 
            |> fun (e,r) -> 
               if measureWord (substr word e.Length) > n
                 then replaceLast word e r
               else word
    else word

  let step1A (word : string) =
    findFirstReplace word 0 step1A_subs

  let step1B (word : string) =

    let step1Bx (w : string) =
      findFirstReplace w -1 step1B_subs

    let sub w n =
      if containsVowel (substr w n)
      then substr w n else w

    match word with
    | w when w.EndsWith "eed" -> findFirstReplace word 0 step1B2_subs
    | w when w.EndsWith "ed"  -> sub word 2
    | w when w.EndsWith "ing" -> sub word 3
    | w ->   w
  
  let step1C (w : string) =
    if containsVowel (substr w 0) && w.EndsWith "y" then
      replaceLast w "y" "i"
    else w

  let step2 (word : string) =
    findFirstReplace word 0 step2_subs

  let step3 (word : string) = 
    findFirstReplace word 0 step3_subs

  let step4 (word : string) =
    findFirstReplace word 1 step4_subs

  let step5 (word : string) = 

    if word.EndsWith "e" then
      let s = substr word 1
      if word.EndsWith "l" && measureWord s > 1 
        then s
      else
        let o = endsWithOCondition s
        if measureWord s > 1 || measureWord s = 1 && not o
          then replaceLast word "e" ""
        else word
    else word

  let c word len' f = 
    if length word < len' then word 
    else 
      let ret = f word
      if length ret > 0 then ret else word
  
  let rec step w n = 
    let s b f = step (c w b f) (n+1)
    if n < 7 then
      match n with
      | 0 -> s 3 step1A 
      | 1 -> s 0 step1B
      | 2 -> s 4 step1C 
      | 3 -> s 4 step2 
      | 4 -> s 3 step3
      | 5 -> s 3 step4 
      | 6 -> s 2 step5
      | _ -> ""
    else w
  step winput 0

let ps (winput : string) =
  if length winput < 2 || isVowelOnly winput then 
    Some winput
  else 
    let wl = winput.ToLower()
    if wl.[1] = 'y' && isVowel wl.[0] then 
      Some wl
    else Some (stem wl)

let filter (str : string) = 
  str |> Seq.filter (fun c -> Char.IsLetter c || c = ' ')
      |> Seq.map string 
      |> String.concat ""

let test = 
  "Do you really think it is weakness that yields to temptation? \
  I tell you that there are terrible temptations which it requires strength, \ 
  strength and courage to yield to"

(test |> filter).Split ' '
|> Array.filter ((<>) "")
|> Array.map ps 
|> Array.iter (fun x -> printf "%s " x.Value)

//"do you realli think it is weak that yield to temptat I tell you that there ar terribl temptat which it requir strength strength and courag to yield to"
// test below taken from http://qaa.ath.cx/porter_js_demo.html, for closer accuracy we should strip non a-z

//"Do you realli think it is weak that yield to temptat I tell you that there ar terribl temptat which it requir strength strength and courag to yield to"
namespace System
type Letter =
  | Empty
  | Vowel of char
  | Conso of char

Full name: Script.Letter
union case Letter.Empty: Letter
union case Letter.Vowel: char -> Letter
Multiple items
val char : value:'T -> char (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.char

--------------------
type char = Char

Full name: Microsoft.FSharp.Core.char
union case Letter.Conso: char -> Letter
val isVowel : v:char -> bool

Full name: Script.isVowel
val v : char
module Seq

from Microsoft.FSharp.Collections
val exists : predicate:('T -> bool) -> source:seq<'T> -> bool

Full name: Microsoft.FSharp.Collections.Seq.exists
val isVowelOnly : s:seq<char> -> bool

Full name: Script.isVowelOnly
val s : seq<char>
val forall : predicate:('T -> bool) -> source:seq<'T> -> bool

Full name: Microsoft.FSharp.Collections.Seq.forall
val isConsonant : c:char -> bool

Full name: Script.isConsonant
val c : char
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val length : s:string -> int

Full name: Script.length
val s : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
property String.Length: int
val containsVowel : s:seq<char> -> bool

Full name: Script.containsVowel
val substr : w:string -> i:int -> string

Full name: Script.substr
val w : string
val i : int
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
val categoriseWord : w:seq<char> -> seq<Letter>

Full name: Script.categoriseWord
val w : seq<char>
active pattern result C: unit
active pattern result V: unit
val s : char
active pattern result V: char -> Choice<char,char>
active pattern result C: char -> Choice<char,char>
val idx : ('a -> seq<'a> -> int) (requires equality)
val p : 'a (requires equality)
val findIndex : predicate:('T -> bool) -> source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.findIndex
val nth : (char -> char)
val nth : index:int -> source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.nth
val cat : (char -> Letter)
active recognizer V: char -> Choice<char,char>
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
val measureWord : w:seq<char> -> int

Full name: Script.measureWord
val length : source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.length
val measureWord' : (seq<Letter> -> Letter -> int -> int)
val curWord : seq<Letter>
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
val previousLetter : Letter
val curScore : int
val isEmpty : source:seq<'T> -> bool

Full name: Microsoft.FSharp.Collections.Seq.isEmpty
val curLetter : Letter
val head : source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.head
val skipLetter : seq<Letter>
val skip : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.skip
val endsWithDoubleConsonant : word:string -> bool

Full name: Script.endsWithDoubleConsonant
val word : string
val length : int
val first : char
val second : char
val endsWithOCondition : word:string -> bool

Full name: Script.endsWithOCondition
val lastLength : int
val step1A_subs : (string * string) list

Full name: Script.step1A_subs
val step1B_subs : (string * string) list

Full name: Script.step1B_subs
val step1B2_subs : (string * string) list

Full name: Script.step1B2_subs
val step2_subs : (string * string) list

Full name: Script.step2_subs
val step3_subs : (string * string) list

Full name: Script.step3_subs
val step4_subs : (string * string) list

Full name: Script.step4_subs
val stem : winput:string -> string

Full name: Script.stem
val winput : string
val replaceLast : (string -> string -> string -> string)
val r : string
val place : int
String.LastIndexOf(value: string) : int
String.LastIndexOf(value: char) : int
String.LastIndexOf(value: string, comparisonType: StringComparison) : int
String.LastIndexOf(value: string, startIndex: int) : int
String.LastIndexOf(value: char, startIndex: int) : int
String.LastIndexOf(value: string, startIndex: int, comparisonType: StringComparison) : int
String.LastIndexOf(value: string, startIndex: int, count: int) : int
String.LastIndexOf(value: char, startIndex: int, count: int) : int
String.LastIndexOf(value: string, startIndex: int, count: int, comparisonType: StringComparison) : int
String.Remove(startIndex: int) : string
String.Remove(startIndex: int, count: int) : string
val findFirstReplace : (string -> int -> seq<string * string> -> string)
val n : int
val pList : seq<string * string>
val cand : (string * string) option
val tryFind : predicate:('T -> bool) -> source:seq<'T> -> 'T option

Full name: Microsoft.FSharp.Collections.Seq.tryFind
val e : string
val b : string
String.EndsWith(value: string) : bool
String.EndsWith(value: string, comparisonType: StringComparison) : bool
String.EndsWith(value: string, ignoreCase: bool, culture: Globalization.CultureInfo) : bool
union case Option.None: Option<'T>
module Option

from Microsoft.FSharp.Core
val get : option:'T option -> 'T

Full name: Microsoft.FSharp.Core.Option.get
val step1A : (string -> string)
val step1B : (string -> string)
val step1Bx : (string -> string)
val sub : (string -> int -> string)
val step1C : (string -> string)
val step2 : (string -> string)
val step3 : (string -> string)
val step4 : (string -> string)
val step5 : (string -> string)
val o : bool
val c : (string -> int -> (string -> string) -> string)
val len' : int
val f : (string -> string)
val ret : string
val step : (string -> int -> string)
val s : (int -> (string -> string) -> string)
val b : int
val ps : winput:string -> string option

Full name: Script.ps
union case Option.Some: Value: 'T -> Option<'T>
val wl : string
String.ToLower() : string
String.ToLower(culture: Globalization.CultureInfo) : string
val filter : str:string -> string

Full name: Script.filter
val str : string
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.filter
type Char =
  struct
    member CompareTo : value:obj -> int + 1 overload
    member Equals : obj:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member GetTypeCode : unit -> TypeCode
    member ToString : unit -> string + 1 overload
    static val MaxValue : char
    static val MinValue : char
    static member ConvertFromUtf32 : utf32:int -> string
    static member ConvertToUtf32 : highSurrogate:char * lowSurrogate:char -> int + 1 overload
    static member GetNumericValue : c:char -> float + 1 overload
    ...
  end

Full name: System.Char
Char.IsLetter(c: char) : bool
Char.IsLetter(s: string, index: int) : bool
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
val concat : sep:string -> strings:seq<string> -> string

Full name: Microsoft.FSharp.Core.String.concat
val test : string

Full name: Script.test
type Array =
  member Clone : unit -> obj
  member CopyTo : array:Array * index:int -> unit + 1 overload
  member GetEnumerator : unit -> IEnumerator
  member GetLength : dimension:int -> int
  member GetLongLength : dimension:int -> int64
  member GetLowerBound : dimension:int -> int
  member GetUpperBound : dimension:int -> int
  member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
  member Initialize : unit -> unit
  member IsFixedSize : bool
  ...

Full name: System.Array
val filter : predicate:('T -> bool) -> array:'T [] -> 'T []

Full name: Microsoft.FSharp.Collections.Array.filter
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.map
val iter : action:('T -> unit) -> array:'T [] -> unit

Full name: Microsoft.FSharp.Collections.Array.iter
val x : string option
val printf : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printf
property Option.Value: string

More information

Link:http://fssnip.net/bl
Posted:5 years ago
Author:David Klein
Tags: stemming , word stem , porter stemmer