module Stemmer
open System

let (|BaseVowel|_|) a = match Char.ToLower(a) with 'a' | 'e' | 'i' | 'o' | 'u' -> Some a | _ -> None

let rec (|Consonant|_|) xs =
    match xs with
    | [] -> None
    | a::_ when Char.IsWhiteSpace(a) -> None
    | BaseVowel a::_ -> None
    | 'y'::Consonant _ | 'Y'::Consonant _-> None
    | x::rest -> Some(x,rest)
and (|Vowel|_|) xs = 
    match xs with
    | [] -> None 
    | a::_ when Char.IsWhiteSpace(a) -> None
    | Consonant _ -> None 
    | x::rest -> Some(x,rest)

let (|C|_|) xs = 
    let rec loop xs acc =
        match xs with
        | Consonant (x,xs) -> loop xs (x::acc) 
        | _ -> match acc with [] -> None | _ -> Some(acc|>List.rev,xs)
    loop xs []

let (|V|_|) xs  = 
    let rec loop xs acc =
        match xs with
        | Vowel (x,xs) -> loop xs (x::acc) 
        | _ -> match acc with [] -> None | _ -> Some(acc|>List.rev,xs)
    loop xs []

let (|VC|_|) xs  =
    let rec loop xs acc =
        match xs with 
        | [] -> match acc with [] -> None | _ -> Some(acc,xs)
        |  a::_ when Char.IsWhiteSpace(a) -> match acc with [] -> None | _ -> Some(acc,xs)
        | C (cs, V (vs,xs)) -> loop xs ((vs,cs)::acc)
        | C (cs, xs) -> loop xs (([],cs)::acc)
        | V (vs, xs) -> loop xs ((vs,[])::acc)
        | _ -> None
    loop xs []

let calcMeasure vs = (0,vs) ||> List.fold (fun c (vs,cs) -> match vs,cs with [],_ | _,[] -> c | _ -> c+1)

let (|Ends|_|) stem xs  = 
    let rec loop xs ys =
        match xs,ys with
        | [],[] -> Some(xs)
        | [],_ -> None
        | _, [] -> Some(xs)
        | a::xs,b::ys when a = b || Char.ToLower(a) = Char.ToLower(b) -> loop xs ys
        | _ -> None
    loop xs stem

let ContainsVowel = List.exists ((|BaseVowel|_|) >> Option.isSome)

let (|EndsWithDoubleC|_|) xs =
    match xs with
    | Consonant (a, Consonant (b,xs)) when a = b -> Some (a,xs)
    | _ -> None

let (|EndsWithCVC|_|) xs =
    match xs with
    | Consonant (a, Vowel (_, Consonant (_,xs))) ->
        match a with 
        | 'w' | 'x' | 'y' | 'W' | 'X' | 'Y' -> None
        | _ -> Some(xs)
    | _ -> None
let (|NotEndsWithCVC|_|) xs = match xs with EndsWithCVC (_) -> None | _ -> Some(xs)

let (|Measure|_|) xs = match xs with VC (cvs,rest) -> calcMeasure cvs |> Some | _ -> None

let Measure xs = match xs with Measure (m) -> m | _  -> 0

let step1a xs =
    match xs with
    | Ends ['s';'e';'s';'s'] (rest) -> 's'::'s'::rest
    | Ends ['s';'e';'i'] (rest) -> 'i'::rest
    | 's'::'s'::rest -> xs
    | 's'::rest -> rest
    | _ -> xs

let contains y xs  = xs |> List.exists (fun x -> x=y)

let step1bTx xs =
    match xs with
    | Ends ['t';'a'] _ -> 'e'::xs
    | Ends ['l'; 'b'] _ -> 'e'::xs
    | Ends ['z'; 'i'] _ -> 'e'::xs
    | EndsWithDoubleC (x,rest) when ['l';'s';'z'] |> contains x |> not -> x::rest
    | EndsWithCVC (_) & Measure (m) when m = 1 -> 'e'::xs
    | _ -> xs

let step1b xs =
    match xs with
    | Ends ['d';'e';'e'] rest -> if Measure rest > 0 then 'e'::'e':: rest else xs
    | Ends ['d';'e'] rest when ContainsVowel rest -> rest |> step1bTx 
    | Ends ['g';'n';'i'] rest when ContainsVowel rest -> rest |> step1bTx 
    | _ -> xs

let step1c xs = 
    match xs with 
    | Ends ['y'] rest when ContainsVowel rest -> 'i'::rest
    | _ -> xs 

let step2 xs =
    match xs with
    | Ends ['l';'a';'n';'o';'i';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest 
    | Ends ['l';'a';'n';'o';'i';'t'] rest when Measure rest > 0 -> 'n'::'o'::'i'::'t'::rest 
    | Ends ['i';'c';'n';'e'] rest when Measure rest > 0 -> 'e'::'c'::'n'::'e'::rest 
    | Ends ['i';'c';'n';'a'] rest when Measure rest > 0 -> 'e'::'c'::'n'::'a'::rest 
    | Ends ['r';'e';'z';'i'] rest when Measure rest > 0 -> 'e'::'z'::'i'::rest 
    | Ends ['i';'l';'b'] rest when Measure rest > 0 -> 'e'::'l'::'b'::rest 
    | Ends ['i';'l';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['i';'l';'t';'n';'e'] rest when Measure rest > 0 -> 't'::'n'::'e'::rest 
    | Ends ['i';'l';'e'] rest when Measure rest > 0 -> 'e'::rest 
    | Ends ['i';'l';'s';'u';'o'] rest when Measure rest > 0 -> 's'::'u'::'o'::rest 
    | Ends ['n';'o';'i';'t';'a';'z';'i'] rest when Measure rest > 0 -> 'e'::'z'::'i'::rest 
    | Ends ['n';'o';'i';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest 
    | Ends ['r';'o';'t';'a'] rest when Measure rest > 0 -> 'e'::'t'::'a'::rest 
    | Ends ['m';'z';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['s';'s';'e';'n';'e';'v';'i'] rest when Measure rest > 0 -> 'e'::'v'::'i'::rest 
    | Ends ['s';'s';'e';'n';'l';'u';'f'] rest when Measure rest > 0 -> 'l'::'u'::'f'::rest 
    | Ends ['s';'s';'e';'n';'s';'u';'o'] rest when Measure rest > 0 -> 's'::'u'::'o'::rest 
    | Ends ['i';'t';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['i';'t';'i';'v';'i'] rest when Measure rest > 0 -> 'e'::'v'::'i'::rest 
    | Ends ['i';'t';'i';'l';'i';'b'] rest when Measure rest > 0 -> 'e'::'l'::'b'::rest 
    | Ends ['i';'g';'o';'l'] rest when Measure rest > 0 -> 'g'::'o'::'l'::rest 
    | _ -> xs

let step3 xs =
    match xs with
    | Ends ['e';'t';'a';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest 
    | Ends ['e';'v';'i';'t';'a'] rest when Measure rest > 0 -> rest 
    | Ends ['e';'z';'i';'l';'a'] rest when Measure rest > 0 -> 'l'::'a'::rest 
    | Ends ['i';'t';'i';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest 
    | Ends ['l';'a';'c';'i'] rest when Measure rest > 0 -> 'c'::'i'::rest 
    | Ends ['l';'u';'f'] rest when Measure rest > 0 -> rest 
    | Ends ['s';'s';'e';'n'] rest when Measure rest > 0 -> rest 
    | _ -> xs

let step4 xs =
    match xs with
    | Ends ['l';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'c';'n';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'c';'n';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['r';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['c';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'l';'b';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'l';'b';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'e';'m';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'e';'m'] rest when Measure rest > 1 -> rest 
    | Ends ['t';'n';'e'] rest when Measure rest > 1 -> rest 
    | Ends ['n';'o';'i'] rest when 
        match rest with 
        | Measure (m) & (Ends ['s'] (_) | Ends ['t'] (_)) when m > 1 -> true
        | _ -> false
        -> rest 
    | Ends ['u';'o'] rest when Measure rest > 1 -> rest 
    | Ends ['m';'s';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'t';'a'] rest when Measure rest > 1 -> rest 
    | Ends ['i';'t';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['s';'u';'o'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'v';'i'] rest when Measure rest > 1 -> rest 
    | Ends ['e';'z';'i'] rest when Measure rest > 1 -> rest 
    | _ -> xs

let step5a xs =
    match xs with
    | Ends ['e'] rest when Measure rest > 1 -> rest
    | Ends ['e'] rest when 
        match rest with
        | NotEndsWithCVC (_) & Measure (m) when m = 1 -> true
        | _ -> false
        -> rest
    | _ -> xs

let step5b xs =
    match xs with
    | EndsWithDoubleC ('l',rest) & Measure (m) when m > 1 -> 'l'::rest
    | _ -> xs

let stemCsRev = step1a >> step1b >> step1c >> step2 >> step3 >> step4 >> step5a >> step5b
let stemCs = List.rev >> stemCsRev >> List.rev
let stem (s:string) = System.String(s.ToLower().ToCharArray() |> Array.toList |> stemCs |> List.toArray)

(* TEST SCRIPT *)
(*
#load "Stemmer.fs"
open Stemmer

let test a b = if  a |> stem  = b then () else failwithf "%s %s" a b

//verified against NLTK implementation http://text-processing.com/demo/stem/
let testWords =
    [
        "caresses","caress"
        "ponies","poni"
        "ties","ti"
        "caress","caress"
        "cats","cat"
        "feed","feed"
        "agreed","agre"
        "plastered","plaster"
        "bled","bled"
        "motoring","motor"
        "sing","sing"
        "conflated","conflat" //"conflate"
        "troubled","troubl"
        "sized","size"
        "hopping","hop"
        "tanned","tan"
        "falling","fall"
        "hissing","hiss"
        "fizzed","fizz"
        "failing","fail"
        "filing","file"
        "relational","relat"
        "conditional","condit"
        "rational","ration"
        "valenci","valenc"
        "hesitanci","hesit"
        "digitizer","digit"
        "conformabli","conform"
        "radicalli","radic"
        "differentli","differ"
        "vileli","vile"
        "analogousli","analog"
        "vietnamizati","vietnamizati"
        "predication","predic"
        "operator","oper"
        "feudalism","feudal"
        "decisiveness","decis"
        "hopefulness","hope"
        "callousness","callous"
        "formaliti","formal"
        "sensitiviti","sensit"
        "sensibiliti","sensibl"
        "triplicate","triplic"
        "formative","form"
        "formalize","formal"
        "electriciti","electr"
        "electrical","electr"
        "hopeful","hope"
        "goodness","good"
        "revival","reviv"
        "allowance","allow"
        "inference","infer"
        "airliner","airlin"
        "gyroscopic","gyroscop"
        "adjustable","adjust"
        "defensible","defens"
        "irritant","irrit"
        "replacement","replac"
        "adjustment","adjust"
        "dependent","depend"
        "adoption","adopt"
        "homologou","homolog"
        "communism","commun"
        "activate","activ"
        "angulariti","angular"
        "homologous","homolog"
        "effective","effect"
        "bowdlerize","bowdler"
        "probate","probat"
        "rate","rate"
        "cease","ceas"
        "controll","control"
        "roll","roll"
    ]

let runTest() = 
    testWords |> List.iter (fun (a,b) -> test a b; printfn "%s -> %s" a b)
    printfn "done"
*)