2 people like it.

Sentiment Classification and Cross-Validation with ML.Net - Sample - Using Anonymous Records

ML.Net sentiment classification and cross-validation example using Gradient Boosted trees Needs to be compiled in a dotnet core F# project. Uses F# 4.6 anonymous records which work well with the ML.Net API static api

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
module SampleModelP
open System
open Microsoft.ML
open Microsoft.ML.StaticPipe
open Microsoft.ML.Data.IO
open System.IO

(*
Contrast two snippets of code to highlight usefulness of 
F# anonymous records. 

Evquivalent code, based on F# 4.5 syntax, is here: http://fssnip.net/7VS 

The  marker '//***' is used to explain anonmyous record usage 
in comments below

*)

module Train =
    (* 
    Data:
        download test / train datasets from here:
        https://github.com/dotnet/machinelearning/blob/master/test/data/wikipedia-detox-250-line-data.tsv
    *)

    //*** change this for your run
    let dataPath = @"C:\s\repodata\hpto\train.txt" 

    //train the model using the supplied hyperparameters
    //and print its performance results
    let trainModel (hp:{|Trees:int; Leaves:int; LearningRate:float |}) = //*** anonymous rec as function args

        let ctx = MLContext(Nullable 10)

        let reader  = 
                TextLoaderStatic.CreateReader(
                                ctx, 
                                (fun (c:TextLoaderStatic.Context) -> 
                                    
                                        {|                          //***
                                            Label=c.LoadBool(0)     //create anonymous record
                                            Text =c.LoadText(1)     //with labeled fields
                                        |}),
                                    separator = '\t',
                                    hasHeader = true)


                          
        let trainData = reader.Read(dataPath)

        let pipeline = 
            (reader :> SchemaBearing<_>).MakeNewEstimator()                    //*** upcasting required here as inference does not work
                .Append(fun  a_rec -> 
                    let features = a_rec.Text.FeaturizeText()                   //***  get field of a-rec by name
                    let score =                                                 //     (no need to deconstruct tuple)
                            ctx.BinaryClassification.Trainers.FastTree(
                                    a_rec.Label,                                //*** same here 
                                    features,
                                    numTrees= hp.Trees,
                                    numLeaves = hp.Leaves,
                                    learningRate = hp.LearningRate,
                                    minDatapointsInLeaves=20
                                    )
                    {| a_rec with                                               //*** extend a_rec to include
                        Features = features                                     //    features and score values
                        Score = score
                    |})

        //perform 5-fold crossvalidation and print results to gauge model performance
        let metrics = ctx.BinaryClassification.CrossValidate(trainData, pipeline, (fun x->x.Label) , numFolds=5)
        let m = metrics |> Seq.map(fun struct(m,a,b)->m.Auc) |> Seq.average                    
        printfn "trees=%d, leaves=%d, lr=%f -> %f" hp.Trees hp.Leaves hp.LearningRate m

        {| Metric=m; Model=pipeline; Data=trainData|} //*** return anonymous record with info needed for prediction


    //generate predictions from the model
    let pred() =

        let hp = {|Trees=50; Leaves=50; LearningRate=0.1 |} //*** model hyperparameters

        let modelInfo = trainModel hp          //play with hyperparameters to get better performance results 
                                               //try various values for #trees #leaves and learning rate
                                               //alternatively try a hyperparameter optimization framework
                                               //e.g. https://github.com/fwaris/hpopt

        let mdl = modelInfo.Model.Fit(modelInfo.Data)          //fit model to data using the chosen hyperparameters

        let predictions = mdl.Transform(modelInfo.Data)        //generate predictions using same data for now
                                                               //in reality you will likely generate predcitions
                                                               //a record at a time with new data as 
                                                               //it becomes available

                                                               //code below saves predictions to a text file
        let ctx = MLContext(Nullable 10)
        let txa = TextSaver.Arguments()              
        txa.OutputHeader <-true
        txa.OutputSchema <- true
        let tx = TextSaver(ctx,txa)
        use fn = File.Create(@"C:\repodata\hpopt\t1.txt")

        let s = predictions.AsDynamic.Schema                    //***  *very useful* 
                                                                //     field names from anonymous records are
                                                                //     are preserved in the schema for the data

        for c in 0 .. s.Count-1 do                              //*** print field names - these come from 
           printfn "%A - %A" (s.Item(c).Name) (s.[c].Type)      //    anonymous records used before

                                                                //actual field names printed by code:

                                                                //"Label" - Bool
                                                                //"Text" - Text
                                                                //"Features" - Vec<R4, 9141>
                                                                //"PredictedLabel" - Bool
                                                                //"Score" - R4
                                                                //"Probability" - R4
                                                                //"Score.Item1" - R4
                                                                //"Score.Item2" - R4
                                                                //"Score.Item3" - Bool

        tx.SaveData(fn,predictions.AsDynamic,0,3,4)              //save selected fields from the schema to text file
                                                                 //Note: Here (for now) you have to switch to dynamic DataView
                                                                 //and use indices for the columns you want output
module SampleModelP
namespace System
namespace Microsoft
Multiple items
namespace System.Data

--------------------
namespace Microsoft.FSharp.Data
namespace System.IO
module Train

from SampleModelP
val dataPath : string

Full name: SampleModelP.Train.dataPath
val trainModel : 'a -> 'b -> 'c

Full name: SampleModelP.Train.trainModel
Multiple items
val int : value:'T -> int (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.int

--------------------
type int = int32

Full name: Microsoft.FSharp.Core.int

--------------------
type int<'Measure> = int

Full name: Microsoft.FSharp.Core.int<_>
Multiple items
val float : value:'T -> float (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.float

--------------------
type float = Double

Full name: Microsoft.FSharp.Core.float

--------------------
type float<'Measure> = float

Full name: Microsoft.FSharp.Core.float<_>
Multiple items
type Nullable =
  static member Compare<'T> : n1:Nullable<'T> * n2:Nullable<'T> -> int
  static member Equals<'T> : n1:Nullable<'T> * n2:Nullable<'T> -> bool
  static member GetUnderlyingType : nullableType:Type -> Type

Full name: System.Nullable

--------------------
type Nullable<'T (requires default constructor and value type and 'T :> ValueType)> =
  struct
    new : value:'T -> Nullable<'T>
    member Equals : other:obj -> bool
    member GetHashCode : unit -> int
    member GetValueOrDefault : unit -> 'T + 1 overload
    member HasValue : bool
    member ToString : unit -> string
    member Value : 'T
  end

Full name: System.Nullable<_>

--------------------
Nullable()
Nullable(value: 'T) : unit
namespace System.Text
val trainData : obj

Full name: SampleModelP.Train.trainData
val pipeline : obj

Full name: SampleModelP.Train.pipeline
val metrics : seq<obj>

Full name: SampleModelP.Train.metrics
val m : float

Full name: SampleModelP.Train.m
module Seq

from Microsoft.FSharp.Collections
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
val average : source:seq<'T> -> 'T (requires member ( + ) and member DivideByInt and member get_Zero)

Full name: Microsoft.FSharp.Collections.Seq.average
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
type File =
  static member AppendAllLines : path:string * contents:IEnumerable<string> -> unit + 1 overload
  static member AppendAllText : path:string * contents:string -> unit + 1 overload
  static member AppendText : path:string -> StreamWriter
  static member Copy : sourceFileName:string * destFileName:string -> unit + 1 overload
  static member Create : path:string -> FileStream + 3 overloads
  static member CreateText : path:string -> StreamWriter
  static member Decrypt : path:string -> unit
  static member Delete : path:string -> unit
  static member Encrypt : path:string -> unit
  static member Exists : path:string -> bool
  ...

Full name: System.IO.File
File.Create(path: string) : FileStream
File.Create(path: string, bufferSize: int) : FileStream
File.Create(path: string, bufferSize: int, options: FileOptions) : FileStream
File.Create(path: string, bufferSize: int, options: FileOptions, fileSecurity: Security.AccessControl.FileSecurity) : FileStream
type Type =
  inherit MemberInfo
  member Assembly : Assembly
  member AssemblyQualifiedName : string
  member Attributes : TypeAttributes
  member BaseType : Type
  member ContainsGenericParameters : bool
  member DeclaringMethod : MethodBase
  member DeclaringType : Type
  member Equals : o:obj -> bool + 1 overload
  member FindInterfaces : filter:TypeFilter * filterCriteria:obj -> Type[]
  member FindMembers : memberType:MemberTypes * bindingAttr:BindingFlags * filter:MemberFilter * filterCriteria:obj -> MemberInfo[]
  ...

Full name: System.Type
Raw view Test code New version

More information

Link:http://fssnip.net/7VV
Posted:5 years ago
Author:Faisal Waris
Tags: machine learning , sentiment classification , anonymous records