7 people like it.

Course 4: Decision trees and Titanic

F# introduction course - Get and read the Titanic data set using CSV type provider, define the type of "feature" and use it to classify the data and then implement a simple decision tree that can be used for writing more complex classifiers. To be used in Try F#.

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
// Load type provider for CSV files
#r "Samples.Csv.dll"
open Samples.Csv

// Download data from the web with CSV provider
let [<Literal>] DataUrl = 
  "https://gist.github.com/tpetricek/263fb1bee9366170b2ef/raw/90d012bac3713e8618d3ae2f83f2f6535b6bebd9/titanic.csv"  
type Titanic = CsvFile<DataUrl, Schema="int,int,int,string,string,string,int,string,string,string,string,string">
let data = new Titanic()

// ------------------------------------------------------------------
// TUTORIAL: Qualifying passengers with decision tree 
// ------------------------------------------------------------------

// What is a feature? A feature is something that classifies a person
// into two groups - the result of calculating a feature on a given
// row is true or false. In F#, we can use a function.
type Feature = Titanic.Row -> bool

// The syntax ":" is a type annotation. We give the compiler hint,
// so that it knows what 'row' is later in the code.
let longName : Feature = (fun row -> row.Name.Length > 25)
let lowClass : Feature = (fun row -> row.Pclass = 3)
let ageOver20 : Feature = (fun row -> row.Age <> "" && (float row.Age > 20.0))

// Get the first person and do some experiments
let first = data.Data |> Seq.head

longName first
lowClass first
ageOver20 first

// TASK #1: Write features that test the following conditions:
//  * Person has more than 2 siblings
//  * Person is over 9.5 years old
//  * Person is a male

/// Calculates how common the feature is in the data set
/// (returns the percentage of 'true' cases among all passengers)
let frequency feature =
  let counts = data.Data |> Seq.countBy feature |> dict
  (float counts.[true]) / (float (Seq.length data.Data))  

/// What is the percentage of people who survived 
/// and have the specified feature
let surviveRate feature = 
  let subset = data.Data |> Seq.filter feature
  let subsetNot = data.Data |> Seq.filter (feature >> not)
  let survived = subset |> Seq.filter (fun r -> r.Survived = 1) |> Seq.length
  let survivedNot = subsetNot |> Seq.filter (fun r -> r.Survived = 1) |> Seq.length
  (float survived) / (float (Seq.length subset)),
  (float survivedNot) / (float (Seq.length subsetNot))
  
// TASK #2: Find out which of the features best classifies the data?
// (It should be relatively common - othrewise it 'over-fits' the data
// but the survival rate should be pretty high or low)   

frequency longName 
surviveRate longName

frequency lowClass
surviveRate lowClass

// ------------------------------------------------------------------
// TUTORIAL: Qualifying passengers with decision tree 
// ------------------------------------------------------------------

// Decision tree is a simple classifier - it either branches using
// a feature, or it produces a final result. For example, see:
// http://en.wikipedia.org/wiki/Decision_tree_learning
type DecisionTree =
  | Result of bool
  | Condition of Feature * DecisionTree * DecisionTree

/// Classify a specified input using a specified decision tree
let rec classify tree row =
  match tree with 
  | Result(value) -> value
  | Condition(feature, left, right) ->
      if feature row then classify left row 
      else classify right row 

// Very simple (and silly) decision tree - person survives 
// if he/she did not travel in class 3 and has a short name
//
//     class=3?
//     /      \
//  false    name.Length>20
//             /       \
//           true     false
//
let simpleTree = 
  Condition
    ( lowClass, Result(false),
      Condition
        ( longName, Result(true), Result(false) ))

// Run the simple tree on the first person
classify simpleTree first  
// Compare this with the actual result
first.Survived
// What are the survival rates (how well it classifies?)
surviveRate (classify simpleTree)

// TASK #3: Construct a decision tree based on the sample figure
// on WikiPedia: http://en.wikipedia.org/wiki/Decision_tree_learning
Multiple items
type LiteralAttribute =
  inherit Attribute
  new : unit -> LiteralAttribute

Full name: Microsoft.FSharp.Core.LiteralAttribute

--------------------
new : unit -> LiteralAttribute
val DataUrl : string

Full name: Script.DataUrl
type Titanic = obj

Full name: Script.Titanic
val data : Titanic

Full name: Script.data
type Feature = obj -> bool

Full name: Script.Feature
type bool = System.Boolean

Full name: Microsoft.FSharp.Core.bool
val longName : row:obj -> bool

Full name: Script.longName
val row : obj
val lowClass : row:obj -> bool

Full name: Script.lowClass
val ageOver20 : row:obj -> bool

Full name: Script.ageOver20
Multiple items
val float : value:'T -> float (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.float

--------------------
type float = System.Double

Full name: Microsoft.FSharp.Core.float

--------------------
type float<'Measure> = float

Full name: Microsoft.FSharp.Core.float<_>
val first : obj

Full name: Script.first
namespace Microsoft.FSharp.Data
module Seq

from Microsoft.FSharp.Collections
val head : source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.head
val frequency : feature:('a -> bool) -> float

Full name: Script.frequency


 Calculates how common the feature is in the data set
 (returns the percentage of 'true' cases among all passengers)
val feature : ('a -> bool)
val counts : System.Collections.Generic.IDictionary<bool,int>
val countBy : projection:('T -> 'Key) -> source:seq<'T> -> seq<'Key * int> (requires equality)

Full name: Microsoft.FSharp.Collections.Seq.countBy
val dict : keyValuePairs:seq<'Key * 'Value> -> System.Collections.Generic.IDictionary<'Key,'Value> (requires equality)

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.dict
val length : source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.length
val surviveRate : feature:('a -> bool) -> float * float

Full name: Script.surviveRate


 What is the percentage of people who survived
 and have the specified feature
val subset : seq<'a>
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.filter
val subsetNot : seq<'a>
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val survived : int
val r : 'a
val survivedNot : int
type DecisionTree =
  | Result of bool
  | Condition of Feature * DecisionTree * DecisionTree

Full name: Script.DecisionTree
union case DecisionTree.Result: bool -> DecisionTree
union case DecisionTree.Condition: Feature * DecisionTree * DecisionTree -> DecisionTree
val classify : tree:DecisionTree -> row:'a -> bool

Full name: Script.classify


 Classify a specified input using a specified decision tree
val tree : DecisionTree
val row : 'a
val value : bool
val feature : Feature
val left : DecisionTree
val right : DecisionTree
val simpleTree : DecisionTree

Full name: Script.simpleTree

More information

Link:http://fssnip.net/je
Posted:11 years ago
Author:Tomas Petricek
Tags: try f# , titanic , csv , decision tree , discriminated union