7 people like it.
Like the snippet!
Course 4: Decision trees and Titanic
F# introduction course - Get and read the Titanic data set using CSV type provider, define the type of "feature" and use it to classify the data and then implement a simple decision tree that can be used for writing more complex classifiers. To be used in Try F#.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
|
// Load type provider for CSV files
#r "Samples.Csv.dll"
open Samples.Csv
// Download data from the web with CSV provider
let [<Literal>] DataUrl =
"https://gist.github.com/tpetricek/263fb1bee9366170b2ef/raw/90d012bac3713e8618d3ae2f83f2f6535b6bebd9/titanic.csv"
type Titanic = CsvFile<DataUrl, Schema="int,int,int,string,string,string,int,string,string,string,string,string">
let data = new Titanic()
// ------------------------------------------------------------------
// TUTORIAL: Qualifying passengers with decision tree
// ------------------------------------------------------------------
// What is a feature? A feature is something that classifies a person
// into two groups - the result of calculating a feature on a given
// row is true or false. In F#, we can use a function.
type Feature = Titanic.Row -> bool
// The syntax ":" is a type annotation. We give the compiler hint,
// so that it knows what 'row' is later in the code.
let longName : Feature = (fun row -> row.Name.Length > 25)
let lowClass : Feature = (fun row -> row.Pclass = 3)
let ageOver20 : Feature = (fun row -> row.Age <> "" && (float row.Age > 20.0))
// Get the first person and do some experiments
let first = data.Data |> Seq.head
longName first
lowClass first
ageOver20 first
// TASK #1: Write features that test the following conditions:
// * Person has more than 2 siblings
// * Person is over 9.5 years old
// * Person is a male
/// Calculates how common the feature is in the data set
/// (returns the percentage of 'true' cases among all passengers)
let frequency feature =
let counts = data.Data |> Seq.countBy feature |> dict
(float counts.[true]) / (float (Seq.length data.Data))
/// What is the percentage of people who survived
/// and have the specified feature
let surviveRate feature =
let subset = data.Data |> Seq.filter feature
let subsetNot = data.Data |> Seq.filter (feature >> not)
let survived = subset |> Seq.filter (fun r -> r.Survived = 1) |> Seq.length
let survivedNot = subsetNot |> Seq.filter (fun r -> r.Survived = 1) |> Seq.length
(float survived) / (float (Seq.length subset)),
(float survivedNot) / (float (Seq.length subsetNot))
// TASK #2: Find out which of the features best classifies the data?
// (It should be relatively common - othrewise it 'over-fits' the data
// but the survival rate should be pretty high or low)
frequency longName
surviveRate longName
frequency lowClass
surviveRate lowClass
// ------------------------------------------------------------------
// TUTORIAL: Qualifying passengers with decision tree
// ------------------------------------------------------------------
// Decision tree is a simple classifier - it either branches using
// a feature, or it produces a final result. For example, see:
// http://en.wikipedia.org/wiki/Decision_tree_learning
type DecisionTree =
| Result of bool
| Condition of Feature * DecisionTree * DecisionTree
/// Classify a specified input using a specified decision tree
let rec classify tree row =
match tree with
| Result(value) -> value
| Condition(feature, left, right) ->
if feature row then classify left row
else classify right row
// Very simple (and silly) decision tree - person survives
// if he/she did not travel in class 3 and has a short name
//
// class=3?
// / \
// false name.Length>20
// / \
// true false
//
let simpleTree =
Condition
( lowClass, Result(false),
Condition
( longName, Result(true), Result(false) ))
// Run the simple tree on the first person
classify simpleTree first
// Compare this with the actual result
first.Survived
// What are the survival rates (how well it classifies?)
surviveRate (classify simpleTree)
// TASK #3: Construct a decision tree based on the sample figure
// on WikiPedia: http://en.wikipedia.org/wiki/Decision_tree_learning
|
Multiple items
type LiteralAttribute =
inherit Attribute
new : unit -> LiteralAttribute
Full name: Microsoft.FSharp.Core.LiteralAttribute
--------------------
new : unit -> LiteralAttribute
val DataUrl : string
Full name: Script.DataUrl
type Titanic = obj
Full name: Script.Titanic
val data : Titanic
Full name: Script.data
type Feature = obj -> bool
Full name: Script.Feature
type bool = System.Boolean
Full name: Microsoft.FSharp.Core.bool
val longName : row:obj -> bool
Full name: Script.longName
val row : obj
val lowClass : row:obj -> bool
Full name: Script.lowClass
val ageOver20 : row:obj -> bool
Full name: Script.ageOver20
Multiple items
val float : value:'T -> float (requires member op_Explicit)
Full name: Microsoft.FSharp.Core.Operators.float
--------------------
type float = System.Double
Full name: Microsoft.FSharp.Core.float
--------------------
type float<'Measure> = float
Full name: Microsoft.FSharp.Core.float<_>
val first : obj
Full name: Script.first
namespace Microsoft.FSharp.Data
module Seq
from Microsoft.FSharp.Collections
val head : source:seq<'T> -> 'T
Full name: Microsoft.FSharp.Collections.Seq.head
val frequency : feature:('a -> bool) -> float
Full name: Script.frequency
Calculates how common the feature is in the data set
(returns the percentage of 'true' cases among all passengers)
val feature : ('a -> bool)
val counts : System.Collections.Generic.IDictionary<bool,int>
val countBy : projection:('T -> 'Key) -> source:seq<'T> -> seq<'Key * int> (requires equality)
Full name: Microsoft.FSharp.Collections.Seq.countBy
val dict : keyValuePairs:seq<'Key * 'Value> -> System.Collections.Generic.IDictionary<'Key,'Value> (requires equality)
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.dict
val length : source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.length
val surviveRate : feature:('a -> bool) -> float * float
Full name: Script.surviveRate
What is the percentage of people who survived
and have the specified feature
val subset : seq<'a>
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.filter
val subsetNot : seq<'a>
val not : value:bool -> bool
Full name: Microsoft.FSharp.Core.Operators.not
val survived : int
val r : 'a
val survivedNot : int
type DecisionTree =
| Result of bool
| Condition of Feature * DecisionTree * DecisionTree
Full name: Script.DecisionTree
union case DecisionTree.Result: bool -> DecisionTree
union case DecisionTree.Condition: Feature * DecisionTree * DecisionTree -> DecisionTree
val classify : tree:DecisionTree -> row:'a -> bool
Full name: Script.classify
Classify a specified input using a specified decision tree
val tree : DecisionTree
val row : 'a
val value : bool
val feature : Feature
val left : DecisionTree
val right : DecisionTree
val simpleTree : DecisionTree
Full name: Script.simpleTree
More information