2 people like it.
Like the snippet!
Sample for train/test sets in Deedle Frames
Supervised learning algorithms need a training set and a test set. This snippet show two ways to generate train/test sets.
First is by count, where it will take a random number of rows from the Frame and generate two new frame of the specified size. This is useful if you have a very large dataset, but you only want to explore your learning algorithm on a small subset.
Second is by ratio, this will break the frame into two parts based on the ratio given. For example you might want to train on 25% of your data, then test on the other 75%.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
|
open System
open Deedle
#load "Deedle.fsx"
let sampleByCnt data s =
let r = new Random(0)
let sampleSize = match ((data |> Seq.length) / 2 ) >= s with
| false -> (data |> Seq.length) / 2
| true -> s
let rndData = data |> Seq.map(fun d-> (d,r.Next()))
|> Seq.sortBy(snd)
|> Seq.map(fst)
|> Seq.toList
(rndData |> Seq.take sampleSize |> Seq.toList),
(rndData |> Seq.skip sampleSize |> Seq.take sampleSize |> Seq.toList)
let sampleByRatio data ratio =
let total = data |> Seq.length
let first = (int) (Math.Round((float)(total) * ratio))
let second = total - first
let r = new Random(0)
let rndData = data |> Seq.map(fun d-> (d,r.Next()))
|> Seq.sortBy(snd)
|> Seq.map(fst)
|> Seq.toList
(rndData |> Seq.take first |> Seq.toList),
(rndData |> Seq.skip first |> Seq.take second |> Seq.toList)
let sampleFrameByCnt (df:Frame<int,string>) (cnt:int) =
let keySample = sampleByCnt df.RowKeys cnt
(df |> Frame.getRows (fst keySample)),(df |> Frame.getRows (snd keySample))
let sampleFrameByRatio (df:Frame<int,string>) (ratio:float) =
let keySample = sampleByRatio df.RowKeys ratio
(df |> Frame.getRows (fst keySample)),(df |> Frame.getRows (snd keySample))
|
namespace System
namespace Deedle
val sampleByCnt : data:seq<'a> -> s:int -> 'a list * 'a list
Full name: Script.sampleByCnt
val data : seq<'a>
val s : int
val r : Random
Multiple items
type Random =
new : unit -> Random + 1 overload
member Next : unit -> int + 2 overloads
member NextBytes : buffer:byte[] -> unit
member NextDouble : unit -> float
Full name: System.Random
--------------------
Random() : unit
Random(Seed: int) : unit
val sampleSize : int
module Seq
from Microsoft.FSharp.Collections
val length : source:seq<'T> -> int
Full name: Microsoft.FSharp.Collections.Seq.length
val rndData : 'a list
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
Full name: Microsoft.FSharp.Collections.Seq.map
val d : 'a
Random.Next() : int
Random.Next(maxValue: int) : int
Random.Next(minValue: int, maxValue: int) : int
val sortBy : projection:('T -> 'Key) -> source:seq<'T> -> seq<'T> (requires comparison)
Full name: Microsoft.FSharp.Collections.Seq.sortBy
val snd : tuple:('T1 * 'T2) -> 'T2
Full name: Microsoft.FSharp.Core.Operators.snd
val fst : tuple:('T1 * 'T2) -> 'T1
Full name: Microsoft.FSharp.Core.Operators.fst
val toList : source:seq<'T> -> 'T list
Full name: Microsoft.FSharp.Collections.Seq.toList
val take : count:int -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.take
val skip : count:int -> source:seq<'T> -> seq<'T>
Full name: Microsoft.FSharp.Collections.Seq.skip
val sampleByRatio : data:seq<'a> -> ratio:float -> 'a list * 'a list
Full name: Script.sampleByRatio
val ratio : float
val total : int
val first : int
Multiple items
val int : value:'T -> int (requires member op_Explicit)
Full name: Microsoft.FSharp.Core.Operators.int
--------------------
type int = int32
Full name: Microsoft.FSharp.Core.int
--------------------
type int<'Measure> = int
Full name: Microsoft.FSharp.Core.int<_>
type Math =
static val PI : float
static val E : float
static member Abs : value:sbyte -> sbyte + 6 overloads
static member Acos : d:float -> float
static member Asin : d:float -> float
static member Atan : d:float -> float
static member Atan2 : y:float * x:float -> float
static member BigMul : a:int * b:int -> int64
static member Ceiling : d:decimal -> decimal + 1 overload
static member Cos : d:float -> float
...
Full name: System.Math
Math.Round(d: decimal) : decimal
Math.Round(a: float) : float
Math.Round(d: decimal, mode: MidpointRounding) : decimal
Math.Round(d: decimal, decimals: int) : decimal
Math.Round(value: float, mode: MidpointRounding) : float
Math.Round(value: float, digits: int) : float
Math.Round(d: decimal, decimals: int, mode: MidpointRounding) : decimal
Math.Round(value: float, digits: int, mode: MidpointRounding) : float
Multiple items
val float : value:'T -> float (requires member op_Explicit)
Full name: Microsoft.FSharp.Core.Operators.float
--------------------
type float = Double
Full name: Microsoft.FSharp.Core.float
--------------------
type float<'Measure> = float
Full name: Microsoft.FSharp.Core.float<_>
val second : int
val sampleFrameByCnt : df:Frame<int,string> -> cnt:int -> 'a * 'b
Full name: Script.sampleFrameByCnt
val df : Frame<int,string>
Multiple items
module Frame
from Deedle
--------------------
type Frame =
static member ReadReader : reader:IDataReader -> Frame<int,string>
static member CustomExpanders : Dictionary<Type,Func<obj,seq<string * Type * obj>>>
static member NonExpandableInterfaces : List<Type>
static member NonExpandableTypes : HashSet<Type>
Full name: Deedle.Frame
--------------------
type Frame<'TRowKey,'TColumnKey (requires equality and equality)> =
interface IDynamicMetaObjectProvider
interface INotifyCollectionChanged
interface IFsiFormattable
interface IFrame
new : names:seq<'TColumnKey> * columns:seq<ISeries<'TRowKey>> -> Frame<'TRowKey,'TColumnKey>
new : rowIndex:IIndex<'TRowKey> * columnIndex:IIndex<'TColumnKey> * data:IVector<IVector> * indexBuilder:IIndexBuilder * vectorBuilder:IVectorBuilder -> Frame<'TRowKey,'TColumnKey>
member AddColumn : column:'TColumnKey * series:ISeries<'TRowKey> -> unit
member AddColumn : column:'TColumnKey * series:seq<'V> -> unit
member AddColumn : column:'TColumnKey * series:ISeries<'TRowKey> * lookup:Lookup -> unit
member AddColumn : column:'TColumnKey * series:seq<'V> * lookup:Lookup -> unit
...
Full name: Deedle.Frame<_,_>
--------------------
new : names:seq<'TColumnKey> * columns:seq<ISeries<'TRowKey>> -> Frame<'TRowKey,'TColumnKey>
new : rowIndex:Indices.IIndex<'TRowKey> * columnIndex:Indices.IIndex<'TColumnKey> * data:IVector<IVector> * indexBuilder:Indices.IIndexBuilder * vectorBuilder:Vectors.IVectorBuilder -> Frame<'TRowKey,'TColumnKey>
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = String
Full name: Microsoft.FSharp.Core.string
val cnt : int
val keySample : int list * int list
property Frame.RowKeys: seq<int>
val getRows : frame:Frame<'R,'C> -> Series<'R,Series<'C,'T>> (requires equality and equality)
Full name: Deedle.Frame.getRows
val sampleFrameByRatio : df:Frame<int,string> -> ratio:float -> 'a * 'b
Full name: Script.sampleFrameByRatio
More information