2 people like it.

Sample for train/test sets in Deedle Frames

Supervised learning algorithms need a training set and a test set. This snippet show two ways to generate train/test sets. First is by count, where it will take a random number of rows from the Frame and generate two new frame of the specified size. This is useful if you have a very large dataset, but you only want to explore your learning algorithm on a small subset. Second is by ratio, this will break the frame into two parts based on the ratio given. For example you might want to train on 25% of your data, then test on the other 75%.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
open System
open Deedle

#load "Deedle.fsx"

let sampleByCnt data s = 
    let r = new Random(0)
    let sampleSize = match ((data |> Seq.length) / 2 ) >= s with
                            | false  -> (data |> Seq.length) / 2 
                            | true -> s
        
    let rndData = data |> Seq.map(fun d-> (d,r.Next()))
                    |> Seq.sortBy(snd)
                    |> Seq.map(fst)
                    |> Seq.toList
    (rndData |> Seq.take sampleSize |> Seq.toList), 
    (rndData |> Seq.skip sampleSize |> Seq.take sampleSize  |> Seq.toList)
         
let sampleByRatio data ratio =      
    let total = data |> Seq.length
    let first = (int) (Math.Round((float)(total) * ratio))
    let second = total - first 
    let r = new Random(0)
    let rndData = data |> Seq.map(fun d-> (d,r.Next()))
                    |> Seq.sortBy(snd)
                    |> Seq.map(fst)
                    |> Seq.toList
    (rndData |> Seq.take first |> Seq.toList), 
    (rndData |> Seq.skip first |> Seq.take second  |> Seq.toList)         
           


let sampleFrameByCnt (df:Frame<int,string>) (cnt:int) =
    let keySample = sampleByCnt df.RowKeys cnt
    (df |> Frame.getRows (fst keySample)),(df |> Frame.getRows (snd keySample))


let sampleFrameByRatio (df:Frame<int,string>) (ratio:float) =
    let keySample = sampleByRatio df.RowKeys ratio
    (df |> Frame.getRows (fst keySample)),(df |> Frame.getRows (snd keySample))
namespace System
namespace Deedle
val sampleByCnt : data:seq<'a> -> s:int -> 'a list * 'a list

Full name: Script.sampleByCnt
val data : seq<'a>
val s : int
val r : Random
Multiple items
type Random =
  new : unit -> Random + 1 overload
  member Next : unit -> int + 2 overloads
  member NextBytes : buffer:byte[] -> unit
  member NextDouble : unit -> float

Full name: System.Random

--------------------
Random() : unit
Random(Seed: int) : unit
val sampleSize : int
module Seq

from Microsoft.FSharp.Collections
val length : source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.length
val rndData : 'a list
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
val d : 'a
Random.Next() : int
Random.Next(maxValue: int) : int
Random.Next(minValue: int, maxValue: int) : int
val sortBy : projection:('T -> 'Key) -> source:seq<'T> -> seq<'T> (requires comparison)

Full name: Microsoft.FSharp.Collections.Seq.sortBy
val snd : tuple:('T1 * 'T2) -> 'T2

Full name: Microsoft.FSharp.Core.Operators.snd
val fst : tuple:('T1 * 'T2) -> 'T1

Full name: Microsoft.FSharp.Core.Operators.fst
val toList : source:seq<'T> -> 'T list

Full name: Microsoft.FSharp.Collections.Seq.toList
val take : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.take
val skip : count:int -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.skip
val sampleByRatio : data:seq<'a> -> ratio:float -> 'a list * 'a list

Full name: Script.sampleByRatio
val ratio : float
val total : int
val first : int
Multiple items
val int : value:'T -> int (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.int

--------------------
type int = int32

Full name: Microsoft.FSharp.Core.int

--------------------
type int<'Measure> = int

Full name: Microsoft.FSharp.Core.int<_>
type Math =
  static val PI : float
  static val E : float
  static member Abs : value:sbyte -> sbyte + 6 overloads
  static member Acos : d:float -> float
  static member Asin : d:float -> float
  static member Atan : d:float -> float
  static member Atan2 : y:float * x:float -> float
  static member BigMul : a:int * b:int -> int64
  static member Ceiling : d:decimal -> decimal + 1 overload
  static member Cos : d:float -> float
  ...

Full name: System.Math
Math.Round(d: decimal) : decimal
Math.Round(a: float) : float
Math.Round(d: decimal, mode: MidpointRounding) : decimal
Math.Round(d: decimal, decimals: int) : decimal
Math.Round(value: float, mode: MidpointRounding) : float
Math.Round(value: float, digits: int) : float
Math.Round(d: decimal, decimals: int, mode: MidpointRounding) : decimal
Math.Round(value: float, digits: int, mode: MidpointRounding) : float
Multiple items
val float : value:'T -> float (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.float

--------------------
type float = Double

Full name: Microsoft.FSharp.Core.float

--------------------
type float<'Measure> = float

Full name: Microsoft.FSharp.Core.float<_>
val second : int
val sampleFrameByCnt : df:Frame<int,string> -> cnt:int -> 'a * 'b

Full name: Script.sampleFrameByCnt
val df : Frame<int,string>
Multiple items
module Frame

from Deedle

--------------------
type Frame =
  static member ReadReader : reader:IDataReader -> Frame<int,string>
  static member CustomExpanders : Dictionary<Type,Func<obj,seq<string * Type * obj>>>
  static member NonExpandableInterfaces : List<Type>
  static member NonExpandableTypes : HashSet<Type>

Full name: Deedle.Frame

--------------------
type Frame<'TRowKey,'TColumnKey (requires equality and equality)> =
  interface IDynamicMetaObjectProvider
  interface INotifyCollectionChanged
  interface IFsiFormattable
  interface IFrame
  new : names:seq<'TColumnKey> * columns:seq<ISeries<'TRowKey>> -> Frame<'TRowKey,'TColumnKey>
  new : rowIndex:IIndex<'TRowKey> * columnIndex:IIndex<'TColumnKey> * data:IVector<IVector> * indexBuilder:IIndexBuilder * vectorBuilder:IVectorBuilder -> Frame<'TRowKey,'TColumnKey>
  member AddColumn : column:'TColumnKey * series:ISeries<'TRowKey> -> unit
  member AddColumn : column:'TColumnKey * series:seq<'V> -> unit
  member AddColumn : column:'TColumnKey * series:ISeries<'TRowKey> * lookup:Lookup -> unit
  member AddColumn : column:'TColumnKey * series:seq<'V> * lookup:Lookup -> unit
  ...

Full name: Deedle.Frame<_,_>

--------------------
new : names:seq<'TColumnKey> * columns:seq<ISeries<'TRowKey>> -> Frame<'TRowKey,'TColumnKey>
new : rowIndex:Indices.IIndex<'TRowKey> * columnIndex:Indices.IIndex<'TColumnKey> * data:IVector<IVector> * indexBuilder:Indices.IIndexBuilder * vectorBuilder:Vectors.IVectorBuilder -> Frame<'TRowKey,'TColumnKey>
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
val cnt : int
val keySample : int list * int list
property Frame.RowKeys: seq<int>
val getRows : frame:Frame<'R,'C> -> Series<'R,Series<'C,'T>> (requires equality and equality)

Full name: Deedle.Frame.getRows
val sampleFrameByRatio : df:Frame<int,string> -> ratio:float -> 'a * 'b

Full name: Script.sampleFrameByRatio
Raw view Test code New version

More information

Link:http://fssnip.net/kS
Posted:11 years ago
Author:tonyabell
Tags: deedle