2 people like it.

Seq.splitBy, Seq.splitAt, Seq.tryHeadTail

these `Seq` functions allow to split and process Seqs in a sequential manner without the need to rescan or cache elements into memory.

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
182: 
module Seq =
    type [< RequireQualifiedAccess >] SplitByOption = Exclude | IncludeInFirst | IncludeInSecond

    type [< RequireQualifiedAccess >] private SplitSubUnfoldState<'T> = 
    | PostValue of 'T * SplitSubUnfoldState<'T>
    | Start     of seqNo: int * start: int
    | Started   of tryNext: (unit -> 'T option) * bingo: ('T -> unit) * finish: (unit -> unit) 
    | Finish

    type [< RequireQualifiedAccess >] private SplitUnfoldState<'T> = {
        enumerator          : System.Collections.Generic.IEnumerator<'T>
        mutable currentPos  : int
        mutable isDone      : bool
        mutable splitterO   : 'T     option
        mutable currentSeqO : 'T seq option
    }

    /// Straight scan through is efficient, reusing seqs causes rescan from beginning
    let splitBy f opt (input: 'a seq) =
        let getEnumerator() = input.GetEnumerator()
        let startingState() = 
            {
                SplitUnfoldState.enumerator   = getEnumerator()
                SplitUnfoldState.currentPos   = 0
                SplitUnfoldState.isDone       = false
                SplitUnfoldState.splitterO    = None
                SplitUnfoldState.currentSeqO  = None
            }, 0

        Seq.unfold(fun stateO ->
            let state, currentSeqNo = stateO |> Option.defaultWith startingState

            let tryNextMain() : 'a option=
                if state.isDone then None else
                if state.enumerator.MoveNext() then
                    state.currentPos  <- state.currentPos + 1
                    Some state.enumerator.Current
                else
                    state.enumerator.Dispose()
                    None
            let bingo  v  = state.splitterO <- Some v ; state.currentSeqO <- None
            let finish () = state.splitterO <- None   ; state.currentSeqO <- None ; state.isDone <- true   

            let tryNextSub start  =
                let en = getEnumerator()
                for i in 0..start - 1 do (en.MoveNext() |> ignore)
                fun () ->
                    if en.MoveNext()   then Some en.Current
                    else                    en.Dispose()
                                            None

            let subUnFold(tryNext: unit -> 'a option, bingo, finish) = 
                match tryNext() with
                | None      ->  finish()
                                None
                | Some v    ->
                    if f v then bingo v
                                if opt = SplitByOption.IncludeInFirst 
                                then Some( v, SplitSubUnfoldState.Finish                          )
                                else None
                    else             Some( v, SplitSubUnfoldState.Started(tryNext, bingo, finish) )

            //printfn "Unfold %A" currentSeqNo
            while state.currentSeqO |> Option.isSome do 
                //printfn "skipping"
                subUnFold(tryNextMain, bingo , finish) |> ignore
            if state.isDone then None else
            if opt <> SplitByOption.IncludeInSecond then state.splitterO <- None
            let s0 =
                let start = SplitSubUnfoldState.Start(currentSeqNo, state.currentPos)
                match state.splitterO, opt with
                | Some v, SplitByOption.IncludeInSecond -> SplitSubUnfoldState.PostValue(v, start)
                | _                                   ->                                  start
                |> Seq.unfold(function
                    | SplitSubUnfoldState.PostValue(v, next)                 -> Some(v, next)
                    | SplitSubUnfoldState.Finish                             -> None
                    | SplitSubUnfoldState.Started(tryNext, bingo, finish)    -> subUnFold(tryNext, bingo, finish)
                    | SplitSubUnfoldState.Start(seqNo, myStart)              -> //printfn "Starting %d at %d = %d" seqNo myStart state.currentPos
                                                                                if state.currentPos = myStart // && not state.isDone 
                                                                                then subUnFold(tryNextMain       , bingo , finish)
                                                                                else subUnFold(tryNextSub myStart, ignore, ignore)
                )
            state.currentSeqO <- Some s0
            Some (s0, Some(state, currentSeqNo + 1) )
        ) None

    let splitAt n s =
        s
        |> Seq.mapi(fun i v -> i,v)
        |> splitBy (fst >> ((=) n )) SplitByOption.IncludeInSecond
        |> Seq.map (Seq.map snd)
        |> Seq.truncate 2

    let tryHeadTail fhead ftail s =
        ( Choice1Of3(), splitAt 1 s )
        ||> Seq.fold(function
            | Choice1Of3 (          ) -> Seq.tryHead >> Option.map fhead >> Choice2Of3
            | Choice2Of3 (Some headv) -> ftail    headv                  >> Choice3Of3
            | result                  -> fun _ -> result
        ) 
        |> function
        | Choice2Of3 (Some headv) -> Seq.empty |> ftail headv |> Some
        | Choice3Of3 v            -> Some v
        | _                       -> None

/////////// Sample Usage

    [ -1 ; 0 ; 1; 2; 3; -1; 3; 5; 7; -1; 2; 3; 9 ; -1] 
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> splitBy ((=) -1) SplitByOption.Exclude
    |> Seq.skip 2
    //|> Seq.take 2
    |> Seq.iter (fun s -> 
        let s = Seq.cache s
        Seq.length s |> printfn " ==> %A %A" (Seq.toList s) ) 

    "Hello friend how are you? Good "
    |> Seq.toArray
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> splitBy ((=) ' ') SplitByOption.IncludeInFirst
    //|> Seq.iter (printfn "%A")
    |> Seq.skip 3
    //|> Seq.take 2
    |> Seq.iter (fun s -> 
        let s = Seq.cache s
        Seq.length s |> printfn " ==> %A %A" (System.String(Seq.toArray s)) ) 


    [ 1 ; 2 ; 3 ; 4 ; 5 ; 6 ; 7 ; 8 ]
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> splitAt  2
    |> printfn "%A"

    [ 1 ; 2 ; 3 ; 4 ; 5 ; 6 ; 7 ; 8 ]
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> tryHeadTail id     (fun head tail -> printfn "head = %A, tail = %A" head tail )
    |> Option.defaultWith (fun ()        -> printfn "No Head Tail"                   )


// The following example uses Seq.splitBy and Seq.tryHeadTail 
// twice to split the input file first in 2 and then into several pieces
// collecting the output file names in the first section and the content in the subsequent pieces
// it can split a huge 2.2GB file into several subfiles without having to rescan the file
// or cache in memory any portions of the file apart from the current line

// the first Seq.tryHeadTail operates on a seq<seq<string>>, because of the double seq it must use 2
// separate functions to process the head and then the tail without backtracking,
// the first function must process the head seq<string> before continuing to the tail function
// the result from the first function is passed to the second:
//     |> Seq.tryHeadTail (fun head -> ...; filelist) (fun filelist tail -> ...)

// the second Seq.tryHeadTail operates on a seq<string>, because it is not nested seqs it can be called simply:
//     |> Seq.tryHeadTail id (fun head tail -> ...)

open System.IO

let splitFile path file =
    File.ReadLines file
    //|> Seq.mapi    (fun i v -> printfn "%d -> %s" i v; v)
    |> Seq.skip 1 
    |> Seq.splitBy    (fun l    -> l.EndsWith "---*/") Seq.SplitByOption.IncludeInSecond
    |> Seq.tryHeadTail(fun head ->
        let filelist = head |> Seq.map (fun s-> s.Split('.') |> Seq.last) |> Seq.toArray
        for file in filelist do printfn "file: %A" file
        printfn "--------"
        filelist
    ) (fun filelist tail ->
        tail
        |> Seq.collect id
        |> Seq.splitBy (fun l -> l.EndsWith " rows affected)" || l = "(1 row affected)") Seq.SplitByOption.IncludeInFirst
        |> Seq.map (Seq.skip 1)
        |> Seq.iteri(fun i s ->
            if i < filelist.Length then
                s 
                |> Seq.tryHeadTail id (fun head tail -> 
                    printfn "%s => %s" filelist.[i] head
                    File.WriteAllLines(path + filelist.[i] + ".rpt", Seq.append [head] tail) 
                )
                |> Option.defaultValue ()
        )
    )
    |> Option.defaultValue ()
module Seq

from Microsoft.FSharp.Collections
Multiple items
type RequireQualifiedAccessAttribute =
  inherit Attribute
  new : unit -> RequireQualifiedAccessAttribute

--------------------
new : unit -> RequireQualifiedAccessAttribute
type SplitByOption =
  | Exclude
  | IncludeInFirst
  | IncludeInSecond
union case SplitByOption.Exclude: SplitByOption
union case SplitByOption.IncludeInFirst: SplitByOption
union case SplitByOption.IncludeInSecond: SplitByOption
type private SplitSubUnfoldState<'T> =
  | PostValue of 'T * SplitSubUnfoldState<'T>
  | Start of seqNo: int * start: int
  | Started of tryNext: unit -> 'T option * bingo: 'T -> unit * finish: unit -> unit
  | Finish
union case SplitSubUnfoldState.PostValue: 'T * SplitSubUnfoldState<'T> -> SplitSubUnfoldState<'T>
union case SplitSubUnfoldState.Start: seqNo: int * start: int -> SplitSubUnfoldState<'T>
Multiple items
val int : value:'T -> int (requires member op_Explicit)

--------------------
type int = int32

--------------------
type int<'Measure> = int
union case SplitSubUnfoldState.Started: tryNext: unit -> 'T option * bingo: 'T -> unit * finish: unit -> unit -> SplitSubUnfoldState<'T>
type unit = Unit
type 'T option = Option<'T>
union case SplitSubUnfoldState.Finish: SplitSubUnfoldState<'T>
type private SplitUnfoldState<'T> =
  { enumerator: IEnumerator<'T>
    mutable currentPos: int
    mutable isDone: bool
    mutable splitterO: 'T option
    mutable currentSeqO: seq<'T> option }
SplitUnfoldState.enumerator: System.Collections.Generic.IEnumerator<'T>
namespace System
namespace System.Collections
namespace System.Collections.Generic
type IEnumerator<'T> =
  inherit IDisposable
  inherit IEnumerator
  member Current : 'T
SplitUnfoldState.currentPos: int
SplitUnfoldState.isDone: bool
type bool = System.Boolean
SplitUnfoldState.splitterO: 'T option
SplitUnfoldState.currentSeqO: seq<'T> option
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

--------------------
type seq<'T> = System.Collections.Generic.IEnumerable<'T>
val splitBy : f:('a -> bool) -> opt:SplitByOption -> input:seq<'a> -> seq<seq<'a>>


 Straight scan through is efficient, reusing seqs causes rescan from beginning
val f : ('a -> bool)
val opt : SplitByOption
val input : seq<'a>
val getEnumerator : (unit -> System.Collections.Generic.IEnumerator<'a>)
System.Collections.Generic.IEnumerable.GetEnumerator() : System.Collections.Generic.IEnumerator<'a>
val startingState : (unit -> SplitUnfoldState<'a> * int)
union case Option.None: Option<'T>
val unfold : generator:('State -> ('T * 'State) option) -> state:'State -> seq<'T>
val stateO : (SplitUnfoldState<'a> * int) option
val state : SplitUnfoldState<'a>
val currentSeqNo : int
module Option

from Microsoft.FSharp.Core
val defaultWith : defThunk:(unit -> 'T) -> option:'T option -> 'T
val tryNextMain : (unit -> 'a option)
SplitUnfoldState.enumerator: System.Collections.Generic.IEnumerator<'a>
System.Collections.IEnumerator.MoveNext() : bool
union case Option.Some: Value: 'T -> Option<'T>
property System.Collections.Generic.IEnumerator.Current: 'a with get
System.IDisposable.Dispose() : unit
val bingo : ('a -> unit)
val v : 'a
SplitUnfoldState.splitterO: 'a option
SplitUnfoldState.currentSeqO: seq<'a> option
val finish : (unit -> unit)
val tryNextSub : (int -> unit -> 'a option)
val start : int
val en : System.Collections.Generic.IEnumerator<'a>
val i : int32
val ignore : value:'T -> unit
val subUnFold : ((unit -> 'a option) * ('a -> unit) * (unit -> unit) -> ('a * SplitSubUnfoldState<'a>) option)
val tryNext : (unit -> 'a option)
val isSome : option:'T option -> bool
val s0 : seq<'a>
val start : SplitSubUnfoldState<'a>
val next : SplitSubUnfoldState<'a>
val seqNo : int
val myStart : int
val splitAt : n:int -> s:seq<'a> -> seq<seq<'a>>
val n : int
val s : seq<'a>
val mapi : mapping:(int -> 'T -> 'U) -> source:seq<'T> -> seq<'U>
val i : int
val fst : tuple:('T1 * 'T2) -> 'T1
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
val snd : tuple:('T1 * 'T2) -> 'T2
val truncate : count:int -> source:seq<'T> -> seq<'T>
val tryHeadTail : fhead:('a -> 'b) -> ftail:('b -> seq<'a> -> 'c) -> s:seq<'a> -> 'c option
val fhead : ('a -> 'b)
val ftail : ('b -> seq<'a> -> 'c)
union case Choice.Choice1Of3: 'T1 -> Choice<'T1,'T2,'T3>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State
val tryHead : source:seq<'T> -> 'T option
val map : mapping:('T -> 'U) -> option:'T option -> 'U option
union case Choice.Choice2Of3: 'T2 -> Choice<'T1,'T2,'T3>
val headv : 'b
union case Choice.Choice3Of3: 'T3 -> Choice<'T1,'T2,'T3>
val result : Choice<unit,'b option,'c>
val empty<'T> : seq<'T>
val v : 'c
val x : int
val printfn : format:Printf.TextWriterFormat<'T> -> 'T
val skip : count:int -> source:seq<'T> -> seq<'T>
val iter : action:('T -> unit) -> source:seq<'T> -> unit
val s : seq<int>
val cache : source:seq<'T> -> seq<'T>
val length : source:seq<'T> -> int
val toList : source:seq<'T> -> 'T list
val toArray : source:seq<'T> -> 'T []
val x : char
val s : seq<char>
Multiple items
type String =
  new : value:char[] -> string + 8 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool + 3 overloads
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 3 overloads
  member EnumerateRunes : unit -> StringRuneEnumerator
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  ...

--------------------
System.String(value: char []) : System.String
System.String(value: nativeptr<char>) : System.String
System.String(value: nativeptr<sbyte>) : System.String
System.String(value: System.ReadOnlySpan<char>) : System.String
System.String(c: char, count: int) : System.String
System.String(value: char [], startIndex: int, length: int) : System.String
System.String(value: nativeptr<char>, startIndex: int, length: int) : System.String
System.String(value: nativeptr<sbyte>, startIndex: int, length: int) : System.String
System.String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: System.Text.Encoding) : System.String
val id : x:'T -> 'T
val head : int
val tail : seq<int>
namespace System.IO
val splitFile : path:string -> file:string -> unit
val path : string
val file : string
type File =
  static member AppendAllLines : path:string * contents:IEnumerable<string> -> unit + 1 overload
  static member AppendAllLinesAsync : path:string * contents:IEnumerable<string> * ?cancellationToken:CancellationToken -> Task + 1 overload
  static member AppendAllText : path:string * contents:string -> unit + 1 overload
  static member AppendAllTextAsync : path:string * contents:string * ?cancellationToken:CancellationToken -> Task + 1 overload
  static member AppendText : path:string -> StreamWriter
  static member Copy : sourceFileName:string * destFileName:string -> unit + 1 overload
  static member Create : path:string -> FileStream + 2 overloads
  static member CreateText : path:string -> StreamWriter
  static member Decrypt : path:string -> unit
  static member Delete : path:string -> unit
  ...
File.ReadLines(path: string) : System.Collections.Generic.IEnumerable<string>
File.ReadLines(path: string, encoding: System.Text.Encoding) : System.Collections.Generic.IEnumerable<string>
Multiple items
module Seq

from Script

--------------------
module Seq

from Microsoft.FSharp.Collections
val splitBy : f:('a -> bool) -> opt:Seq.SplitByOption -> input:seq<'a> -> seq<seq<'a>>


 Straight scan through is efficient, reusing seqs causes rescan from beginning
val l : string
union case Seq.SplitByOption.IncludeInSecond: Seq.SplitByOption
val head : seq<string>
val filelist : string []
val s : string
val last : source:seq<'T> -> 'T
val tail : seq<seq<string>>
val collect : mapping:('T -> #seq<'U>) -> source:seq<'T> -> seq<'U>
union case Seq.SplitByOption.IncludeInFirst: Seq.SplitByOption
val iteri : action:(int -> 'T -> unit) -> source:seq<'T> -> unit
val s : seq<string>
val head : string
val tail : seq<string>
File.WriteAllLines(path: string, contents: System.Collections.Generic.IEnumerable<string>) : unit
File.WriteAllLines(path: string, contents: string []) : unit
File.WriteAllLines(path: string, contents: System.Collections.Generic.IEnumerable<string>, encoding: System.Text.Encoding) : unit
File.WriteAllLines(path: string, contents: string [], encoding: System.Text.Encoding) : unit
val append : source1:seq<'T> -> source2:seq<'T> -> seq<'T>
val defaultValue : value:'T -> option:'T option -> 'T
Next Version Raw view Test code New version

More information

Link:http://fssnip.net/84n
Posted:3 years ago
Author:Abelardo Mieres
Tags: sequences