5 people like it.

Seq.splitBy, Seq.splitAt, Seq.tryHeadTail

The example uses Seq.splitBy and Seq.tryHeadTail twice to split the input file in two and then into several pieces collecting the output file names in the first section and the content in the subsequent pieces It can efficiently split a huge 2.2GB file into several subfiles without having to rescan the file or cache in memory any portions of the file apart from the current line.

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
module Seq =
    type [< RequireQualifiedAccess >] SplitByOption = Exclude | IncludeInFirst | IncludeInSecond

    type [< RequireQualifiedAccess >] private SplitSubUnfoldState<'T> = 
    | PostValue of 'T * SplitSubUnfoldState<'T>
    | Start     of seqNo: int * start: int
    | Started   of tryNext: (unit -> 'T option) * bingo: ('T -> unit) * finish: (unit -> unit) 
    | Finish

    type [< RequireQualifiedAccess >] private SplitUnfoldState<'T> = {
        enumerator          : System.Collections.Generic.IEnumerator<'T>
        mutable currentPos  : int
        mutable isDone      : bool
        mutable splitterO   : 'T     option
        mutable currentSeqO : 'T seq option
    }

    /// Straight scan through is efficient, reusing seqs causes rescan from beginning
    let splitBy f opt (input: 'a seq) =
        let getEnumerator() = input.GetEnumerator()
        let startingState() = 
            {
                SplitUnfoldState.enumerator   = getEnumerator()
                SplitUnfoldState.currentPos   = 0
                SplitUnfoldState.isDone       = false
                SplitUnfoldState.splitterO    = None
                SplitUnfoldState.currentSeqO  = None
            }, 0

        Seq.unfold(fun stateO ->
            let state, currentSeqNo = stateO |> Option.defaultWith startingState

            let tryNextMain() : 'a option=
                if state.isDone then None else
                if state.enumerator.MoveNext() then
                    state.currentPos  <- state.currentPos + 1
                    Some state.enumerator.Current
                else
                    state.enumerator.Dispose()
                    None
            let bingo  v  = state.splitterO <- Some v ; state.currentSeqO <- None
            let finish () = state.splitterO <- None   ; state.currentSeqO <- None ; state.isDone <- true   

            let tryNextSub start  =
                let en = getEnumerator()
                for i in 0..start - 1 do (en.MoveNext() |> ignore)
                fun () ->
                    if en.MoveNext()   then Some en.Current
                    else                    en.Dispose()
                                            None

            let subUnFold(tryNext: unit -> 'a option, bingo, finish) = 
                match tryNext() with
                | None      ->  finish()
                                None
                | Some v    ->
                    if f v then bingo v
                                if opt = SplitByOption.IncludeInFirst 
                                then Some( v, SplitSubUnfoldState.Finish                          )
                                else None
                    else             Some( v, SplitSubUnfoldState.Started(tryNext, bingo, finish) )

            //printfn "Unfold %A" currentSeqNo
            while state.currentSeqO |> Option.isSome do 
                //printfn "skipping"
                subUnFold(tryNextMain, bingo , finish) |> ignore
            if state.isDone then None else
            if opt <> SplitByOption.IncludeInSecond then state.splitterO <- None
            let s0 =
                let start = SplitSubUnfoldState.Start(currentSeqNo, state.currentPos)
                match state.splitterO, opt with
                | Some v, SplitByOption.IncludeInSecond -> SplitSubUnfoldState.PostValue(v, start)
                | _                                   ->                                  start
                |> Seq.unfold(function
                    | SplitSubUnfoldState.PostValue(v, next)                 -> Some(v, next)
                    | SplitSubUnfoldState.Finish                             -> None
                    | SplitSubUnfoldState.Started(tryNext, bingo, finish)    -> subUnFold(tryNext, bingo, finish)
                    | SplitSubUnfoldState.Start(seqNo, myStart)              -> //printfn "Starting %d at %d = %d" seqNo myStart state.currentPos
                                                                                if state.currentPos = myStart // && not state.isDone 
                                                                                then subUnFold(tryNextMain       , bingo , finish)
                                                                                else subUnFold(tryNextSub myStart, ignore, ignore)
                )
            state.currentSeqO <- Some s0
            Some (s0, Some(state, currentSeqNo + 1) )
        ) None

    let splitAt n s =
        s
        |> Seq.mapi(fun i v -> i,v)
        |> splitBy (fst >> ((=) n )) SplitByOption.IncludeInSecond
        |> Seq.map (Seq.map snd)
        |> Seq.truncate 2

    let tryHeadTail fhead ftail s =
        ( Choice1Of3(), splitAt 1 s )
        ||> Seq.fold(function
            | Choice1Of3 (          ) -> Seq.tryHead >> Option.map fhead >> Choice2Of3
            | Choice2Of3 (Some headv) -> ftail    headv                  >> Choice3Of3
            | result                  -> fun _ -> result
        ) 
        |> function
        | Choice2Of3 (Some headv) -> Seq.empty |> ftail headv |> Some
        | Choice3Of3 v            -> Some v
        | _                       -> None

/////////// Sample Usage

    [ -1 ; 0 ; 1; 2; 3; -1; 3; 5; 7; -1; 2; 3; 9 ; -1] 
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> splitBy ((=) -1) SplitByOption.Exclude
    |> Seq.skip 2
    //|> Seq.take 2
    |> Seq.iter (fun s -> 
        let s = Seq.cache s
        Seq.length s |> printfn " ==> %A %A" (Seq.toList s) ) 

    "Hello friend how are you? Good "
    |> Seq.toArray
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> splitBy ((=) ' ') SplitByOption.IncludeInFirst
    //|> Seq.iter (printfn "%A")
    |> Seq.skip 3
    //|> Seq.take 2
    |> Seq.iter (fun s -> 
        let s = Seq.cache s
        Seq.length s |> printfn " ==> %A %A" (System.String(Seq.toArray s)) ) 


    [ 1 ; 2 ; 3 ; 4 ; 5 ; 6 ; 7 ; 8 ]
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> splitAt  2
    |> printfn "%A"

    [ 1 ; 2 ; 3 ; 4 ; 5 ; 6 ; 7 ; 8 ]
    |> Seq.map  (fun x -> printfn "---> %A" x; x)
    |> tryHeadTail id     (fun head tail -> printfn "head = %A, tail = %A" head tail )
    |> Option.defaultWith (fun ()        -> printfn "No Head Tail"                   )


// The following example uses Seq.splitBy and Seq.tryHeadTail 
// twice to split the input file first in 2 and then into several pieces
// collecting the output file names in the first section and the content in the subsequent pieces
// it can split a huge 2.2GB file into several subfiles without having to rescan the file
// or cache in memory any portions of the file apart from the current line

// the first Seq.tryHeadTail operates on a seq<seq<string>>. Because of the nested seq it must use 2
// separate functions to process first the head and then the tail without backtracking,
// the result from the first function is passed to the second:
//     |> Seq.tryHeadTail (fun head -> ...; filelist) (fun filelist tail -> ...)

// the second Seq.tryHeadTail operates on a seq<string>, because it is not nested seqs it can be called simply:
//     |> Seq.tryHeadTail id (fun head tail -> ...)

open System.IO

let splitFile path file =
    File.ReadLines file
    //|> Seq.mapi    (fun i v -> printfn "%d -> %s" i v; v)
    |> Seq.skip 1 
    |> Seq.splitBy    (fun l    -> l.EndsWith "---*/") Seq.SplitByOption.IncludeInSecond
    |> Seq.tryHeadTail(fun head ->
        let filelist = head |> Seq.map (fun s-> s.Split('.') |> Seq.last) |> Seq.toArray
        for file in filelist do printfn "file: %A" file
        printfn "--------"
        filelist
    ) (fun filelist tail ->
        tail
        |> Seq.collect id
        |> Seq.splitBy (fun l -> l.EndsWith " rows affected)" || l = "(1 row affected)") Seq.SplitByOption.IncludeInFirst
        |> Seq.map (Seq.skip 1)
        |> Seq.iteri(fun i s ->
            if i < filelist.Length then
                s 
                |> Seq.tryHeadTail id (fun head tail -> 
                    printfn "%s => %s" filelist.[i] head
                    File.WriteAllLines(path + filelist.[i] + ".rpt", Seq.append [head] tail) 
                )
                |> Option.defaultValue ()
        )
    )
    |> Option.defaultValue ()
module Seq

from Microsoft.FSharp.Collections
Multiple items
type RequireQualifiedAccessAttribute =
  inherit Attribute
  new : unit -> RequireQualifiedAccessAttribute

--------------------
new : unit -> RequireQualifiedAccessAttribute
type SplitByOption =
  | Exclude
  | IncludeInFirst
  | IncludeInSecond
union case SplitByOption.Exclude: SplitByOption
union case SplitByOption.IncludeInFirst: SplitByOption
union case SplitByOption.IncludeInSecond: SplitByOption
type private SplitSubUnfoldState<'T> =
  | PostValue of 'T * SplitSubUnfoldState<'T>
  | Start of seqNo: int * start: int
  | Started of tryNext: unit -> 'T option * bingo: 'T -> unit * finish: unit -> unit
  | Finish
union case SplitSubUnfoldState.PostValue: 'T * SplitSubUnfoldState<'T> -> SplitSubUnfoldState<'T>
union case SplitSubUnfoldState.Start: seqNo: int * start: int -> SplitSubUnfoldState<'T>
Multiple items
val int : value:'T -> int (requires member op_Explicit)

--------------------
[<Struct>]
type int = int32

--------------------
type int<'Measure> =
  int
union case SplitSubUnfoldState.Started: tryNext: unit -> 'T option * bingo: 'T -> unit * finish: unit -> unit -> SplitSubUnfoldState<'T>
type unit = Unit
type 'T option = Option<'T>
union case SplitSubUnfoldState.Finish: SplitSubUnfoldState<'T>
type private SplitUnfoldState<'T> =
  { enumerator: IEnumerator<'T>
    mutable currentPos: int
    mutable isDone: bool
    mutable splitterO: 'T option
    mutable currentSeqO: seq<'T> option }
SplitUnfoldState.enumerator: System.Collections.Generic.IEnumerator<'T>
namespace System
namespace System.Collections
namespace System.Collections.Generic
type IEnumerator<'T> =
  inherit IDisposable
  inherit IEnumerator
  member Current : 'T
SplitUnfoldState.currentPos: int
SplitUnfoldState.isDone: bool
[<Struct>]
type bool = System.Boolean
SplitUnfoldState.splitterO: 'T option
SplitUnfoldState.currentSeqO: seq<'T> option
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

--------------------
type seq<'T> = System.Collections.Generic.IEnumerable<'T>
val splitBy : f:('a -> bool) -> opt:SplitByOption -> input:seq<'a> -> seq<seq<'a>>
 Straight scan through is efficient, reusing seqs causes rescan from beginning
val f : ('a -> bool)
val opt : SplitByOption
val input : seq<'a>
val getEnumerator : (unit -> System.Collections.Generic.IEnumerator<'a>)
System.Collections.Generic.IEnumerable.GetEnumerator() : System.Collections.Generic.IEnumerator<'a>
val startingState : (unit -> SplitUnfoldState<'a> * int)
union case Option.None: Option<'T>
val unfold : generator:('State -> ('T * 'State) option) -> state:'State -> seq<'T>
val stateO : (SplitUnfoldState<'a> * int) option
val state : SplitUnfoldState<'a>
val currentSeqNo : int
module Option

from Microsoft.FSharp.Core
val defaultWith : defThunk:(unit -> 'T) -> option:'T option -> 'T
val tryNextMain : (unit -> 'a option)
SplitUnfoldState.enumerator: System.Collections.Generic.IEnumerator<'a>
System.Collections.IEnumerator.MoveNext() : bool
union case Option.Some: Value: 'T -> Option<'T>
property System.Collections.Generic.IEnumerator.Current: 'a with get
System.IDisposable.Dispose() : unit
val bingo : ('a -> unit)
val v : 'a
SplitUnfoldState.splitterO: 'a option
SplitUnfoldState.currentSeqO: seq<'a> option
val finish : (unit -> unit)
val tryNextSub : (int -> unit -> 'a option)
val start : int
val en : System.Collections.Generic.IEnumerator<'a>
val i : int32
val ignore : value:'T -> unit
val subUnFold : ((unit -> 'a option) * ('a -> unit) * (unit -> unit) -> ('a * SplitSubUnfoldState<'a>) option)
val tryNext : (unit -> 'a option)
val isSome : option:'T option -> bool
val s0 : seq<'a>
val start : SplitSubUnfoldState<'a>
val next : SplitSubUnfoldState<'a>
val seqNo : int
val myStart : int
val splitAt : n:int -> s:seq<'a> -> seq<seq<'a>>
val n : int
val s : seq<'a>
val mapi : mapping:(int -> 'T -> 'U) -> source:seq<'T> -> seq<'U>
val i : int
val fst : tuple:('T1 * 'T2) -> 'T1
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
val snd : tuple:('T1 * 'T2) -> 'T2
val truncate : count:int -> source:seq<'T> -> seq<'T>
val tryHeadTail : fhead:('a -> 'b) -> ftail:('b -> seq<'a> -> 'c) -> s:seq<'a> -> 'c option
val fhead : ('a -> 'b)
val ftail : ('b -> seq<'a> -> 'c)
union case Choice.Choice1Of3: 'T1 -> Choice<'T1,'T2,'T3>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State
val tryHead : source:seq<'T> -> 'T option
val map : mapping:('T -> 'U) -> option:'T option -> 'U option
union case Choice.Choice2Of3: 'T2 -> Choice<'T1,'T2,'T3>
val headv : 'b
union case Choice.Choice3Of3: 'T3 -> Choice<'T1,'T2,'T3>
val result : Choice<unit,'b option,'c>
val empty<'T> : seq<'T>
val v : 'c
val x : int
val printfn : format:Printf.TextWriterFormat<'T> -> 'T
val skip : count:int -> source:seq<'T> -> seq<'T>
val iter : action:('T -> unit) -> source:seq<'T> -> unit
val s : seq<int>
val cache : source:seq<'T> -> seq<'T>
val length : source:seq<'T> -> int
val toList : source:seq<'T> -> 'T list
val toArray : source:seq<'T> -> 'T []
val x : char
val s : seq<char>
Multiple items
type String =
  interface IComparable
  interface IEnumerable
  interface IConvertible
  interface IEnumerable<char>
  interface IComparable<string>
  interface IEquatable<string>
  interface ICloneable
  new : value: char [] -> unit + 8 overloads
  member Clone : unit -> obj
  member CompareTo : value: obj -> int + 1 overload
  ...

--------------------
System.String(value: char []) : System.String
System.String(value: nativeptr<char>) : System.String
System.String(value: nativeptr<sbyte>) : System.String
System.String(value: System.ReadOnlySpan<char>) : System.String
System.String(c: char, count: int) : System.String
System.String(value: char [], startIndex: int, length: int) : System.String
System.String(value: nativeptr<char>, startIndex: int, length: int) : System.String
System.String(value: nativeptr<sbyte>, startIndex: int, length: int) : System.String
System.String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: System.Text.Encoding) : System.String
val id : x:'T -> 'T
val head : int
val tail : seq<int>
namespace System.IO
val splitFile : path:string -> file:string -> unit
val path : string
val file : string
type File =
  static member AppendAllLines : path: string * contents: IEnumerable<string> -> unit + 1 overload
  static member AppendAllLinesAsync : path: string * contents: IEnumerable<string> *?cancellationToken: CancellationToken -> Task + 1 overload
  static member AppendAllText : path: string * contents: string -> unit + 1 overload
  static member AppendAllTextAsync : path: string * contents: string *?cancellationToken: CancellationToken -> Task + 1 overload
  static member AppendText : path: string -> StreamWriter
  static member AsyncStreamReader : path: string * encoding: Encoding -> StreamReader
  static member AsyncStreamWriter : path: string * encoding: Encoding * append: bool -> StreamWriter
  static member Copy : sourceFileName: string * destFileName: string -> unit + 1 overload
  static member Create : path: string -> FileStream + 2 overloads
  static member CreateText : path: string -> StreamWriter
  ...
File.ReadLines(path: string) : System.Collections.Generic.IEnumerable<string>
File.ReadLines(path: string, encoding: System.Text.Encoding) : System.Collections.Generic.IEnumerable<string>
Multiple items
module Seq

from Script

--------------------
module Seq

from Microsoft.FSharp.Collections
val splitBy : f:('a -> bool) -> opt:Seq.SplitByOption -> input:seq<'a> -> seq<seq<'a>>
 Straight scan through is efficient, reusing seqs causes rescan from beginning
val l : string
System.String.EndsWith(value: char) : bool
System.String.EndsWith(value: string) : bool
System.String.EndsWith(value: string, comparisonType: System.StringComparison) : bool
System.String.EndsWith(value: string, ignoreCase: bool, culture: System.Globalization.CultureInfo) : bool
union case Seq.SplitByOption.IncludeInSecond: Seq.SplitByOption
val head : seq<string>
val filelist : string []
val s : string
System.String.Split([<System.ParamArray>] separator: char []) : string []
System.String.Split(separator: string [], options: System.StringSplitOptions) : string []
System.String.Split(separator: string,?options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int) : string []
System.String.Split(separator: char,?options: System.StringSplitOptions) : string []
System.String.Split(separator: string [], count: int, options: System.StringSplitOptions) : string []
System.String.Split(separator: string, count: int,?options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int, options: System.StringSplitOptions) : string []
System.String.Split(separator: char, count: int,?options: System.StringSplitOptions) : string []
val last : source:seq<'T> -> 'T
val tail : seq<seq<string>>
val collect : mapping:('T -> #seq<'U>) -> source:seq<'T> -> seq<'U>
union case Seq.SplitByOption.IncludeInFirst: Seq.SplitByOption
val iteri : action:(int -> 'T -> unit) -> source:seq<'T> -> unit
val s : seq<string>
property System.Array.Length: int with get
val head : string
val tail : seq<string>
File.WriteAllLines(path: string, contents: System.Collections.Generic.IEnumerable<string>) : unit
File.WriteAllLines(path: string, contents: string []) : unit
File.WriteAllLines(path: string, contents: System.Collections.Generic.IEnumerable<string>, encoding: System.Text.Encoding) : unit
File.WriteAllLines(path: string, contents: string [], encoding: System.Text.Encoding) : unit
val append : source1:seq<'T> -> source2:seq<'T> -> seq<'T>
val defaultValue : value:'T -> option:'T option -> 'T
Raw view Test code New version

More information

Link:http://fssnip.net/8ac
Posted:11 months ago
Author:amieres
Tags: #seq