5 people like it.
Like the snippet!
Seq.splitBy, Seq.splitAt, Seq.tryHeadTail
The example uses Seq.splitBy and Seq.tryHeadTail twice to split the input file in two and then into several pieces collecting the output file names in the first section and the content in the subsequent pieces
It can efficiently split a huge 2.2GB file into several subfiles without having to rescan the file or cache in memory any portions of the file apart from the current line.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
161:
162:
163:
164:
165:
166:
167:
168:
169:
170:
171:
172:
173:
174:
175:
176:
177:
178:
179:
180:
181:
|
module Seq =
type [< RequireQualifiedAccess >] SplitByOption = Exclude | IncludeInFirst | IncludeInSecond
type [< RequireQualifiedAccess >] private SplitSubUnfoldState<'T> =
| PostValue of 'T * SplitSubUnfoldState<'T>
| Start of seqNo: int * start: int
| Started of tryNext: (unit -> 'T option) * bingo: ('T -> unit) * finish: (unit -> unit)
| Finish
type [< RequireQualifiedAccess >] private SplitUnfoldState<'T> = {
enumerator : System.Collections.Generic.IEnumerator<'T>
mutable currentPos : int
mutable isDone : bool
mutable splitterO : 'T option
mutable currentSeqO : 'T seq option
}
/// Straight scan through is efficient, reusing seqs causes rescan from beginning
let splitBy f opt (input: 'a seq) =
let getEnumerator() = input.GetEnumerator()
let startingState() =
{
SplitUnfoldState.enumerator = getEnumerator()
SplitUnfoldState.currentPos = 0
SplitUnfoldState.isDone = false
SplitUnfoldState.splitterO = None
SplitUnfoldState.currentSeqO = None
}, 0
Seq.unfold(fun stateO ->
let state, currentSeqNo = stateO |> Option.defaultWith startingState
let tryNextMain() : 'a option=
if state.isDone then None else
if state.enumerator.MoveNext() then
state.currentPos <- state.currentPos + 1
Some state.enumerator.Current
else
state.enumerator.Dispose()
None
let bingo v = state.splitterO <- Some v ; state.currentSeqO <- None
let finish () = state.splitterO <- None ; state.currentSeqO <- None ; state.isDone <- true
let tryNextSub start =
let en = getEnumerator()
for i in 0..start - 1 do (en.MoveNext() |> ignore)
fun () ->
if en.MoveNext() then Some en.Current
else en.Dispose()
None
let subUnFold(tryNext: unit -> 'a option, bingo, finish) =
match tryNext() with
| None -> finish()
None
| Some v ->
if f v then bingo v
if opt = SplitByOption.IncludeInFirst
then Some( v, SplitSubUnfoldState.Finish )
else None
else Some( v, SplitSubUnfoldState.Started(tryNext, bingo, finish) )
//printfn "Unfold %A" currentSeqNo
while state.currentSeqO |> Option.isSome do
//printfn "skipping"
subUnFold(tryNextMain, bingo , finish) |> ignore
if state.isDone then None else
if opt <> SplitByOption.IncludeInSecond then state.splitterO <- None
let s0 =
let start = SplitSubUnfoldState.Start(currentSeqNo, state.currentPos)
match state.splitterO, opt with
| Some v, SplitByOption.IncludeInSecond -> SplitSubUnfoldState.PostValue(v, start)
| _ -> start
|> Seq.unfold(function
| SplitSubUnfoldState.PostValue(v, next) -> Some(v, next)
| SplitSubUnfoldState.Finish -> None
| SplitSubUnfoldState.Started(tryNext, bingo, finish) -> subUnFold(tryNext, bingo, finish)
| SplitSubUnfoldState.Start(seqNo, myStart) -> //printfn "Starting %d at %d = %d" seqNo myStart state.currentPos
if state.currentPos = myStart // && not state.isDone
then subUnFold(tryNextMain , bingo , finish)
else subUnFold(tryNextSub myStart, ignore, ignore)
)
state.currentSeqO <- Some s0
Some (s0, Some(state, currentSeqNo + 1) )
) None
let splitAt n s =
s
|> Seq.mapi(fun i v -> i,v)
|> splitBy (fst >> ((=) n )) SplitByOption.IncludeInSecond
|> Seq.map (Seq.map snd)
|> Seq.truncate 2
let tryHeadTail fhead ftail s =
( Choice1Of3(), splitAt 1 s )
||> Seq.fold(function
| Choice1Of3 ( ) -> Seq.tryHead >> Option.map fhead >> Choice2Of3
| Choice2Of3 (Some headv) -> ftail headv >> Choice3Of3
| result -> fun _ -> result
)
|> function
| Choice2Of3 (Some headv) -> Seq.empty |> ftail headv |> Some
| Choice3Of3 v -> Some v
| _ -> None
/////////// Sample Usage
[ -1 ; 0 ; 1; 2; 3; -1; 3; 5; 7; -1; 2; 3; 9 ; -1]
|> Seq.map (fun x -> printfn "---> %A" x; x)
|> splitBy ((=) -1) SplitByOption.Exclude
|> Seq.skip 2
//|> Seq.take 2
|> Seq.iter (fun s ->
let s = Seq.cache s
Seq.length s |> printfn " ==> %A %A" (Seq.toList s) )
"Hello friend how are you? Good "
|> Seq.toArray
|> Seq.map (fun x -> printfn "---> %A" x; x)
|> splitBy ((=) ' ') SplitByOption.IncludeInFirst
//|> Seq.iter (printfn "%A")
|> Seq.skip 3
//|> Seq.take 2
|> Seq.iter (fun s ->
let s = Seq.cache s
Seq.length s |> printfn " ==> %A %A" (System.String(Seq.toArray s)) )
[ 1 ; 2 ; 3 ; 4 ; 5 ; 6 ; 7 ; 8 ]
|> Seq.map (fun x -> printfn "---> %A" x; x)
|> splitAt 2
|> printfn "%A"
[ 1 ; 2 ; 3 ; 4 ; 5 ; 6 ; 7 ; 8 ]
|> Seq.map (fun x -> printfn "---> %A" x; x)
|> tryHeadTail id (fun head tail -> printfn "head = %A, tail = %A" head tail )
|> Option.defaultWith (fun () -> printfn "No Head Tail" )
// The following example uses Seq.splitBy and Seq.tryHeadTail
// twice to split the input file first in 2 and then into several pieces
// collecting the output file names in the first section and the content in the subsequent pieces
// it can split a huge 2.2GB file into several subfiles without having to rescan the file
// or cache in memory any portions of the file apart from the current line
// the first Seq.tryHeadTail operates on a seq<seq<string>>. Because of the nested seq it must use 2
// separate functions to process first the head and then the tail without backtracking,
// the result from the first function is passed to the second:
// |> Seq.tryHeadTail (fun head -> ...; filelist) (fun filelist tail -> ...)
// the second Seq.tryHeadTail operates on a seq<string>, because it is not nested seqs it can be called simply:
// |> Seq.tryHeadTail id (fun head tail -> ...)
open System.IO
let splitFile path file =
File.ReadLines file
//|> Seq.mapi (fun i v -> printfn "%d -> %s" i v; v)
|> Seq.skip 1
|> Seq.splitBy (fun l -> l.EndsWith "---*/") Seq.SplitByOption.IncludeInSecond
|> Seq.tryHeadTail(fun head ->
let filelist = head |> Seq.map (fun s-> s.Split('.') |> Seq.last) |> Seq.toArray
for file in filelist do printfn "file: %A" file
printfn "--------"
filelist
) (fun filelist tail ->
tail
|> Seq.collect id
|> Seq.splitBy (fun l -> l.EndsWith " rows affected)" || l = "(1 row affected)") Seq.SplitByOption.IncludeInFirst
|> Seq.map (Seq.skip 1)
|> Seq.iteri(fun i s ->
if i < filelist.Length then
s
|> Seq.tryHeadTail id (fun head tail ->
printfn "%s => %s" filelist.[i] head
File.WriteAllLines(path + filelist.[i] + ".rpt", Seq.append [head] tail)
)
|> Option.defaultValue ()
)
)
|> Option.defaultValue ()
|
module Seq
from Microsoft.FSharp.Collections
Multiple items
type RequireQualifiedAccessAttribute =
inherit Attribute
new : unit -> RequireQualifiedAccessAttribute
--------------------
new : unit -> RequireQualifiedAccessAttribute
type SplitByOption =
| Exclude
| IncludeInFirst
| IncludeInSecond
union case SplitByOption.Exclude: SplitByOption
union case SplitByOption.IncludeInFirst: SplitByOption
union case SplitByOption.IncludeInSecond: SplitByOption
type private SplitSubUnfoldState<'T> =
| PostValue of 'T * SplitSubUnfoldState<'T>
| Start of seqNo: int * start: int
| Started of tryNext: unit -> 'T option * bingo: 'T -> unit * finish: unit -> unit
| Finish
union case SplitSubUnfoldState.PostValue: 'T * SplitSubUnfoldState<'T> -> SplitSubUnfoldState<'T>
union case SplitSubUnfoldState.Start: seqNo: int * start: int -> SplitSubUnfoldState<'T>
Multiple items
val int : value:'T -> int (requires member op_Explicit)
--------------------
[<Struct>]
type int = int32
--------------------
type int<'Measure> =
int
union case SplitSubUnfoldState.Started: tryNext: unit -> 'T option * bingo: 'T -> unit * finish: unit -> unit -> SplitSubUnfoldState<'T>
type unit = Unit
type 'T option = Option<'T>
union case SplitSubUnfoldState.Finish: SplitSubUnfoldState<'T>
type private SplitUnfoldState<'T> =
{ enumerator: IEnumerator<'T>
mutable currentPos: int
mutable isDone: bool
mutable splitterO: 'T option
mutable currentSeqO: seq<'T> option }
SplitUnfoldState.enumerator: System.Collections.Generic.IEnumerator<'T>
namespace System
namespace System.Collections
namespace System.Collections.Generic
type IEnumerator<'T> =
inherit IDisposable
inherit IEnumerator
member Current : 'T
SplitUnfoldState.currentPos: int
SplitUnfoldState.isDone: bool
[<Struct>]
type bool = System.Boolean
SplitUnfoldState.splitterO: 'T option
SplitUnfoldState.currentSeqO: seq<'T> option
Multiple items
val seq : sequence:seq<'T> -> seq<'T>
--------------------
type seq<'T> = System.Collections.Generic.IEnumerable<'T>
val splitBy : f:('a -> bool) -> opt:SplitByOption -> input:seq<'a> -> seq<seq<'a>>
Straight scan through is efficient, reusing seqs causes rescan from beginning
val f : ('a -> bool)
val opt : SplitByOption
val input : seq<'a>
val getEnumerator : (unit -> System.Collections.Generic.IEnumerator<'a>)
System.Collections.Generic.IEnumerable.GetEnumerator() : System.Collections.Generic.IEnumerator<'a>
val startingState : (unit -> SplitUnfoldState<'a> * int)
union case Option.None: Option<'T>
val unfold : generator:('State -> ('T * 'State) option) -> state:'State -> seq<'T>
val stateO : (SplitUnfoldState<'a> * int) option
val state : SplitUnfoldState<'a>
val currentSeqNo : int
module Option
from Microsoft.FSharp.Core
val defaultWith : defThunk:(unit -> 'T) -> option:'T option -> 'T
val tryNextMain : (unit -> 'a option)
SplitUnfoldState.enumerator: System.Collections.Generic.IEnumerator<'a>
System.Collections.IEnumerator.MoveNext() : bool
union case Option.Some: Value: 'T -> Option<'T>
property System.Collections.Generic.IEnumerator.Current: 'a with get
System.IDisposable.Dispose() : unit
val bingo : ('a -> unit)
val v : 'a
SplitUnfoldState.splitterO: 'a option
SplitUnfoldState.currentSeqO: seq<'a> option
val finish : (unit -> unit)
val tryNextSub : (int -> unit -> 'a option)
val start : int
val en : System.Collections.Generic.IEnumerator<'a>
val i : int32
val ignore : value:'T -> unit
val subUnFold : ((unit -> 'a option) * ('a -> unit) * (unit -> unit) -> ('a * SplitSubUnfoldState<'a>) option)
val tryNext : (unit -> 'a option)
val isSome : option:'T option -> bool
val s0 : seq<'a>
val start : SplitSubUnfoldState<'a>
val next : SplitSubUnfoldState<'a>
val seqNo : int
val myStart : int
val splitAt : n:int -> s:seq<'a> -> seq<seq<'a>>
val n : int
val s : seq<'a>
val mapi : mapping:(int -> 'T -> 'U) -> source:seq<'T> -> seq<'U>
val i : int
val fst : tuple:('T1 * 'T2) -> 'T1
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>
val snd : tuple:('T1 * 'T2) -> 'T2
val truncate : count:int -> source:seq<'T> -> seq<'T>
val tryHeadTail : fhead:('a -> 'b) -> ftail:('b -> seq<'a> -> 'c) -> s:seq<'a> -> 'c option
val fhead : ('a -> 'b)
val ftail : ('b -> seq<'a> -> 'c)
union case Choice.Choice1Of3: 'T1 -> Choice<'T1,'T2,'T3>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State
val tryHead : source:seq<'T> -> 'T option
val map : mapping:('T -> 'U) -> option:'T option -> 'U option
union case Choice.Choice2Of3: 'T2 -> Choice<'T1,'T2,'T3>
val headv : 'b
union case Choice.Choice3Of3: 'T3 -> Choice<'T1,'T2,'T3>
val result : Choice<unit,'b option,'c>
val empty<'T> : seq<'T>
val v : 'c
val x : int
val printfn : format:Printf.TextWriterFormat<'T> -> 'T
val skip : count:int -> source:seq<'T> -> seq<'T>
val iter : action:('T -> unit) -> source:seq<'T> -> unit
val s : seq<int>
val cache : source:seq<'T> -> seq<'T>
val length : source:seq<'T> -> int
val toList : source:seq<'T> -> 'T list
val toArray : source:seq<'T> -> 'T []
val x : char
val s : seq<char>
Multiple items
type String =
interface IComparable
interface IEnumerable
interface IConvertible
interface IEnumerable<char>
interface IComparable<string>
interface IEquatable<string>
interface ICloneable
new : value: char [] -> unit + 8 overloads
member Clone : unit -> obj
member CompareTo : value: obj -> int + 1 overload
...
--------------------
System.String(value: char []) : System.String
System.String(value: nativeptr<char>) : System.String
System.String(value: nativeptr<sbyte>) : System.String
System.String(value: System.ReadOnlySpan<char>) : System.String
System.String(c: char, count: int) : System.String
System.String(value: char [], startIndex: int, length: int) : System.String
System.String(value: nativeptr<char>, startIndex: int, length: int) : System.String
System.String(value: nativeptr<sbyte>, startIndex: int, length: int) : System.String
System.String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: System.Text.Encoding) : System.String
val id : x:'T -> 'T
val head : int
val tail : seq<int>
namespace System.IO
val splitFile : path:string -> file:string -> unit
val path : string
val file : string
type File =
static member AppendAllLines : path: string * contents: IEnumerable<string> -> unit + 1 overload
static member AppendAllLinesAsync : path: string * contents: IEnumerable<string> *?cancellationToken: CancellationToken -> Task + 1 overload
static member AppendAllText : path: string * contents: string -> unit + 1 overload
static member AppendAllTextAsync : path: string * contents: string *?cancellationToken: CancellationToken -> Task + 1 overload
static member AppendText : path: string -> StreamWriter
static member AsyncStreamReader : path: string * encoding: Encoding -> StreamReader
static member AsyncStreamWriter : path: string * encoding: Encoding * append: bool -> StreamWriter
static member Copy : sourceFileName: string * destFileName: string -> unit + 1 overload
static member Create : path: string -> FileStream + 2 overloads
static member CreateText : path: string -> StreamWriter
...
File.ReadLines(path: string) : System.Collections.Generic.IEnumerable<string>
File.ReadLines(path: string, encoding: System.Text.Encoding) : System.Collections.Generic.IEnumerable<string>
Multiple items
module Seq
from Script
--------------------
module Seq
from Microsoft.FSharp.Collections
val splitBy : f:('a -> bool) -> opt:Seq.SplitByOption -> input:seq<'a> -> seq<seq<'a>>
Straight scan through is efficient, reusing seqs causes rescan from beginning
val l : string
System.String.EndsWith(value: char) : bool
System.String.EndsWith(value: string) : bool
System.String.EndsWith(value: string, comparisonType: System.StringComparison) : bool
System.String.EndsWith(value: string, ignoreCase: bool, culture: System.Globalization.CultureInfo) : bool
union case Seq.SplitByOption.IncludeInSecond: Seq.SplitByOption
val head : seq<string>
val filelist : string []
val s : string
System.String.Split([<System.ParamArray>] separator: char []) : string []
System.String.Split(separator: string [], options: System.StringSplitOptions) : string []
System.String.Split(separator: string,?options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int) : string []
System.String.Split(separator: char,?options: System.StringSplitOptions) : string []
System.String.Split(separator: string [], count: int, options: System.StringSplitOptions) : string []
System.String.Split(separator: string, count: int,?options: System.StringSplitOptions) : string []
System.String.Split(separator: char [], count: int, options: System.StringSplitOptions) : string []
System.String.Split(separator: char, count: int,?options: System.StringSplitOptions) : string []
val last : source:seq<'T> -> 'T
val tail : seq<seq<string>>
val collect : mapping:('T -> #seq<'U>) -> source:seq<'T> -> seq<'U>
union case Seq.SplitByOption.IncludeInFirst: Seq.SplitByOption
val iteri : action:(int -> 'T -> unit) -> source:seq<'T> -> unit
val s : seq<string>
property System.Array.Length: int with get
val head : string
val tail : seq<string>
File.WriteAllLines(path: string, contents: System.Collections.Generic.IEnumerable<string>) : unit
File.WriteAllLines(path: string, contents: string []) : unit
File.WriteAllLines(path: string, contents: System.Collections.Generic.IEnumerable<string>, encoding: System.Text.Encoding) : unit
File.WriteAllLines(path: string, contents: string [], encoding: System.Text.Encoding) : unit
val append : source1:seq<'T> -> source2:seq<'T> -> seq<'T>
val defaultValue : value:'T -> option:'T option -> 'T
More information