4 people like it.

CSV reader

I've modified the CSV sample from Expert F# to my needs. I don't wann be forced to use the csv schema as defined by column rows. Therefore I've done two major modifications. 1. remove the permutation 2. added a new column name option to the ColumnAttribute 3. added a name to csv index mapping So basically you now have 3 options. 1. Don't annotate your record at all and use it as POCO. The order of the record fields is mapped directly to the order in the csv. UPDATE: I don't recommend this any more. As of the writing of this snippet I wasn't aware of the fact, that field order isn't guaranted by the reflection mechanism. 2. Use the index option of the ColumnAttribute. Same as before. 3. Use the name option. This is what I've looked for. I've to deal with tons of csv that has more columns I'm interested in. Have a look at the sample usage below. I've moved the type conversion out of the CsvReader class in order to be easyly expandable with custom type conversation (i.e. for combined column values - denormalized data)

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
module Csv

open System
open System.IO
open System.Reflection
open Microsoft.FSharp.Reflection

type ColumnAttribute(index:int option,name:string option) =     
    inherit Attribute()     
    let mutable index = index
    let mutable name = name
    new () = ColumnAttribute (None, None)
    member x.Index          
        with get() = match index with | Some i -> i | None -> -1
        and set value = index <- Some value         
    member x.Name          
        with get() = match name with | Some n -> n | None -> ""
        and set value = name <- Some value     

type CsvReader<'a>(typeConverter:Type -> (string -> obj)) = 
    let mutable header = Map.empty
    let recordType = typeof<'a>  
    let fields = FSharpType.GetRecordFields(recordType)
    let objectBuilder = FSharpValue.PreComputeRecordConstructor(recordType)
    let split (delim:char) (line:string) = 
       line.Split([|delim|]) |> Array.map( fun s -> s.Trim())

    member x.CreateRecord(header:Map<string,int>, delim, line) = 
        let lookupFromHeader (column:ColumnAttribute) = 
            match column.Name with
            | name when name <> String.Empty ->
                try
                    Some header.[name]
                with e -> failwithf "no"  
            | _ -> None
             
        let schema = fields |> Array.mapi( fun fieldIndex field -> 
            let propertyInfo = recordType.GetProperty(field.Name)
            let deserializeColumnData = typeConverter field.PropertyType
            let columnIndex = 
                match propertyInfo.GetCustomAttributes(typeof<ColumnAttribute>,false) with
                | [| (:? ColumnAttribute as col) |] -> 
                    match col.Index with
                    | i when i >= 0 -> i 
                    | _ -> 
                        match lookupFromHeader col with
                        | Some(i) -> i
                        | None -> fieldIndex 
                | _ -> fieldIndex
                
            (fieldIndex, field.Name, columnIndex, deserializeColumnData) )
        
        let fieldContentFromSchema (words:string[]) = 
            let deserializedData = 
                schema 
                |> Array.map( fun (fieldIndex, fieldName, columnIndex, deserializeColumnData) -> 
                        deserializeColumnData words.[columnIndex])
            deserializedData

        let words = line |> split delim |> fieldContentFromSchema
        let convertColumn colText (fieldName, deserializeColumnData) =
            try deserializeColumnData colText
            with e ->
                failwithf "error converting '%s' to field '%s'" colText fieldName

        let obj = objectBuilder(words)
        unbox<'a>(obj)
    
    member x.ReadFile(file, separator:char, firstLineHasHeader:bool) = 
        seq { 
            use textReader = File.OpenText(file)
            if firstLineHasHeader then
                header <-
                    textReader.ReadLine() 
                    |> split separator
                    |> Array.filter (fun name -> not (String.IsNullOrWhiteSpace name))
                    |> Array.mapi (fun i name -> (name, i))
                    |> Map.ofArray
            while not textReader.EndOfStream do
                let line = textReader.ReadLine()
                if not (String.IsNullOrEmpty line) then
                    yield x.CreateRecord(header, separator, line)
        }


//Examples:
//the csv-header is mandatory for this case!
type Substance = {
    [<Column(Name="subst id")>] Id : int 
    [<Column(Name="name")>] Name : string
    [<Column(Name="sequence")>] Sequence : string
}

// a one-to-one mapping to the column names
// UPDATE: Dont' use this any more. As of the writing of this snippet I wasn't aware of the fact, that field order isn't guaranted by the reflection mechanism.
type Probe = {
    Name : string
    Mismatches : int
    Feature : string
    HitLocation : string
    Strain : string
} 

//0 based index mapping 
type ProbeAlt = {
    [<Column(Index=4)>]Strain : string
    [<Column(Index=0)>]Name : string
}

//read the csv
let typeConverter _type =
    match _type with
    | t when t = typeof<float>    -> (System.Double.Parse >> box)
    | t when t = typeof<int>      -> (System.Int32.Parse >> box)
    | t when t = typeof<string>   -> (fun(s:string) -> box s)
    | t when t = typeof<bool>     -> (System.Boolean.Parse >> box)
    | t -> failwithf "Unknown type %A" t

let path = "" //....
let reader = new CsvReader<Probe>(typeConverter)
let hasHeader = true
let separator = '\t'
let probes = reader.ReadFile(path, separator, hasHeader)
module Csv
namespace System
namespace System.IO
namespace System.Reflection
namespace Microsoft
namespace Microsoft.FSharp
namespace Microsoft.FSharp.Reflection
Multiple items
type ColumnAttribute =
  inherit Attribute
  new : unit -> ColumnAttribute
  new : index:int option * name:string option -> ColumnAttribute
  member Index : int
  member Name : string
  member Index : int with set
  member Name : string with set

Full name: Csv.ColumnAttribute

--------------------
new : unit -> ColumnAttribute
new : index:int option * name:string option -> ColumnAttribute
val index : int option
Multiple items
val int : value:'T -> int (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.int

--------------------
type int = int32

Full name: Microsoft.FSharp.Core.int

--------------------
type int<'Measure> = int

Full name: Microsoft.FSharp.Core.int<_>
type 'T option = Option<'T>

Full name: Microsoft.FSharp.Core.option<_>
val name : string option
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
Multiple items
type Attribute =
  member Equals : obj:obj -> bool
  member GetHashCode : unit -> int
  member IsDefaultAttribute : unit -> bool
  member Match : obj:obj -> bool
  member TypeId : obj
  static member GetCustomAttribute : element:MemberInfo * attributeType:Type -> Attribute + 7 overloads
  static member GetCustomAttributes : element:MemberInfo -> Attribute[] + 15 overloads
  static member IsDefined : element:MemberInfo * attributeType:Type -> bool + 7 overloads

Full name: System.Attribute

--------------------
Attribute() : unit
val mutable index : int option
val mutable name : string option
union case Option.None: Option<'T>
val x : ColumnAttribute
member ColumnAttribute.Index : int with set

Full name: Csv.ColumnAttribute.Index
union case Option.Some: Value: 'T -> Option<'T>
val i : int
val set : elements:seq<'T> -> Set<'T> (requires comparison)

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.set
val value : int
member ColumnAttribute.Name : string with set

Full name: Csv.ColumnAttribute.Name
val n : string
val value : string
Multiple items
type CsvReader<'a> =
  new : typeConverter:(Type -> string -> obj) -> CsvReader<'a>
  member CreateRecord : header:Map<string,int> * delim:char * line:string -> 'a
  member ReadFile : file:string * separator:char * firstLineHasHeader:bool -> seq<'a>

Full name: Csv.CsvReader<_>

--------------------
new : typeConverter:(Type -> string -> obj) -> CsvReader<'a>
val typeConverter : (Type -> string -> obj)
type Type =
  inherit MemberInfo
  member Assembly : Assembly
  member AssemblyQualifiedName : string
  member Attributes : TypeAttributes
  member BaseType : Type
  member ContainsGenericParameters : bool
  member DeclaringMethod : MethodBase
  member DeclaringType : Type
  member Equals : o:obj -> bool + 1 overload
  member FindInterfaces : filter:TypeFilter * filterCriteria:obj -> Type[]
  member FindMembers : memberType:MemberTypes * bindingAttr:BindingFlags * filter:MemberFilter * filterCriteria:obj -> MemberInfo[]
  ...

Full name: System.Type
type obj = Object

Full name: Microsoft.FSharp.Core.obj
val mutable header : Map<string,int>
Multiple items
module Map

from Microsoft.FSharp.Collections

--------------------
type Map<'Key,'Value (requires comparison)> =
  interface IEnumerable
  interface IComparable
  interface IEnumerable<KeyValuePair<'Key,'Value>>
  interface ICollection<KeyValuePair<'Key,'Value>>
  interface IDictionary<'Key,'Value>
  new : elements:seq<'Key * 'Value> -> Map<'Key,'Value>
  member Add : key:'Key * value:'Value -> Map<'Key,'Value>
  member ContainsKey : key:'Key -> bool
  override Equals : obj -> bool
  member Remove : key:'Key -> Map<'Key,'Value>
  ...

Full name: Microsoft.FSharp.Collections.Map<_,_>

--------------------
new : elements:seq<'Key * 'Value> -> Map<'Key,'Value>
val empty<'Key,'T (requires comparison)> : Map<'Key,'T> (requires comparison)

Full name: Microsoft.FSharp.Collections.Map.empty
val recordType : Type
val typeof<'T> : Type

Full name: Microsoft.FSharp.Core.Operators.typeof
val fields : PropertyInfo []
type FSharpType =
  static member GetExceptionFields : exceptionType:Type * ?bindingFlags:BindingFlags -> PropertyInfo []
  static member GetFunctionElements : functionType:Type -> Type * Type
  static member GetRecordFields : recordType:Type * ?bindingFlags:BindingFlags -> PropertyInfo []
  static member GetTupleElements : tupleType:Type -> Type []
  static member GetUnionCases : unionType:Type * ?bindingFlags:BindingFlags -> UnionCaseInfo []
  static member IsExceptionRepresentation : exceptionType:Type * ?bindingFlags:BindingFlags -> bool
  static member IsFunction : typ:Type -> bool
  static member IsModule : typ:Type -> bool
  static member IsRecord : typ:Type * ?bindingFlags:BindingFlags -> bool
  static member IsTuple : typ:Type -> bool
  ...

Full name: Microsoft.FSharp.Reflection.FSharpType
static member FSharpType.GetRecordFields : recordType:Type * ?allowAccessToPrivateRepresentation:bool -> PropertyInfo []
static member FSharpType.GetRecordFields : recordType:Type * ?bindingFlags:BindingFlags -> PropertyInfo []
val objectBuilder : (obj [] -> obj)
type FSharpValue =
  static member GetExceptionFields : exn:obj * ?bindingFlags:BindingFlags -> obj []
  static member GetRecordField : record:obj * info:PropertyInfo -> obj
  static member GetRecordFields : record:obj * ?bindingFlags:BindingFlags -> obj []
  static member GetTupleField : tuple:obj * index:int -> obj
  static member GetTupleFields : tuple:obj -> obj []
  static member GetUnionFields : value:obj * unionType:Type * ?bindingFlags:BindingFlags -> UnionCaseInfo * obj []
  static member MakeFunction : functionType:Type * implementation:(obj -> obj) -> obj
  static member MakeRecord : recordType:Type * values:obj [] * ?bindingFlags:BindingFlags -> obj
  static member MakeTuple : tupleElements:obj [] * tupleType:Type -> obj
  static member MakeUnion : unionCase:UnionCaseInfo * args:obj [] * ?bindingFlags:BindingFlags -> obj
  ...

Full name: Microsoft.FSharp.Reflection.FSharpValue
static member FSharpValue.PreComputeRecordConstructor : recordType:Type * ?allowAccessToPrivateRepresentation:bool -> (obj [] -> obj)
static member FSharpValue.PreComputeRecordConstructor : recordType:Type * ?bindingFlags:BindingFlags -> (obj [] -> obj)
val split : (char -> string -> string [])
val delim : char
Multiple items
val char : value:'T -> char (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.char

--------------------
type char = Char

Full name: Microsoft.FSharp.Core.char
val line : string
String.Split([<ParamArray>] separator: char []) : string []
String.Split(separator: string [], options: StringSplitOptions) : string []
String.Split(separator: char [], options: StringSplitOptions) : string []
String.Split(separator: char [], count: int) : string []
String.Split(separator: string [], count: int, options: StringSplitOptions) : string []
String.Split(separator: char [], count: int, options: StringSplitOptions) : string []
type Array =
  member Clone : unit -> obj
  member CopyTo : array:Array * index:int -> unit + 1 overload
  member GetEnumerator : unit -> IEnumerator
  member GetLength : dimension:int -> int
  member GetLongLength : dimension:int -> int64
  member GetLowerBound : dimension:int -> int
  member GetUpperBound : dimension:int -> int
  member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
  member Initialize : unit -> unit
  member IsFixedSize : bool
  ...

Full name: System.Array
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.map
val s : string
String.Trim() : string
String.Trim([<ParamArray>] trimChars: char []) : string
val x : CsvReader<'a>
member CsvReader.CreateRecord : header:Map<string,int> * delim:char * line:string -> 'a

Full name: Csv.CsvReader`1.CreateRecord
val header : Map<string,int>
val lookupFromHeader : (ColumnAttribute -> int option)
val column : ColumnAttribute
property ColumnAttribute.Name: string
val name : string
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
field string.Empty
val e : exn
val failwithf : format:Printf.StringFormat<'T,'Result> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.failwithf
val schema : (int * string * int * (string -> obj)) []
val mapi : mapping:(int -> 'T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.mapi
val fieldIndex : int
val field : PropertyInfo
val propertyInfo : PropertyInfo
Type.GetProperty(name: string) : PropertyInfo
Type.GetProperty(name: string, returnType: Type) : PropertyInfo
Type.GetProperty(name: string, types: Type []) : PropertyInfo
Type.GetProperty(name: string, bindingAttr: BindingFlags) : PropertyInfo
Type.GetProperty(name: string, returnType: Type, types: Type []) : PropertyInfo
Type.GetProperty(name: string, returnType: Type, types: Type [], modifiers: ParameterModifier []) : PropertyInfo
Type.GetProperty(name: string, bindingAttr: BindingFlags, binder: Binder, returnType: Type, types: Type [], modifiers: ParameterModifier []) : PropertyInfo
property MemberInfo.Name: string
val deserializeColumnData : (string -> obj)
property PropertyInfo.PropertyType: Type
val columnIndex : int
MemberInfo.GetCustomAttributes(inherit: bool) : obj []
MemberInfo.GetCustomAttributes(attributeType: Type, inherit: bool) : obj []
val col : ColumnAttribute
property ColumnAttribute.Index: int
val fieldContentFromSchema : (string [] -> obj [])
val words : string []
val deserializedData : obj []
val fieldName : string
val words : obj []
val convertColumn : (string -> string * (string -> 'b) -> 'b)
val colText : string
val deserializeColumnData : (string -> 'b)
Multiple items
val obj : obj

--------------------
type obj = Object

Full name: Microsoft.FSharp.Core.obj
val unbox : value:obj -> 'T

Full name: Microsoft.FSharp.Core.Operators.unbox
member CsvReader.ReadFile : file:string * separator:char * firstLineHasHeader:bool -> seq<'a>

Full name: Csv.CsvReader`1.ReadFile
val file : string
val separator : char
val firstLineHasHeader : bool
type bool = Boolean

Full name: Microsoft.FSharp.Core.bool
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
val textReader : StreamReader
type File =
  static member AppendAllLines : path:string * contents:IEnumerable<string> -> unit + 1 overload
  static member AppendAllText : path:string * contents:string -> unit + 1 overload
  static member AppendText : path:string -> StreamWriter
  static member Copy : sourceFileName:string * destFileName:string -> unit + 1 overload
  static member Create : path:string -> FileStream + 3 overloads
  static member CreateText : path:string -> StreamWriter
  static member Decrypt : path:string -> unit
  static member Delete : path:string -> unit
  static member Encrypt : path:string -> unit
  static member Exists : path:string -> bool
  ...

Full name: System.IO.File
File.OpenText(path: string) : StreamReader
StreamReader.ReadLine() : string
val filter : predicate:('T -> bool) -> array:'T [] -> 'T []

Full name: Microsoft.FSharp.Collections.Array.filter
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
String.IsNullOrWhiteSpace(value: string) : bool
val ofArray : elements:('Key * 'T) [] -> Map<'Key,'T> (requires comparison)

Full name: Microsoft.FSharp.Collections.Map.ofArray
property StreamReader.EndOfStream: bool
String.IsNullOrEmpty(value: string) : bool
member CsvReader.CreateRecord : header:Map<string,int> * delim:char * line:string -> 'a
type Substance =
  {Id: int;
   Name: string;
   Sequence: string;}

Full name: Csv.Substance
Substance.Id: int
Substance.Name: string
Substance.Sequence: string
type Probe =
  {Name: string;
   Mismatches: int;
   Feature: string;
   HitLocation: string;
   Strain: string;}

Full name: Csv.Probe
Probe.Name: string
Probe.Mismatches: int
Probe.Feature: string
Probe.HitLocation: string
Probe.Strain: string
type ProbeAlt =
  {Strain: string;
   Name: string;}

Full name: Csv.ProbeAlt
ProbeAlt.Strain: string
ProbeAlt.Name: string
val typeConverter : _type:Type -> (string -> obj)

Full name: Csv.typeConverter
val _type : Type
val t : Type
Multiple items
val float : value:'T -> float (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.float

--------------------
type float = Double

Full name: Microsoft.FSharp.Core.float

--------------------
type float<'Measure> = float

Full name: Microsoft.FSharp.Core.float<_>
type Double =
  struct
    member CompareTo : value:obj -> int + 1 overload
    member Equals : obj:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member GetTypeCode : unit -> TypeCode
    member ToString : unit -> string + 3 overloads
    static val MinValue : float
    static val MaxValue : float
    static val Epsilon : float
    static val NegativeInfinity : float
    static val PositiveInfinity : float
    ...
  end

Full name: System.Double
Double.Parse(s: string) : float
Double.Parse(s: string, provider: IFormatProvider) : float
Double.Parse(s: string, style: Globalization.NumberStyles) : float
Double.Parse(s: string, style: Globalization.NumberStyles, provider: IFormatProvider) : float
val box : value:'T -> obj

Full name: Microsoft.FSharp.Core.Operators.box
type Int32 =
  struct
    member CompareTo : value:obj -> int + 1 overload
    member Equals : obj:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member GetTypeCode : unit -> TypeCode
    member ToString : unit -> string + 3 overloads
    static val MaxValue : int
    static val MinValue : int
    static member Parse : s:string -> int + 3 overloads
    static member TryParse : s:string * result:int -> bool + 1 overload
  end

Full name: System.Int32
Int32.Parse(s: string) : int
Int32.Parse(s: string, provider: IFormatProvider) : int
Int32.Parse(s: string, style: Globalization.NumberStyles) : int
Int32.Parse(s: string, style: Globalization.NumberStyles, provider: IFormatProvider) : int
type Boolean =
  struct
    member CompareTo : obj:obj -> int + 1 overload
    member Equals : obj:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member GetTypeCode : unit -> TypeCode
    member ToString : unit -> string + 1 overload
    static val TrueString : string
    static val FalseString : string
    static member Parse : value:string -> bool
    static member TryParse : value:string * result:bool -> bool
  end

Full name: System.Boolean
Boolean.Parse(value: string) : bool
val path : string

Full name: Csv.path
val reader : CsvReader<Probe>

Full name: Csv.reader
val hasHeader : bool

Full name: Csv.hasHeader
val separator : char

Full name: Csv.separator
val probes : seq<Probe>

Full name: Csv.probes
member CsvReader.ReadFile : file:string * separator:char * firstLineHasHeader:bool -> seq<'a>

More information

Link:http://fssnip.net/3T
Posted:12 years ago
Author:Rainer Schuster
Tags: csv , deserialize , reader