3 people like it.

PDF Text Extract

Extracts text content from PDF file using PDFsharp.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
open System.Text
open PdfSharp.Pdf.IO
open PdfSharp.Pdf.Content
open PdfSharp.Pdf.Content.Objects

let rec extractText(content:CObject, sb:StringBuilder) =
   match content with
   | :? CArray as xs -> for x in xs do extractText(x, sb)
   | :? CComment -> ()
   | :? CInteger -> ()
   | :? CName -> ()
   | :? CNumber -> ()
   | :? COperator as op // Tj/TJ = Show text
      when op.OpCode.OpCodeName = OpCodeName.Tj ||
            op.OpCode.OpCodeName = OpCodeName.TJ ->
      for element in op.Operands do extractText(element, sb)
      sb.Append(" ") |> ignore
   | :? COperator -> ()
   | :? CSequence as xs -> for x in xs do extractText(x, sb)
   | :? CString as s -> sb.Append(s.Value) |> ignore
   | x -> raise <| System.NotImplementedException(x.ToString())

let readAllText (pdfPath:string) =
   use document = PdfReader.Open(pdfPath, PdfDocumentOpenMode.ReadOnly)
   let result = StringBuilder()
   for page in document.Pages do
      let content = ContentReader.ReadContent(page)
      extractText(content, result)
      result.AppendLine() |> ignore
   result.ToString()

let text = readAllText @"<path>.pdf"
namespace System
namespace System.Text
val extractText : content:'a * sb:StringBuilder -> 'b

Full name: Script.extractText
val content : 'a
val sb : StringBuilder
Multiple items
type StringBuilder =
  new : unit -> StringBuilder + 5 overloads
  member Append : value:string -> StringBuilder + 18 overloads
  member AppendFormat : format:string * arg0:obj -> StringBuilder + 4 overloads
  member AppendLine : unit -> StringBuilder + 1 overload
  member Capacity : int with get, set
  member Chars : int -> char with get, set
  member Clear : unit -> StringBuilder
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EnsureCapacity : capacity:int -> int
  member Equals : sb:StringBuilder -> bool
  ...

Full name: System.Text.StringBuilder

--------------------
StringBuilder() : unit
StringBuilder(capacity: int) : unit
StringBuilder(value: string) : unit
StringBuilder(value: string, capacity: int) : unit
StringBuilder(capacity: int, maxCapacity: int) : unit
StringBuilder(value: string, startIndex: int, length: int, capacity: int) : unit
StringBuilder.Append(value: char []) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: obj) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint64) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint32) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint16) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: decimal) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: float) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: float32) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: int64) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: int) : StringBuilder
   (+0 other overloads)
val ignore : value:'T -> unit

Full name: Microsoft.FSharp.Core.Operators.ignore
val raise : exn:System.Exception -> 'T

Full name: Microsoft.FSharp.Core.Operators.raise
Multiple items
type NotImplementedException =
  inherit SystemException
  new : unit -> NotImplementedException + 2 overloads

Full name: System.NotImplementedException

--------------------
System.NotImplementedException() : unit
System.NotImplementedException(message: string) : unit
System.NotImplementedException(message: string, inner: exn) : unit
val readAllText : pdfPath:string -> string

Full name: Script.readAllText
val pdfPath : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = System.String

Full name: Microsoft.FSharp.Core.string
val document : System.IDisposable
val result : StringBuilder
val page : obj
val content : obj
StringBuilder.AppendLine() : StringBuilder
StringBuilder.AppendLine(value: string) : StringBuilder
StringBuilder.ToString() : string
StringBuilder.ToString(startIndex: int, length: int) : string
val text : string

Full name: Script.text

More information

Link:http://fssnip.net/lT
Posted:3 years ago
Author:Phillip Trelford
Tags: pdf