3 people like it.

PDF Text Extract

Extracts text content from PDF file using PDFsharp.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
open System.Text
open PdfSharp.Pdf.IO
open PdfSharp.Pdf.Content
open PdfSharp.Pdf.Content.Objects

let rec extractText(content:CObject, sb:StringBuilder) =
   match content with
   | :? CArray as xs -> for x in xs do extractText(x, sb)
   | :? CComment -> ()
   | :? CInteger -> ()
   | :? CName -> ()
   | :? CNumber -> ()
   | :? COperator as op // Tj/TJ = Show text
      when op.OpCode.OpCodeName = OpCodeName.Tj ||
            op.OpCode.OpCodeName = OpCodeName.TJ ->
      for element in op.Operands do extractText(element, sb)
      sb.Append(" ") |> ignore
   | :? COperator -> ()
   | :? CSequence as xs -> for x in xs do extractText(x, sb)
   | :? CString as s -> sb.Append(s.Value) |> ignore
   | x -> raise <| System.NotImplementedException(x.ToString())

let readAllText (pdfPath:string) =
   use document = PdfReader.Open(pdfPath, PdfDocumentOpenMode.ReadOnly)
   let result = StringBuilder()
   for page in document.Pages do
      let content = ContentReader.ReadContent(page)
      extractText(content, result)
      result.AppendLine() |> ignore
   result.ToString()

let text = readAllText @"<path>.pdf"
namespace System
namespace System.Text
namespace PdfSharp
namespace PdfSharp.Pdf
namespace PdfSharp.Pdf.IO
namespace PdfSharp.Pdf.Content
namespace PdfSharp.Pdf.Content.Objects
val extractText : content:CObject * sb:StringBuilder -> unit

Full name: Script.extractText
val content : CObject
type CObject =
  member Clone : unit -> CObject

Full name: PdfSharp.Pdf.Content.Objects.CObject
val sb : StringBuilder
Multiple items
type StringBuilder =
  new : unit -> StringBuilder + 5 overloads
  member Append : value:string -> StringBuilder + 18 overloads
  member AppendFormat : format:string * arg0:obj -> StringBuilder + 4 overloads
  member AppendLine : unit -> StringBuilder + 1 overload
  member Capacity : int with get, set
  member Chars : int -> char with get, set
  member Clear : unit -> StringBuilder
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EnsureCapacity : capacity:int -> int
  member Equals : sb:StringBuilder -> bool
  ...

Full name: System.Text.StringBuilder

--------------------
StringBuilder() : unit
StringBuilder(capacity: int) : unit
StringBuilder(value: string) : unit
StringBuilder(value: string, capacity: int) : unit
StringBuilder(capacity: int, maxCapacity: int) : unit
StringBuilder(value: string, startIndex: int, length: int, capacity: int) : unit
Multiple items
type CArray =
  inherit CSequence
  new : unit -> CArray
  member Clone : unit -> CArray
  member ToString : unit -> string

Full name: PdfSharp.Pdf.Content.Objects.CArray

--------------------
CArray() : unit
val xs : CArray
val x : CObject
Multiple items
type CComment =
  inherit CObject
  new : unit -> CComment
  member Clone : unit -> CComment
  member Text : string with get, set
  member ToString : unit -> string

Full name: PdfSharp.Pdf.Content.Objects.CComment

--------------------
CComment() : unit
Multiple items
type CInteger =
  inherit CNumber
  new : unit -> CInteger
  member Clone : unit -> CInteger
  member ToString : unit -> string
  member Value : int with get, set

Full name: PdfSharp.Pdf.Content.Objects.CInteger

--------------------
CInteger() : unit
Multiple items
type CName =
  inherit CObject
  new : unit -> CName + 1 overload
  member Clone : unit -> CName
  member Name : string with get, set
  member ToString : unit -> string

Full name: PdfSharp.Pdf.Content.Objects.CName

--------------------
CName() : unit
CName(name: string) : unit
type CNumber =
  inherit CObject
  member Clone : unit -> CNumber

Full name: PdfSharp.Pdf.Content.Objects.CNumber
type COperator =
  inherit CObject
  member Clone : unit -> COperator
  member Name : string
  member OpCode : OpCode
  member Operands : CSequence
  member ToString : unit -> string

Full name: PdfSharp.Pdf.Content.Objects.COperator
val op : COperator
property COperator.OpCode: OpCode
field OpCode.OpCodeName
type OpCodeName =
  | b = 0
  | B = 1
  | bx = 2
  | Bx = 3
  | BDC = 4
  | BI = 5
  | BMC = 6
  | BT = 7
  | BX = 8
  | c = 9
  ...

Full name: PdfSharp.Pdf.Content.Objects.OpCodeName
field OpCodeName.Tj = 58
field OpCodeName.TJ = 59
val element : CObject
property COperator.Operands: CSequence
StringBuilder.Append(value: char []) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: obj) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint64) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint32) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: uint16) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: decimal) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: float) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: float32) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: int64) : StringBuilder
   (+0 other overloads)
StringBuilder.Append(value: int) : StringBuilder
   (+0 other overloads)
val ignore : value:'T -> unit

Full name: Microsoft.FSharp.Core.Operators.ignore
Multiple items
type CSequence =
  inherit CObject
  new : unit -> CSequence
  member Add : sequence:CSequence -> unit + 1 overload
  member Clear : unit -> unit
  member Clone : unit -> CSequence
  member Contains : value:CObject -> bool
  member CopyTo : array:CObject[] * index:int -> unit
  member Count : int
  member GetEnumerator : unit -> IEnumerator<CObject>
  member IndexOf : value:CObject -> int
  member Insert : index:int * value:CObject -> unit
  ...

Full name: PdfSharp.Pdf.Content.Objects.CSequence

--------------------
CSequence() : unit
val xs : CSequence
Multiple items
type CString =
  inherit CObject
  new : unit -> CString
  member Clone : unit -> CString
  member ToString : unit -> string
  member Value : string with get, set

Full name: PdfSharp.Pdf.Content.Objects.CString

--------------------
CString() : unit
val s : CString
property CString.Value: string
val raise : exn:System.Exception -> 'T

Full name: Microsoft.FSharp.Core.Operators.raise
Multiple items
type NotImplementedException =
  inherit SystemException
  new : unit -> NotImplementedException + 2 overloads

Full name: System.NotImplementedException

--------------------
System.NotImplementedException() : unit
System.NotImplementedException(message: string) : unit
System.NotImplementedException(message: string, inner: exn) : unit
System.Object.ToString() : string
val readAllText : pdfPath:string -> string

Full name: Script.readAllText
val pdfPath : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = System.String

Full name: Microsoft.FSharp.Core.string
val document : PdfSharp.Pdf.PdfDocument
type PdfReader =
  static member Open : path:string -> PdfDocument + 10 overloads
  static member TestPdfFile : path:string -> int + 2 overloads

Full name: PdfSharp.Pdf.IO.PdfReader
PdfReader.Open(stream: System.IO.Stream) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(path: string) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(path: string, password: string) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(path: string, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, password: string, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, openmode: PdfDocumentOpenMode, passwordProvider: PdfPasswordProvider) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(path: string, password: string, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(path: string, openmode: PdfDocumentOpenMode, provider: PdfPasswordProvider) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, password: string, openmode: PdfDocumentOpenMode, passwordProvider: PdfPasswordProvider) : PdfSharp.Pdf.PdfDocument
   (+0 other overloads)
type PdfDocumentOpenMode =
  | Modify = 0
  | Import = 1
  | ReadOnly = 2
  | InformationOnly = 3

Full name: PdfSharp.Pdf.IO.PdfDocumentOpenMode
field PdfDocumentOpenMode.ReadOnly = 2
val result : StringBuilder
val page : PdfSharp.Pdf.PdfPage
property PdfSharp.Pdf.PdfDocument.Pages: PdfSharp.Pdf.PdfPages
val content : CSequence
type ContentReader =
  static member ReadContent : page:PdfPage -> CSequence + 1 overload

Full name: PdfSharp.Pdf.Content.ContentReader
ContentReader.ReadContent(content: byte []) : CSequence
ContentReader.ReadContent(page: PdfSharp.Pdf.PdfPage) : CSequence
StringBuilder.AppendLine() : StringBuilder
StringBuilder.AppendLine(value: string) : StringBuilder
StringBuilder.ToString() : string
StringBuilder.ToString(startIndex: int, length: int) : string
val text : string

Full name: Script.text
Next Version Raw view Test code New version

More information

Link:http://fssnip.net/lT
Posted:3 years ago
Author:Phillip Trelford
Tags: pdf