3 people like it.
Like the snippet!
PDF Text Extract
Extracts text content from PDF file using PDFsharp.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
|
open System.Text
open PdfSharp.Pdf.IO
open PdfSharp.Pdf.Content
open PdfSharp.Pdf.Content.Objects
let rec extractText(content:CObject, sb:StringBuilder) =
match content with
| :? CArray as xs -> for x in xs do extractText(x, sb)
| :? CComment -> ()
| :? CInteger -> ()
| :? CName -> ()
| :? CNumber -> ()
| :? COperator as op // Tj/TJ = Show text
when op.OpCode.OpCodeName = OpCodeName.Tj ||
op.OpCode.OpCodeName = OpCodeName.TJ ->
for element in op.Operands do extractText(element, sb)
sb.Append(" ") |> ignore
| :? COperator -> ()
| :? CSequence as xs -> for x in xs do extractText(x, sb)
| :? CString as s -> sb.Append(s.Value) |> ignore
| x -> raise <| System.NotImplementedException(x.ToString())
let readAllText (pdfPath:string) =
use document = PdfReader.Open(pdfPath, PdfDocumentOpenMode.ReadOnly)
let result = StringBuilder()
for page in document.Pages do
let content = ContentReader.ReadContent(page)
extractText(content, result)
result.AppendLine() |> ignore
result.ToString()
let text = readAllText @"<path>.pdf"
|
namespace System
namespace System.Text
namespace PdfSharp
namespace PdfSharp.Pdf
namespace PdfSharp.Pdf.IO
namespace PdfSharp.Pdf.Content
namespace PdfSharp.Pdf.Content.Objects
val extractText : content:CObject * sb:StringBuilder -> unit
Full name: Script.extractText
val content : CObject
type CObject =
member Clone : unit -> CObject
Full name: PdfSharp.Pdf.Content.Objects.CObject
val sb : StringBuilder
Multiple items
type StringBuilder =
new : unit -> StringBuilder + 5 overloads
member Append : value:string -> StringBuilder + 18 overloads
member AppendFormat : format:string * arg0:obj -> StringBuilder + 4 overloads
member AppendLine : unit -> StringBuilder + 1 overload
member Capacity : int with get, set
member Chars : int -> char with get, set
member Clear : unit -> StringBuilder
member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
member EnsureCapacity : capacity:int -> int
member Equals : sb:StringBuilder -> bool
...
Full name: System.Text.StringBuilder
--------------------
StringBuilder() : unit
StringBuilder(capacity: int) : unit
StringBuilder(value: string) : unit
StringBuilder(value: string, capacity: int) : unit
StringBuilder(capacity: int, maxCapacity: int) : unit
StringBuilder(value: string, startIndex: int, length: int, capacity: int) : unit
Multiple items
type CArray =
inherit CSequence
new : unit -> CArray
member Clone : unit -> CArray
member ToString : unit -> string
Full name: PdfSharp.Pdf.Content.Objects.CArray
--------------------
CArray() : unit
val xs : CArray
val x : CObject
Multiple items
type CComment =
inherit CObject
new : unit -> CComment
member Clone : unit -> CComment
member Text : string with get, set
member ToString : unit -> string
Full name: PdfSharp.Pdf.Content.Objects.CComment
--------------------
CComment() : unit
Multiple items
type CInteger =
inherit CNumber
new : unit -> CInteger
member Clone : unit -> CInteger
member ToString : unit -> string
member Value : int with get, set
Full name: PdfSharp.Pdf.Content.Objects.CInteger
--------------------
CInteger() : unit
Multiple items
type CName =
inherit CObject
new : unit -> CName + 1 overload
member Clone : unit -> CName
member Name : string with get, set
member ToString : unit -> string
Full name: PdfSharp.Pdf.Content.Objects.CName
--------------------
CName() : unit
CName(name: string) : unit
type CNumber =
inherit CObject
member Clone : unit -> CNumber
Full name: PdfSharp.Pdf.Content.Objects.CNumber
type COperator =
inherit CObject
member Clone : unit -> COperator
member Name : string
member OpCode : OpCode
member Operands : CSequence
member ToString : unit -> string
Full name: PdfSharp.Pdf.Content.Objects.COperator
val op : COperator
property COperator.OpCode: OpCode
field OpCode.OpCodeName
type OpCodeName =
| b = 0
| B = 1
| bx = 2
| Bx = 3
| BDC = 4
| BI = 5
| BMC = 6
| BT = 7
| BX = 8
| c = 9
...
Full name: PdfSharp.Pdf.Content.Objects.OpCodeName
field OpCodeName.Tj = 58
field OpCodeName.TJ = 59
val element : CObject
property COperator.Operands: CSequence
StringBuilder.Append(value: char []) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: obj) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: uint64) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: uint32) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: uint16) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: decimal) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: float) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: float32) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: int64) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: int) : StringBuilder
(+0 other overloads)
val ignore : value:'T -> unit
Full name: Microsoft.FSharp.Core.Operators.ignore
Multiple items
type CSequence =
inherit CObject
new : unit -> CSequence
member Add : sequence:CSequence -> unit + 1 overload
member Clear : unit -> unit
member Clone : unit -> CSequence
member Contains : value:CObject -> bool
member CopyTo : array:CObject[] * index:int -> unit
member Count : int
member GetEnumerator : unit -> IEnumerator<CObject>
member IndexOf : value:CObject -> int
member Insert : index:int * value:CObject -> unit
...
Full name: PdfSharp.Pdf.Content.Objects.CSequence
--------------------
CSequence() : unit
val xs : CSequence
Multiple items
type CString =
inherit CObject
new : unit -> CString
member Clone : unit -> CString
member ToString : unit -> string
member Value : string with get, set
Full name: PdfSharp.Pdf.Content.Objects.CString
--------------------
CString() : unit
val s : CString
property CString.Value: string
val raise : exn:System.Exception -> 'T
Full name: Microsoft.FSharp.Core.Operators.raise
Multiple items
type NotImplementedException =
inherit SystemException
new : unit -> NotImplementedException + 2 overloads
Full name: System.NotImplementedException
--------------------
System.NotImplementedException() : unit
System.NotImplementedException(message: string) : unit
System.NotImplementedException(message: string, inner: exn) : unit
System.Object.ToString() : string
val readAllText : pdfPath:string -> string
Full name: Script.readAllText
val pdfPath : string
Multiple items
val string : value:'T -> string
Full name: Microsoft.FSharp.Core.Operators.string
--------------------
type string = System.String
Full name: Microsoft.FSharp.Core.string
val document : PdfSharp.Pdf.PdfDocument
type PdfReader =
static member Open : path:string -> PdfDocument + 10 overloads
static member TestPdfFile : path:string -> int + 2 overloads
Full name: PdfSharp.Pdf.IO.PdfReader
PdfReader.Open(stream: System.IO.Stream) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(path: string) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(path: string, password: string) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(path: string, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, password: string, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, openmode: PdfDocumentOpenMode, passwordProvider: PdfPasswordProvider) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(path: string, password: string, openmode: PdfDocumentOpenMode) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(path: string, openmode: PdfDocumentOpenMode, provider: PdfPasswordProvider) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
PdfReader.Open(stream: System.IO.Stream, password: string, openmode: PdfDocumentOpenMode, passwordProvider: PdfPasswordProvider) : PdfSharp.Pdf.PdfDocument
(+0 other overloads)
type PdfDocumentOpenMode =
| Modify = 0
| Import = 1
| ReadOnly = 2
| InformationOnly = 3
Full name: PdfSharp.Pdf.IO.PdfDocumentOpenMode
field PdfDocumentOpenMode.ReadOnly = 2
val result : StringBuilder
val page : PdfSharp.Pdf.PdfPage
property PdfSharp.Pdf.PdfDocument.Pages: PdfSharp.Pdf.PdfPages
val content : CSequence
type ContentReader =
static member ReadContent : page:PdfPage -> CSequence + 1 overload
Full name: PdfSharp.Pdf.Content.ContentReader
ContentReader.ReadContent(content: byte []) : CSequence
ContentReader.ReadContent(page: PdfSharp.Pdf.PdfPage) : CSequence
StringBuilder.AppendLine() : StringBuilder
StringBuilder.AppendLine(value: string) : StringBuilder
StringBuilder.ToString() : string
StringBuilder.ToString(startIndex: int, length: int) : string
val text : string
Full name: Script.text
More information