3 people like it.
Like the snippet!
PDF Text Extract
Extracts text content from PDF file using PDFsharp.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
|
open System.Text
open PdfSharp.Pdf.IO
open PdfSharp.Pdf.Content
open PdfSharp.Pdf.Content.Objects
let rec extractText(content:CObject, sb:StringBuilder) =
match content with
| :? CArray as xs -> for x in xs do extractText(x, sb)
| :? CComment -> ()
| :? CInteger -> ()
| :? CName -> ()
| :? CNumber -> ()
| :? COperator as op // Tj/TJ = Show text
when op.OpCode.OpCodeName = OpCodeName.Tj ||
op.OpCode.OpCodeName = OpCodeName.TJ ->
for element in op.Operands do extractText(element, sb)
sb.Append(" ") |> ignore
| :? COperator -> ()
| :? CSequence as xs -> for x in xs do extractText(x, sb)
| :? CString as s -> sb.Append(s.Value) |> ignore
| x -> raise <| System.NotImplementedException(x.ToString())
let readAllText (pdfPath:string) =
use document = PdfReader.Open(pdfPath, PdfDocumentOpenMode.ReadOnly)
let result = StringBuilder()
for page in document.Pages do
let content = ContentReader.ReadContent(page)
extractText(content, result)
result.AppendLine() |> ignore
result.ToString()
let text = readAllText @"<path>.pdf"
|
namespace System
namespace System.Text
val extractText : content:'a * sb:StringBuilder -> 'b
val content : 'a
val sb : StringBuilder
Multiple items
type StringBuilder =
new : unit -> StringBuilder + 5 overloads
member Append : value:string -> StringBuilder + 23 overloads
member AppendFormat : format:string * arg0:obj -> StringBuilder + 7 overloads
member AppendJoin : separator:string * [<ParamArray>] values:obj[] -> StringBuilder + 5 overloads
member AppendLine : unit -> StringBuilder + 1 overload
member Capacity : int with get, set
member Chars : int -> char with get, set
member Clear : unit -> StringBuilder
member CopyTo : sourceIndex:int * destination:Span<char> * count:int -> unit + 1 overload
member EnsureCapacity : capacity:int -> int
...
nested type ChunkEnumerator
--------------------
StringBuilder() : StringBuilder
StringBuilder(capacity: int) : StringBuilder
StringBuilder(value: string) : StringBuilder
StringBuilder(value: string, capacity: int) : StringBuilder
StringBuilder(capacity: int, maxCapacity: int) : StringBuilder
StringBuilder(value: string, startIndex: int, length: int, capacity: int) : StringBuilder
StringBuilder.Append(value: System.ReadOnlyMemory<char>) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: System.ReadOnlySpan<char>) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: char []) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: obj) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: uint64) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: uint32) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: uint16) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: decimal) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: float) : StringBuilder
(+0 other overloads)
StringBuilder.Append(value: float32) : StringBuilder
(+0 other overloads)
val ignore : value:'T -> unit
val raise : exn:System.Exception -> 'T
Multiple items
type NotImplementedException =
inherit SystemException
new : unit -> NotImplementedException + 2 overloads
--------------------
System.NotImplementedException() : System.NotImplementedException
System.NotImplementedException(message: string) : System.NotImplementedException
System.NotImplementedException(message: string, inner: exn) : System.NotImplementedException
val readAllText : pdfPath:string -> string
val pdfPath : string
Multiple items
val string : value:'T -> string
--------------------
type string = System.String
val document : System.IDisposable
val result : StringBuilder
val page : obj
val content : obj
StringBuilder.AppendLine() : StringBuilder
StringBuilder.AppendLine(value: string) : StringBuilder
StringBuilder.ToString() : string
StringBuilder.ToString(startIndex: int, length: int) : string
val text : string
More information