4 people like it.

Extract script blocks from html page

This module extracts all the blocks out of an html page's header. This can be useful if you are writing tools to merge all the js for future minifaction, or for otherwise manipulating the files.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
namespace MergeJsFiles

open HtmlAgilityPack 
open System.Text.RegularExpressions
open System.IO
open System

module JsRetriever =
    
    let stripHtml (text:string) = 
        try   
            let mutable target = text
                     
            let regex = [
                "<script\s*", "";            
                "\"?\s*type\s*=\s*\"\s*text/javascript\s*\"\s*", "";                 
                "</script>", "";
                "src\s*=\s*", ""
                "\"", "";
                ">", "";
                "</",""
                "<",""

            ] 
                
            for (pattern, replacement) in regex do
                    target <- Regex.Replace(target,pattern,replacement).Trim()

            target                 
        with
            | ex -> 
                Console.WriteLine ("Error handling " + text + ", " + ex.ToString())
                ""          

    let convertToAbsolute parent path =
        try            
            Path.Combine(Path.GetDirectoryName(parent), path) |> Path.GetFullPath
        with
            | ex -> 
                Console.WriteLine ("Error handling " + path)
                ""
        

    let endsOn ext file = 
        Path.GetExtension(file) = ext
            
    let getJsFiles (defaultAspxPath:string) = 
        let doc = new HtmlDocument()

        doc.Load defaultAspxPath

        doc.DocumentNode.SelectNodes "/html/head/script/@src" 
            |> Seq.map (fun i -> i.OuterHtml) 
            |> Seq.map stripHtml            
            |> Seq.map (convertToAbsolute defaultAspxPath)
            |> Seq.filter (endsOn ".js")
namespace HtmlAgilityPack
namespace System
namespace System.Text
namespace System.Text.RegularExpressions
namespace System.IO
module JsRetriever

from MergeJsFiles
val stripHtml : text:string -> string

Full name: MergeJsFiles.JsRetriever.stripHtml
val text : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
val mutable target : string
val regex : (string * string) list
val pattern : string
val replacement : string
Multiple items
type Regex =
  new : pattern:string -> Regex + 1 overload
  member GetGroupNames : unit -> string[]
  member GetGroupNumbers : unit -> int[]
  member GroupNameFromNumber : i:int -> string
  member GroupNumberFromName : name:string -> int
  member IsMatch : input:string -> bool + 1 overload
  member Match : input:string -> Match + 2 overloads
  member Matches : input:string -> MatchCollection + 1 overload
  member Options : RegexOptions
  member Replace : input:string * replacement:string -> string + 5 overloads
  ...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator) : string
Regex.Replace(input: string, pattern: string, replacement: string) : string
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator, options: RegexOptions) : string
Regex.Replace(input: string, pattern: string, replacement: string, options: RegexOptions) : string
val ex : exn
type Console =
  static member BackgroundColor : ConsoleColor with get, set
  static member Beep : unit -> unit + 1 overload
  static member BufferHeight : int with get, set
  static member BufferWidth : int with get, set
  static member CapsLock : bool
  static member Clear : unit -> unit
  static member CursorLeft : int with get, set
  static member CursorSize : int with get, set
  static member CursorTop : int with get, set
  static member CursorVisible : bool with get, set
  ...

Full name: System.Console
Console.WriteLine() : unit
   (+0 other overloads)
Console.WriteLine(value: string) : unit
   (+0 other overloads)
Console.WriteLine(value: obj) : unit
   (+0 other overloads)
Console.WriteLine(value: uint64) : unit
   (+0 other overloads)
Console.WriteLine(value: int64) : unit
   (+0 other overloads)
Console.WriteLine(value: uint32) : unit
   (+0 other overloads)
Console.WriteLine(value: int) : unit
   (+0 other overloads)
Console.WriteLine(value: float32) : unit
   (+0 other overloads)
Console.WriteLine(value: float) : unit
   (+0 other overloads)
Console.WriteLine(value: decimal) : unit
   (+0 other overloads)
Exception.ToString() : string
val convertToAbsolute : parent:string -> path:string -> string

Full name: MergeJsFiles.JsRetriever.convertToAbsolute
val parent : string
val path : string
type Path =
  static val DirectorySeparatorChar : char
  static val AltDirectorySeparatorChar : char
  static val VolumeSeparatorChar : char
  static val InvalidPathChars : char[]
  static val PathSeparator : char
  static member ChangeExtension : path:string * extension:string -> string
  static member Combine : [<ParamArray>] paths:string[] -> string + 3 overloads
  static member GetDirectoryName : path:string -> string
  static member GetExtension : path:string -> string
  static member GetFileName : path:string -> string
  ...

Full name: System.IO.Path
Path.Combine([<ParamArray>] paths: string []) : string
Path.Combine(path1: string, path2: string) : string
Path.Combine(path1: string, path2: string, path3: string) : string
Path.Combine(path1: string, path2: string, path3: string, path4: string) : string
Path.GetDirectoryName(path: string) : string
Path.GetFullPath(path: string) : string
val endsOn : ext:string -> file:string -> bool

Full name: MergeJsFiles.JsRetriever.endsOn
val ext : string
val file : string
Path.GetExtension(path: string) : string
val getJsFiles : defaultAspxPath:string -> seq<string>

Full name: MergeJsFiles.JsRetriever.getJsFiles
val defaultAspxPath : string
val doc : HtmlDocument
Multiple items
type HtmlDocument =
  new : unit -> HtmlDocument
  val OptionAddDebuggingAttributes : bool
  val OptionAutoCloseOnEnd : bool
  val OptionCheckSyntax : bool
  val OptionComputeChecksum : bool
  val OptionDefaultStreamEncoding : Encoding
  val OptionExtractErrorSourceText : bool
  val OptionExtractErrorSourceTextMaxLength : int
  val OptionFixNestedTags : bool
  val OptionOutputAsXml : bool
  ...

Full name: HtmlAgilityPack.HtmlDocument

--------------------
HtmlDocument() : unit
HtmlDocument.Load(reader: TextReader) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, encoding: Text.Encoding) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string, encoding: Text.Encoding) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, buffersize: int) : unit
   (+0 other overloads)
property HtmlDocument.DocumentNode: HtmlNode
HtmlNode.SelectNodes(xpath: string) : HtmlNodeCollection
module Seq

from Microsoft.FSharp.Collections
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
val i : HtmlNode
property HtmlNode.OuterHtml: string
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.filter
Raw view New version

More information

Link:http://fssnip.net/iR
Posted:3 years ago
Author:devshorts
Tags: html , parsing