6 people like it.

Speech Stackoverflow article automatically

This snippet automatically speeches text of question,answers and comments in a Stackoverflow article. It requires HtmlAgilityPack(available from Nuget package manager).(Attention: You need reset F# interactive to stop the speech)

Html formatting functions

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
open System.Text.RegularExpressions
/// remove html tags to make raw text
let removeTags text = 
  Regex.Replace(text,"<.*?>","")
/// replace some words for correct pronouciation
let replaceWords text =
  [
    @"\(\W\)\." , "\1 "
    @"\.\(\W\)" , " \1"
    "F#"        , "F sharp"
    ";|:"       , " "
  ]
  |> Seq.fold (fun input (pattern,replacement) -> 
    Regex.Replace(input,pattern,replacement)) text


open System.Web
/// unescape some escaped charactors
let unescapeHtml text = HttpUtility.HtmlDecode text


/// do all formatting operations
let formatHtml text =
  text
  |> removeTags
  |> unescapeHtml
  |> replaceWords

Get text to speech

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
//TODO: please rewrite the below reference to your HtmlAgilityPack.dll
#r @"C:\Users\nagat01\Documents\Visual Studio 11\Projects\WebTrawler\packages\HtmlAgilityPack.1.4.3\lib\HtmlAgilityPack.dll"
open HtmlAgilityPack
/// this xpath specifies the text to speech
let xpath = 
  """
    //div[@id="question-header"]//a
  | //div[@class="post-text"]//p
  | //span[@class="comment-copy"]
  | //a[@class="comment-user"]
  """
let getTextToSheech url =
  let documentNode = HtmlWeb().Load(url).DocumentNode
  seq {
    for text in documentNode.SelectNodes xpath ->
      text.InnerHtml
      |> formatHtml
  }
  |> String.concat "\n"

Usage

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
#r "System.Speech"  
open System.Speech.Synthesis
let speech url =
  let text = getTextToSheech url
  printfn "%s" text
  use speechSynthesizer = new SpeechSynthesizer(Rate= -3)
  speechSynthesizer.Speak text

let url = "http://stackoverflow.com/questions/181613/hidden-features-of-f"
speech url
namespace System
namespace System.Text
namespace System.Text.RegularExpressions
val removeTags : text:string -> string

Full name: Script.removeTags


 remove html tags to make raw text
val text : string
Multiple items
type Regex =
  new : pattern:string -> Regex + 1 overload
  member GetGroupNames : unit -> string[]
  member GetGroupNumbers : unit -> int[]
  member GroupNameFromNumber : i:int -> string
  member GroupNumberFromName : name:string -> int
  member IsMatch : input:string -> bool + 1 overload
  member Match : input:string -> Match + 2 overloads
  member Matches : input:string -> MatchCollection + 1 overload
  member Options : RegexOptions
  member Replace : input:string * replacement:string -> string + 5 overloads
  ...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator) : string
Regex.Replace(input: string, pattern: string, replacement: string) : string
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator, options: RegexOptions) : string
Regex.Replace(input: string, pattern: string, replacement: string, options: RegexOptions) : string
val replaceWords : text:string -> string

Full name: Script.replaceWords


 replace some words for correct pronouciation
module Seq

from Microsoft.FSharp.Collections
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State

Full name: Microsoft.FSharp.Collections.Seq.fold
val input : string
val pattern : string
val replacement : string
namespace System.Web
val unescapeHtml : text:string -> string

Full name: Script.unescapeHtml


 unescape some escaped charactors
Multiple items
type HttpUtility =
  new : unit -> HttpUtility
  static member HtmlAttributeEncode : s:string -> string + 1 overload
  static member HtmlDecode : s:string -> string + 1 overload
  static member HtmlEncode : s:string -> string + 2 overloads
  static member JavaScriptStringEncode : value:string -> string + 1 overload
  static member ParseQueryString : query:string -> NameValueCollection + 1 overload
  static member UrlDecode : str:string -> string + 3 overloads
  static member UrlDecodeToBytes : str:string -> byte[] + 3 overloads
  static member UrlEncode : str:string -> string + 3 overloads
  static member UrlEncodeToBytes : str:string -> byte[] + 3 overloads
  ...

Full name: System.Web.HttpUtility

--------------------
HttpUtility() : unit
HttpUtility.HtmlDecode(s: string) : string
HttpUtility.HtmlDecode(s: string, output: IO.TextWriter) : unit
val formatHtml : text:string -> string

Full name: Script.formatHtml


 do all formatting operations
val xpath : string

Full name: Script.xpath


 this xpath specifies the text to speech
val getTextToSheech : url:'a -> string

Full name: Script.getTextToSheech
val url : 'a
val documentNode : obj
Multiple items
val seq : sequence:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Core.Operators.seq

--------------------
type seq<'T> = Collections.Generic.IEnumerable<'T>

Full name: Microsoft.FSharp.Collections.seq<_>
val text : obj
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
val concat : sep:string -> strings:seq<string> -> string

Full name: Microsoft.FSharp.Core.String.concat
namespace System.Speech
namespace System.Speech.Synthesis
val speech : url:'a -> unit

Full name: Script.speech
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
val speechSynthesizer : SpeechSynthesizer
Multiple items
type SpeechSynthesizer =
  new : unit -> SpeechSynthesizer
  member AddLexicon : uri:Uri * mediaType:string -> unit
  member Dispose : unit -> unit
  member GetCurrentlySpokenPrompt : unit -> Prompt
  member GetInstalledVoices : unit -> ReadOnlyCollection<InstalledVoice> + 1 overload
  member Pause : unit -> unit
  member Rate : int with get, set
  member RemoveLexicon : uri:Uri -> unit
  member Resume : unit -> unit
  member SelectVoice : name:string -> unit
  ...

Full name: System.Speech.Synthesis.SpeechSynthesizer

--------------------
SpeechSynthesizer() : unit
SpeechSynthesizer.Speak(promptBuilder: PromptBuilder) : unit
SpeechSynthesizer.Speak(prompt: Prompt) : unit
SpeechSynthesizer.Speak(textToSpeak: string) : unit
val url : string

Full name: Script.url
Next Version Raw view Test code New version

More information

Link:http://fssnip.net/bX
Posted:12 years ago
Author:nagat01
Tags: web , html , parsing , regular expressions