open System open System.Net open System.Text open System.IO open System.Text.RegularExpressions //just a short snippet to measure time spent in an eagerly executed function //not gonna work with lazy function, e.g. function returning a sequence (IEnumerable) let time jobName job = let startTime = DateTime.Now; let returnValue = job() let endTime = DateTime.Now; printfn "%s took %d ms" jobName (int((endTime - startTime).TotalMilliseconds)) returnValue //goes through 2 lists in linear time and looks for equal keys //for elements with equal keys a given function is called to produce output merged element //NOTE: //1. function assumes that keys in the second list are unique, otherwise results will be surprising, see (*) below //2. function assumes both lists are ordered ascending let rec orderedListsMerge xs ys keyExtractor merger = match xs, ys with | [],_ | _,[] -> [] | x::xs', y::ys' -> let xkey = keyExtractor x let ykey = keyExtractor y if(xkey = ykey) then //here we move xs forward, but keep ys the same, //because we assume that next y will have different key while next x might still have the same key, //otherwise this logic is incorrect (merger x y) :: orderedListsMerge xs' ys keyExtractor merger // (*) elif(xkey > ykey) then orderedListsMerge xs ys' keyExtractor merger else orderedListsMerge xs' ys keyExtractor merger let webRequestHtml (url : string) = let req = WebRequest.Create(url) let resp = req.GetResponse() let stream = resp.GetResponseStream() let reader = new StreamReader(stream, Encoding.GetEncoding("Windows-1251")) //don't forget the Encoding, when you work with international documents let html = reader.ReadToEnd() resp.Close() html let regexSingleLineMatch input pattern = Regex.Match(input, pattern, RegexOptions.Singleline).Groups.Item(1).Value let regexMatches input pattern = seq { for m in Regex.Matches(input, pattern) -> m.Groups.Item(1).Value } //only named hrefs point to poems let extractNamedHrefs html = //I tried XmlDocument here, but it doesn't work as HTML can contain some "invalid" elements like   //Stand back now, I'm going to use regular expressions! let hrefPattern = ".*" regexMatches html hrefPattern //remove all html markup from the line let cleanupHtml text = let htmlTagPattern = "<.+?>" Regex.Replace(text, htmlTagPattern, String.Empty) //remove all html markup from the line let takeFirstLine text = let firstLinePattern = "(.*)" Regex.Match(text, firstLinePattern).Groups.Item(1).Value type Poem(poemHref : string, title : string, lines : seq) = let MAX_TITLE_LENGTH = 30 member this.Href = poemHref member this.Title = let newTitle = match title with | "* * *" -> (lines |> Seq.nth 0) | _ -> title if(newTitle.Length > MAX_TITLE_LENGTH) then newTitle.Substring(0, MAX_TITLE_LENGTH-3) + "..." else newTitle member this.Lines = seq { for line in lines -> let nbspPattern = " " Regex.Replace(line, nbspPattern, "") } member this.LineTokens = seq { for line in lines -> let russianWordPattern = "([а-яА-Я]+)" regexMatches (line.ToLower()) russianWordPattern } //TODO: add more structural analysis -> handle sub-titles and personas let producePoem poemHref poemHtml = //titles can be multiline, sometimes they include sub-titles let titlePattern = "

(.+?)

" let linePattern = "(.+?)" new Poem( poemHref, (regexSingleLineMatch poemHtml titlePattern) |> cleanupHtml |> takeFirstLine, regexMatches poemHtml linePattern |> Seq.map cleanupHtml) //check that the given link is a link to a final edition poem, not early edition to avoid duplicate texts in index let isFinalEditionHref (href : string) = not (href.Contains("03edit")) let crawlPoems = let domainUrl = "http://www.rvb.ru/pushkin/" let volumeUrlTemplate = domainUrl + "tocvol{0}.htm" let poemUrlTemplate = domainUrl + "{0}" //take only first 4 volumes -- they contain poems seq { for volumeNumber in 1..4 -> String.Format(volumeUrlTemplate, volumeNumber) } |> Seq.map webRequestHtml |> Seq.collect extractNamedHrefs |> Seq.filter isFinalEditionHref |> Seq.map (fun href -> String.Format(poemUrlTemplate, href)) // //development mode -- comment later // |> Seq.take 40 //requesting individual poems |> Seq.map (fun href -> (producePoem href (webRequestHtml href))) |> Seq.cache //building inversed index of tokens in poems //so that we have a way to index (token -> poem number -> (line number,position in line)) let indexPoems (poems : seq) = poems |> Seq.mapi ( fun poemNumber poem -> poem.LineTokens |> Seq.mapi ( fun lineNumber tokens -> tokens |> Seq.mapi ( fun position token -> (token, poemNumber, lineNumber, position) ) ) |> Seq.concat ) |> Seq.concat //now we have raw list of tuples, we will turn it into ordered inversed index |> Seq.groupBy (fun (token, _, _, _) -> token) |> Seq.sortBy (fun (token, _) -> token) |> Seq.map ( fun (token, tuples) -> let poems = tuples |> Seq.map ( fun (token, poemNumber, lineNumber, position) -> (poemNumber,lineNumber,position) ) |> Seq.groupBy (fun (poemNumber,lineNumber,position) -> poemNumber) |> Seq.sortBy (fun (poemNumber, _) -> poemNumber) |> Seq.map ( fun (poemNumber, tuples) -> let linesPositions = tuples |> Seq.map (fun (poemNumber,lineNumber,position) -> (lineNumber,position)) |> Seq.sortBy ( fun (lineNumber,position) -> position) |> Seq.sortBy ( fun (lineNumber,position) -> lineNumber) //sortBy is stable according to MSDN |> Seq.toList (poemNumber, linesPositions) ) |> Seq.toList (token, poems) ) |> Seq.toList //token index is a subtree of full index only including poems and lines with the given token in given position let tokenIndex fullIndex filterToken filterPosition = let (token, poems) = fullIndex |> List.find (fun(token, poems) -> token=filterToken) poems |> List.map ( fun (poemNumber, linesPositions) -> let filteredLines = linesPositions |> List.filter (fun (lineNumber, position) -> position = filterPosition) (poemNumber, filteredLines) ) |> List.filter (fun (poemNumber, linesPositions) -> not (Seq.isEmpty linesPositions)) //intersect current index with token index //we want to only keep tokens and poems which are present in the token index (which is a subtree of full index, see above) let intersectIndex currentIndex tokenIndex = currentIndex |> List.map ( fun (token, poems) -> let mergePoems currentPoems tokenPoems = let mergeLinesPositions currentLinesPositions tokenLinesPositions = let keyExtractor = (fun (lineNumber, _) -> lineNumber) let merger = (fun (currentLineNumber, currentPosition) (_,_) -> (currentLineNumber, currentPosition)) orderedListsMerge currentLinesPositions tokenLinesPositions keyExtractor merger let keyExtractor = (fun (poemNumber, _) -> poemNumber) let merger = (fun (currentPoemNumber, currentLinesPositions) (_, tokenLinesPositions) -> (currentPoemNumber, mergeLinesPositions currentLinesPositions tokenLinesPositions)) orderedListsMerge currentPoems tokenPoems keyExtractor merger |> List.filter (fun (poemNumber, linesPositions) -> not (List.isEmpty linesPositions)) (token, mergePoems poems tokenIndex) ) |> List.filter (fun (token, poems) -> not (List.isEmpty poems)) // The main function to query reverse index // index -- index per se, we assume that the index is already filtered by caller using intersect\tokenFilter // findPosition -- number of position to search tokens for // the query function will return a list of terms that can be in this position // count -- number of most frequent terms to return let queryIndex index findPosition count = index |> List.map ( fun (token, poems) -> let tokenFreq = poems |> List.sumBy ( fun (_, linesPositions) -> linesPositions |> List.sumBy ( fun (lineNumber, position) -> if (position = findPosition) then 1 else 0 ) ) (token, tokenFreq) ) |> Seq.filter (fun (token, tokenFreq) -> tokenFreq > 0) |> Seq.sortBy (fun (token, tokenFreq) -> -tokenFreq) |> Seq.zip [1..count] // Seq.take fails if there is less than "count" elements |> Seq.map (fun (index, element) -> element) |> Seq.toList //acquire first poem for given token and position //used to resolve single query result token into poems let getPoemResult index findToken findPosition = let (token, poems) = index |> List.find (fun (token, poems) -> token = findToken) poems |> List.collect ( fun (poemNumber, linesPositions) -> linesPositions |> List.filter (fun (lineNumber, position) -> position = findPosition) |> List.map (fun (lineNumber, position) -> (poemNumber, lineNumber)) ) |> Seq.nth 0 type QueryResult = //token + count | LineVariant of string*int //poemNumber, lineNumber | SinglePoem of int*int //TODO: identical strings currently will not be resolved to their poems //this is a wrapper around query index that will perform the same action, //but the result will be translated and wrapped into QueryResult type //single result tokens will be returned as (poemNumber, lineNumber) tuple let wrappedQueryIndex filteredIndex searchPosition count = queryIndex filteredIndex searchPosition count |> List.map ( fun (token, count) -> match count with | 1 -> SinglePoem(getPoemResult filteredIndex token searchPosition) | _ -> LineVariant(token,count) ) type PrettyResult = | PrettyLineVariant of string*int | PrettySinglePoem of string*string*int*string let prettifyQueryResults queryResults poems = queryResults |> List.map ( fun result -> match result with | SinglePoem (poemNumber, lineNumber) -> let (poem : Poem) = poems |> Seq.nth poemNumber let line = poem.Lines |> Seq.nth lineNumber PrettySinglePoem(poem.Title, poem.Href, lineNumber, line) | LineVariant (token, count) -> PrettyLineVariant(token,count) ) type PushkinTreeNode = | VariantNode of string*int*seq | PoemNode of string*string*int*string let createPushkinTree pushkinPoems poemsIndex count = let rec createTreeLevel count currentQuery currentIndex = let searchPosition = List.length currentQuery let queryResult = wrappedQueryIndex currentIndex searchPosition count let prettyResult = prettifyQueryResults queryResult pushkinPoems prettyResult |> List.map ( fun result -> match result with | PrettySinglePoem (title, href, lineNumber, line) -> PoemNode(title, href, lineNumber, line) | PrettyLineVariant (token, freq) -> VariantNode(token, freq, createTreeLevel count (currentQuery @ [token]) (intersectIndex currentIndex (tokenIndex currentIndex token searchPosition))) ) createTreeLevel count [] poemsIndex let resultsToHtml pushkinTree = let rec treeToHtml tree currentPath (currentNumber:int) startingNumber = let zippedTreeLevel = tree |> Seq.zip (Seq.initInfinite (fun i -> startingNumber+i)) let pathToString path = let parts = path |> Seq.map (fun x -> "'"+x+"'") String.Join(",", parts) let thisLevelStart = String.Format("
", currentNumber) + Environment.NewLine let thisLevelTable = zippedTreeLevel |> Seq.fold ( fun acc treeNode -> match treeNode with | (number, PoemNode (title, href, lineNumber, line)) -> acc + String.Format("",line, href, title, lineNumber+1) + Environment.NewLine | (number, VariantNode (token, freq, subtree)) -> acc + String.Format("", number, token, (pathToString (currentPath@[string(number)])), freq) + Environment.NewLine ) "" let thisLevelEnd = @"
{0}{2}, строка {3}
{1} ⇒ {3}
" + Environment.NewLine + Environment.NewLine let thisLevelOutput = (thisLevelStart+thisLevelTable+thisLevelEnd) let levelLength = tree |> Seq.length let (subTreeCount, subTreeOutput) = zippedTreeLevel |> Seq.fold ( fun (acc, result) treeNode -> match treeNode with | (number, PoemNode (title, href, lineNumber, line)) -> (acc, result) | (number, VariantNode (token, freq, subtree)) -> let (subTreeCount, subTreeOutput) = treeToHtml subtree (currentPath@[string(number)]) number (startingNumber+acc+levelLength) (acc + subTreeCount, result + subTreeOutput) ) (0, "") (levelLength + subTreeCount, thisLevelOutput + subTreeOutput) let (_, content) = treeToHtml pushkinTree ["0"] 0 1 content let outputResultsToFile (content:string) = let templateFile = "template.htm" let outputFile = "output.htm" let templateReplacePattern = "#HERE_GOES_CONTENT#" let templateHtml = File.ReadAllText(templateFile) let resultHtml = Regex.Replace(templateHtml, templateReplacePattern, content) File.WriteAllText(outputFile, resultHtml) let poems = crawlPoems //lazy operation, so we can't time it here, we do it the in next line printfn "Crawled %d poems" (time "Crawling poems" (fun() -> poems |> Seq.length)) printfn "Crawled %d lines" (poems |> Seq.sumBy ( fun poem -> poem.Lines |> Seq.length)) let poemIndex = time "Indexing poems" (fun () -> poems |> indexPoems) printfn "Index contains %d terms" poemIndex.Length let tree = time "Generating result tree" (fun () -> createPushkinTree poems poemIndex 20) let htmlContent = time "Generating html content" (fun () -> resultsToHtml tree) time "Output content" (fun() -> outputResultsToFile htmlContent) //let queryResult = queryIndex poemIndex ["но"] 10 //let prettyResult = prettifyQueryResults queryResult poems