Web Crawler example in XQuery

Description of a web crawler example in XQuery.

The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not. This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml. The complete code can be found in the test directory of the html convertor module.

import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";

The internal pages are checked recursively, while the external ones are only checked for existence. The distinction between internal and external links is made by comparing the URI with a global string variable $uri-host. Change this variable to point to your website, or a subdirectory on your website.

declare variable $top-uri  as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";
declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";

declare function local:is-internal($x as xs:string) as xs:boolean
{
 starts-with($x, $uri-host)
};

The crawling starts from the URI pointed by $top-uri.

Visited links are stored as nodes in two maps, one for internal pages and one for external pages. The keys are the URIs, and the values are the strings "broken" or "clean". The maps are used to avoid parsing the same page twice.

declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
declare variable $local:processed-external-links := xs:QName("processed-external-links");

declare %ann:sequential function local:create-containers()
{
  map:create($local:processed-internal-links, xs:QName("xs:string"));
  map:create($local:processed-external-links, xs:QName("xs:string"));
};

declare %ann:sequential function local:delete-containers(){
  for $x in map:available-maps()
  return map:delete($x);
};

After parsing an internal page with html module, all the links are extracted and parsed recursively, if they haven't been parsed. The html module uses tidy library, so we use tidy options to setup for converting from html to xml. Some html tags are marked to be ignored in new-inline-tags param, this being a particular case of this website. You can add or remove tags to suit your website needs.

declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
{  distinct-values( for $y in  ($content//*:a/string(@href),
                              $content//*:link/string(@href),
                              $content//*:script/string(@src),
                              $content//*:img/string(@src),
                              $content//*:area/string(@href)
                              )
return  local:get-real-link($y, $uri))
};

declare function local:tidy-options()
{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
                                         <tidyParam name="output-xml" value="yes" />
                                         <tidyParam name="doctype" value="omit" />
                                         <tidyParam name="quote-nbsp" value="no" />
                                         <tidyParam name="char-encoding" value="utf8" />
                                         <tidyParam name="newline" value="LF" />
                                         <tidyParam name="tidy-mark" value="no" />
                                         <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
                                       </options>
};

declare  %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
      if($n=3) then exit returning (); else {}
      if(not(empty(map:get($local:processed-internal-links, $x))))
            then exit returning false();
              else {}
       variable $http-call:=();
       try{
             $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
       }
       catch * {}
      if( not(local:alive($http-call)))
                then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
               else {}
       if(not (local:get-media-type($http-call[1]) = $supported-media-types))
                then {map:insert($local:processed-internal-links, "clean", $x);  exit returning ();}
                else {}
       variable $string-content := xs:string($http-call[2]);
       variable $content:=();

       try{
             $content:=html:parse($string-content,local:tidy-options() );
        }
        catch *
             { 
                 map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x); 
                 try{
                       $content:=parse-xml:parse-xml-fragment ($string-content, "");
                 }
                 catch *
                     { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
            }
       variable $links :=();
       if(empty($content))
            then $links:=local:get-out-links-unparsed($string-content, $x);
           else $links:=local:get-out-links-parsed($content, $x);
       for $l in $links
       return  local:process-link($l, $n+1);
};

Some html pages have errors, and tidy library is very strict with checking errors. When the parsing fails, we fallback to using regex for extracting the links.

declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{

      distinct-values( 
         let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
         for $other-uri2 in  $search//group[@nr=8]/string()
         let $y:= fn:normalize-space($other-uri2)
         return local:get-real-link($y, $uri)
     )
};

For external links, we just check if they exist, so the http command requests only for HEAD.

declare  %ann:sequential function local:process-external-link($x as xs:string){
  if(not(empty(map:get($local:processed-external-links, $x))))
         then   exit returning false();
         else {}
 variable $http-call:=();
  try{
        $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
  }
  catch * {}
  if( local:alive($http-call))
          then map:insert($local:processed-external-links, "clean", $x);
          else map:insert($local:processed-external-links, "broken", $x);
};

After parsing, the results are returned in xml format.

declare function local:print-results() as element()*
{
    for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
    return <INTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-internal-links,$x)}</RESULT></INTERNAL>, 
    
    for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
    return <EXTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-external-links,$x)}</RESULT></EXTERNAL>
};
blog comments powered by Disqus