#!/usr/bin/perl -w  
#*-------------------------------------------------------------------
#*- spider.pl: 
#*-   Author: Manu Konchady
#*-   License: GPL
#*-   Input: A query
#*-   Output: A list of the hubs and authorities for the query  
#*-   Module Dependencies: HTML::Parser 3.0, LWP::UserAgent,
#*-   URI::URL, HTTP::Request, DBI and Posix. 
#*-
#*-   Description: We assume that two tables have been created for this
#*-   particular query to store URLs and associated links
#*-   prior to the execution of the spider. An entry has also
#*-   been made in a queries table for this query. 
#*- 
#*-   An initial collection of URLs are stored in the URL table 
#*-   for the query followed by a parallel execution of spiders 
#*-   to selectively generate the web tree for the query. 
#*-   After all spiders have terminated, the hubs and authorities 
#*-   for the query are listed. The list of all URLs sorted by 
#*-   relevancy can also be viewed.     
#*-
#*-   Notes: This code is a sample and should be modified as
#*-   needed. For example, relevancy can be computed using several
#*-   options as described in the article. Links can be pruned or
#*-   followed depending on user criteria. Priorities can be
#*-   set for certain domains. Entity tables can be used to track
#*-   entities across web pages.
#*-   
#*-   RAL hacked to do postgres instead of MySQL
#*-   tables are utab, queries, ltab
#*- ( url varchar(120), status varchar(10), site varchar(120), relevancy float4, level int4, spider int4)
#*- ( query varchar(250), qword varchar(250), url_limit int4, url_count int4, status varchar(10))
#*- the "query" is the text to match, the qword is a name for reference and 
#*- which was used to set the tables up
#*- ( url varchar(250), link varchar(250)) 
#*-------------------------------------------------------------------

 use POSIX ":sys_wait_h";
 use PgDBI;
 use MyWeb;
 use strict;

 my $dbname = 'querynet1';
 my ($i, @pid, $done, $num_spiders, $kid, $query, %urls, $count);
 my ($qword, $utab, $ltab, $dbh, $sth, $command, $url_limit, $url);

 #*-- The first argument is a table prefix associated with the query.
 #*-- It is used to assign the url and link tables for the query. 
 #*-- The database name for the db connection must be entered.
 $qword = $ARGV[0];
 $utab = "$qword" . "_u"; $ltab = "$qword" . "_l";
 $dbh = PgDBI::connect_db("dbname=$dbname");

 #*-- get the full query and maximum number of urls for the query
 $command = "select query, url_limit from queries where qword = '$qword'";
 $sth = PgDBI::execute_stmt($dbh, $command);
 ($query, $url_limit) = PgDBI::fetch_row($sth);

 #*-- get an initial set of urls from Google for the query
 &call_google();
 #*-- close the database in ccase google has crashed it

 PgDBI::disconnect_db($dbh, $sth);

 #*-- start the parallel operation with 4 spiders, this
 #*-- works on Linux. On a Windows platform, you may need
 #*-- to use the Win32 module to start individual processes
#$num_spiders = 1; 
$num_spiders = 4;
 for $i (1..$num_spiders)
  {
   if ($pid[$i] = fork()) #*-- the parent code continues
    { next; }
   else			  #*-- the child code continues
    { defined($pid[$i]) or die "fork: $!"; &fetch_text($i); }
  }
####################################################################
 #*-- wait till all the spiders are complete. Every two seconds,
 #*-- check the status of the spiders
 $done = 0;
 while (!($done))
  {
   for $i (1..$num_spiders)
    {
     next if ($pid[$i] == 0);
     $kid = waitpid($pid[$i], &WNOHANG); 
     $pid[$i] = 0 if ($kid == -1);
    }
   $done = 1;
   for $i (1..$num_spiders) { $done = 0 if ($pid[$i] != 0); }
   sleep(2);
  }

 #*-- clean up after the spiders, find all URLs which were
 #*-- not processed and delete them from the link table

 $dbh = PgDBI::connect_db("dbname=$dbname");
 $command = "select url from $utab where status != 'd'";
 $sth = PgDBI::execute_stmt($dbh, $command);
 while ( ($url) = PgDBI::fetch_row($sth) ) { $urls{$url}++; }
 foreach $url (keys %urls)
  { $command = "delete from $ltab where url = '$url'";
    $sth = PgDBI::execute_stmt($dbh, $command);
  }

 #*-- delete the unprocessed urls from the url table
 $command = "delete from $utab where status != 'd'";
 $sth = PgDBI::execute_stmt($dbh, $command);

 $command = "update queries set status = 'done' where qword = '$qword'";
 $sth = PgDBI::execute_stmt($dbh, $command);

 #*-- print the top 10 hubs and authorities
 print ("List of top 10 Hubs\n"); $i = 1;
 $command = "select count(*), url from $ltab group by url order by 1 desc";
 $sth = PgDBI::execute_stmt($dbh, $command);
 while ( ($count, $url) = PgDBI::fetch_row($sth) )
  { print ("$i. $url\n"); $i++; last if ($i == 11); }

 print ("List of top 10 Authorities\n"); $i = 1;
 $command = "select count(*), link from $ltab group by link order by 1 desc";
 $sth = PgDBI::execute_stmt($dbh, $command);
 while ( ($count, $url) = PgDBI::fetch_row($sth) )
  { print ("$i". "- "." $url\n"); $i++; last if ($i == 11); }
 
 PgDBI::disconnect_db($dbh, $sth);

 exit(0);


 #*------------------------------------------------------------
 #*-  Each of the spiders run the following code and then terminate.
 #*-  An unprocessed URL is selected and traversed. New links
 #*-  are added to the tables. The spider number is passed
 #*-  as a parameter
 #*------------------------------------------------------------
 sub fetch_text()
  {

   my ($snum) = @_;
   my ($dbh, $sth, $done, $command, $count, $links, $ref_text, $usite);
   my ($ref_links, $url, $level, $link, %links, $site, $url_temp);

   #*-- create a handle for the database
   $dbh = PgDBI::connect_db("dbname=$dbname");
   $done = 0;
   while (!($done)) 
    {
     $link = '';
##     $command = "lock tables $utab write, queries write"; # no good in postgres
#     $command = "begin work;"; # ral new
#     $sth = PgDBI::execute_stmt($dbh, $command);
#     $command = "lock table $utab"; # ral new
#     $sth = PgDBI::execute_stmt($dbh, $command); 
#     $command = "lock table queries"; # ral new
#     $sth = PgDBI::execute_stmt($dbh, $command);

     #*-- check if there are any unprocessed URLs
     $command = "select url, level, site from $utab where status = ''"; # Pg
     $sth = PgDBI::execute_stmt($dbh, $command);
     ($url, $level, $usite) = PgDBI::fetch_row($sth);
     $command = "select count(*) from $utab where status = ''"; # Pg
     $sth = PgDBI::execute_stmt($dbh, $command);
     ($count) = PgDBI::fetch_row($sth);

     if ($count == 0)
       { $done = 1;        next; } # Pg

     #*-- update the number of URLs processed in the queries table
     $command = "select count(*) from $utab where status = 'd'";
     $sth = PgDBI::execute_stmt($dbh, $command);
     ($count) = PgDBI::fetch_row($sth);
     $command = "update queries set url_count = $count where qword = '$qword'";
     $sth = PgDBI::execute_stmt($dbh, $command);
     #*-- check if the number of processed URLs exceeds the limit
     if ($count > $url_limit)
#       { $done = 1; $sth = PgDBI::execute_stmt($dbh, "unlock tables"); next; } # no good in postgres
         { $done = 1; 
# $sth = PgDBI::execute_stmt($dbh, "commit work"); #
          next; } # Pg

     #*-- set the status of the URL and unlock the tables
     $url_temp = $url;
     $url =~ s/'/\\'/g; $url =~ s/\$/\\\$/;
     $command = "update $utab set status = 'v' where url = '$url'";
     $sth = PgDBI::execute_stmt($dbh, $command);

     #*-- if this is an irrelevant URL, then remove it
     $url = $url_temp;
     ($ref_text, $ref_links) = MyWeb::parse_URL($url);
     if (!(&match($$ref_text, $query) ) ) 
      {
       $command = "delete from $utab where url = '$url'";
       $sth = PgDBI::execute_stmt($dbh, $command);
       if ($link eq '') { next; } else {
       $command = "delete from $ltab where url = '$url'";
       $sth = PgDBI::execute_stmt($dbh, $command);
       next;}
      }

     #*-- add all links for this relevant URL to $ltab
     $level++; %links = %$ref_links;
     foreach $link (keys %links)
      {
       
       ($site = $link) =~ s#/.*$##i; 
       #*-- skip links to ads and other formatted documents
       next if ($link =~ /doubleclick/i);
       next if ($link =~ /affiliates\./i);
       next if ($link =~ /\.(jpg|gif|pdf|ps|doc|xls|ppt|mpg|mpeg)$/i);
       #*-- skip links to the same site, except when one of the
       #*-- query keywords matches the link
       next if ( ($site eq $usite) && (!(&match_link($link, $query))) );
       $link =~ s/'/\\'/g; $link =~ s/\$/\\\$/;
       $link = lc($link);
       #*-- check for duplicates
       $command = "select count(*) from $utab where url = '$link'";
       $sth = PgDBI::execute_stmt($dbh, $command);
       ($count) = PgDBI::fetch_row($sth);
       next if ($count != 0);
       #*-- check for too many URLs from the same site
       $command = "select count(*) from $utab where site = '$site'";
       $sth = PgDBI::execute_stmt($dbh, $command);
       ($count) = PgDBI::fetch_row($sth);
       next if ($count > 200);
       #*-- create entries in the link and url tables
       $command = "insert into $ltab values ('$url', '$link')";
       $sth = PgDBI::execute_stmt($dbh, $command);
       $command = "insert into $utab values ('$link', '','$site', 0.0, $level, $snum)";
       $sth = PgDBI::execute_stmt($dbh, $command);
         
      }

     #*-- set the status and relevancy of the URL, text from
     #*-- the web page can be used to set relevancy values
     $command = "update $utab set status = 'd' where url = '$url'";
     $sth = PgDBI::execute_stmt($dbh, $command);
     $command = "update $utab set relevancy = 0.2 where url = '$url'";
     $sth = PgDBI::execute_stmt($dbh, $command);

    }
   PgDBI::disconnect_db($dbh, $sth);
   sleep (3);
   exit(0);

  }

 #*------------------------------------------------------
 #*- check if query satisfies the boolean query. 
 #*------------------------------------------------------
 sub match()
 {
  my ( $text, $query_terms) = @_;
  my ($word, @words, $quotes, $retval);

  $quotes = '"\'';
  #*-- split a query with the AND operator and recursively
  #*-- parse the query
  if ($query_terms =~ /\bAND\b/i)
   {
    @words = split(/\bAND\b/i, $query_terms);
    foreach $word (@words)
     { 
      if ($word =~ /\((.*?)\)/)  
       { $retval = &match($text, $1); 
         if ($retval == 0) { return(0); } else { next; } 
       }
      $word =~ s/[$quotes]//g; $word =~ s/^\s+//; $word =~ s/\s+$//;
      return(0) if ($text !~ /\b$word\b/i);
     }
    return(1);
   }

  #*-- split a query with the OR operator and recursively
  #*-- parse the query
  if ($query_terms =~ /\bOR\b/i)
   {
    @words = split(/\bOR\b/i, $query_terms);
    foreach $word (@words)
     { 
      if ($word =~ /\((.*?)\)/)  
       { $retval = &match($text, $1); 
         if ($retval == 1) { return(1); } else { next; } 
       }
      $word =~ s/[$quotes]//g; $word =~ s/^\s+//; $word =~ s/\s+$//;
      return(1) if ($text =~ /\b$word\b/i);
     }
    return(0);
   }

  #*-- assume an AND operator for a query with multiple terms
  $query_terms =~ s/[$quotes]//g; $query_terms =~ s/^\s+//; $query_terms =~ s/\s+$//;
  $query_terms =~ s/([()])/\\$1/g;
  @words = split(/\s+/, $query_terms);
  foreach $word (@words)
   { return(0) if ($text !~ /\b$word\b/i); }
  return(1);

 }


 #*------------------------------------------------------
 #*- check if any words in the query match the query text
 #*------------------------------------------------------
 sub match_link()
 {
  my ($link, $query) = @_;

  my (@words, $word, $match);

  $match = 0;
  @words = split(/\s+/, $query);
  foreach $word (@words)
   {
    next if ( ($word =~ /\bAND\b/i) || ($word =~ /\bOR\b/i) || ($word =~ /\bNOT\b/i) );
    $match = 1 if ($link =~ /$word/i);
   }

  return($match);

 }


#*-------------------------------------------
#*- load the initial set of URLs from Google
#*-------------------------------------------
 sub call_google()
 {

  my ($sql, $query, $url_string, $reg, $j, $script, $ref_html);
  my ($output, $site, %urls, $url);
  my ($count) = 0;

  $sth = PgDBI::execute_stmt($dbh, "delete from $utab");
  $sth = PgDBI::execute_stmt($dbh, "delete from $ltab");

  #*-- get the full query
  $sql = "select query from queries where qword = '$qword'";
  $sth = PgDBI::execute_stmt($dbh, "$sql");
  ($query) = PgDBI::fetch_row($sth);

  #*-- get urls from Google
  $query =~ s/(\W)/sprintf("%%%x", ord($1))/eg;
  $url_string  = "http://www.google.com/search?num=100&safe=off&btnG=Google+Search";
  $url_string .= "&q=$query&filter=0&start=";

  $reg = '<p><a href=http://(.*?)>';
  undef $/;
  for ($j = 0; $j < 46; $j += 50)
   {
    $script = "$url_string$j";
    ($ref_html) = MyWeb::get_URL($script);
    $output = $$ref_html;  #*-- get all the data
    while ($output =~ /$reg/sgi)
     { ($url = $1) =~ tr/A-Z/a-z/; $urls{$url}++; }
    sleep(3);
   }

  #*-- put the urls in a table
  foreach $url (keys %urls)        
   {
    $count++;
    ($site = $url) =~ s#/.*$##i; $url =~ s/'/\\'/g; $url =~ s/\$/\\\$/;
    $command = "insert into $utab values ('$url', '', '$site', 0.0, 0, 0)";
    $sth = PgDBI::execute_stmt($dbh, $command);
   }

 }
