
From wjones@tc.fluke.com Fri Apr  3 23:17:53 1998
Date: Wed, 10 Sep 1997 14:40:21 -0700
From: Warren Jones <wjones@tc.fluke.com>
To: Andrew Scherpbier <andrew@contigo.com>
Cc: htdig@sdsu.edu
Subject: Re: htdig: excluding directories?

On Tue, Sep 02, 1997 at 01:59:50PM -0700, Andrew Scherpbier wrote:

> Hmmm...  Maybe htdig could store an MD5 signature for each file that it
> finds and only store unique ones.  It would mean another hashtable in
> the code but it would definitely reduce these problems.
> Would it be important to see all URLs that reference the same document?
> Should the MD5's only be valid for any one host or domain?

I modified ht://Dig to allow it to reject URL's on a local host that
are links (through the file system) to a URL that has already been
indexed.  This works with the local_urls option in version 3.0.8b2.
I didn't bother to create another hashtable, but just added a key
based on the file's device and inode numbers to Retriever::visited.
The following patch was made against version 3.0.8b2.

--------------------------------------------------------------------
Warren Jones              | To keep every cog and wheel is the first
Fluke Corporation         | precaution of intelligent tinkering.
Everett, Washington, USA  |                          -- Aldo Leopold
--------------------------------------------------------------------

Index: Retriever.cc
===================================================================
RCS file: /usr0/wjones/src/CVS.repo/htdig/htdig/Retriever.cc,v
retrieving revision 1.1.1.2
diff -c -r1.1.1.2 Retriever.cc
*** Retriever.cc	1997/09/04 19:22:52	1.1.1.2
--- Retriever.cc	1997/09/10 21:14:09
***************
*** 32,37 ****
--- 32,38 ----
  #include "Parsable.h"
  #include "Document.h"
  #include <StringList.h>
+ #include <sys/stat.h>
  
  static WordList	words;
  
***************
*** 441,447 ****
      url = u;
      url.lowercase();
  
!     return !visited.Exists(url);
  }
  
  
--- 442,478 ----
      url = u;
      url.lowercase();
  
!     if ( visited.Exists(url) )
!     	return FALSE;
!     	
!     String *local_filename = IsLocal(u);    // For local URL's, check
!     if ( local_filename )		    // list for device and inode
!     {					    // to make sure we haven't
! 	struct stat buf;		    // already indexed a link
! 					    // to this file.
! 
! 	if ( stat(local_filename->get(),&buf) == 0 )
! 	{
! 	    char key[2*sizeof(ino_t)+2*sizeof(dev_t)+2];      // Make hash key
! 	    sprintf( key, "%x+%x", buf.st_dev, buf.st_ino );  // from device
! 	    if ( visited.Exists(key) )			      // and inode.
! 	    {
! 		if ( debug ) {
! 		    String *dup = (String*)visited.Find(key);
! 		    cout << endl
! 			 << "Duplicate: " << local_filename->get()
! 			 << " -> "        << dup->get() << endl;
! 		}
! 		delete local_filename;
! 		return FALSE;
! 	    }
! 	    visited.Add(key,local_filename);
! 	    return TRUE;
! 	}
! 	delete local_filename;
!     }
!     return TRUE;
! 
  }

----------------------------------------------------------------------
To unsubscribe from the htdig mailing list, send a message to
htdig-request@sdsu.edu containing the single word "unsubscribe" in
the body of the message.
