
From Geoffrey.R.Hutchison@williams.edu Thu Apr 16 17:15:24 1998
Date: Thu, 16 Apr 1998 19:05:05 -0400
From: Geoff Hutchison <Geoffrey.R.Hutchison@williams.edu>
To: htdig@sdsu.edu
Cc: htdig-patches@sol.ccsf.cc.ca.us
Subject: META Description patch

Hi all,

In the spirit of continuing patches, here's a patch against htdig-3.0.8b2 I
wrote on request from Brian Kariger. It defines a new config file option
"use_meta_description" which is false by default. Setting this to true will
check for <META NAME="description"> tags and set the excerpt to the content
of these if they exist (and aren't empty).

Comments, questions and bugs should be directed to me,
-Geoff Hutchison
Williams Students Online
http://wso.williams.edu/

*** htcommon/defaults.cc.orig	Tue Jan  6 13:18:12 1998
--- htcommon/defaults.cc	Sat Mar 21 10:33:47 1998
***************
*** 112,117 ****
--- 112,118 ----
      {"title_factor",			"100"},
      {"url_list",			"${database_base}.urls"},
      {"use_star_image",			"true"},
+     {"use_meta_description",            "false"},
      {"valid_punctuation",		".-_/!#$%^&*'"},
      {"version",				HTDIG_VERSION},
      {"word_db",				"${database_base}.words.gdbm"},
*** htdig/HTML.h.orig	Sat Mar 21 13:31:49 1998
--- htdig/HTML.h	Sat Mar 21 10:44:22 1998
***************
*** 45,50 ****
--- 45,51 ----
      int			in_ref;
      int			in_heading;
      int			doindex;
+     int                 	dohead;
      int			minimumWordLength;
      URL			*base;

*** htdig/HTML.cc.orig	Sat Mar 21 21:12:00 1998
--- htdig/HTML.cc	Sat Mar 21 20:41:50 1998
***************
*** 66,71 ****
--- 66,72 ----
      in_heading = 0;
      base = 0;
      doindex = 1;
+     dohead = 1;
      minimumWordLength = config.Value("minimum_word_length", 3);
  }

***************
*** 103,108 ****
--- 104,110 ----
      start = position;
      title = 0;
      head = 0;
+     dohead = 1;
      doindex = 1;
      in_heading = 0;
      in_title = 0;
***************
*** 231,237 ****
  		//
  		// Append the word to the head (excerpt)
  		//
! 		head << word;
  	    }

  	    if (word.length() >= minimumWordLength && doindex)
--- 233,240 ----
  		//
  		// Append the word to the head (excerpt)
  		//
! 		if (dohead)
! 		  head << word;
  	    }

  	    if (word.length() >= minimumWordLength && doindex)
***************
*** 260,266 ****
  		    //
  		    if (!in_space)
  		    {
! 			if (head.length() < max_head_length)
  			{
  			    head << ' ';
  			}
--- 263,269 ----
  		    //
  		    if (!in_space)
  		    {
! 			if (head.length() < max_head_length && dohead)
  			{
  			    head << ' ';
  			}
***************
*** 280,286 ****
  		    //
  		    // Not whitespace
  		    //
! 		    if (head.length() < max_head_length)
  		    {
  			head << *position;
  		    }
--- 283,289 ----
  		    //
  		    // Not whitespace
  		    //
! 		    if (head.length() < max_head_length && dohead)
  		    {
  			head << *position;
  		    }
***************
*** 503,509 ****
  	}

  	case 19:	// "li"
! 	    if (doindex && head.length() < max_head_length)
  		head << "* ";
  	    break;

--- 506,512 ----
  	}

  	case 19:	// "li"
! 	    if (doindex && head.length() < max_head_length && dohead)
  		head << "* ";
  	    break;

***************
*** 588,593 ****
--- 591,608 ----
  		{
  		    doindex = 0;
  		}
+ 		else if (mystrcasecmp(cache, "description") == 0
+ 			 && config.Boolean("use_meta_description")
+ 			 && strlen(conf["content"]) != 0)
+ 		  {
+ 		    head = conf["content"];
+ 		    if (head.length() > max_head_length)
+ 		      head = head.sub(0, max_head_length);
+ 		    if (debug > 0)
+ 		      cout << "META Description: " << conf["content"] << endl;
+ 		    retriever.got_head(head);
+ 		    dohead = 0;
+ 		  }
  	    }
  	    else if (conf["name"] &&
  		     mystrcasecmp(conf["name"], "htdig-noindex") == 0)

