
From grdetil@scrc.umanitoba.ca Fri Jul 30 13:28:50 1999
Date: Fri, 30 Jul 1999 12:38:23 -0500 (CDT)
From: Gilles Detillieux <grdetil@scrc.umanitoba.ca>
To: htdig@htdig.org
Cc: htdig@htdig.org
Subject: Re: [htdig] Problem with &..; entities in meta tags


According to Lennart Almkvist:
> Some more testing gave the following results:
> 
> The german flower words "Stiefm&uuml;tterchen" and the islandic
> "&thorn;renningarfj&oacute;la" are treated different in meta content
> and in the body or title part of an html document.
> 
> When in the body or in the title,  the  "&uuml;", "&thorn;" and "&oacute; "
> are decoded to a one byte character in the .wordlist and .words.db files.
> 
> In meta content however, these  words are decoded to "stiefmuuml;t"
> and "thorn;rennin" in the .wordlist and .words.db file. That is the "&" is
> removed and the rest is kept as letters ("&" is in valid_punctuation but
> the ";"  is not, by default).
> 
> Should not they be decoded as the title or body is ?

Here's a patch for 3.1.2 that should do what you want.  Please give it a
try and let us know if it fixes this bug.

--- htdig-3.1.2.bak/htdig/HTML.h	Wed Apr 21 21:47:57 1999
+++ htdig-3.1.2/htdig/HTML.h	Fri Jul 30 12:23:25 1999
@@ -72,6 +72,7 @@ private:
     // Helper functions
     //
     void		do_tag(Retriever &, String &);
+    char		*transSGML(char *);
 };
 
 #endif
--- htdig-3.1.2.bak/htdig/HTML.cc	Wed Apr 21 21:47:57 1999
+++ htdig-3.1.2/htdig/HTML.cc	Fri Jul 30 12:24:14 1999
@@ -744,7 +744,7 @@ HTML::do_tag(Retriever &retriever, Strin
 	    }
 	    if (conf["htdig-email-subject"])
 	    {
-		retriever.got_meta_subject(conf["htdig-email-subject"]);
+		retriever.got_meta_subject(transSGML(conf["htdig-email-subject"]));
 	    }
 	    if (conf["htdig-keywords"] || conf["keywords"])
 	    {
@@ -757,7 +757,7 @@ HTML::do_tag(Retriever &retriever, Strin
 		char	*keywords = conf["htdig-keywords"];
 		if (!keywords)
 		    keywords = conf["keywords"];
-		char	*w = strtok(keywords, " ,\t\r\n");
+		char	*w = strtok(transSGML(keywords), " ,\t\r\n");
 		while (w)
 		{
 		    if (strlen(w) >= minimumWordLength)
@@ -811,7 +811,7 @@ HTML::do_tag(Retriever &retriever, Strin
 		    //
 		    // We need to do two things. First grab the description
 		    //
-		    meta_dsc = conf["content"];
+		    meta_dsc = transSGML(conf["content"]);
 		   if (meta_dsc.length() > max_meta_description_length)
 		     meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
 		   if (debug > 1)
@@ -824,7 +824,7 @@ HTML::do_tag(Retriever &retriever, Strin
 		   // (slot 11 is the new slot for this)
 		   //
 
-		   char        *w = strtok(conf["content"], " \t\r\n");
+		   char        *w = strtok(transSGML(conf["content"]), " \t\r\n");
                    while (w)
 		     {
 			if (strlen(w) >= minimumWordLength)
@@ -836,7 +836,7 @@ HTML::do_tag(Retriever &retriever, Strin
 
 		if (keywordsMatch.CompareWord(cache))
 		{
-		    char	*w = strtok(conf["content"], " ,\t\r\n");
+		    char	*w = strtok(transSGML(conf["content"]), " ,\t\r\n");
 		    while (w)
 		    {
 			if (strlen(w) >= minimumWordLength)
@@ -855,7 +855,7 @@ HTML::do_tag(Retriever &retriever, Strin
 		}
 		else if (mystrcasecmp(cache, "htdig-email-subject") == 0)
 		{
-		    retriever.got_meta_subject(conf["content"]);
+		    retriever.got_meta_subject(transSGML(conf["content"]));
 		}
 		else if (mystrcasecmp(cache, "htdig-noindex") == 0)
 		  {
@@ -1095,4 +1095,26 @@ HTML::do_tag(Retriever &retriever, Strin
 	default:
 	    return;						// Nothing...
     }
+}
+
+
+//*****************************************************************************
+// char * HTML::transSGML(char *text)
+//
+char *
+HTML::transSGML(char *str)
+{
+    static String	convert;
+    unsigned char	*text = (unsigned char *)str;
+
+    convert = 0;
+    while (*text)
+    {
+	if (*text == '&')
+	    convert << SGMLEntities::translateAndUpdate(text);
+	else
+	    convert << *text;
+	text++;
+    }
+    return convert.get();
 }

-- 
Gilles R. Detillieux              E-mail: <grdetil@scrc.umanitoba.ca>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------
To unsubscribe from the htdig mailing list, send a message to
htdig@htdig.org containing the single word unsubscribe in
the SUBJECT of the message.
