From grdetil@scrc.umanitoba.ca Fri Dec 3 14:02:03 1999 Date: Fri, 3 Dec 1999 15:36:22 -0600 (CST) From: Gilles Detillieux To: htdig3-dev@htdig.org Subject: [htdig3-dev] could I borrow some eyes? Hi, folks. Among the many changes I've committed to the 3.1.x source tree, is the one below. I'm a bit nervous about it, because it's in a pretty critical piece of code, but as far as I can tell it's doing what it should. I'd appreciate a few more eyeballs looking over it, and perhaps also testing it, just to be sure. In the process of adding support for img alt text, and fixing the meta description parsing, I also uncovered a few other problems, as far as I can tell. 1) The handling of meta keywords and meta descriptions kept right on going even if doindex was 0, so a noindex tag had no effect on these. 2) The handling of meta keywords and meta descriptions didn't consider word offsets in the document - it used a relative offset of 1 for everything. While not really a problem for the keywords tags, it seemed wrong to me that it did that for meta descriptions too. I changed the latter. 3) The relative offset calculation was done by dividing by the total size of the document, before any stripping of comments, JavaScript, etc. That meant that the more stuff got stripped out, the lower the offset (and the higher the importance) of remaining words. I've changed it to use the size after stripping. It appears that (2) and (3) are no longer an issue in 3.2, but (1) still is. Anyway, I'd appreciate some feedback on these fixes, as well as the img alt handling and my HtWordtoken() function. It all seems to work, as far as I can tell, but I may have missed something. Fri Dec 3 10:52:57 1999 Gilles Detillieux * htdig/HTML.cc(parse, do_tag): Add handling of ... text, fix parsing of words in meta tags, disable indexing of meta tags when "noindex" state in effect, fix calculations of word positions to more accurately reflect relative positions. * htlib/HtWordType.h, htlib/HtWordType.cc: Add HtWordToken() function, to replace strtok() in HTML parser. --- htdig-3.1.3/htdig/HTML.cc Fri Nov 26 17:10:36 1999 +++ htdig-3.1.4-dev/htdig/HTML.cc Fri Dec 3 10:05:49 1999 @@ -27,6 +27,8 @@ static StringMatch attrs; static StringMatch srcMatch; static StringMatch hrefMatch; static StringMatch keywordsMatch; +static int offset; +static int totlength; //***************************************************************************** @@ -139,7 +141,6 @@ HTML::parse(Retriever &retriever, URL &b // We have some variables which will contain the various items we // are looking for // - int offset = 0; int in_space; int in_punct; unsigned char *q, *start; @@ -149,6 +150,7 @@ HTML::parse(Retriever &retriever, URL &b static char *skip_start = config["noindex_start"]; static char *skip_end = config["noindex_end"]; + offset = 0; title = 0; head = 0; meta_dsc = 0; @@ -268,6 +270,7 @@ HTML::parse(Retriever &retriever, URL &b } } *ptext++ = '\0'; + totlength = ptext - text; position = text; start = position; @@ -388,7 +391,7 @@ HTML::parse(Retriever &retriever, URL &b if (word.length() >= minimumWordLength && doindex) { retriever.got_word(word, - int(offset * 1000 / contents->length()), + int(offset * 1000 / totlength), in_heading); } } @@ -662,6 +665,33 @@ HTML::do_tag(Retriever &retriever, Strin case 18: // "img" { + // Handle alt parameter + Configuration conf; + conf.NameValueSeparators("="); + conf.Add(position+length); + if (conf["alt"]) + { + char *alttxt = transSGML(conf["alt"]); + if (doindex && in_title) + title << alttxt << " "; + if (in_ref && description.length() < max_description_length) + description << alttxt << " "; + if (doindex && !in_title && head.length() < max_head_length) + head << alttxt << " "; + char *w = HtWordToken(alttxt); + while (w && doindex) + { + if (strlen(w) >= minimumWordLength) + retriever.got_word(w, + int((offset+(w-alttxt)) * 1000 + / totlength), + in_heading); + w = HtWordToken(0); + } + w = '\0'; + } + + // Handle src parameter which = -1; int pos = attrs.FindFirstWord(position, which, length); if (pos < 0 || which != 0) @@ -759,12 +789,12 @@ HTML::do_tag(Retriever &retriever, Strin char *keywords = conf["htdig-keywords"]; if (!keywords) keywords = conf["keywords"]; - char *w = strtok(transSGML(keywords), " ,\t\r\n"); - while (w) + char *w = HtWordToken(transSGML(keywords)); + while (w && doindex) { if (strlen(w) >= minimumWordLength) retriever.got_word(w, 1, 10); - w = strtok(0, " ,\t\r\n"); + w = HtWordToken(0); } w = '\0'; } @@ -826,24 +856,28 @@ HTML::do_tag(Retriever &retriever, Strin // (slot 11 is the new slot for this) // - char *w = strtok(transSGML(conf["content"]), " \t\r\n"); - while (w) + char *words = HtWordToken(transSGML(conf["content"])); + char *w = words; + while (w && doindex) { if (strlen(w) >= minimumWordLength) - retriever.got_word(w, 1, 11); - w = strtok(0, " \t\r\n"); + retriever.got_word(w, + int((offset+(w-words)) * 1000 + / totlength), + 11); + w = HtWordToken(0); } w = '\0'; } if (keywordsMatch.CompareWord(cache)) { - char *w = strtok(transSGML(conf["content"]), " ,\t\r\n"); - while (w) + char *w = HtWordToken(transSGML(conf["content"])); + while (w && doindex) { if (strlen(w) >= minimumWordLength) retriever.got_word(w, 1, 10); - w = strtok(0, " ,\t\r\n"); + w = HtWordToken(0); } w = '\0'; } --- htdig-3.1.3/htlib/HtWordType.h Wed Sep 22 11:18:43 1999 +++ htdig-3.1.4-dev/htlib/HtWordType.h Fri Dec 3 11:15:33 1999 @@ -76,5 +76,8 @@ HtStripPunctuation(String &s) { s.remove(HtWordType::statics.valid_punctuation); } + +// Like strtok(), but using our rules for word separation. +char *HtWordToken(char *s); #endif /* __HtWordType_h */ --- htdig-3.1.3/htlib/HtWordType.cc Wed Sep 22 11:18:43 1999 +++ htdig-3.1.4-dev/htlib/HtWordType.cc Fri Dec 3 11:15:57 1999 @@ -36,4 +36,30 @@ HtWordType::Initialize(Configuration &co if (strchr(valid_punct, i)) HtWordType::statics.chrtypes[i] |= HtWt_ValidPunct; } +} + + +// much like strtok(), and destructive of the source string like strtok(), +// but does word separation by our rules. +char * +HtWordToken(char *str) +{ + unsigned char *text = (unsigned char *)str; + char *ret = 0; + static unsigned char *prev = 0; + + if (!text) + text = prev; + while (text && *text && !HtIsStrictWordChar(*text)) + text++; + if (text && *text) + { + ret = (char *)text; + while (*text && HtIsWordChar(*text)) + text++; + if (*text) + *text++ = '\0'; + } + prev = text; + return ret; } -- Gilles R. Detillieux E-mail: Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil Dept. Physiology, U. of Manitoba Phone: (204)789-3766 Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930 ------------------------------------ To unsubscribe from the htdig3-dev mailing list, send a message to htdig3-dev-unsubscribe@htdig.org You will receive a message to confirm this.