*** /dev/null Tue Jan 1 05:00:00 1980 --- htlib/HtURLSeedScore.h Fri Jan 7 09:13:08 2000 *************** *** 0 **** --- 1,55 ---- + // + // HtURLSeedScore.h + // + // URLSeedScore: Constructed from a Configuration, see doc + // for format of config item "url_seed_score". + // Method "double adjust_score(double score, const String &url)" + // returns an adjusted score, given the original score, or returns the + // original score if there was no adjustment to do. + // + // $Id$ + // + // Part of the ht://Dig package + // Copyright (c) 2000 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU Public License version 2 or later + // + // + #ifndef __HtURLSeedScore_h + #define __HtURLSeedScore_h + + #include "Configuration.h" + #include "List.h" + + class URLSeedScore + { + public: + URLSeedScore(Configuration &); + ~URLSeedScore(); + + // Return the "adjusted" score. Use an inline method to avoid + // function-call overhead when this feature is unused. + double adjust_score(double score, const String& url) + { + return myAdjustmentList->Count() == 0 + ? score : noninline_adjust_score(score, url); + } + + // If an error was discovered during the parsing of + // the configuration, this member gives a + // nonempty String with an error message. + const String& ErrMsg() { return myErrMsg; } + + private: + double noninline_adjust_score(double score, const String& url); + + // These member functions are not supposed to be implemented. + URLSeedScore(); + URLSeedScore(const URLSeedScore &); + void operator= (const URLSeedScore &); + + List *myAdjustmentList; + String myErrMsg; + }; + + #endif /* __HtURLSeedScore_h */ *** /dev/null Tue Jan 1 05:00:00 1980 --- htlib/HtURLSeedScore.cc Fri Jan 7 09:13:08 2000 *************** *** 0 **** --- 1,214 ---- + // + // HtURLSeedScore.cc + // + // URLSeedScore: + // Holds a list of configured adjustments to be applied on a given + // score and given URL. + // + // Part of the ht://Dig package + // Copyright (c) 2000 The ht://Dig Group + // For copyright details, see the file COPYING in your distribution + // or the GNU Public License version 2 or later + // + // + // $Id$ + + #include "StringList.h" + #include "StringMatch.h" + #include "HtURLSeedScore.h" + #include + #include + + // This class is only used in private members of URLSeedScore. + // The OO-right thing would be to nest this inside the private + // declaration of HtURLSeedScore, but that would cause portability + // problems according to + // . + + class ScoreAdjustItem : public Object + { + public: + // Construct from a string applicable to StringMatch, and a string to + // parse for a formula. + ScoreAdjustItem(String &, String &); + + ~ScoreAdjustItem(); + + // Does this item match? + inline bool Match(const String &s) { return match.FindFirst(s.get()) != -1; } + + // Return the argument adjusted according to this item. + double adjust_score(double orig) + { return orig*my_mul_factor + my_add_constant; } + + // Error in parsing? Message given here if non-empty string. + String& ErrMsg() { return myErrMsg; } + + private: + double my_add_constant; + double my_mul_factor; + StringMatch match; + + static String myErrMsg; + + // These member functions are not supposed to be implemented, but + // mentioned here as private so the compiler will not generate them if + // someone puts in buggy code that would use them. + ScoreAdjustItem(); + ScoreAdjustItem(const ScoreAdjustItem &); + void operator= (const ScoreAdjustItem &); + }; + + // Definition of myErrMsg. + String ScoreAdjustItem::myErrMsg(""); + + ScoreAdjustItem::ScoreAdjustItem(String &url_regex, String &formula) + { + double mul_factor = 1; + double add_constant = 0; + bool factor_found = false; + bool constant_found = false; + int chars_so_far; + match.Pattern(url_regex); + + // FIXME: Missing method to check if the regex was in error. + // We'll check hasPattern for the time being as a placeholder. + if (! match.hasPattern()) + { + myErrMsg = form("%s is not a valid regex", url_regex.get()); + return; + } + + char *s = formula.get(); + + // Parse the ([*]N[ ]*)?[+]?M format. + if (s[0] == '*') + { + // Skip past the '*'. + s++; + + // There is a mul_factor. Let's parse it. + chars_so_far = 0; + sscanf(s, "%lf%n", &mul_factor, &chars_so_far); + + // If '%lf' failed to match, then it will show up as either no + // assignment to chars_so_far, or as writing 0 there. + if (chars_so_far == 0) + { + myErrMsg = form("%s is not a valid adjustment formula", s); + return; + } + + // Skip past the number. + s += chars_so_far; + + // Skip any whitespaces. + while (isspace(*s)) + s++; + + // Eat any plus-sign; it's redundant if alone, and may come before a + // minus. + if (*s == '+') + s++; + + factor_found = true; + } + + // If there's anything here, it must be the additive constant. + if (*s) + { + chars_so_far = 0; + sscanf(s, "%lf%n", &add_constant, &chars_so_far); + + // If '%lf' failed to match, then it will show up as either no + // assignment to chars_so_far, or as writing 0 there. + // We also need to check that it was the end of the input. + if (chars_so_far == 0 || s[chars_so_far] != 0) + { + myErrMsg = form("%s is not a valid adjustment formula", + formula.get()); + return; + } + + constant_found = true; + } + + // Either part must be there. + if (!factor_found && !constant_found) + { + myErrMsg = form("%s is not a valid formula", formula.get()); + return; + } + + my_add_constant = add_constant; + my_mul_factor = mul_factor; + } + + ScoreAdjustItem::~ScoreAdjustItem() + { + } + + URLSeedScore::URLSeedScore(Configuration &config) + { + char *config_item = "url_seed_score"; + + StringList sl(config[config_item], "\t \r\n"); + + myAdjustmentList = new List(); + + if (sl.Count() % 2) + { + myErrMsg = form("%s is not a list of pairs (odd number of items)", + config_item); + + // We *could* continue, but that just means the error will be harder + // to find, unless someone actually sees the error message. + return; + } + + // Parse each as in TemplateList::createFromString. + for (int i = 0; i < sl.Count(); i += 2) + { + String url_regex = sl[i]; + String adjust_formula = sl[i+1]; + + ScoreAdjustItem *adjust_item + = new ScoreAdjustItem(url_regex, adjust_formula); + + if (adjust_item->ErrMsg().length() != 0) + { + // No point in continuing beyond the error; we might just + // overwrite the first error. + myErrMsg = form("While parsing %s: %s", + config_item, + adjust_item->ErrMsg().get()); + return; + } + + myAdjustmentList->Add(adjust_item); + } + } + + URLSeedScore::~URLSeedScore() + { + delete myAdjustmentList; + } + + double + URLSeedScore::noninline_adjust_score(double orig_score, const String &url) + { + List *adjlist = myAdjustmentList; + ScoreAdjustItem *adjust_item; + + adjlist->Start_Get(); + + while ((adjust_item = (ScoreAdjustItem *) adjlist->Get_Next())) + { + // Use the first match only. + if (adjust_item->Match(url)) + return adjust_item->adjust_score(orig_score); + } + + // We'll get here if no match was found. + return orig_score; + } Index: htcommon/defaults.cc =================================================================== RCS file: /opt/htdig/cvs/htdig3/htcommon/defaults.cc,v retrieving revision 1.43.2.12 diff -p -c -r1.43.2.12 defaults.cc *** htcommon/defaults.cc 1999/12/06 22:26:46 1.43.2.12 --- htcommon/defaults.cc 2000/01/07 09:32:39 *************** ConfigDefaults defaults[] = *** 148,153 **** --- 148,154 ---- {"translate_amp", "false"}, {"translate_lt_gt", "false"}, {"translate_quot", "false"}, + {"url_seed_score", ""}, {"url_list", "${database_base}.urls"}, {"url_part_aliases", ""}, {"url_log", "${database_base}.log"}, Index: htdoc/attrs.html =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdoc/attrs.html,v retrieving revision 1.27.2.25 diff -p -c -r1.27.2.25 attrs.html *** htdoc/attrs.html 1999/12/07 04:29:26 1.27.2.25 --- htdoc/attrs.html 2000/01/07 09:32:50 *************** *** 6816,6821 **** --- 6816,6895 ----
+ url_seed_score +
+
+
+
+ type: +
+
+ string list +
+
+ used by: +
+
+ htsearch +
+
+ default: +
+
+ <empty> +
+
+ description: +
+
+ This is a list of pairs, pattern + formula, used to weigh the score of + hits, depending on the URL of the document.
+ The pattern part is a substring to match + against the URL. Pipe ('|') characters can be + used in the pattern to concatenate substrings for + web-areas that have the same formula.
+ The formula describes a factor and a + constant, by which the hit score is + weighed. The factor part is multiplied + to the original score, then the constant + part is added.
+ The format of the formula is the factor part: + "*N" optionally followed by comma and + spaces, followed by the constant part : + "+M", where the plus sign may be emitted + for negative numbers. Either part is optional, + but must come in this order.
+ The numbers N and M are floating + point constants.
+ More straightforward is to think of the format as + "newscore = oldscore*N+M", + but with the "newscore = oldscore" part left out. +
+
+ example: +
+
+ + + + + +
+ url_seed_score: + + /mailinglist/ *.5-1e6 \
+ /docs/|/news/ *1.5 \
+ /testresults/ "*.7 -200" \
+ /faq-area/ *2+10000 +
+
+
+
+
+
+
+
use_meta_description
Index: htdoc/cf_byname.html =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdoc/cf_byname.html,v retrieving revision 1.18.2.13 diff -p -c -r1.18.2.13 cf_byname.html *** htdoc/cf_byname.html 1999/12/06 22:26:48 1.18.2.13 --- htdoc/cf_byname.html 2000/01/07 09:32:51 *************** *** 176,181 **** --- 176,182 ---- * url_list
* url_log
* url_part_aliases
+ * url_seed_score
* use_meta_description
* use_star_image
* user_agent
Index: htdoc/cf_byprog.html =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdoc/cf_byprog.html,v retrieving revision 1.17.2.13 diff -p -c -r1.17.2.13 cf_byprog.html *** htdoc/cf_byprog.html 1999/12/06 22:26:48 1.17.2.13 --- htdoc/cf_byprog.html 2000/01/07 09:32:52 *************** *** 175,180 **** --- 175,181 ---- * syntax_error_file
* uncoded_db_compatible
* url_part_aliases
+ * url_seed_score
* use_meta_description
* use_star_image
* valid_punctuation
Index: htlib/Makefile.in =================================================================== RCS file: /opt/htdig/cvs/htdig3/htlib/Makefile.in,v retrieving revision 1.13.2.2 diff -p -c -r1.13.2.2 Makefile.in *** htlib/Makefile.in 1999/03/29 15:53:48 1.13.2.2 --- htlib/Makefile.in 2000/01/07 09:32:52 *************** OBJS= Configuration.o Connection.o Datab *** 16,22 **** URL.o URLTrans.o cgi.o \ good_strtok.o io.o strcasecmp.o \ strptime.o mytimegm.o HtCodec.o HtWordCodec.o \ ! HtURLCodec.o regex.o HtWordType.o TARGET= libht.a --- 16,22 ---- URL.o URLTrans.o cgi.o \ good_strtok.o io.o strcasecmp.o \ strptime.o mytimegm.o HtCodec.o HtWordCodec.o \ ! HtURLCodec.o regex.o HtWordType.o HtURLSeedScore.o TARGET= libht.a Index: htsearch/Display.cc =================================================================== RCS file: /opt/htdig/cvs/htdig3/htsearch/Display.cc,v retrieving revision 1.54.2.22 diff -p -c -r1.54.2.22 Display.cc *** htsearch/Display.cc 1999/12/07 16:52:35 1.54.2.22 --- htsearch/Display.cc 2000/01/07 09:32:56 *************** static char RCSid[] = "$Id: Display.cc,v *** 21,28 **** --- 21,30 ---- #include #include #include + #include #include "HtURLCodec.h" #include "HtWordType.h" + #include "HtURLSeedScore.h" //***************************************************************************** // *************** Display::Display(char *indexFile, char * *** 43,49 **** templateError = 0; maxStars = config.Value("max_stars"); ! maxScore = 100; setupImages(); setupTemplates(); --- 45,52 ---- templateError = 0; maxStars = config.Value("max_stars"); ! maxScore = -DBL_MAX; ! minScore = DBL_MAX; setupImages(); setupTemplates(); *************** Display::displayMatch(ResultMatch *match *** 304,310 **** if (maxScore != 0) { ! int percent = (int)(ref->DocScore() * 100 / (double)maxScore); if (percent <= 0) percent = 1; vars.Add("PERCENT", new String(form("%d", percent))); --- 307,314 ---- if (maxScore != 0) { ! int percent = (int)((ref->DocScore() - minScore) * 100 / ! (maxScore - minScore)); if (percent <= 0) percent = 1; vars.Add("PERCENT", new String(form("%d", percent))); *************** Display::generateStars(DocumentRef *ref, *** 742,748 **** if (maxScore != 0) { ! score = ref->DocScore() / (double)maxScore; } else { --- 746,752 ---- if (maxScore != 0) { ! score = (ref->DocScore() - minScore) / (maxScore - minScore); } else { *************** Display::buildMatchList() *** 938,943 **** --- 942,951 ---- double backlink_factor = config.Double("backlink_factor"); double date_factor = config.Double("date_factor"); SortType typ = sortType(); + URLSeedScore adjustments(config); + + // If we knew where to pass it, this would be a good place to pass + // on errors from adjustments.ErrMsg(). results->Start_Get(); while ((id = results->Get_Next())) *************** Display::buildMatchList() *** 1007,1012 **** --- 1015,1023 ---- sortRef->DocTitle(thisRef->DocTitle()); thisMatch->setRef(sortRef); } + + score = adjustments.adjust_score(score, thisRef->DocURL()); + } // Get rid of it to free the memory! delete thisRef; *************** Display::buildMatchList() *** 1019,1024 **** --- 1030,1039 ---- // Append this match to our list of matches. // matches->Add(thisMatch); + if (matches->Count() == 1 || maxScore < score) + maxScore = score; + if (matches->Count() == 1 || minScore > score) + minScore = score; } // *************** Display::sort(List *matches) *** 1163,1170 **** for (i = 0; i < numberOfMatches; i++) { array[i] = (ResultMatch *)(*matches)[i]; - if (i == 0 || maxScore < array[i]->getScore()) - maxScore = array[i]->getScore(); } matches->Release(); --- 1178,1183 ---- Index: htsearch/Display.h =================================================================== RCS file: /opt/htdig/cvs/htdig3/htsearch/Display.h,v retrieving revision 1.8.2.4 diff -p -c -r1.8.2.4 Display.h *** htsearch/Display.h 1999/11/24 05:17:10 1.8.2.4 --- htsearch/Display.h 2000/01/07 09:32:57 *************** protected: *** 125,131 **** // Maximum number of stars to display // int maxStars; ! int maxScore; // // For display, we have different versions of the list of words. --- 125,132 ---- // Maximum number of stars to display // int maxStars; ! double maxScore; ! double minScore; // // For display, we have different versions of the list of words. Compilation exited abnormally with code 1 at Fri Jan 7 11:21:03