From pasi.eronen@hut.fi Sun Aug 3 14:33:10 1997 Date: Fri, 1 Aug 1997 23:27:19 +0300 (EEST) From: Pasi Eronen To: HTDig mailing list Subject: htdig: Local filesystem access: 4-5X speedup for digging Hi! I'm running HTDig on the same machine as the WWW server runs (as I imagine many people are doing), so accessing every document with HTTP seemed a little silly to me. So I wrote a set of patches which allow HTDig to access files on the local filesystem. The patches aren't very well tested yet (but should work), and the performance increase may vary. When running HTDig on a collection of 19 documents (total of 429 KB) on a SGI Indigo^2 with Boa 0.92o WWW-server, the digging time dropped from 43.6 seconds to 9.7 seconds -- a 4.5X speedup. The patches add a new configuration directive to HTDig, which specifies where the documents are found on the local filesystem. The syntax is: # Access certain URLs on the local filesystem. # For example, local_urls: http://www.foo.com/=/usr/www/htdocs/ # local_urls: prefix1=path1 prefix2=path2 ... Any URL with the given prefix is first tried using the local filename, and if anything goes wrong (e.g., the file doesn't exists, it's actually a directory, or it doesn't look like HTML), using HTTP. This fall-back to HTTP allows things like automatic directory index generation to work. Note that HTDig doesn't know anything about the security settings in your WWW server, so if you're using things like passwords or domain filtering, you probably don't want to give HTDig unlimited access to those parts of your server. Here are the patches, all six of them (against HTDig 3.0.8b1): As always, comments are welcome. Pasi --- Pasi Eronen , +358-50-5123499 *** ../htdig-3.0.8b1.orig/htcommon//defaults.cc Mon Mar 17 07:54:25 1997 --- htcommon/defaults.cc Fri Aug 1 22:27:05 1997 *************** *** 64,67 **** --- 64,68 ---- {"limit_urls_to", "${start_url}"}, {"locale", "iso_8859_1"}, + {"local_urls", ""}, {"maintainer", "andrew@contigo.com"}, {"match_method", "or"}, *** ../htdig-3.0.8b1.orig/htdig/Document.h Mon Mar 24 06:33:28 1997 --- htdig/Document.h Fri Aug 1 22:26:02 1997 *************** *** 87,93 **** Document_no_server, Document_no_host, ! Document_not_authorized }; ! DocStatus Retrieve(time_t date); // --- 87,95 ---- Document_no_server, Document_no_host, ! Document_not_authorized, ! Document_not_local }; ! DocStatus RetrieveHTTP(time_t date); ! DocStatus RetrieveLocal(time_t date, char *filename); // *************** *** 109,113 **** Header_redirect, Header_not_text, ! Header_not_authorized }; --- 111,115 ---- Header_redirect, Header_not_text, ! Header_not_authorized }; *** ../htdig-3.0.8b1.orig/htdig/Document.cc Fri Aug 1 23:09:57 1997 --- htdig/Document.cc Fri Aug 1 22:29:22 1997 *************** *** 21,24 **** --- 21,25 ---- #include + #include #include #include "Document.h" *************** *** 254,262 **** //***************************************************************************** ! // DocStatus Document::Retrieve(time_t date) // Attempt to retrieve the document pointed to by our internal URL // Document::DocStatus ! Document::Retrieve(time_t date) { Connection c; --- 255,263 ---- //***************************************************************************** ! // DocStatus Document::RetrieveHTTP(time_t date) // Attempt to retrieve the document pointed to by our internal URL // Document::DocStatus ! Document::RetrieveHTTP(time_t date) { Connection c; *************** *** 483,486 **** --- 484,545 ---- cout << "returnStatus = " << returnStatus << endl; return returnStatus; + } + + + //***************************************************************************** + // DocStatus Document::RetrieveLocal(time_t date, char *filename) + // Attempt to retrieve the document pointed to by our internal URL + // using a local filename given. Returns Document_ok, + // Document_not_changed or Document_not_local (in which case the + // retriever tries it again using HTTP). + // + Document::DocStatus + Document::RetrieveLocal(time_t date, char *filename) + { + struct stat stat_buf; + // Check that it exists, and is a regular file. + if ((stat(filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)) + return Document_not_local; + + modtime = stat_buf.st_mtime; + if (modtime <= date) + return Document_not_changed; + + // Process only HTML files (this could be changed if we read + // the server's mime.types file). + const char *ext = strrchr(filename, '.'); + if (ext == NULL) + return Document_not_local; + if ((strcasecmp(ext, ".html") == 0) || (strcasecmp(ext, ".htm") == 0)) + contentType = "text/html"; + else + return Document_not_local; + + // Open it + FILE *f = fopen(filename, "r"); + if (f == NULL) + return Document_not_local; + + // + // Read in the document itself + // + contents = 0; + char docBuffer[8192]; + int bytesRead; + + while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), f)) > 0) + { + if (debug > 2) + cout << "Read " << bytesRead << " from document\n"; + if (contents.length() + bytesRead > max_doc_size) + break; + contents.append(docBuffer, bytesRead); + } + fclose(f); + document_length = contents.length(); + + if (debug > 2) + cout << "Read a total of " << document_length << " bytes\n"; + return Document_ok; } *** ../htdig-3.0.8b1.orig/htdig/Retriever.h Fri Feb 7 11:14:19 1997 --- htdig/Retriever.h Fri Aug 1 22:26:03 1997 *************** *** 116,119 **** --- 116,120 ---- DocumentRef * GetRef(char *url); int IsValidURL(char *url); + String * IsLocal(char *url); void RetrievedDocument(Document &, char *url, DocumentRef *ref); void parse_url(URLRef &urlRef); *** ../htdig-3.0.8b1.orig/htdig/Retriever.cc Fri Feb 7 11:14:16 1997 --- htdig/Retriever.cc Fri Aug 1 22:46:34 1997 *************** *** 258,262 **** base = doc->Url(); ! Document::DocStatus status = doc->Retrieve(date); if (status == Document::Document_not_found) { --- 258,280 ---- base = doc->Url(); ! // Retrive document, first trying local file access if possible. ! Document::DocStatus status; ! String *local_filename = IsLocal(url.get()); ! if (local_filename) ! { ! if (debug > 1) ! cout << "Trying local file " << *local_filename << endl; ! status = doc->RetrieveLocal(date, *local_filename); ! if (status == Document::Document_not_local) ! { ! if (debug > 1) ! cout << "Local retrieval failed, trying HTTP" << endl; ! status = doc->RetrieveHTTP(date); ! } ! delete local_filename; ! } ! else ! status = doc->RetrieveHTTP(date); ! if (status == Document::Document_not_found) { *************** *** 271,275 **** doc->Url(tempurl); base = doc->Url(); ! status = doc->Retrieve(date); } } --- 289,293 ---- doc->Url(tempurl); base = doc->Url(); ! status = doc->RetrieveHTTP(date); } } *************** *** 481,484 **** --- 499,555 ---- return TRUE; return FALSE; + } + + + //***************************************************************************** + // String* Retriever::IsLocal(char *url) + // Returns a string containing the (possible) local filename + // of the given url, or 0 if it's definitely not local. + // THE CALLER MUST FREE THE STRING AFTER USE! + // + String* + Retriever::IsLocal(char *url) + { + static StringList *prefixes = 0; + static StringList *paths = 0; + + // + // Initialize prefix/path list if this is the first time. + // The list is given in format "prefix1=path1 prefix2=path2 ..." + // + if (!prefixes) + { + prefixes = new StringList(); + paths = new StringList(); + + String t = config["local_urls"]; + char *p = strtok(t, " \t"); + while (p) + { + char *path = strchr(p, '='); + if (!path) + continue; + *path++ = '\0'; + prefixes->Add(p); + paths->Add(path); + p = strtok(0, " \t"); + } + } + + String *prefix, *path; + prefixes->Start_Get(); + paths->Start_Get(); + while (prefix = (String*) prefixes->Get_Next()) + { + path = (String*) paths->Get_Next(); + if (strncasecmp(*prefix, url, prefix->length()) == 0) + { + int l = strlen(url)-prefix->length()+path->length()+4; + String *local = new String(*path, l); + *local += &url[prefix->length()]; + return local; + } + } + return 0; } *** ../htdig-3.0.8b1.orig/htdig/Server.cc Mon Mar 24 06:34:00 1997 --- htdig/Server.cc Fri Aug 1 22:34:35 1997 *************** *** 46,50 **** url << host << ':' << port << "/robots.txt"; Document doc(url, 10000); ! switch (doc.Retrieve(0)) { case Document::Document_ok: --- 46,50 ---- url << host << ':' << port << "/robots.txt"; Document doc(url, 10000); ! switch (doc.RetrieveHTTP(0)) { case Document::Document_ok: