From lha@ee.mu.oz.au Sun Jun 23 22:24:50 2002
Date: Mon, 24 Jun 2002 11:54:21 +1000
From: Lachlan Andrew <lha@ee.mu.oz.au>
To: Gilles Detillieux <grdetil@scrc.umanitoba.ca>
Cc: Geoff Hutchison <ghutchis@wso.williams.edu>,
     htdig-dev@lists.sourceforge.net
Subject: [htdig-dev] Re: [413879] local files / external parsers

On Fri, Jun 14, 2002 at 06:03:09PM -0500, Gilles Detillieux wrote:

> I'd recommend two changes:
> 1) Grab the most recent 3.2.0b4 snapshot
> 2) The HtFile::Request() and Document::RetrieveLocal() methods both
> have some hardcoded extensions, which should probably be kept in the
> new HtFile::Ext2Mime() method.  HtFile::Request() currently falls back
> on these when it can't open mime.types.


Greetings,

Below is the patch against 3.2.0b4-20020616.  This includes the
hardcoded types, and  bad_local_extensions  to allow  .php  etc. not
to be parsed locally.  If  bad_local_extensions  is explicitly set
empty, do you think it would be good to allow *all* files to be parsed
locally (even those with no extensions)?  Of course, ones for which no
MIME type is known would have to be treated as  text/plain  but it
would be good if a site has a lot of text files with no extensions.

Also, would there be any demand to index compressed files?  If someone
has a lot of  .ps.gz  files, for example, it could be useful to include
them in the index.

Finally, I think someon has been editing the files with a tab size other
than 8...  Is there a policy on that?

Cheers,
Lachlan


*** htdig/Document.cc	Sun Jan 13 19:13:13 2002
--- htdig/Document.cc.lha	Mon Jun 24 01:06:48 2002
***************
*** 72,78 ****
      FileConnect = 0;
      NNTPConnect = 0;
      externalConnect = 0;
! 	HtConfiguration* config= HtConfiguration::config();
  
      // We probably need to move assignment of max_doc_size, according
      // to a server or url configuration value. The same is valid for
--- 72,78 ----
      FileConnect = 0;
      NNTPConnect = 0;
      externalConnect = 0;
!     HtConfiguration* config= HtConfiguration::config();
  
      // We probably need to move assignment of max_doc_size, according
      // to a server or url configuration value. The same is valid for
***************
*** 549,555 ****
  Transport::DocStatus
  Document::RetrieveLocal(HtDateTime date, StringList *filenames)
  {
! 	HtConfiguration* config= HtConfiguration::config();
      struct stat stat_buf;
      String *filename;
  
--- 549,555 ----
  Transport::DocStatus
  Document::RetrieveLocal(HtDateTime date, StringList *filenames)
  {
!     HtConfiguration* config= HtConfiguration::config();
      struct stat stat_buf;
      String *filename;
  
***************
*** 558,564 ****
      // Loop through list of potential filenames until the list is exhausted
      // or a suitable file is found to exist as a regular file.
      while ((filename = (String *)filenames->Get_Next()) &&
! 	   ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
          if (debug > 1)
  	    cout << "  tried local file " << *filename << endl;
      
--- 558,564 ----
      // Loop through list of potential filenames until the list is exhausted
      // or a suitable file is found to exist as a regular file.
      while ((filename = (String *)filenames->Get_Next()) &&
!        ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
          if (debug > 1)
  	    cout << "  tried local file " << *filename << endl;
      
***************
*** 572,593 ****
      if (modtime <= date)
        return Transport::Document_not_changed;
  
-     // Process only HTML files (this could be changed if we read
-     // the server's mime.types file).
-     // (...and handle a select few other types for now...  this should
-     //  eventually be handled by the "file://..." handler, which uses
-     //  mime.types to determine the file type.) -- FIXME!!
      char *ext = strrchr((char*)*filename, '.');
      if (ext == NULL)
        return Transport::Document_not_local;
!     if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))
!         contentType = "text/html";
!     else if ((mystrcasecmp(ext, ".txt") == 0) || (mystrcasecmp(ext, ".asc") == 0))
!         contentType = "text/plain";
!     else if ((mystrcasecmp(ext, ".pdf") == 0))
!         contentType = "application/pdf";
!     else if ((mystrcasecmp(ext, ".ps") == 0) || (mystrcasecmp(ext, ".eps") == 0))
!         contentType = "application/postscript";
      else 
        return Transport::Document_not_local;
  
--- 572,585 ----
      if (modtime <= date)
        return Transport::Document_not_changed;
  
      char *ext = strrchr((char*)*filename, '.');
+     if (ext && strchr(ext,'/'))		// Ignore a dot if it's not in the
+       ext = NULL;			// final component of the path.
      if (ext == NULL)
        return Transport::Document_not_local;
!     const String *type = HtFile::Ext2Mime (ext + 1);
!     if (type != NULL)
!       contentType = *type;
      else 
        return Transport::Document_not_local;
  
*** htnet/HtFile.h	Mon Jun 24 01:02:42 2002
--- htnet/HtFile.h.lha	Mon Jun 24 01:02:51 2002
***************
*** 64,69 ****
--- 64,73 ----
     // manages a Transport request (method inherited from Transport class)
     virtual DocStatus Request ();
  
+    // Determine Mime type of file
+    // (Does it belong here??)
+    static const String *Ext2Mime (const char *);
+ 
   ///////
      //    Interface for resource retrieving
   ///////
*** htnet/HtFile.cc	Sun Dec 23 19:13:14 2001
--- htnet/HtFile.cc.lha	Mon Jun 24 00:48:34 2002
***************
*** 76,96 ****
  }
  
  
! ///////
!    //    Manages the requesting process
! ///////
! 
! HtFile::DocStatus HtFile::Request()
  {
-    HtConfiguration* config= HtConfiguration::config();
     static Dictionary *mime_map = 0;
  
     if (!mime_map)
       {
         mime_map = new Dictionary();
         ifstream in(config->Find("mime_types").get());
         if (in)
           {
             String line;
             while (in >> line)
               {
--- 76,110 ----
  }
  
  
! // Return mime type indicated by extension  ext  (which is assumed not
! // to contain the '.'), or  NULL  if  ext  is not a know mime type, or
! // is listed in  bad_local_extensions.
! const String *HtFile::Ext2Mime (const char *ext)
  {
     static Dictionary *mime_map = 0;
  
     if (!mime_map)
       {
+        HtConfiguration* config= HtConfiguration::config();
         mime_map = new Dictionary();
+        if (!mime_map)
+ 	 return NULL;
+ 
+        if (debug > 2)
+  	    cout << "MIME types: " << config->Find("mime_types").get() << endl;
         ifstream in(config->Find("mime_types").get());
         if (in)
           {
+ 	   // Set up temporary dictionary of extensions not to parse locally
+ 	   Dictionary bad_local_exts;
+ 	   StringList split_exts(config->Find("bad_local_extensions"), "\t .");
+ 	   for (int i = 0; i < split_exts.Count(); i++)
+ 	   {
+ 	      if (debug > 3)
+ 		cout << "Bad local extension: " << split_exts[i] << endl;
+ 	      bad_local_exts.Add(split_exts[i], 0);
+ 	   }
+ 
             String line;
             while (in >> line)
               {
***************
*** 99,114 ****
                 if ((cmt = line.indexOf('#')) >= 0)
                   line = line.sub(0, cmt);
                 StringList split_line(line, "\t ");
!                // Let's cache mime type to lesser the number of 
!                // operator [] callings
                 String mime_type = split_line[0];
                 // Fill map with values.
                 for (int i = 1; i < split_line.Count(); i++)
!                  mime_map->Add(split_line[i], new String(mime_type));
               }
           }
       }
  
     // Reset the response
     _response.Reset();
     
--- 113,161 ----
                 if ((cmt = line.indexOf('#')) >= 0)
                   line = line.sub(0, cmt);
                 StringList split_line(line, "\t ");
!                // cache mime type to lessen the number of operator [] callings
                 String mime_type = split_line[0];
                 // Fill map with values.
                 for (int i = 1; i < split_line.Count(); i++)
! 	       {
! 		 const char *ext = split_line [i];
! 		 if (bad_local_exts.Exists(ext))
! 		 {
! 	           if (debug > 3)
! 		     cout << "Bad local extension: " << ext << endl;
! 		   continue;
! 		 }
! 
! 	         if (debug > 3)
! 		   cout << "MIME: " << ext << "\t-> " << mime_type << endl;
!                  mime_map->Add(ext, new String(mime_type));
! 	       }
               }
           }
+        else
+ 	 {
+ 	   if (debug > 2)
+ 		cout << "MIME types file not found.  Using default types.\n";
+ 	   mime_map->Add(String("html"), new String("text/html"));
+ 	   mime_map->Add(String("htm"),  new String("text/html"));
+ 	   mime_map->Add(String("txt"),  new String("text/plain"));
+ 	   mime_map->Add(String("asc"),  new String("text/plain"));
+ 	   mime_map->Add(String("pdf"),  new String("application/pdf"));
+ 	   mime_map->Add(String("ps"),   new String("application/postscript"));
+ 	   mime_map->Add(String("eps"),  new String("application/postscript"));
+ 	 }
       }
  
+    // return MIME type, or NULL if not found
+    return (String *)mime_map->Find(ext);
+ }
+ 
+ ///////
+    //    Manages the requesting process
+ ///////
+ 
+ HtFile::DocStatus HtFile::Request()
+ {
     // Reset the response
     _response.Reset();
     
***************
*** 166,191 ****
       return Transport::Document_not_changed;
  
     char *ext = strrchr(_url.path(), '.');
     if (ext == NULL)
       return Transport::Document_not_local;
  
!    if (mime_map && mime_map->Count())
!      {
!        String *mime_type = (String *)mime_map->Find(ext + 1);
!        if (mime_type)
!          _response._content_type = *mime_type;
!        else
!          return Transport::Document_not_local;
!      }
     else
!      {
!        if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))
!          _response._content_type = "text/html";
!        else if (mystrcasecmp(ext, ".txt") == 0)
!          _response._content_type = "text/plain";
!        else
!          return Transport::Document_not_local;
!      }
  
     _response._modification_time = new HtDateTime(stat_buf.st_mtime);
  
--- 213,228 ----
       return Transport::Document_not_changed;
  
     char *ext = strrchr(_url.path(), '.');
+    if (ext && strchr(ext,'/'))		// Ignore a dot if it's not in the
+      ext = NULL;			// final component of the path.
     if (ext == NULL)
       return Transport::Document_not_local;
  
!    const String *mime_type = Ext2Mime(ext + 1);
!    if (mime_type)
!      _response._content_type = *mime_type;
     else
!      return Transport::Document_not_local;
  
     _response._modification_time = new HtDateTime(stat_buf.st_mtime);
  
*** htcommon/defaults.cc	Sun Jun 23 23:55:41 2002
--- htcommon/defaults.cc.lha	Mon Jun 24 01:01:09 2002
***************
*** 145,151 ****
  	documents as text while they are some binary format. \
  	If the list is empty, then all extensions are acceptable, \
  	provided they pass other criteria for acceptance or rejection. \
! 	See also <a href=\"#valid_extensions\">valid_extensions</a>. \
  " }, \
  { "bad_querystr", "",  \
  	"pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&amp;passwd=required", " \
--- 145,165 ----
  	documents as text while they are some binary format. \
  	If the list is empty, then all extensions are acceptable, \
  	provided they pass other criteria for acceptance or rejection. \
! 	See also <a href=\"#valid_extensions\">valid_extensions</a> and \
! 	<a href=\"#bad_local_extensions\">bad_local_extensions</a>. \
! " }, \
! { "bad_local_extensions", ".php .shtml",  \
! 	"string list", "htdig", "URL", "all", "Indexing:Where", "bad_local_extensions: .php .foo .bar", " \
! 	This is a list of extensions on URLs which are \
! 	considered active, that is, the content delivered by the web \
! 	server is not simply the text of the file, but is generated \
! 	on-the-fly. This list is used mainly to allow URLs on the local \
! 	machine to be read using the local filesystem, rather than \
! 	through HTTP.  \
! 	If the list is empty, then all extensions are acceptable, \
! 	provided they pass other criteria for acceptance or rejection. \
! 	See also <a href=\"#valid_extensions\">valid_extensions</a> and \
! 	<a href=\"#bad_extensions\">bad_extensions</a>. \
  " }, \
  { "bad_querystr", "",  \
  	"pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&amp;passwd=required", " \

-- 
Lachlan Andrew  lha@ee.mu.oz.au  Phone: +613 8344-3816 Fax: +613 8344-6678
Department of Electrical and Electronic Engineering	   CRICOS Provider Code
University of Melbourne, Victoria, 3010    AUSTRALIA		00116K


-------------------------------------------------------
Sponsored by:
ThinkGeek at http://www.ThinkGeek.com/
_______________________________________________
htdig-dev mailing list
htdig-dev@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/htdig-dev

