Date: Mon, 26 Apr 2004 17:27:24 -0700 (PDT)
From: Joe R. Jah <jjah@cloud.ccsf.cc.ca.us>
To: Gilles Detillieux <grdetil@scrc.umanitoba.ca>
Cc: "ht://Dig mailing list" <htdig-general@lists.sourceforge.net>
Subject: Adding the allow_space_in_url attribute

This patch, fileSpace.1, is fileSpace.0 plus corresponding defaults.cc,
defaults.xml, and attrs.html entries.  I used Gilles' excellent comments
in fileSpace.0:)

Regards,

Joe
--
     _/   _/_/_/       _/              ____________    __o
     _/   _/   _/      _/         ______________     _-\<,_
 _/  _/   _/_/_/   _/  _/                     ......(_)/ (_)
  _/_/ oe _/   _/.  _/_/ ah        jjah@cloud.ccsf.cc.ca.us


--- htcommon/URL.cc.orig	2003-07-21 07:40:16.000000000 -0500
+++ htcommon/URL.cc	2003-11-13 16:50:03.000000000 -0600
@@ -144,8 +144,26 @@ URL::URL(const String &url, const URL &p
     _signature(parent._signature),
     _user(parent._user)
 {
-    String	temp(url);
-    temp.remove(" \r\n\t");
+	HtConfiguration* config= HtConfiguration::config();
+    int  allowspace = config->Boolean("allow_space_in_url", 0);
+    String      temp;
+    const char *urp = url.get();
+    while (*urp)
+    {
+	if (*urp == ' ' && temp.length() > 0 && allowspace)
+	{
+	    // Replace space character with %20 if there's more non-space
+	    // characters to come...
+	    const char *s = urp+1;
+	    while (*s && isspace(*s))
+		s++;
+	    if (*s)
+		temp << "%20";
+	}
+	else if (!isspace(*urp))
+	    temp << *urp;
+	urp++;
+    }
     char* ref = temp;
 
     //
@@ -314,8 +332,26 @@ void URL::rewrite()
 //
 void URL::parse(const String &u)
 {
-    String	temp(u);
-    temp.remove(" \t\r\n");
+	HtConfiguration* config= HtConfiguration::config();
+    int  allowspace = config->Boolean("allow_space_in_url", 0);
+    String	temp;
+    const char *urp = u.get();
+    while (*urp)
+    {
+	if (*urp == ' ' && temp.length() > 0 && allowspace)
+	{
+	    // Replace space character with %20 if there's more non-space
+	    // characters to come...
+	    const char *s = urp+1;
+	    while (*s && isspace(*s))
+		s++;
+	    if (*s)
+		temp << "%20";
+	}
+	else if (!isspace(*urp))
+	    temp << *urp;
+	urp++;
+    }
     char	*nurl = temp;
 
     //

--- htcommon/defaults.cc.orig	Sat Nov  8 19:46:34 2003
+++ htcommon/defaults.cc	Mon Apr 26 16:07:23 2004
@@ -125,6 +125,19 @@
 	To disallow digits in words, add the digits to \
 	<a href=\"#valid_punctuation\">valid_punctuation</a>. \
 " }, \
+{ "allow_space_in_url", "false",  \
+	"boolean", "htdig", "", "3.2.0b5", "Indexing:Where", "allow_space_in_url: true", " \
+	If set to true, htdig will handle URLs that contain \
+	embedded spaces. Technically, this is a violation of \
+	RFC 2396, which says spaces should be stripped out \
+	(as htdig does by default).  However, many web browsers \
+	and HTML code generators violate this standard already, \
+	so enabling this attribute allows htdig to handle these \
+	non-compliant URLs.  Even with this attribute set, htdig \
+	still strips out all white space (leading, trailing and \
+	embedded), except that space characters embedded within \
+	the URL will be encoded as %20.
+" }, \
 { "allow_virtual_hosts", "true",  \
 	"boolean", "htdig", "", "3.0.8b2", "Indexing:Where", "allow_virtual_hosts: false", " \
 	If set to true, htdig will index virtual web sites as \

--- htcommon/defaults.xml.orig	Sat Nov  8 19:46:34 2003
+++ htcommon/defaults.xml	Mon Apr 26 17:01:06 2004
@@ -121,6 +121,27 @@
      </description>
    </attribute>
 
+   <attribute name="allow_space_in_url" 
+              type="boolean" 
+              programs="htdig" 
+              version="3.2.0b5" 
+              category="Indexing:Where" >
+     <default>false</default>
+     <example>true</example>
+     <description> 
+	If set to true, htdig will handle URLs that contain
+	embedded spaces. Technically, this is a violation of
+	<em>RFC 2396</em>, which says spaces should be stripped out
+	(as htdig does by default).  However, many web browsers
+	and HTML code generators violate this standard already,
+	so enabling this attribute allows htdig to handle these
+	non-compliant URLs.  Even with this attribute set, htdig
+	still strips out all white space (leading, trailing and
+	embedded), except that space characters embedded within
+	the URL will be encoded as %20.
+     </description>
+   </attribute>
+
    <attribute name="allow_virtual_hosts" 
               type="boolean" 
               programs="htdig" 

--- htdoc/attrs.html.orig	Sat Nov  8 19:46:34 2003
+++ htdoc/attrs.html	Mon Apr 26 17:50:03 2004
@@ -457,6 +457,77 @@
 	<hr>
 	<dl>
 	  <dt>
+		<strong><a name="allow_space_in_url">
+		allow_space_in_url</a></strong>
+	  </dt>
+	  <dd>
+		<dl>
+		  <dt>
+			<em>type:</em>
+		  </dt>
+		  <dd>
+			boolean
+		  </dd>
+		  <dt>
+			<em>used by:</em>
+		  </dt>
+		  <dd>
+			<a href="htdig.html">htdig</a>
+		  </dd>
+		  <dt>
+			<em>default:</em>
+		  </dt>
+		  <dd>
+			false
+		  </dd>
+		  <dt>
+			<em>block:</em>
+		  </dt>
+		  <dd>
+			Global
+		  </dd>
+		  <dt>
+		      <em>version:</em>
+		  </dt>
+		  <dd>
+		      3.2.0b5
+		  </dd>
+		  <dt>
+			<em>description:</em>
+		  </dt>
+		  <dd>
+	If set to true, htdig will handle URLs that contain
+	embedded spaces. Technically, this is a violation of
+	<em>RFC 2396</em>, which says spaces should be stripped out
+	(as htdig does by default).  However, many web browsers
+	and HTML code generators violate this standard already,
+	so enabling this attribute allows htdig to handle these
+	non-compliant URLs.  Even with this attribute set, htdig
+	still strips out all white space (leading, trailing and
+	embedded), except that space characters embedded within
+	the URL will be encoded as %20.
+		  </dd>
+		  <dt>
+			<em>example:</em>
+		  </dt>
+		  <dd>
+			<table border="0">
+			  <tr>
+				<td valign="top">
+				  allow_space_in_url:
+				</td>
+				<td nowrap>
+				     true
+				</td>
+			  </tr>
+			</table>
+		  </dd>
+		</dl>
+	  </dd>
+	</dl>
+	<hr>
+	<dl>
+	  <dt>
 		<strong><a name="allow_virtual_hosts">
 		allow_virtual_hosts</a></strong>
 	  </dt>

--- htdoc/cf_byname.html.orig	Sat Nov  8 19:46:34 2003
+++ htdoc/cf_byname.html	Mon Apr 26 17:39:19 2004
@@ -25,6 +25,7 @@
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_double_slash">allow_double_slash</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_in_form">allow_in_form</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_numbers">allow_numbers</a><br>
+	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_space_in_url">allow_space_in_url</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_virtual_hosts">allow_virtual_hosts</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#anchor_target">anchor_target</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#any_keywords">any_keywords</a><br>

--- htdoc/cf_byprog.html.orig	Sat Nov  8 19:46:34 2003
+++ htdoc/cf_byprog.html	Mon Apr 26 17:40:24 2004
@@ -44,6 +44,7 @@
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#accept_language">accept_language</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_double_slash">allow_double_slash</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_numbers">allow_numbers</a><br>
+	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_space_in_url">allow_space_in_url</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#allow_virtual_hosts">allow_virtual_hosts</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#authorization">authorization</a><br>
 	 <img src="dot.gif" alt="*" width=9 height=9> <a target="body" href="attrs.html#bad_extensions">bad_extensions</a><br>
