
From tim@nz.eds.com Wed Apr  8 18:28:36 1998
Date: Thu, 9 Apr 1998 10:06:19 +1200 (NZST)
From: Tim Frost <tim@nz.eds.com>
Reply-To: Tim Frost <Tim.Frost@nz.eds.com>
To: htdig-patches@sol.ccsf.cc.ca.us
Cc: Andrew Scherpbier <andrew@contigo.com>
Subject: ht://dig - URL quoting patch - correction

In checking the unofficial patch site for ht://dig, I find that an early,
broken, version of my fix for handling URL quoting is held on the patch
site. 

The file HTML.cc-h.0, found under the 3.0.8b2 directory, should be
deleted, and the following patch should be made available (as HTML.cc.1?). 
This patch was created against V3.0.8b2 as a unified diff.  I can produce
a context diff if desired.  The original patch attempted to use a new
function to do the duplicated work, but this failed because it did not
return the (at least) two pointer values that were needed. 

Tim


diff -u htdig-3.0.8b2/htdig/HTML.cc-orig htdig-3.0.8b2/htdig/HTML.cc
--- htdig-3.0.8b2/htdig/HTML.cc-orig	Sun Dec  7 22:14:40 1997
+++ htdig-3.0.8b2/htdig/HTML.cc	Fri Jan  9 21:24:03 1998
@@ -309,7 +309,7 @@
 HTML::do_tag(Retriever &retriever, String &tag)
 {
     char	*position = tag.get() + 1;		// Skip the '<'
-    char	*q;
+    char	*q, *t;
     int		which, length;
 
     while (isspace(*position))
@@ -358,12 +358,34 @@
 			position++;
 			while (isspace(*position))
 			    position++;
-			if (*position == '"')
+			//
+			// Allow either single quotes or double quotes
+			// around the URL itself
+			//
+			if (*position == '"'||*position == '\'')
 			{
 			    position++;
-			    q = strchr(position, '"');
+			    q = strchr(position, position[-1]);
 			    if (!q)
 				break;
+			    //
+			    // We seem to have matched the opening quote char
+			    // Mark the end of the quotes as our endpoint, so
+			    // that we can continue parsing after the current 
+			    // text
+			    //
+			    *q = '\0';
+			    //
+			    // If a '?' or '#' is present in a quoted URL,
+			    //  treat that as the end of the URL, but we skip
+			    //  past the quote to parse the rest of the anchor.
+			    //
+			    // Is there a better way of looking for these?
+			    //
+			    if ((t = strchr(position, '#')) != NULL)
+				*t = '\0';
+			    if ((t = strchr(position, '?')) != NULL)
+				*t = '\0';
 			}
 			else
 			{
@@ -374,8 +396,8 @@
 				   *q != '?' &&
 				   *q != '#')
 				q++;
+			    *q = '\0';
 			}
-			*q = '\0';
 			delete href;
 			href = new URL(position, *base);
 			in_ref = 1;
@@ -396,20 +418,42 @@
 			position++;
 			while (isspace(*position))
 			    position++;
-			if (*position == '"')
+			//
+			// Allow either single quotes or double quotes
+			// around the URL itself
+			//
+			if (*position == '"'||*position == '\'')
 			{
 			    position++;
-			    q = strchr(position, '"');
+			    q = strchr(position, position[-1]);
 			    if (!q)
 				break;
+			    //
+			    // We seem to have matched the opening quote char
+			    // Mark the end of the quotes as our endpoint, so
+			    // that we can continue parsing after the current 
+			    // text
+			    //
+			    *q = '\0';
+			    //
+			    // If a '?' or '#' is present in a quoted URL,
+			    //  treat that as the end of the URL, but we skip
+			    //  past the quote to parse the rest of the anchor.
+			    //
+			    // Is there a better way of looking for these?
+			    //
+			    if ((t = strchr(position, '#')) != NULL)
+				*t = '\0';
+			    if ((t = strchr(position, '?')) != NULL)
+				*t = '\0';
 			}
 			else
 			{
 			    q = position;
 			    while (*q && *q != '>' && !isspace(*q))
 				q++;
+			    *q = '\0';
 			}
-			*q = '\0';
 			retriever.got_anchor(position);
 			position = q + 1;
 			break;
@@ -484,20 +528,42 @@
 	    position++;
 	    while (isspace(*position))
 		position++;
-	    if (*position == '"')
+	    //
+	    // Allow either single quotes or double quotes
+	    // around the URL itself
+	    //
+	    if (*position == '"'||*position == '\'')
 	    {
 		position++;
-		q = strchr(position, '"');
+		q = strchr(position, position[-1]);
 		if (!q)
 		    break;
+		//
+		// We seem to have matched the opening quote char
+		// Mark the end of the quotes as our endpoint, so
+		// that we can continue parsing after the current 
+		// text
+		//
+		*q = '\0';
+		//
+		// If a '?' or '#' is present in a quoted URL,
+		//  treat that as the end of the URL, but we skip
+		//  past the quote to parse the rest of the anchor.
+		//
+		// Is there a better way of looking for these?
+		//
+		if ((t = strchr(position, '#')) != NULL)
+		    *t = '\0';
+		if ((t = strchr(position, '?')) != NULL)
+		    *t = '\0';
 	    }
 	    else
 	    {
 		q = position;
 		while (*q && *q != '>' && !isspace(*q))
 		    q++;
+		*q = '\0';
 	    }
-	    *q = '\0';
 	    retriever.got_image(position);
 	    break;
 	}
@@ -616,12 +682,34 @@
 		    position++;
 		    while (isspace(*position))
 			position++;
-		    if (*position == '"')
+		    //
+		    // Allow either single quotes or double quotes
+		    // around the URL itself
+		    //
+		    if (*position == '"'||*position == '\'')
 		    {
 			position++;
-			q = strchr(position, '"');
+			q = strchr(position, position[-1]);
 			if (!q)
 			    break;
+			//
+			// We seem to have matched the opening quote char
+			// Mark the end of the quotes as our endpoint, so
+			// that we can continue parsing after the current 
+			// text
+			//
+			*q = '\0';
+			//
+			// If a '?' or '#' is present in a quoted URL,
+			//  treat that as the end of the URL, but we skip
+			//  past the quote to parse the rest of the anchor.
+			//
+			// Is there a better way of looking for these?
+			//
+			if ((t = strchr(position, '#')) != NULL)
+			    *t = '\0';
+			if ((t = strchr(position, '?')) != NULL)
+			    *t = '\0';
 		    }
 		    else
 		    {
@@ -632,8 +720,8 @@
 			       *q != '?' &&
 			       *q != '#')
 			    q++;
+			*q = '\0';
 		    }
-		    *q = '\0';
 		    delete href;
 		    href = new URL(position, *base);
 		    if (doindex)
@@ -668,12 +756,34 @@
 		    position++;
 		    while (isspace(*position))
 			position++;
-		    if (*position == '"')
+		    //
+		    // Allow either single quotes or double quotes
+		    // around the URL itself
+		    //
+		    if (*position == '"'||*position == '\'')
 		    {
 			position++;
-			q = strchr(position, '"');
+			q = strchr(position, position[-1]);
 			if (!q)
 			    break;
+			//
+			// We seem to have matched the opening quote char
+			// Mark the end of the quotes as our endpoint, so
+			// that we can continue parsing after the current 
+			// text
+			//
+			*q = '\0';
+			//
+			// If a '?' or '#' is present in a quoted URL,
+			//  treat that as the end of the URL, but we skip
+			//  past the quote to parse the rest of the anchor.
+			//
+			// Is there a better way of looking for these?
+			//
+			if ((t = strchr(position, '#')) != NULL)
+			    *t = '\0';
+			if ((t = strchr(position, '?')) != NULL)
+			    *t = '\0';
 		    }
 		    else
 		    {
@@ -684,8 +794,8 @@
 			       *q != '?' &&
 			       *q != '#')
 			    q++;
+			*q = '\0';
 		    }
-		    *q = '\0';
 		    delete href;
 		    href = new URL(position, *base);
 		    if (doindex)
@@ -719,12 +829,34 @@
 		    position++;
 		    while (isspace(*position))
 			position++;
-		    if (*position == '"')
+		    //
+		    // Allow either single quotes or double quotes
+		    // around the URL itself
+		    //
+		    if (*position == '"'||*position == '\'')
 		    {
 			position++;
-			q = strchr(position, '"');
+			q = strchr(position, position[-1]);
 			if (!q)
 			    break;
+			//
+			// We seem to have matched the opening quote char
+			// Mark the end of the quotes as our endpoint, so
+			// that we can continue parsing after the current 
+			// text
+			//
+			*q = '\0';
+			//
+			// If a '?' or '#' is present in a quoted URL,
+			//  treat that as the end of the URL, but we skip
+			//  past the quote to parse the rest of the anchor.
+			//
+			// Is there a better way of looking for these?
+			//
+			if ((t = strchr(position, '#')) != NULL)
+			    *t = '\0';
+			if ((t = strchr(position, '?')) != NULL)
+			    *t = '\0';
 		    }
 		    else
 		    {
@@ -735,8 +867,8 @@
 			       *q != '?' &&
 			       *q != '#')
 			    q++;
+			*q = '\0';
 		    }
-		    *q = '\0';
 		    URL tempBase(position, *base);
 		    *base = tempBase;
 		}

