
From grdetil@scrc.umanitoba.ca Wed Mar 17 10:00:18 1999
Date: Wed, 17 Mar 1999 11:03:47 -0600 (CST)
From: Gilles Detillieux <grdetil@scrc.umanitoba.ca>
To: htdig3-dev@htdig.org
Cc: htdig3-dev@htdig.org
Subject: Re: [htdig3-dev] Problem (and) solved with comments in HTML.cc


According to Benjamin Smedberg:
> Having seen the argument about comment parsing both here and on the regular
> list, I think that there is a solution that will please everybody:
> 
> Once you've started a comment, check each dash: if it is followed by only
> dashes, whitespace, up to the next right bracket, then end the comment.
> Otherwise, keep going.
> 
> This follows the standard and allows for quite a bit of non-standard) coding
> as well.

Yes, this is more or less what we had come to agree upon.  I've finally
taken the time to implement it.  Here is the new code, first as a snippet,
so you can see what it does, and as a patch to the 3.1.1 source, so you
can give it a try.  I'd appreciate it if everyone who participated in this
discussion, or who had problems with parsing comments in the past, to give
this a try.  It past the few test cases I threw at it, but I'd like people
to hammer away at it for a while.

      if (strncmp((char *)position, "<!", 2) == 0)
	{
	  //
	  // Possible comment declaration (but could be DTD declaration!)
	  // A comment can contain other '<' and '>':
	  // we have to ignore complete comment declarations
	  // but of course also DTD declarations.
	  //
	  position += 2;	// Get past declaration start
	  if (strncmp((char *)position, "--", 2) == 0)
	    {
	      // Found start of comment - now find the end
	      position += 2;
	      do
		{
		  q = (unsigned char*)strstr((char *)position, "--");
		  if (!q)
		    {
		      *position = '\0';
		      break;	// Rest of document seems to be a comment...
		    }
		  else
		    {
		      position = q + 2;
		      // Skip dashes after a badly formed comment
		      while (*position == '-')
			  position++;
		      // Skip whitespace after an individual comment
		      while (isspace(*position))
			  position++;
		    }
		  // if comment declaration hasn't ended, skip another comment
		}
	      while (*position && *position != '>');
	      if (*position == '>')
		{
		  position++;	// End of comment declaration
		}
	    }
	  else
	    {
	      // Not a comment declaration after all
	      // but possibly DTD: get to the end
	      q = (unsigned char*)strstr((char *)position, ">");
	      if (q)
		{
		  position = q + 1;
		  // End of (whatever) declaration
		}
	      else
		{
		  *position = '\0'; // Rest of document is DTD?
		}
	    }
	  continue;
	}


Apply this patch to htdig/HTML.cc from ht://Dig 3.1.1 (or later).

--- htdig/HTML.cc.commentbug	Wed Feb 24 12:36:59 1999
+++ htdig/HTML.cc	Wed Mar 17 11:01:08 1999
@@ -140,54 +140,54 @@ HTML::parse(Retriever &retriever, URL &b
 	  //
 	  // Possible comment declaration (but could be DTD declaration!)
 	  // A comment can contain other '<' and '>':
-	  // we have to ignore a complete comment declarations
+	  // we have to ignore complete comment declarations
 	  // but of course also DTD declarations.
 	  //
 	  position += 2;	// Get past declaration start
-	  while (*position)
+	  if (strncmp((char *)position, "--", 2) == 0)
 	    {
-	      // Let's see if the declaration ends here
-	      if (*position == '>')
-		{
-		  position++;
-		  break;	// End of comment declaration
-		}
-	      // Not the end of the declaration yet:
-	      // we'll try to find an actual comment
-	      if (strncmp((char *)position, "--", 2) == 0)
+	      // Found start of comment - now find the end
+	      position += 2;
+	      do
 		{
-		  // Found start of comment - now find the end
-		  position += 2;
 		  q = (unsigned char*)strstr((char *)position, "--");
 		  if (!q)
 		    {
 		      *position = '\0';
 		      break;	// Rest of document seems to be a comment...
 		    }
-		  position = q + 2;
-		}
-	      else
-		{
-		  // Not a comment declaration after all
-		  // but possibly DTD: get to the end
-		  q = (unsigned char*)strstr((char *)position, ">");
-		  if (q)
-		    {
-		      position = q + 1;
-		      break;
-		      // End of (whatever) declaration
-		    }
 		  else
 		    {
-		      *position = '\0'; // Rest of document is DTD?
-		      break;
+		      position = q + 2;
+		      // Skip extra dashes after a badly formed comment
+		      while (*position == '-')
+			  position++;
+		      // Skip whitespace after an individual comment
+		      while (isspace(*position))
+			  position++;
 		    }
-		  
+		  // if comment declaration hasn't ended, skip another comment
+		}
+	      while (*position && *position != '>');
+	      if (*position == '>')
+		{
+		  position++;	// End of comment declaration
+		}
+	    }
+	  else
+	    {
+	      // Not a comment declaration after all
+	      // but possibly DTD: get to the end
+	      q = (unsigned char*)strstr((char *)position, ">");
+	      if (q)
+		{
+		  position = q + 1;
+		  // End of (whatever) declaration
+		}
+	      else
+		{
+		  *position = '\0'; // Rest of document is DTD?
 		}
-	      
-	      // Skip whitespace after an individual comment
-	      while (isspace(*position))
-		position++;
 	    }
 	  continue;
 	}

-- 
Gilles R. Detillieux              E-mail: <grdetil@scrc.umanitoba.ca>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
htdig3-dev@htdig.org containing the single word "unsubscribe" in
the SUBJECT of the message.
