Date: Sat, 3 Mar 2001 12:41:35 -0600 From: Geoff Hutchison To: htdig3-dev Subject: [htdig-dev] Patch to limit based on IP address I've been going through the old bug tracker and migrating requests to the SourceForge pages. For some reason, I cannot currently attach files on SourceForge, so I'm forwarding this patch to the list to keep it from getting lost. This is for feature request #405688. Submitted originally by Geerd Kakes Untill now when I run htdig I used the limit_urls_to: in the config file to limit htdig to only our clients. The number of urls which we need to limit to is getting rather big though. So big htdig aborts its search. I thought of a solution to this problem and think the solution lies in not limiting htdig to the written domain name but limit it to a certain IP range. -Geoff diff -Naur htdig-3.1.5.org/htdig/Retriever.cc htdig-3.1.5/htdig/Retriever.cc --- htdig-3.1.5.org/htdig/Retriever.cc Fri Feb 25 03:29:10 2000 +++ htdig-3.1.5/htdig/Retriever.cc Mon Feb 28 22:59:42 2000 @@ -18,6 +18,7 @@ #include #include #include +#include #include "HtWordType.h" static WordList words; @@ -618,6 +619,7 @@ { static Dictionary *invalids = 0; static Dictionary *valids = 0; + static Dictionary *validip = 0; // // Invalid extensions will be kept in a dictionary for quick @@ -661,6 +663,27 @@ } } + // + // Valid IP adress check, build valid IP Dict. + // + if (!validip) + { + // A list of ip adresses, seperated bu spaces or tabs + String t = config["limit_ip"]; + String lowerp; + char *p = strtok(t, " \t"); + validip = new Dictionary; + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + validip->Add(lowerp, 0); + p = strtok(0, " \t"); + } + } + + static String url; url = u; @@ -743,6 +766,46 @@ // if (limits.FindFirst(url) >= 0) return TRUE; + + // + // Check also the IP based limits. + // + char *hostaddr, ipaddr[16]; + int i, len; + len = 0; + struct hostent *ht; + for ( i=7; i < strlen(url); i++) + if (url[i] == '/' || url[i] == ':') + { + len = i - 7; + break; + } + if (!len) + len = strlen(url) - 6; + if ((hostaddr = (char *) malloc((len + 1) *sizeof(char))) == NULL) + return FALSE; + strncpy(hostaddr,url + 7,len); + hostaddr[len] = '\0'; + ht = gethostbyname(hostaddr); + for (i = 0 ; ht->h_addr_list[i] != NULL; i++) + { + sprintf(ipaddr,"%d.%d.%d.%d", + ((unsigned)ht->h_addr_list[i][0]>127) ? ht->h_addr_list[i][0] + 256 : ht->h_addr_list[i][0], + ((unsigned)ht->h_addr_list[i][1]>127) ? ht->h_addr_list[i][1] + 256 : ht->h_addr_list[i][1], + ((unsigned)ht->h_addr_list[i][2]>127) ? ht->h_addr_list[i][2] + 256 : ht->h_addr_list[i][2], + ((unsigned)ht->h_addr_list[i][3]>127) ? ht->h_addr_list[i][3] + 256 : ht->h_addr_list[i][3] + ); + if (debug > 3) + cout << endl <<" Hostname: " << hostaddr << " Resolved to: " << ipaddr; + if (validip->Exists(ipaddr)) + { + if (debug > 2) + cout << endl <<" Accepted: " << ipaddr << " <-> " << hostaddr << endl; + free(hostaddr); + return TRUE; + } + } + free(hostaddr); if (debug > 2) cout << endl <<" Rejected: URL not in the limits!"; diff -Naur htdig-3.1.5.org/installdir/htdig.conf htdig-3.1.5/installdir/htdig.conf --- htdig-3.1.5.org/installdir/htdig.conf Fri Feb 25 03:29:12 2000 +++ htdig-3.1.5/installdir/htdig.conf Mon Feb 28 23:15:40 2000 @@ -40,6 +40,15 @@ limit_urls_to: ${start_url} # +# This attribute limits the scope of the indexing process. It contains +# the ip addresses of the hosts you want to visit. This means every website +# which resolves to the same address will be indexed. The limits of limit_url_to +# and limit_ip are or'ed. +# +limit_ip: 10.0.0.1 + + +# # If there are particular pages that you definately do NOT want to index, you # can use the exclude_urls attribute. The value is a list of string patterns. # If a URL matches any of the patterns, it will NOT be indexed. This is