xref: /openbsd-src/usr.sbin/nsd/rrl.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 
2 /* rrl.c - Response Rate Limiting for NSD.
3  * By W.C.A. Wijngaards
4  * Copyright 2012, NLnet Labs.
5  * BSD, see LICENSE.
6  */
7 #include "config.h"
8 #include <errno.h>
9 #include "rrl.h"
10 #include "util.h"
11 #include "lookup3.h"
12 #include "options.h"
13 
14 #ifdef RATELIMIT
15 
16 #ifdef HAVE_MMAP
17 #include <sys/mman.h>
18 #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
19 #define MAP_ANONYMOUS   MAP_ANON
20 #endif
21 #endif /* HAVE_MMAP */
22 
23 
24 /**
25  * The rate limiting data structure bucket, this represents one rate of
26  * packets from a single source.
27  * Smoothed average rates.
28  */
29 struct rrl_bucket {
30 	/* the source netmask */
31 	uint64_t source;
32 	/* rate, in queries per second, which due to rate=r(t)+r(t-1)/2 is
33 	 * equal to double the queries per second */
34 	uint32_t rate;
35 	/* the full hash */
36 	uint32_t hash;
37 	/* counter for queries arrived in this second */
38 	uint32_t counter;
39 	/* timestamp, which time is the time of the counter, the rate is from
40 	 * one timestep before that. */
41 	int32_t stamp;
42 	/* flags for the source mask and type */
43 	uint16_t flags;
44 };
45 
46 /* the (global) array of RRL buckets */
47 static struct rrl_bucket* rrl_array = NULL;
48 static size_t rrl_array_size = RRL_BUCKETS;
49 static uint32_t rrl_ratelimit = RRL_LIMIT; /* 2x qps */
50 static uint8_t rrl_slip_ratio = RRL_SLIP;
51 static uint8_t rrl_ipv4_prefixlen = RRL_IPV4_PREFIX_LENGTH;
52 static uint8_t rrl_ipv6_prefixlen = RRL_IPV6_PREFIX_LENGTH;
53 static uint64_t rrl_ipv6_mask; /* max prefixlen 64 */
54 static uint32_t rrl_whitelist_ratelimit = RRL_WLIST_LIMIT; /* 2x qps */
55 
56 /* the array of mmaps for the children (saved between reloads) */
57 static void** rrl_maps = NULL;
58 static size_t rrl_maps_num = 0;
59 
60 void rrl_mmap_init(int numch, size_t numbuck, size_t lm, size_t wlm, size_t sm,
61 	size_t plf, size_t pls)
62 {
63 #ifdef HAVE_MMAP
64 	size_t i;
65 #endif
66 	if(numbuck != 0)
67 		rrl_array_size = numbuck;
68 	rrl_ratelimit = lm*2;
69 	rrl_slip_ratio = sm;
70 	rrl_ipv4_prefixlen = plf;
71 	rrl_ipv6_prefixlen = pls;
72 	if (pls <= 32) {
73 		rrl_ipv6_mask = ((uint64_t) htonl(0xffffffff << (32-pls))) << 32;
74 	} else {
75 		rrl_ipv6_mask =  ((uint64_t) htonl(0xffffffff << (64-pls))) |
76 			(((uint64_t)0xffffffff)<<32);
77 	}
78 	rrl_whitelist_ratelimit = wlm*2;
79 #ifdef HAVE_MMAP
80 	/* allocate the ratelimit hashtable in a memory map so it is
81 	 * preserved across reforks (every child its own table) */
82 	rrl_maps_num = (size_t)numch;
83 	rrl_maps = (void**)xmallocarray(rrl_maps_num, sizeof(void*));
84 	for(i=0; i<rrl_maps_num; i++) {
85 		rrl_maps[i] = mmap(NULL,
86 			sizeof(struct rrl_bucket)*rrl_array_size,
87 			PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
88 		if(rrl_maps[i] == MAP_FAILED) {
89 			log_msg(LOG_ERR, "rrl: mmap failed: %s",
90 				strerror(errno));
91 			exit(1);
92 		}
93 		memset(rrl_maps[i], 0,
94 			sizeof(struct rrl_bucket)*rrl_array_size);
95 	}
96 #else
97 	(void)numch;
98 	rrl_maps_num = 0;
99 	rrl_maps = NULL;
100 #endif
101 }
102 
103 void rrl_set_limit(size_t lm, size_t wlm, size_t sm)
104 {
105 	rrl_ratelimit = lm*2;
106 	rrl_whitelist_ratelimit = wlm*2;
107 	rrl_slip_ratio = sm;
108 }
109 
110 void rrl_init(size_t ch)
111 {
112 	if(!rrl_maps || ch >= rrl_maps_num)
113 	    rrl_array = xalloc_array_zero(sizeof(struct rrl_bucket),
114 	    	rrl_array_size);
115 #ifdef HAVE_MMAP
116 	else rrl_array = (struct rrl_bucket*)rrl_maps[ch];
117 #endif
118 }
119 
120 /** return the source netblock of the query, this is the genuine source
121  * for genuine queries and the target for reflected packets */
122 static uint64_t rrl_get_source(query_type* query, uint16_t* c2)
123 {
124 	/* note there is an IPv6 subnet, that maps
125 	 * to the same buckets as IPv4 space, but there is a flag in c2
126 	 * that makes the hash different */
127 #ifdef INET6
128 	if( ((struct sockaddr_in*)&query->addr)->sin_family == AF_INET) {
129 		*c2 = 0;
130 		return ((struct sockaddr_in*)&query->addr)->
131 			sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
132 	} else {
133 		uint64_t s;
134 		*c2 = rrl_ip6;
135 		memmove(&s, &((struct sockaddr_in6*)&query->addr)->sin6_addr,
136 			sizeof(s));
137 		return s & rrl_ipv6_mask;
138 	}
139 #else
140 	*c2 = 0;
141 	return query->addr.sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
142 #endif
143 }
144 
145 /** debug source to string */
146 static const char* rrlsource2str(uint64_t s, uint16_t c2)
147 {
148 	static char buf[64];
149 	struct in_addr a4;
150 #ifdef INET6
151 	if(c2) {
152 		/* IPv6 */
153 		struct in6_addr a6;
154 		memset(&a6, 0, sizeof(a6));
155 		memmove(&a6, &s, sizeof(s));
156 		if(!inet_ntop(AF_INET6, &a6, buf, sizeof(buf)))
157 			strlcpy(buf, "[ip6 ntop failed]", sizeof(buf));
158 		else {
159 			static char prefix[4];
160 			snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv6_prefixlen);
161 			strlcat(buf, &prefix[0], sizeof(buf));
162 		}
163 		return buf;
164 	}
165 #else
166 	(void)c2;
167 #endif
168 	/* ipv4 */
169 	a4.s_addr = (uint32_t)s;
170 	if(!inet_ntop(AF_INET, &a4, buf, sizeof(buf)))
171 		strlcpy(buf, "[ip4 ntop failed]", sizeof(buf));
172 	else {
173 		static char prefix[4];
174 		snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv4_prefixlen);
175 		strlcat(buf, &prefix[0], sizeof(buf));
176 	}
177 	return buf;
178 }
179 
180 enum rrl_type rrlstr2type(const char* s)
181 {
182 	if(strcmp(s, "nxdomain")==0) return rrl_type_nxdomain;
183 	else if(strcmp(s, "error")==0) return rrl_type_error;
184 	else if(strcmp(s, "referral")==0) return rrl_type_referral;
185 	else if(strcmp(s, "any")==0) return rrl_type_any;
186 	else if(strcmp(s, "wildcard")==0) return rrl_type_wildcard;
187 	else if(strcmp(s, "nodata")==0) return rrl_type_nodata;
188 	else if(strcmp(s, "dnskey")==0) return rrl_type_dnskey;
189 	else if(strcmp(s, "positive")==0) return rrl_type_positive;
190 	else if(strcmp(s, "rrsig")==0) return rrl_type_rrsig;
191 	else if(strcmp(s, "all")==0) return rrl_type_all;
192 	return 0; /* unknown */
193 }
194 
195 const char* rrltype2str(enum rrl_type c)
196 {
197 	switch(c & 0x0fff) {
198 		case rrl_type_nxdomain: return "nxdomain";
199 		case rrl_type_error: return "error";
200 		case rrl_type_referral: return "referral";
201 		case rrl_type_any: return "any";
202 		case rrl_type_wildcard: return "wildcard";
203 		case rrl_type_nodata: return "nodata";
204 		case rrl_type_dnskey: return "dnskey";
205 		case rrl_type_positive: return "positive";
206 		case rrl_type_rrsig: return "rrsig";
207 		case rrl_type_all: return "all";
208 	}
209 	return "unknown";
210 }
211 
212 /** classify the query in a number of different types, each has separate
213  * ratelimiting, so that positive queries are not impeded by others */
214 static uint16_t rrl_classify(query_type* query, const uint8_t** d,
215 	size_t* d_len)
216 {
217 	if(RCODE(query->packet) == RCODE_NXDOMAIN) {
218 		if(query->zone && query->zone->apex) {
219 			*d = dname_name(domain_dname(query->zone->apex));
220 			*d_len = domain_dname(query->zone->apex)->name_size;
221 		}
222 		return rrl_type_nxdomain;
223 	}
224 	if(RCODE(query->packet) != RCODE_OK) {
225 		if(query->zone && query->zone->apex) {
226 			*d = dname_name(domain_dname(query->zone->apex));
227 			*d_len = domain_dname(query->zone->apex)->name_size;
228 		}
229 		return rrl_type_error;
230 	}
231 	if(query->delegation_domain) {
232 		*d = dname_name(domain_dname(query->delegation_domain));
233 		*d_len = domain_dname(query->delegation_domain)->name_size;
234 		return rrl_type_referral;
235 	}
236 	if(query->qtype == TYPE_ANY) {
237 		if(query->qname) {
238 			*d = dname_name(query->qname);
239 			*d_len = query->qname->name_size;
240 		}
241 		return rrl_type_any;
242 	}
243 	if(query->qtype == TYPE_RRSIG) {
244 		if(query->qname) {
245 			*d = dname_name(query->qname);
246 			*d_len = query->qname->name_size;
247 		}
248 		return rrl_type_rrsig;
249 	}
250 	if(query->wildcard_domain) {
251 		*d = dname_name(domain_dname(query->wildcard_domain));
252 		*d_len = domain_dname(query->wildcard_domain)->name_size;
253 		return rrl_type_wildcard;
254 	}
255 	if(ANCOUNT(query->packet) == 0) {
256 		if(query->zone && query->zone->apex) {
257 			*d = dname_name(domain_dname(query->zone->apex));
258 			*d_len = domain_dname(query->zone->apex)->name_size;
259 		}
260 		return rrl_type_nodata;
261 	}
262 	if(query->qtype == TYPE_DNSKEY) {
263 		if(query->qname) {
264 			*d = dname_name(query->qname);
265 			*d_len = query->qname->name_size;
266 		}
267 		return rrl_type_dnskey;
268 	}
269 	/* positive */
270 	if(query->qname) {
271 		*d = dname_name(query->qname);
272 		*d_len = query->qname->name_size;
273 	}
274 	return rrl_type_positive;
275 }
276 
277 /** Examine the query and return hash and source of netblock. */
278 static void examine_query(query_type* query, uint32_t* hash, uint64_t* source,
279 	uint16_t* flags, uint32_t* lm)
280 {
281 	/* compile a binary string representing the query */
282 	uint16_t c, c2;
283 	/* size with 16 bytes to spare */
284 	uint8_t buf[MAXDOMAINLEN + sizeof(*source) + sizeof(c) + 16];
285 	const uint8_t* dname = NULL; size_t dname_len = 0;
286 	uint32_t r = 0x267fcd16;
287 
288 	*source = rrl_get_source(query, &c2);
289 	c = rrl_classify(query, &dname, &dname_len);
290 	if(query->zone && query->zone->opts &&
291 		(query->zone->opts->pattern->rrl_whitelist & c))
292 		*lm = rrl_whitelist_ratelimit;
293 	if(*lm == 0) return;
294 	c |= c2;
295 	*flags = c;
296 	memmove(buf, source, sizeof(*source));
297 	memmove(buf+sizeof(*source), &c, sizeof(c));
298 
299 	DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "rrl_examine type %s name %s", rrltype2str(c), dname?wiredname2str(dname):"NULL"));
300 
301 	/* and hash it */
302 	if(dname && dname_len <= MAXDOMAINLEN) {
303 		memmove(buf+sizeof(*source)+sizeof(c), dname, dname_len);
304 		*hash = hashlittle(buf, sizeof(*source)+sizeof(c)+dname_len, r);
305 	} else
306 		*hash = hashlittle(buf, sizeof(*source)+sizeof(c), r);
307 }
308 
309 /* age the bucket because elapsed time steps have gone by */
310 static void rrl_attenuate_bucket(struct rrl_bucket* b, int32_t elapsed)
311 {
312 	if(elapsed > 16) {
313 		b->rate = 0;
314 	} else {
315 		/* divide rate /2 for every elapsed time step, because
316 		 * the counters in the inbetween steps were 0 */
317 		/* r(t) = 0 + 0/2 + 0/4 + .. + oldrate/2^dt */
318 		b->rate >>= elapsed;
319 		/* we know that elapsed >= 2 */
320 		b->rate += (b->counter>>(elapsed-1));
321 	}
322 }
323 
324 /** log a message about ratelimits */
325 static void
326 rrl_msg(query_type* query, const char* str)
327 {
328 	uint16_t c, c2, wl = 0;
329 	const uint8_t* d = NULL;
330 	size_t d_len;
331 	uint64_t s;
332 	char address[128];
333 	if(verbosity < 1) return;
334 	addr2str(&query->addr, address, sizeof(address));
335 	s = rrl_get_source(query, &c2);
336 	c = rrl_classify(query, &d, &d_len) | c2;
337 	if(query->zone && query->zone->opts &&
338 		(query->zone->opts->pattern->rrl_whitelist & c))
339 		wl = 1;
340 	log_msg(LOG_INFO, "ratelimit %s %s type %s%s target %s query %s %s",
341 		str, d?wiredname2str(d):"", rrltype2str(c),
342 		wl?"(whitelisted)":"", rrlsource2str(s, c2),
343 		address, rrtype_to_string(query->qtype));
344 }
345 
346 /** true if the query used to be blocked by the ratelimit */
347 static int
348 used_to_block(uint32_t rate, uint32_t counter, uint32_t lm)
349 {
350 	return rate >= lm || counter+rate/2 >= lm;
351 }
352 
353 /** update the rate in a ratelimit bucket, return actual rate */
354 uint32_t rrl_update(query_type* query, uint32_t hash, uint64_t source,
355 	uint16_t flags, int32_t now, uint32_t lm)
356 {
357 	struct rrl_bucket* b = &rrl_array[hash % rrl_array_size];
358 
359 	DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "source %llx hash %x oldrate %d oldcount %d stamp %d",
360 		(long long unsigned)source, hash, b->rate, b->counter, b->stamp));
361 
362 	/* check if different source */
363 	if(b->source != source || b->flags != flags || b->hash != hash) {
364 		/* initialise */
365 		/* potentially the wrong limit here, used lower nonwhitelim */
366 		if(verbosity >= 1 &&
367 			used_to_block(b->rate, b->counter, rrl_ratelimit)) {
368 			char address[128];
369 			addr2str(&query->addr, address, sizeof(address));
370 			log_msg(LOG_INFO, "ratelimit unblock ~ type %s target %s query %s %s (%s collision)",
371 				rrltype2str(b->flags),
372 				rrlsource2str(b->source, b->flags),
373 				address, rrtype_to_string(query->qtype),
374 				(b->hash!=hash?"bucket":"hash"));
375 		}
376 		b->hash = hash;
377 		b->source = source;
378 		b->flags = flags;
379 		b->counter = 1;
380 		b->rate = 0;
381 		b->stamp = now;
382 		return 1;
383 	}
384 	/* this is the same source */
385 
386 	/* check if old, zero or smooth it */
387 	/* circular arith for time */
388 	if(now - b->stamp == 1) {
389 		/* very busy bucket and time just stepped one step */
390 		int oldblock = used_to_block(b->rate, b->counter, lm);
391 		b->rate = b->rate/2 + b->counter;
392 		if(oldblock && b->rate < lm)
393 			rrl_msg(query, "unblock");
394 		b->counter = 1;
395 		b->stamp = now;
396 	} else if(now - b->stamp > 0) {
397 		/* older bucket */
398 		int olderblock = used_to_block(b->rate, b->counter, lm);
399 		rrl_attenuate_bucket(b, now - b->stamp);
400 		if(olderblock && b->rate < lm)
401 			rrl_msg(query, "unblock");
402 		b->counter = 1;
403 		b->stamp = now;
404 	} else if(now != b->stamp) {
405 		/* robust, timestamp from the future */
406 		if(used_to_block(b->rate, b->counter, lm))
407 			rrl_msg(query, "unblock");
408 		b->rate = 0;
409 		b->counter = 1;
410 		b->stamp = now;
411 	} else {
412 		/* bucket is from the current timestep, update counter */
413 		b->counter ++;
414 
415 		/* log what is blocked for operational debugging */
416 		if(b->counter + b->rate/2 == lm && b->rate < lm)
417 			rrl_msg(query, "block");
418 	}
419 
420 	/* return max from current rate and projected next-value for rate */
421 	/* so that if the rate increases suddenly very high, it is
422 	 * stopped halfway into the time step */
423 	if(b->counter > b->rate/2)
424 		return b->counter + b->rate/2;
425 	return b->rate;
426 }
427 
428 int rrl_process_query(query_type* query)
429 {
430 	uint64_t source;
431 	uint32_t hash;
432 	/* we can use circular arithmetic here, so int32 works after 2038 */
433 	int32_t now = (int32_t)time(NULL);
434 	uint32_t lm = rrl_ratelimit;
435 	uint16_t flags;
436 	if(rrl_ratelimit == 0 && rrl_whitelist_ratelimit == 0)
437 		return 0;
438 
439 	/* examine query */
440 	examine_query(query, &hash, &source, &flags, &lm);
441 
442 	if(lm == 0)
443 		return 0; /* no limit for this */
444 
445 	/* update rate */
446 	return (rrl_update(query, hash, source, flags, now, lm) >= lm);
447 }
448 
449 query_state_type rrl_slip(query_type* query)
450 {
451 	/* discard number the packets, randomly */
452 #ifdef HAVE_ARC4RANDOM_UNIFORM
453 	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random_uniform(rrl_slip_ratio)) == 0))) {
454 #elif HAVE_ARC4RANDOM
455 	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random() % rrl_slip_ratio) == 0))) {
456 #else
457 	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((random() % rrl_slip_ratio) == 0))) {
458 #endif
459 		/* set TC on the rest */
460 		TC_SET(query->packet);
461 		ANCOUNT_SET(query->packet, 0);
462 		NSCOUNT_SET(query->packet, 0);
463 		ARCOUNT_SET(query->packet, 0);
464 		if(query->qname)
465 			/* header, type, class, qname */
466 			buffer_set_position(query->packet,
467 				QHEADERSZ+4+query->qname->name_size);
468 		else 	buffer_set_position(query->packet, QHEADERSZ);
469 		return QUERY_PROCESSED;
470 	}
471 	return QUERY_DISCARDED;
472 }
473 
474 #endif /* RATELIMIT */
475