1 2 /* rrl.c - Response Rate Limiting for NSD. 3 * By W.C.A. Wijngaards 4 * Copyright 2012, NLnet Labs. 5 * BSD, see LICENSE. 6 */ 7 #include "config.h" 8 #include <errno.h> 9 #include "rrl.h" 10 #include "util.h" 11 #include "lookup3.h" 12 #include "options.h" 13 14 #ifdef RATELIMIT 15 16 #ifdef HAVE_MMAP 17 #include <sys/mman.h> 18 #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS) 19 #define MAP_ANONYMOUS MAP_ANON 20 #endif 21 #endif /* HAVE_MMAP */ 22 23 24 /** 25 * The rate limiting data structure bucket, this represents one rate of 26 * packets from a single source. 27 * Smoothed average rates. 28 */ 29 struct rrl_bucket { 30 /* the source netmask */ 31 uint64_t source; 32 /* rate, in queries per second, which due to rate=r(t)+r(t-1)/2 is 33 * equal to double the queries per second */ 34 uint32_t rate; 35 /* the full hash */ 36 uint32_t hash; 37 /* counter for queries arrived in this second */ 38 uint32_t counter; 39 /* timestamp, which time is the time of the counter, the rate is from 40 * one timestep before that. */ 41 int32_t stamp; 42 /* flags for the source mask and type */ 43 uint16_t flags; 44 }; 45 46 /* the (global) array of RRL buckets */ 47 static struct rrl_bucket* rrl_array = NULL; 48 static size_t rrl_array_size = RRL_BUCKETS; 49 static uint32_t rrl_ratelimit = RRL_LIMIT; /* 2x qps */ 50 static uint8_t rrl_slip_ratio = RRL_SLIP; 51 static uint8_t rrl_ipv4_prefixlen = RRL_IPV4_PREFIX_LENGTH; 52 static uint8_t rrl_ipv6_prefixlen = RRL_IPV6_PREFIX_LENGTH; 53 static uint64_t rrl_ipv6_mask; /* max prefixlen 64 */ 54 static uint32_t rrl_whitelist_ratelimit = RRL_WLIST_LIMIT; /* 2x qps */ 55 56 /* the array of mmaps for the children (saved between reloads) */ 57 static void** rrl_maps = NULL; 58 static size_t rrl_maps_num = 0; 59 60 void rrl_mmap_init(int numch, size_t numbuck, size_t lm, size_t wlm, size_t sm, 61 size_t plf, size_t pls) 62 { 63 #ifdef HAVE_MMAP 64 size_t i; 65 #endif 66 if(numbuck != 0) 67 rrl_array_size = numbuck; 68 rrl_ratelimit = lm*2; 69 rrl_slip_ratio = sm; 70 rrl_ipv4_prefixlen = plf; 71 rrl_ipv6_prefixlen = pls; 72 if (pls <= 32) { 73 rrl_ipv6_mask = ((uint64_t) htonl(0xffffffff << (32-pls))) << 32; 74 } else { 75 rrl_ipv6_mask = ((uint64_t) htonl(0xffffffff << (64-pls))) | 76 (((uint64_t)0xffffffff)<<32); 77 } 78 rrl_whitelist_ratelimit = wlm*2; 79 #ifdef HAVE_MMAP 80 /* allocate the ratelimit hashtable in a memory map so it is 81 * preserved across reforks (every child its own table) */ 82 rrl_maps_num = (size_t)numch; 83 rrl_maps = (void**)xmallocarray(rrl_maps_num, sizeof(void*)); 84 for(i=0; i<rrl_maps_num; i++) { 85 rrl_maps[i] = mmap(NULL, 86 sizeof(struct rrl_bucket)*rrl_array_size, 87 PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); 88 if(rrl_maps[i] == MAP_FAILED) { 89 log_msg(LOG_ERR, "rrl: mmap failed: %s", 90 strerror(errno)); 91 exit(1); 92 } 93 memset(rrl_maps[i], 0, 94 sizeof(struct rrl_bucket)*rrl_array_size); 95 } 96 #else 97 (void)numch; 98 rrl_maps_num = 0; 99 rrl_maps = NULL; 100 #endif 101 } 102 103 void rrl_mmap_deinit(void) 104 { 105 #ifdef HAVE_MMAP 106 size_t i; 107 for(i=0; i<rrl_maps_num; i++) { 108 munmap(rrl_maps[i], sizeof(struct rrl_bucket)*rrl_array_size); 109 rrl_maps[i] = NULL; 110 } 111 free(rrl_maps); 112 rrl_maps = NULL; 113 #endif 114 } 115 116 void rrl_mmap_deinit_keep_mmap(void) 117 { 118 #ifdef HAVE_MMAP 119 free(rrl_maps); 120 rrl_maps = NULL; 121 #endif 122 } 123 124 void rrl_set_limit(size_t lm, size_t wlm, size_t sm) 125 { 126 rrl_ratelimit = lm*2; 127 rrl_whitelist_ratelimit = wlm*2; 128 rrl_slip_ratio = sm; 129 } 130 131 void rrl_init(size_t ch) 132 { 133 if(!rrl_maps || ch >= rrl_maps_num) 134 rrl_array = xalloc_array_zero(sizeof(struct rrl_bucket), 135 rrl_array_size); 136 #ifdef HAVE_MMAP 137 else rrl_array = (struct rrl_bucket*)rrl_maps[ch]; 138 #endif 139 } 140 141 void rrl_deinit(size_t ch) 142 { 143 if(!rrl_maps || ch >= rrl_maps_num) 144 free(rrl_array); 145 rrl_array = NULL; 146 } 147 148 /** return the source netblock of the query, this is the genuine source 149 * for genuine queries and the target for reflected packets */ 150 static uint64_t rrl_get_source(query_type* query, uint16_t* c2) 151 { 152 /* note there is an IPv6 subnet, that maps 153 * to the same buckets as IPv4 space, but there is a flag in c2 154 * that makes the hash different */ 155 #ifdef INET6 156 if( ((struct sockaddr_in*)&query->addr)->sin_family == AF_INET) { 157 *c2 = 0; 158 return ((struct sockaddr_in*)&query->addr)-> 159 sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen)); 160 } else { 161 uint64_t s; 162 *c2 = rrl_ip6; 163 memmove(&s, &((struct sockaddr_in6*)&query->addr)->sin6_addr, 164 sizeof(s)); 165 return s & rrl_ipv6_mask; 166 } 167 #else 168 *c2 = 0; 169 return query->addr.sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen)); 170 #endif 171 } 172 173 /** debug source to string */ 174 static const char* rrlsource2str(uint64_t s, uint16_t c2) 175 { 176 static char buf[64]; 177 struct in_addr a4; 178 #ifdef INET6 179 if(c2) { 180 /* IPv6 */ 181 struct in6_addr a6; 182 memset(&a6, 0, sizeof(a6)); 183 memmove(&a6, &s, sizeof(s)); 184 if(!inet_ntop(AF_INET6, &a6, buf, sizeof(buf))) 185 strlcpy(buf, "[ip6 ntop failed]", sizeof(buf)); 186 else { 187 static char prefix[5]; 188 snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv6_prefixlen); 189 strlcat(buf, &prefix[0], sizeof(buf)); 190 } 191 return buf; 192 } 193 #else 194 (void)c2; 195 #endif 196 /* ipv4 */ 197 a4.s_addr = (uint32_t)s; 198 if(!inet_ntop(AF_INET, &a4, buf, sizeof(buf))) 199 strlcpy(buf, "[ip4 ntop failed]", sizeof(buf)); 200 else { 201 static char prefix[5]; 202 snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv4_prefixlen); 203 strlcat(buf, &prefix[0], sizeof(buf)); 204 } 205 return buf; 206 } 207 208 enum rrl_type rrlstr2type(const char* s) 209 { 210 if(strcmp(s, "nxdomain")==0) return rrl_type_nxdomain; 211 else if(strcmp(s, "error")==0) return rrl_type_error; 212 else if(strcmp(s, "referral")==0) return rrl_type_referral; 213 else if(strcmp(s, "any")==0) return rrl_type_any; 214 else if(strcmp(s, "wildcard")==0) return rrl_type_wildcard; 215 else if(strcmp(s, "nodata")==0) return rrl_type_nodata; 216 else if(strcmp(s, "dnskey")==0) return rrl_type_dnskey; 217 else if(strcmp(s, "positive")==0) return rrl_type_positive; 218 else if(strcmp(s, "rrsig")==0) return rrl_type_rrsig; 219 else if(strcmp(s, "all")==0) return rrl_type_all; 220 return 0; /* unknown */ 221 } 222 223 const char* rrltype2str(enum rrl_type c) 224 { 225 switch(c & 0x0fff) { 226 case rrl_type_nxdomain: return "nxdomain"; 227 case rrl_type_error: return "error"; 228 case rrl_type_referral: return "referral"; 229 case rrl_type_any: return "any"; 230 case rrl_type_wildcard: return "wildcard"; 231 case rrl_type_nodata: return "nodata"; 232 case rrl_type_dnskey: return "dnskey"; 233 case rrl_type_positive: return "positive"; 234 case rrl_type_rrsig: return "rrsig"; 235 case rrl_type_all: return "all"; 236 } 237 return "unknown"; 238 } 239 240 /** classify the query in a number of different types, each has separate 241 * ratelimiting, so that positive queries are not impeded by others */ 242 static uint16_t rrl_classify(query_type* query, const uint8_t** d, 243 size_t* d_len) 244 { 245 if(RCODE(query->packet) == RCODE_NXDOMAIN) { 246 if(query->zone && query->zone->apex) { 247 *d = dname_name(domain_dname(query->zone->apex)); 248 *d_len = domain_dname(query->zone->apex)->name_size; 249 } 250 return rrl_type_nxdomain; 251 } 252 if(RCODE(query->packet) != RCODE_OK) { 253 if(query->zone && query->zone->apex) { 254 *d = dname_name(domain_dname(query->zone->apex)); 255 *d_len = domain_dname(query->zone->apex)->name_size; 256 } 257 return rrl_type_error; 258 } 259 if(query->delegation_domain) { 260 *d = dname_name(domain_dname(query->delegation_domain)); 261 *d_len = domain_dname(query->delegation_domain)->name_size; 262 return rrl_type_referral; 263 } 264 if(query->qtype == TYPE_ANY) { 265 if(query->qname) { 266 *d = dname_name(query->qname); 267 *d_len = query->qname->name_size; 268 } 269 return rrl_type_any; 270 } 271 if(query->qtype == TYPE_RRSIG) { 272 if(query->qname) { 273 *d = dname_name(query->qname); 274 *d_len = query->qname->name_size; 275 } 276 return rrl_type_rrsig; 277 } 278 if(query->wildcard_domain) { 279 *d = dname_name(domain_dname(query->wildcard_domain)); 280 *d_len = domain_dname(query->wildcard_domain)->name_size; 281 return rrl_type_wildcard; 282 } 283 if(ANCOUNT(query->packet) == 0) { 284 if(query->zone && query->zone->apex) { 285 *d = dname_name(domain_dname(query->zone->apex)); 286 *d_len = domain_dname(query->zone->apex)->name_size; 287 } 288 return rrl_type_nodata; 289 } 290 if(query->qtype == TYPE_DNSKEY) { 291 if(query->qname) { 292 *d = dname_name(query->qname); 293 *d_len = query->qname->name_size; 294 } 295 return rrl_type_dnskey; 296 } 297 /* positive */ 298 if(query->qname) { 299 *d = dname_name(query->qname); 300 *d_len = query->qname->name_size; 301 } 302 return rrl_type_positive; 303 } 304 305 /** Examine the query and return hash and source of netblock. */ 306 static void examine_query(query_type* query, uint32_t* hash, uint64_t* source, 307 uint16_t* flags, uint32_t* lm) 308 { 309 /* compile a binary string representing the query */ 310 uint16_t c, c2; 311 /* size with 16 bytes to spare */ 312 uint8_t buf[MAXDOMAINLEN + sizeof(*source) + sizeof(c) + 16]; 313 const uint8_t* dname = NULL; size_t dname_len = 0; 314 uint32_t r = 0x267fcd16; 315 316 *source = rrl_get_source(query, &c2); 317 c = rrl_classify(query, &dname, &dname_len); 318 if(query->zone && query->zone->opts && 319 (query->zone->opts->pattern->rrl_whitelist & c)) 320 *lm = rrl_whitelist_ratelimit; 321 if(*lm == 0) return; 322 c |= c2; 323 *flags = c; 324 memmove(buf, source, sizeof(*source)); 325 memmove(buf+sizeof(*source), &c, sizeof(c)); 326 327 DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "rrl_examine type %s name %s", rrltype2str(c), dname?wiredname2str(dname):"NULL")); 328 329 /* and hash it */ 330 if(dname && dname_len <= MAXDOMAINLEN) { 331 memmove(buf+sizeof(*source)+sizeof(c), dname, dname_len); 332 *hash = hashlittle(buf, sizeof(*source)+sizeof(c)+dname_len, r); 333 } else 334 *hash = hashlittle(buf, sizeof(*source)+sizeof(c), r); 335 } 336 337 /* age the bucket because elapsed time steps have gone by */ 338 static void rrl_attenuate_bucket(struct rrl_bucket* b, int32_t elapsed) 339 { 340 if(elapsed > 16) { 341 b->rate = 0; 342 } else { 343 /* divide rate /2 for every elapsed time step, because 344 * the counters in the inbetween steps were 0 */ 345 /* r(t) = 0 + 0/2 + 0/4 + .. + oldrate/2^dt */ 346 b->rate >>= elapsed; 347 /* we know that elapsed >= 2 */ 348 b->rate += (b->counter>>(elapsed-1)); 349 } 350 } 351 352 /** log a message about ratelimits */ 353 static void 354 rrl_msg(query_type* query, const char* str) 355 { 356 uint16_t c, c2, wl = 0; 357 const uint8_t* d = NULL; 358 size_t d_len; 359 uint64_t s; 360 char address[128]; 361 if(verbosity < 1) return; 362 addr2str(&query->addr, address, sizeof(address)); 363 s = rrl_get_source(query, &c2); 364 c = rrl_classify(query, &d, &d_len) | c2; 365 if(query->zone && query->zone->opts && 366 (query->zone->opts->pattern->rrl_whitelist & c)) 367 wl = 1; 368 log_msg(LOG_INFO, "ratelimit %s %s type %s%s target %s query %s %s", 369 str, d?wiredname2str(d):"", rrltype2str(c), 370 wl?"(whitelisted)":"", rrlsource2str(s, c2), 371 address, rrtype_to_string(query->qtype)); 372 } 373 374 /** true if the query used to be blocked by the ratelimit */ 375 static int 376 used_to_block(uint32_t rate, uint32_t counter, uint32_t lm) 377 { 378 return rate >= lm || counter+rate/2 >= lm; 379 } 380 381 /** update the rate in a ratelimit bucket, return actual rate */ 382 uint32_t rrl_update(query_type* query, uint32_t hash, uint64_t source, 383 uint16_t flags, int32_t now, uint32_t lm) 384 { 385 struct rrl_bucket* b = &rrl_array[hash % rrl_array_size]; 386 387 DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "source %llx hash %x oldrate %d oldcount %d stamp %d", 388 (long long unsigned)source, hash, b->rate, b->counter, b->stamp)); 389 390 /* check if different source */ 391 if(b->source != source || b->flags != flags || b->hash != hash) { 392 /* initialise */ 393 /* potentially the wrong limit here, used lower nonwhitelim */ 394 if(verbosity >= 1 && 395 used_to_block(b->rate, b->counter, rrl_ratelimit)) { 396 char address[128]; 397 addr2str(&query->addr, address, sizeof(address)); 398 log_msg(LOG_INFO, "ratelimit unblock ~ type %s target %s query %s %s (%s collision)", 399 rrltype2str(b->flags), 400 rrlsource2str(b->source, b->flags), 401 address, rrtype_to_string(query->qtype), 402 (b->hash!=hash?"bucket":"hash")); 403 } 404 b->hash = hash; 405 b->source = source; 406 b->flags = flags; 407 b->counter = 1; 408 b->rate = 0; 409 b->stamp = now; 410 return 1; 411 } 412 /* this is the same source */ 413 414 /* check if old, zero or smooth it */ 415 /* circular arith for time */ 416 if(now - b->stamp == 1) { 417 /* very busy bucket and time just stepped one step */ 418 int oldblock = used_to_block(b->rate, b->counter, lm); 419 b->rate = b->rate/2 + b->counter; 420 if(oldblock && b->rate < lm) 421 rrl_msg(query, "unblock"); 422 b->counter = 1; 423 b->stamp = now; 424 } else if(now - b->stamp > 0) { 425 /* older bucket */ 426 int olderblock = used_to_block(b->rate, b->counter, lm); 427 rrl_attenuate_bucket(b, now - b->stamp); 428 if(olderblock && b->rate < lm) 429 rrl_msg(query, "unblock"); 430 b->counter = 1; 431 b->stamp = now; 432 } else if(now != b->stamp) { 433 /* robust, timestamp from the future */ 434 if(used_to_block(b->rate, b->counter, lm)) 435 rrl_msg(query, "unblock"); 436 b->rate = 0; 437 b->counter = 1; 438 b->stamp = now; 439 } else { 440 /* bucket is from the current timestep, update counter */ 441 b->counter ++; 442 443 /* log what is blocked for operational debugging */ 444 if(b->counter + b->rate/2 == lm && b->rate < lm) 445 rrl_msg(query, "block"); 446 } 447 448 /* return max from current rate and projected next-value for rate */ 449 /* so that if the rate increases suddenly very high, it is 450 * stopped halfway into the time step */ 451 if(b->counter > b->rate/2) 452 return b->counter + b->rate/2; 453 return b->rate; 454 } 455 456 int rrl_process_query(query_type* query) 457 { 458 uint64_t source; 459 uint32_t hash; 460 /* we can use circular arithmetic here, so int32 works after 2038 */ 461 int32_t now = (int32_t)time(NULL); 462 uint32_t lm = rrl_ratelimit; 463 uint16_t flags; 464 if(rrl_ratelimit == 0 && rrl_whitelist_ratelimit == 0) 465 return 0; 466 467 /* examine query */ 468 examine_query(query, &hash, &source, &flags, &lm); 469 470 if(lm == 0) 471 return 0; /* no limit for this */ 472 473 /* update rate */ 474 return (rrl_update(query, hash, source, flags, now, lm) >= lm); 475 } 476 477 query_state_type rrl_slip(query_type* query) 478 { 479 /* discard number the packets, randomly */ 480 #ifdef HAVE_ARC4RANDOM_UNIFORM 481 if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random_uniform(rrl_slip_ratio)) == 0))) { 482 #elif HAVE_ARC4RANDOM 483 if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random() % rrl_slip_ratio) == 0))) { 484 #else 485 if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((random() % rrl_slip_ratio) == 0))) { 486 #endif 487 /* set TC on the rest */ 488 TC_SET(query->packet); 489 ANCOUNT_SET(query->packet, 0); 490 NSCOUNT_SET(query->packet, 0); 491 ARCOUNT_SET(query->packet, 0); 492 if(query->qname) 493 /* header, type, class, qname */ 494 buffer_set_position(query->packet, 495 QHEADERSZ+4+query->qname->name_size); 496 else buffer_set_position(query->packet, QHEADERSZ); 497 return QUERY_PROCESSED; 498 } 499 return QUERY_DISCARDED; 500 } 501 502 #endif /* RATELIMIT */ 503