1 /* $NetBSD: bsearch.c,v 1.3 2023/06/19 21:41:42 christos Exp $ */
2
3 /*
4 * Copyright (c) 2011, Secure Endpoints Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * - Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * - Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
30 * OF THE POSSIBILITY OF SUCH DAMAGE.
31 *
32 */
33
34 #include "baselocl.h"
35
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #ifdef HAVE_IO_H
39 #include <io.h>
40 #endif
41 #ifdef HAVE_UNISTD_H
42 #include <unistd.h>
43 #endif
44 #include <fcntl.h>
45 #include <ctype.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #ifdef HAVE_STRINGS_H
50 #include <strings.h>
51 #endif
52 #include <errno.h>
53 #include <assert.h>
54
55 /*
56 * This file contains functions for binary searching flat text in memory
57 * and in text files where each line is a [variable length] record.
58 * Each record has a key and an optional value separated from the key by
59 * unquoted whitespace. Whitespace in the key, and leading whitespace
60 * for the value, can be quoted with backslashes (but CR and LF must be
61 * quoted in such a way that they don't appear in the quoted result).
62 *
63 * Binary searching a tree are normally a dead simple algorithm. It
64 * turns out that binary searching flat text with *variable* length
65 * records is... tricky. There's no indexes to record beginning bytes,
66 * thus any index selected during the search is likely to fall in the
67 * middle of a record. When deciding to search a left sub-tree one
68 * might fail to find the last record in that sub-tree on account of the
69 * right boundary falling in the middle of it -- the chosen solution to
70 * this makes left sub-tree searches slightly less efficient than right
71 * sub-tree searches.
72 *
73 * If binary searching flat text in memory is tricky, using block-wise
74 * I/O instead is trickier! But it's necessary in order to support
75 * large files (which we either can't or wouldn't want to read or map
76 * into memory). Each block we read has to be large enough that the
77 * largest record can fit in it. And each block might start and/or end
78 * in the middle of a record. Here it is the right sub-tree searches
79 * that are less efficient than left sub-tree searches.
80 *
81 * bsearch_common() contains the common text block binary search code.
82 *
83 * _bsearch_text() is the interface for searching in-core text.
84 * _bsearch_file() is the interface for block-wise searching files.
85 */
86
87 struct bsearch_file_handle {
88 int fd; /* file descriptor */
89 char *cache; /* cache bytes */
90 char *page; /* one double-size page worth of bytes */
91 size_t file_sz; /* file size */
92 size_t cache_sz; /* cache size */
93 size_t page_sz; /* page size */
94 };
95
96 /* Find a new-line */
97 static const char *
find_line(const char * buf,size_t i,size_t right)98 find_line(const char *buf, size_t i, size_t right)
99 {
100 if (i == 0)
101 return &buf[i];
102 for (; i < right; i++) {
103 if (buf[i] == '\n') {
104 if ((i + 1) < right)
105 return &buf[i + 1];
106 return NULL;
107 }
108 }
109 return NULL;
110 }
111
112 /*
113 * Common routine for binary searching text in core.
114 *
115 * Perform a binary search of a char array containing a block from a
116 * text file where each line is a record (LF and CRLF supported). Each
117 * record consists of a key followed by an optional value separated from
118 * the key by whitespace. Whitespace can be quoted with backslashes.
119 * It's the caller's responsibility to encode/decode keys/values if
120 * quoting is desired; newlines should be encoded such that a newline
121 * does not appear in the result.
122 *
123 * All output arguments are optional.
124 *
125 * Returns 0 if key is found, -1 if not found, or an error code such as
126 * ENOMEM in case of error.
127 *
128 * Inputs:
129 *
130 * @buf String to search
131 * @sz Size of string to search
132 * @key Key string to search for
133 * @buf_is_start True if the buffer starts with a record, false if it
134 * starts in the middle of a record or if the caller
135 * doesn't know.
136 *
137 * Outputs:
138 *
139 * @value Location to store a copy of the value (caller must free)
140 * @location Record location if found else the location where the
141 * record should be inserted (index into @buf)
142 * @cmp Set to less than or greater than 0 to indicate that a
143 * key not found would have fit in an earlier or later
144 * part of a file. Callers should use this to decide
145 * whether to read a block to the left or to the right and
146 * search that.
147 * @loops Location to store a count of bisections required for
148 * search (useful for confirming logarithmic performance)
149 */
150 static int
bsearch_common(const char * buf,size_t sz,const char * key,int buf_is_start,char ** value,size_t * location,int * cmp,size_t * loops)151 bsearch_common(const char *buf, size_t sz, const char *key,
152 int buf_is_start, char **value, size_t *location,
153 int *cmp, size_t *loops)
154 {
155 const char *linep;
156 size_t key_start, key_len; /* key string in buf */
157 size_t val_start, val_len; /* value string in buf */
158 int key_cmp = -1;
159 size_t k;
160 size_t l; /* left side of buffer for binary search */
161 size_t r; /* right side of buffer for binary search */
162 size_t rmax; /* right side of buffer for binary search */
163 size_t i; /* index into buffer, typically in the middle of l and r */
164 size_t loop_count = 0;
165 int ret = -1;
166
167 if (value)
168 *value = NULL;
169 if (cmp)
170 *cmp = 0;
171 if (loops)
172 *loops = 0;
173
174 /* Binary search; file should be sorted */
175 for (l = 0, r = rmax = sz, i = sz >> 1; i >= l && i < rmax; loop_count++) {
176 heim_assert(i < sz, "invalid aname2lname db index");
177
178 /* buf[i] is likely in the middle of a line; find the next line */
179 linep = find_line(buf, i, rmax);
180 k = linep ? linep - buf : i;
181 if (linep == NULL || k >= rmax) {
182 /*
183 * No new line found to the right; search to the left then
184 * but don't change rmax (this isn't optimal, but it's
185 * simple).
186 */
187 if (i == l)
188 break;
189 r = i;
190 i = l + ((r - l) >> 1);
191 continue;
192 }
193 i = k;
194 heim_assert(i >= l && i < rmax, "invalid aname2lname db index");
195
196 /* Got a line; check it */
197
198 /* Search for and split on unquoted whitespace */
199 val_start = 0;
200 for (key_start = i, key_len = 0, val_len = 0, k = i; k < rmax; k++) {
201 if (buf[k] == '\\') {
202 k++;
203 continue;
204 }
205 if (buf[k] == '\r' || buf[k] == '\n') {
206 /* We now know where the key ends, and there's no value */
207 key_len = k - i;
208 break;
209 }
210 if (!isspace((unsigned char)buf[k]))
211 continue;
212
213 while (k < rmax && isspace((unsigned char)buf[k])) {
214 key_len = k - i;
215 k++;
216 }
217 if (k < rmax)
218 val_start = k;
219 /* Find end of value */
220 for (; k < rmax && buf[k] != '\0'; k++) {
221 if (buf[k] == '\r' || buf[k] == '\n') {
222 val_len = k - val_start;
223 break;
224 }
225 }
226 break;
227 }
228
229 /*
230 * The following logic is for dealing with partial buffers,
231 * which we use for block-wise binary searches of large files
232 */
233 if (key_start == 0 && !buf_is_start) {
234 /*
235 * We're at the beginning of a block that might have started
236 * in the middle of a record whose "key" might well compare
237 * as greater than the key we're looking for, so we don't
238 * bother comparing -- we know key_cmp must be -1 here.
239 */
240 key_cmp = -1;
241 break;
242 }
243 if ((val_len && buf[val_start + val_len] != '\n') ||
244 (!val_len && buf[key_start + key_len] != '\n')) {
245 /*
246 * We're at the end of a block that ends in the middle of a
247 * record whose "key" might well compare as less than the
248 * key we're looking for, so we don't bother comparing -- we
249 * know key_cmp must be >= 0 but we can't tell. Our caller
250 * will end up reading a double-size block to handle this.
251 */
252 key_cmp = 1;
253 break;
254 }
255
256 key_cmp = strncmp(key, &buf[key_start], key_len);
257 if (key_cmp == 0 && strlen(key) != key_len)
258 key_cmp = 1;
259 if (key_cmp < 0) {
260 /* search left */
261 r = rmax = (linep - buf);
262 i = l + ((r - l) >> 1);
263 if (location)
264 *location = key_start;
265 } else if (key_cmp > 0) {
266 /* search right */
267 if (l == i)
268 break; /* not found */
269 l = i;
270 i = l + ((r - l) >> 1);
271 if (location)
272 *location = val_start + val_len;
273 } else {
274 /* match! */
275 if (location)
276 *location = key_start;
277 ret = 0;
278 if (val_len && value) {
279 /* Avoid strndup() so we don't need libroken here yet */
280 if ((*value = malloc(val_len + 1))) {
281 (void) memcpy(*value, &buf[val_start], val_len);
282 (*value)[val_len] = '\0';
283 } else {
284 ret = errno;
285 }
286 }
287 break;
288 }
289 }
290
291 if (cmp)
292 *cmp = key_cmp;
293 if (loops)
294 *loops = loop_count;
295
296 return ret;
297 }
298
299 /*
300 * Binary search a char array containing sorted text records separated
301 * by new-lines (or CRLF). Each record consists of a key and an
302 * optional value following the key, separated from the key by unquoted
303 * whitespace.
304 *
305 * All output arguments are optional.
306 *
307 * Returns 0 if key is found, -1 if not found, or an error code such as
308 * ENOMEM in case of error.
309 *
310 * Inputs:
311 *
312 * @buf Char array pointer
313 * @buf_sz Size of buf
314 * @key Key to search for
315 *
316 * Outputs:
317 *
318 * @value Location where to put the value, if any (caller must free)
319 * @location Record location if found else the location where the record
320 * should be inserted (index into @buf)
321 * @loops Location where to put a number of loops (or comparisons)
322 * needed for the search (useful for benchmarking)
323 */
324 int
_bsearch_text(const char * buf,size_t buf_sz,const char * key,char ** value,size_t * location,size_t * loops)325 _bsearch_text(const char *buf, size_t buf_sz, const char *key,
326 char **value, size_t *location, size_t *loops)
327 {
328 return bsearch_common(buf, buf_sz, key, 1, value, location, NULL, loops);
329 }
330
331 #define MAX_BLOCK_SIZE (1024 * 1024)
332 #define DEFAULT_MAX_FILE_SIZE (1024 * 1024)
333 /*
334 * Open a file for binary searching. The file will be read in entirely
335 * if it is smaller than @max_sz, else a cache of @max_sz bytes will be
336 * allocated.
337 *
338 * Returns 0 on success, else an error number or -1 if the file is empty.
339 *
340 * Inputs:
341 *
342 * @fname Name of file to open
343 * @max_sz Maximum size of cache to allocate, in bytes (if zero, default)
344 * @page_sz Page size (must be a power of two, larger than 256, smaller
345 * than 1MB; if zero use default)
346 *
347 * Outputs:
348 *
349 * @bfh Handle for use with _bsearch_file() and _bsearch_file_close()
350 * @reads Number of reads performed
351 */
352 int
_bsearch_file_open(const char * fname,size_t max_sz,size_t page_sz,bsearch_file_handle * bfh,size_t * reads)353 _bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz,
354 bsearch_file_handle *bfh, size_t *reads)
355 {
356 bsearch_file_handle new_bfh = NULL;
357 struct stat st;
358 size_t i;
359 int fd;
360 int ret;
361
362 *bfh = NULL;
363
364 if (reads)
365 *reads = 0;
366
367 fd = open(fname, O_RDONLY);
368 if (fd == -1)
369 return errno;
370
371 if (fstat(fd, &st) == -1) {
372 ret = errno;
373 goto err;
374 }
375
376 if (st.st_size == 0) {
377 ret = -1; /* no data -> no binary search */
378 goto err;
379 }
380
381 /* Validate / default arguments */
382 if (max_sz == 0)
383 max_sz = DEFAULT_MAX_FILE_SIZE;
384 for (i = page_sz; i; i >>= 1) {
385 /* Make sure page_sz is a power of two */
386 if ((i % 2) && (i >> 1)) {
387 page_sz = 0;
388 break;
389 }
390 }
391 if (page_sz == 0)
392 #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
393 page_sz = st.st_blksize;
394 #else
395 page_sz = 4096;
396 #endif
397 for (i = page_sz; i; i >>= 1) {
398 /* Make sure page_sz is a power of two */
399 if ((i % 2) && (i >> 1)) {
400 /* Can't happen! Filesystems always use powers of two! */
401 page_sz = 4096;
402 break;
403 }
404 }
405 if (page_sz > MAX_BLOCK_SIZE)
406 page_sz = MAX_BLOCK_SIZE;
407
408 new_bfh = calloc(1, sizeof (*new_bfh));
409 if (new_bfh == NULL) {
410 ret = ENOMEM;
411 goto err;
412 }
413
414 new_bfh->fd = fd;
415 new_bfh->page_sz = page_sz;
416 new_bfh->file_sz = st.st_size;
417
418 if (max_sz >= st.st_size) {
419 /* Whole-file method */
420 new_bfh->cache = malloc(st.st_size + 1);
421 if (new_bfh->cache) {
422 new_bfh->cache[st.st_size] = '\0';
423 new_bfh->cache_sz = st.st_size;
424 ret = read(fd, new_bfh->cache, st.st_size);
425 if (ret < 0) {
426 ret = errno;
427 goto err;
428 }
429 if (ret != st.st_size) {
430 ret = EIO; /* XXX ??? */
431 goto err;
432 }
433 if (reads)
434 *reads = 1;
435 (void) close(fd);
436 new_bfh->fd = -1;
437 *bfh = new_bfh;
438 return 0;
439 }
440 }
441
442 /* Block-size method, or above malloc() failed */
443 new_bfh->page = malloc(new_bfh->page_sz << 1);
444 if (new_bfh->page == NULL) {
445 /* Can't even allocate a single double-size page! */
446 ret = ENOMEM;
447 goto err;
448 }
449
450 new_bfh->cache_sz = max_sz < st.st_size ? max_sz : st.st_size;
451 new_bfh->cache = malloc(new_bfh->cache_sz);
452 *bfh = new_bfh;
453
454 /*
455 * malloc() may have failed because we were asking for a lot of
456 * memory, but we may still be able to operate without a cache,
457 * so let's not fail.
458 */
459 if (new_bfh->cache == NULL) {
460 new_bfh->cache_sz = 0;
461 return 0;
462 }
463
464 /* Initialize cache */
465 for (i = 0; i < new_bfh->cache_sz; i += new_bfh->page_sz)
466 new_bfh->cache[i] = '\0';
467 return 0;
468
469 err:
470 (void) close(fd);
471 if (new_bfh) {
472 free(new_bfh->page);
473 free(new_bfh->cache);
474 free(new_bfh);
475 }
476 return ret;
477 }
478
479 /*
480 * Indicate whether the given binary search file handle will be searched
481 * with block-wise method.
482 */
483 void
_bsearch_file_info(bsearch_file_handle bfh,size_t * page_sz,size_t * max_sz,int * blockwise)484 _bsearch_file_info(bsearch_file_handle bfh,
485 size_t *page_sz, size_t *max_sz, int *blockwise)
486 {
487 if (page_sz)
488 *page_sz = bfh->page_sz;
489 if (max_sz)
490 *max_sz = bfh->cache_sz;
491 if (blockwise)
492 *blockwise = (bfh->file_sz != bfh->cache_sz);
493 }
494
495 /*
496 * Close the given binary file search handle.
497 *
498 * Inputs:
499 *
500 * @bfh Pointer to variable containing handle to close.
501 */
502 void
_bsearch_file_close(bsearch_file_handle * bfh)503 _bsearch_file_close(bsearch_file_handle *bfh)
504 {
505 if (!*bfh)
506 return;
507 if ((*bfh)->fd >= 0)
508 (void) close((*bfh)->fd);
509 if ((*bfh)->page)
510 free((*bfh)->page);
511 if ((*bfh)->cache)
512 free((*bfh)->cache);
513 free(*bfh);
514 *bfh = NULL;
515 }
516
517 /*
518 * Private function to get a page from a cache. The cache is a char
519 * array of 2^n - 1 double-size page worth of bytes, where n is the
520 * number of tree levels that the cache stores. The cache can be
521 * smaller than n implies.
522 *
523 * The page may or may not be valid. If the first byte of it is NUL
524 * then it's not valid, else it is.
525 *
526 * Returns 1 if page is in cache and valid, 0 if the cache is too small
527 * or the page is invalid. The page address is output in @buf if the
528 * cache is large enough to contain it regardless of whether the page is
529 * valid.
530 *
531 * Inputs:
532 *
533 * @bfh Binary search file handle
534 * @level Level in the tree that we want a page for
535 * @page_idx Page number in the given level (0..2^level - 1)
536 *
537 * Outputs:
538 *
539 * @buf Set to address of page if the cache is large enough
540 */
541 static int
get_page_from_cache(bsearch_file_handle bfh,size_t level,size_t page_idx,char ** buf)542 get_page_from_cache(bsearch_file_handle bfh, size_t level, size_t page_idx,
543 char **buf)
544 {
545 size_t idx = 0;
546 size_t page_sz;
547
548 page_sz = bfh->page_sz << 1; /* we use double-size pages in the cache */
549
550 *buf = NULL;
551
552 /*
553 * Compute index into cache. The cache is basically an array of
554 * double-size pages. The first (zeroth) double-size page in the
555 * cache will be the middle page of the file -- the root of the
556 * tree. The next two double-size pages will be the left and right
557 * pages of the second level in the tree. The next four double-size
558 * pages will be the four pages at the next level. And so on for as
559 * many pages as fit in the cache.
560 *
561 * The page index is the number of the page at the given level. We
562 * then compute (2^level - 1 + page index) * 2page size, check that
563 * we have that in the cache, check that the page has been read (it
564 * doesn't start with NUL).
565 */
566 if (level)
567 idx = (1 << level) - 1 + page_idx;
568 if (((idx + 1) * page_sz * 2) > bfh->cache_sz)
569 return 0;
570
571 *buf = &bfh->cache[idx * page_sz * 2];
572 if (bfh->cache[idx * page_sz * 2] == '\0')
573 return 0; /* cache[idx] == NUL -> page not loaded in cache */
574 return 1;
575 }
576
577 /*
578 * Private function to read a page of @page_sz from @fd at offset @off
579 * into @buf, outputing the number of bytes read, which will be the same
580 * as @page_sz unless the page being read is the last page, in which
581 * case the number of remaining bytes in the file will be output.
582 *
583 * Returns 0 on success or an errno value otherwise (EIO if reads are
584 * short).
585 *
586 * Inputs:
587 *
588 * @bfh Binary search file handle
589 * @level Level in the binary search tree that we're at
590 * @page_idx Page "index" at the @level of the tree that we want
591 * @page Actual page number that we want
592 * want_double Whether we need a page or double page read
593 *
594 * Outputs:
595 *
596 * @buf Page read or cached
597 * @bytes Bytes read (may be less than page or double page size in
598 * the case of the last page, of course)
599 */
600 static int
read_page(bsearch_file_handle bfh,size_t level,size_t page_idx,size_t page,int want_double,const char ** buf,size_t * bytes)601 read_page(bsearch_file_handle bfh, size_t level, size_t page_idx, size_t page,
602 int want_double, const char **buf, size_t *bytes)
603 {
604 int ret;
605 off_t off;
606 size_t expected;
607 size_t wanted;
608 char *page_buf;
609
610 /* Figure out where we're reading and how much */
611 off = page * bfh->page_sz;
612 if (off < 0)
613 return EOVERFLOW;
614
615 wanted = bfh->page_sz << want_double;
616 expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
617
618 if (get_page_from_cache(bfh, level, page_idx, &page_buf)) {
619 *buf = page_buf;
620 *bytes = expected;
621 return 0; /* found in cache */
622 }
623
624
625 *bytes = 0;
626 *buf = NULL;
627
628 /* OK, we have to read a page or double-size page */
629
630 if (page_buf)
631 want_double = 1; /* we'll be caching; we cache double-size pages */
632 else
633 page_buf = bfh->page; /* we won't cache this page */
634
635 wanted = bfh->page_sz << want_double;
636 expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
637
638 #ifdef HAVE_PREAD
639 ret = pread(bfh->fd, page_buf, expected, off);
640 #else
641 if (lseek(bfh->fd, off, SEEK_SET) == (off_t)-1)
642 return errno;
643 ret = read(bfh->fd, page_buf, expected);
644 #endif
645 if (ret < 0)
646 return errno;
647
648 if (ret != expected)
649 return EIO; /* XXX ??? */
650
651 *buf = page_buf;
652 *bytes = expected;
653 return 0;
654 }
655
656 /*
657 * Perform a binary search of a file where each line is a record (LF and
658 * CRLF supported). Each record consists of a key followed by an
659 * optional value separated from the key by whitespace. Whitespace can
660 * be quoted with backslashes. It's the caller's responsibility to
661 * encode/decode keys/values if quoting is desired; newlines should be
662 * encoded such that a newline does not appear in the result.
663 *
664 * The search is done with block-wise I/O (i.e., the whole file is not
665 * read into memory).
666 *
667 * All output arguments are optional.
668 *
669 * Returns 0 if key is found, -1 if not found, or an error code such as
670 * ENOMEM in case of error.
671 *
672 * NOTE: We could improve this by not freeing the buffer, instead
673 * requiring that the caller provide it. Further, we could cache
674 * the top N levels of [double-size] pages (2^N - 1 pages), which
675 * should speed up most searches by reducing the number of reads
676 * by N.
677 *
678 * Inputs:
679 *
680 * @fd File descriptor (file to search)
681 * @page_sz Page size (if zero then the file's st_blksize will be used)
682 * @key Key string to search for
683 *
684 * Outputs:
685 *
686 * @value Location to store a copy of the value (caller must free)
687 * @location Record location if found else the location where the
688 * record should be inserted (index into @buf)
689 * @loops Location to store a count of bisections required for
690 * search (useful for confirming logarithmic performance)
691 * @reads Location to store a count of pages read during search
692 * (useful for confirming logarithmic performance)
693 */
694 int
_bsearch_file(bsearch_file_handle bfh,const char * key,char ** value,size_t * location,size_t * loops,size_t * reads)695 _bsearch_file(bsearch_file_handle bfh, const char *key,
696 char **value, size_t *location, size_t *loops, size_t *reads)
697 {
698 int ret;
699 const char *buf;
700 size_t buf_sz;
701 size_t page, l, r;
702 size_t my_reads = 0;
703 size_t my_loops_total = 0;
704 size_t my_loops;
705 size_t level; /* level in the tree */
706 size_t page_idx = 0; /* page number in the tree level */
707 size_t buf_location;
708 int cmp;
709 int buf_ends_in_eol = 0;
710 int buf_is_start = 0;
711
712 if (reads)
713 *reads = 0;
714 if (value)
715 *value = NULL;
716 if (loops)
717 *loops = 0;
718
719 /* If whole file is in memory then search that and we're done */
720 if (bfh->file_sz == bfh->cache_sz)
721 return _bsearch_text(bfh->cache, bfh->cache_sz, key, value, location, loops);
722
723 /* Else block-wise binary search */
724
725 l = 0;
726 r = (bfh->file_sz / bfh->page_sz) + 1;
727 for (level = 0, page = r >> 1; page >= l && page < r ; level++) {
728 ret = read_page(bfh, level, page_idx, page, 0, &buf, &buf_sz);
729 if (ret != 0)
730 return ret;
731 my_reads++;
732 if (buf[buf_sz - 1] == '\r' || buf[buf_sz - 1] == '\n')
733 buf_ends_in_eol = 1;
734 else
735 buf_ends_in_eol = 0;
736
737 buf_is_start = page == 0 ? 1 : 0;
738 ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
739 value, &buf_location, &cmp, &my_loops);
740 if (ret > 0)
741 return ret;
742 /* Found or no we update stats */
743 my_loops_total += my_loops;
744 if (loops)
745 *loops = my_loops_total;
746 if (reads)
747 *reads = my_reads;
748 if (location)
749 *location = page * bfh->page_sz + buf_location;
750 if (ret == 0)
751 return 0; /* found! */
752 /* Not found */
753 if (cmp < 0) {
754 /* Search left */
755 page_idx <<= 1;
756 r = page;
757 page = l + ((r - l) >> 1);
758 continue;
759 } else {
760 /*
761 * Search right, but first search the current and next
762 * blocks in case that the record we're looking for either
763 * straddles the boundary between this and the next record,
764 * or in case the record starts exactly at the next page.
765 */
766 heim_assert(cmp > 0, "cmp > 0");
767
768 if (!buf_ends_in_eol || page == l || page == (r - 1)) {
769 ret = read_page(bfh, level, page_idx, page, 1, &buf, &buf_sz);
770 if (ret != 0)
771 return ret;
772 my_reads++;
773
774 buf_is_start = page == l ? 1 : 0;
775
776 ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
777 value, &buf_location, &cmp, &my_loops);
778 if (ret > 0)
779 return ret;
780 my_loops_total += my_loops;
781 if (loops)
782 *loops = my_loops_total;
783 if (reads)
784 *reads = my_reads;
785 if (location)
786 *location = page * bfh->page_sz + buf_location;
787 if (ret == 0)
788 return 0;
789 }
790
791 /* Oh well, search right */
792 if (l == page && r == (l + 1))
793 break;
794 page_idx = (page_idx << 1) + 1;
795 l = page;
796 page = l + ((r - l) >> 1);
797 continue;
798 }
799 }
800 return -1;
801 }
802
803
804 static int
stdb_open(void * plug,const char * dbtype,const char * dbname,heim_dict_t options,void ** db,heim_error_t * error)805 stdb_open(void *plug, const char *dbtype, const char *dbname,
806 heim_dict_t options, void **db, heim_error_t *error)
807 {
808 bsearch_file_handle bfh;
809 char *p;
810 int ret;
811
812 if (error)
813 *error = NULL;
814 if (dbname == NULL || *dbname == '\0') {
815 if (error)
816 *error = heim_error_create(EINVAL,
817 N_("DB name required for sorted-text DB "
818 "plugin", ""));
819 return EINVAL;
820 }
821 p = strrchr(dbname, '.');
822 if (p == NULL || strcmp(p, ".txt") != 0) {
823 if (error)
824 *error = heim_error_create(ENOTSUP,
825 N_("Text file (name ending in .txt) "
826 "required for sorted-text DB plugin",
827 ""));
828 return ENOTSUP;
829 }
830
831 ret = _bsearch_file_open(dbname, 0, 0, &bfh, NULL);
832 if (ret)
833 return ret;
834
835 *db = bfh;
836 return 0;
837 }
838
839 static int
stdb_close(void * db,heim_error_t * error)840 stdb_close(void *db, heim_error_t *error)
841 {
842 bsearch_file_handle bfh = db;
843
844 if (error)
845 *error = NULL;
846 _bsearch_file_close(&bfh);
847 return 0;
848 }
849
850 static heim_data_t
stdb_copy_value(void * db,heim_string_t table,heim_data_t key,heim_error_t * error)851 stdb_copy_value(void *db, heim_string_t table, heim_data_t key,
852 heim_error_t *error)
853 {
854 bsearch_file_handle bfh = db;
855 const char *k;
856 char *v = NULL;
857 heim_data_t value;
858 int ret;
859
860 if (error)
861 *error = NULL;
862
863 if (table == NULL)
864 table = HSTR("");
865
866 if (table != HSTR(""))
867 return NULL;
868
869 if (heim_get_tid(key) == HEIM_TID_STRING)
870 k = heim_string_get_utf8((heim_string_t)key);
871 else
872 k = (const char *)heim_data_get_ptr(key);
873 ret = _bsearch_file(bfh, k, &v, NULL, NULL, NULL);
874 if (ret == 0 && v == NULL)
875 ret = -1; /* Quiet lint */
876 if (ret != 0) {
877 if (ret > 0 && error)
878 *error = heim_error_create(ret, "%s", strerror(ret));
879 return NULL;
880 }
881 value = heim_data_create(v, strlen(v));
882 free(v);
883 /* XXX Handle ENOMEM */
884 return value;
885 }
886
887 struct heim_db_type heim_sorted_text_file_dbtype = {
888 1, stdb_open, NULL, stdb_close, NULL, NULL, NULL, NULL, NULL, NULL,
889 stdb_copy_value, NULL, NULL, NULL
890 };
891