1*f14fb602SLionel Sambuc /* $NetBSD: wcscspn_bloom.h,v 1.4 2011/11/25 17:48:22 joerg Exp $ */
2*f14fb602SLionel Sambuc
3*f14fb602SLionel Sambuc /*-
4*f14fb602SLionel Sambuc * Copyright (c) 2011 Joerg Sonnenberger,
5*f14fb602SLionel Sambuc * All rights reserved.
6*f14fb602SLionel Sambuc *
7*f14fb602SLionel Sambuc * Redistribution and use in source and binary forms, with or without
8*f14fb602SLionel Sambuc * modification, are permitted provided that the following conditions
9*f14fb602SLionel Sambuc * are met:
10*f14fb602SLionel Sambuc * 1. Redistributions of source code must retain the above copyright
11*f14fb602SLionel Sambuc * notice, this list of conditions and the following disclaimer.
12*f14fb602SLionel Sambuc * 2. Redistributions in binary form must reproduce the above copyright
13*f14fb602SLionel Sambuc * notice, this list of conditions and the following disclaimer in the
14*f14fb602SLionel Sambuc * documentation and/or other materials provided with the distribution.
15*f14fb602SLionel Sambuc *
16*f14fb602SLionel Sambuc * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17*f14fb602SLionel Sambuc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18*f14fb602SLionel Sambuc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19*f14fb602SLionel Sambuc * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20*f14fb602SLionel Sambuc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21*f14fb602SLionel Sambuc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22*f14fb602SLionel Sambuc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23*f14fb602SLionel Sambuc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24*f14fb602SLionel Sambuc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25*f14fb602SLionel Sambuc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26*f14fb602SLionel Sambuc * SUCH DAMAGE.
27*f14fb602SLionel Sambuc */
28*f14fb602SLionel Sambuc
29*f14fb602SLionel Sambuc /*
30*f14fb602SLionel Sambuc * Bloom filter for fast test if a given character is part of the reject set.
31*f14fb602SLionel Sambuc * The test may have false positives, but doesn't have false negatives.
32*f14fb602SLionel Sambuc * The first hash function is designed to be very fast to evaluate.
33*f14fb602SLionel Sambuc * It is collision free if the input is part of the same European language
34*f14fb602SLionel Sambuc * and shouldn't be too bad even other input. The second hash function
35*f14fb602SLionel Sambuc * tries to provide a much better mixing, but involves the slower
36*f14fb602SLionel Sambuc * multiplication.
37*f14fb602SLionel Sambuc */
38*f14fb602SLionel Sambuc
39*f14fb602SLionel Sambuc #include <limits.h>
40*f14fb602SLionel Sambuc
41*f14fb602SLionel Sambuc #define BLOOM_SIZE 64
42*f14fb602SLionel Sambuc #define BLOOM_ARRAY_SIZE (BLOOM_SIZE / sizeof(size_t))
43*f14fb602SLionel Sambuc #define BLOOM_BITS (BLOOM_SIZE * CHAR_BIT)
44*f14fb602SLionel Sambuc #define BLOOM_DIV (sizeof(size_t) * CHAR_BIT)
45*f14fb602SLionel Sambuc
46*f14fb602SLionel Sambuc static inline size_t
wcscspn_bloom1(size_t x)47*f14fb602SLionel Sambuc wcscspn_bloom1(size_t x)
48*f14fb602SLionel Sambuc {
49*f14fb602SLionel Sambuc return x % BLOOM_BITS;
50*f14fb602SLionel Sambuc }
51*f14fb602SLionel Sambuc
52*f14fb602SLionel Sambuc static inline size_t
wcscspn_bloom2(size_t x)53*f14fb602SLionel Sambuc wcscspn_bloom2(size_t x)
54*f14fb602SLionel Sambuc {
55*f14fb602SLionel Sambuc return (size_t)((uint32_t)(x * 2654435761U) /
56*f14fb602SLionel Sambuc (0x100000000ULL / BLOOM_BITS));
57*f14fb602SLionel Sambuc }
58*f14fb602SLionel Sambuc
59*f14fb602SLionel Sambuc static inline void
wcsspn_bloom_init(size_t * bloom,const wchar_t * charset)60*f14fb602SLionel Sambuc wcsspn_bloom_init(size_t *bloom, const wchar_t *charset)
61*f14fb602SLionel Sambuc {
62*f14fb602SLionel Sambuc size_t val;
63*f14fb602SLionel Sambuc
64*f14fb602SLionel Sambuc memset(bloom, 0, BLOOM_SIZE);
65*f14fb602SLionel Sambuc do {
66*f14fb602SLionel Sambuc val = wcscspn_bloom1((size_t)*charset);
67*f14fb602SLionel Sambuc bloom[val / BLOOM_DIV] |= (size_t)1 << (val % BLOOM_DIV);
68*f14fb602SLionel Sambuc val = wcscspn_bloom2((size_t)*charset);
69*f14fb602SLionel Sambuc bloom[val / BLOOM_DIV] |= (size_t)1 << (val % BLOOM_DIV);
70*f14fb602SLionel Sambuc }
71*f14fb602SLionel Sambuc while (*++charset);
72*f14fb602SLionel Sambuc }
73*f14fb602SLionel Sambuc
74*f14fb602SLionel Sambuc static inline int
wcsspn_in_bloom(const size_t * bloom,wchar_t ch)75*f14fb602SLionel Sambuc wcsspn_in_bloom(const size_t *bloom, wchar_t ch)
76*f14fb602SLionel Sambuc {
77*f14fb602SLionel Sambuc size_t val;
78*f14fb602SLionel Sambuc
79*f14fb602SLionel Sambuc val = wcscspn_bloom1((size_t)ch);
80*f14fb602SLionel Sambuc if (bloom[val / BLOOM_DIV] & ((size_t)1 << (val % BLOOM_DIV)))
81*f14fb602SLionel Sambuc return 1;
82*f14fb602SLionel Sambuc val = wcscspn_bloom2((size_t)ch);
83*f14fb602SLionel Sambuc if (bloom[val / BLOOM_DIV] & ((size_t)1 << (val % BLOOM_DIV)))
84*f14fb602SLionel Sambuc return 1;
85*f14fb602SLionel Sambuc return 0;
86*f14fb602SLionel Sambuc }
87