xref: /dflybsd-src/contrib/gcc-8.0/libstdc++-v3/config/cpu/i486/opt/bits/opt_random.h (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj // Optimizations for random number functions, x86 version -*- C++ -*-
2*38fd1498Szrj 
3*38fd1498Szrj // Copyright (C) 2012-2018 Free Software Foundation, Inc.
4*38fd1498Szrj //
5*38fd1498Szrj // This file is part of the GNU ISO C++ Library.  This library is free
6*38fd1498Szrj // software; you can redistribute it and/or modify it under the
7*38fd1498Szrj // terms of the GNU General Public License as published by the
8*38fd1498Szrj // Free Software Foundation; either version 3, or (at your option)
9*38fd1498Szrj // any later version.
10*38fd1498Szrj 
11*38fd1498Szrj // This library is distributed in the hope that it will be useful,
12*38fd1498Szrj // but WITHOUT ANY WARRANTY; without even the implied warranty of
13*38fd1498Szrj // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14*38fd1498Szrj // GNU General Public License for more details.
15*38fd1498Szrj 
16*38fd1498Szrj // Under Section 7 of GPL version 3, you are granted additional
17*38fd1498Szrj // permissions described in the GCC Runtime Library Exception, version
18*38fd1498Szrj // 3.1, as published by the Free Software Foundation.
19*38fd1498Szrj 
20*38fd1498Szrj // You should have received a copy of the GNU General Public License and
21*38fd1498Szrj // a copy of the GCC Runtime Library Exception along with this program;
22*38fd1498Szrj // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23*38fd1498Szrj // <http://www.gnu.org/licenses/>.
24*38fd1498Szrj 
25*38fd1498Szrj /** @file bits/opt_random.h
26*38fd1498Szrj  *  This is an internal header file, included by other library headers.
27*38fd1498Szrj  *  Do not attempt to use it directly. @headername{random}
28*38fd1498Szrj  */
29*38fd1498Szrj 
30*38fd1498Szrj #ifndef _BITS_OPT_RANDOM_H
31*38fd1498Szrj #define _BITS_OPT_RANDOM_H 1
32*38fd1498Szrj 
33*38fd1498Szrj #ifdef __SSE3__
34*38fd1498Szrj #include <pmmintrin.h>
35*38fd1498Szrj #endif
36*38fd1498Szrj 
37*38fd1498Szrj 
38*38fd1498Szrj #pragma GCC system_header
39*38fd1498Szrj 
40*38fd1498Szrj 
_GLIBCXX_VISIBILITY(default)41*38fd1498Szrj namespace std _GLIBCXX_VISIBILITY(default)
42*38fd1498Szrj {
43*38fd1498Szrj _GLIBCXX_BEGIN_NAMESPACE_VERSION
44*38fd1498Szrj 
45*38fd1498Szrj #ifdef __SSE3__
46*38fd1498Szrj   template<>
47*38fd1498Szrj     template<typename _UniformRandomNumberGenerator>
48*38fd1498Szrj       void
49*38fd1498Szrj       normal_distribution<double>::
50*38fd1498Szrj       __generate(typename normal_distribution<double>::result_type* __f,
51*38fd1498Szrj 		 typename normal_distribution<double>::result_type* __t,
52*38fd1498Szrj 		 _UniformRandomNumberGenerator& __urng,
53*38fd1498Szrj 		 const param_type& __param)
54*38fd1498Szrj       {
55*38fd1498Szrj 	typedef uint64_t __uctype;
56*38fd1498Szrj 
57*38fd1498Szrj 	if (__f == __t)
58*38fd1498Szrj 	  return;
59*38fd1498Szrj 
60*38fd1498Szrj 	if (_M_saved_available)
61*38fd1498Szrj 	  {
62*38fd1498Szrj 	    _M_saved_available = false;
63*38fd1498Szrj 	    *__f++ = _M_saved * __param.stddev() + __param.mean();
64*38fd1498Szrj 
65*38fd1498Szrj 	    if (__f == __t)
66*38fd1498Szrj 	      return;
67*38fd1498Szrj 	  }
68*38fd1498Szrj 
69*38fd1498Szrj 	constexpr uint64_t __maskval = 0xfffffffffffffull;
70*38fd1498Szrj 	static const __m128i __mask = _mm_set1_epi64x(__maskval);
71*38fd1498Szrj 	static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull);
72*38fd1498Szrj 	static const __m128d __three = _mm_set1_pd(3.0);
73*38fd1498Szrj 	const __m128d __av = _mm_set1_pd(__param.mean());
74*38fd1498Szrj 
75*38fd1498Szrj 	const __uctype __urngmin = __urng.min();
76*38fd1498Szrj 	const __uctype __urngmax = __urng.max();
77*38fd1498Szrj 	const __uctype __urngrange = __urngmax - __urngmin;
78*38fd1498Szrj 	const __uctype __uerngrange = __urngrange + 1;
79*38fd1498Szrj 
80*38fd1498Szrj 	while (__f + 1 < __t)
81*38fd1498Szrj 	  {
82*38fd1498Szrj 	    double __le;
83*38fd1498Szrj 	    __m128d __x;
84*38fd1498Szrj 	    do
85*38fd1498Szrj 	      {
86*38fd1498Szrj                 union
87*38fd1498Szrj                 {
88*38fd1498Szrj                   __m128i __i;
89*38fd1498Szrj                   __m128d __d;
90*38fd1498Szrj 		} __v;
91*38fd1498Szrj 
92*38fd1498Szrj 		if (__urngrange > __maskval)
93*38fd1498Szrj 		  {
94*38fd1498Szrj 		    if (__detail::_Power_of_2(__uerngrange))
95*38fd1498Szrj 		      __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(),
96*38fd1498Szrj 							     __urng()),
97*38fd1498Szrj 					      __mask);
98*38fd1498Szrj 		    else
99*38fd1498Szrj 		      {
100*38fd1498Szrj 			const __uctype __uerange = __maskval + 1;
101*38fd1498Szrj 			const __uctype __scaling = __urngrange / __uerange;
102*38fd1498Szrj 			const __uctype __past = __uerange * __scaling;
103*38fd1498Szrj 			uint64_t __v1;
104*38fd1498Szrj 			do
105*38fd1498Szrj 			  __v1 = __uctype(__urng()) - __urngmin;
106*38fd1498Szrj 			while (__v1 >= __past);
107*38fd1498Szrj 			__v1 /= __scaling;
108*38fd1498Szrj 			uint64_t __v2;
109*38fd1498Szrj 			do
110*38fd1498Szrj 			  __v2 = __uctype(__urng()) - __urngmin;
111*38fd1498Szrj 			while (__v2 >= __past);
112*38fd1498Szrj 			__v2 /= __scaling;
113*38fd1498Szrj 
114*38fd1498Szrj 			__v.__i = _mm_set_epi64x(__v1, __v2);
115*38fd1498Szrj 		      }
116*38fd1498Szrj 		  }
117*38fd1498Szrj 		else if (__urngrange == __maskval)
118*38fd1498Szrj 		  __v.__i = _mm_set_epi64x(__urng(), __urng());
119*38fd1498Szrj 		else if ((__urngrange + 2) * __urngrange >= __maskval
120*38fd1498Szrj 			 && __detail::_Power_of_2(__uerngrange))
121*38fd1498Szrj 		  {
122*38fd1498Szrj 		    uint64_t __v1 = __urng() * __uerngrange + __urng();
123*38fd1498Szrj 		    uint64_t __v2 = __urng() * __uerngrange + __urng();
124*38fd1498Szrj 
125*38fd1498Szrj 		    __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2),
126*38fd1498Szrj 					    __mask);
127*38fd1498Szrj 		  }
128*38fd1498Szrj 		else
129*38fd1498Szrj 		  {
130*38fd1498Szrj 		    size_t __nrng = 2;
131*38fd1498Szrj 		    __uctype __high = __maskval / __uerngrange / __uerngrange;
132*38fd1498Szrj 		    while (__high > __uerngrange)
133*38fd1498Szrj 		      {
134*38fd1498Szrj 			++__nrng;
135*38fd1498Szrj 			__high /= __uerngrange;
136*38fd1498Szrj 		      }
137*38fd1498Szrj 		    const __uctype __highrange = __high + 1;
138*38fd1498Szrj 		    const __uctype __scaling = __urngrange / __highrange;
139*38fd1498Szrj 		    const __uctype __past = __highrange * __scaling;
140*38fd1498Szrj 		    __uctype __tmp;
141*38fd1498Szrj 
142*38fd1498Szrj 		    uint64_t __v1;
143*38fd1498Szrj 		    do
144*38fd1498Szrj 		      {
145*38fd1498Szrj 			do
146*38fd1498Szrj 			  __tmp = __uctype(__urng()) - __urngmin;
147*38fd1498Szrj 			while (__tmp >= __past);
148*38fd1498Szrj 			__v1 = __tmp / __scaling;
149*38fd1498Szrj 			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
150*38fd1498Szrj 			  {
151*38fd1498Szrj 			    __tmp = __v1;
152*38fd1498Szrj 			    __v1 *= __uerngrange;
153*38fd1498Szrj 			    __v1 += __uctype(__urng()) - __urngmin;
154*38fd1498Szrj 			  }
155*38fd1498Szrj 		      }
156*38fd1498Szrj 		    while (__v1 > __maskval || __v1 < __tmp);
157*38fd1498Szrj 
158*38fd1498Szrj 		    uint64_t __v2;
159*38fd1498Szrj 		    do
160*38fd1498Szrj 		      {
161*38fd1498Szrj 			do
162*38fd1498Szrj 			  __tmp = __uctype(__urng()) - __urngmin;
163*38fd1498Szrj 			while (__tmp >= __past);
164*38fd1498Szrj 			__v2 = __tmp / __scaling;
165*38fd1498Szrj 			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
166*38fd1498Szrj 			  {
167*38fd1498Szrj 			    __tmp = __v2;
168*38fd1498Szrj 			    __v2 *= __uerngrange;
169*38fd1498Szrj 			    __v2 += __uctype(__urng()) - __urngmin;
170*38fd1498Szrj 			  }
171*38fd1498Szrj 		      }
172*38fd1498Szrj 		    while (__v2 > __maskval || __v2 < __tmp);
173*38fd1498Szrj 
174*38fd1498Szrj 		    __v.__i = _mm_set_epi64x(__v1, __v2);
175*38fd1498Szrj 		  }
176*38fd1498Szrj 
177*38fd1498Szrj 		__v.__i = _mm_or_si128(__v.__i, __two);
178*38fd1498Szrj 		__x = _mm_sub_pd(__v.__d, __three);
179*38fd1498Szrj 		__m128d __m = _mm_mul_pd(__x, __x);
180*38fd1498Szrj 		__le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m));
181*38fd1498Szrj               }
182*38fd1498Szrj             while (__le == 0.0 || __le >= 1.0);
183*38fd1498Szrj 
184*38fd1498Szrj             double __mult = (std::sqrt(-2.0 * std::log(__le) / __le)
185*38fd1498Szrj                              * __param.stddev());
186*38fd1498Szrj 
187*38fd1498Szrj             __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av);
188*38fd1498Szrj 
189*38fd1498Szrj             _mm_storeu_pd(__f, __x);
190*38fd1498Szrj             __f += 2;
191*38fd1498Szrj           }
192*38fd1498Szrj 
193*38fd1498Szrj         if (__f != __t)
194*38fd1498Szrj           {
195*38fd1498Szrj             result_type __x, __y, __r2;
196*38fd1498Szrj 
197*38fd1498Szrj             __detail::_Adaptor<_UniformRandomNumberGenerator, result_type>
198*38fd1498Szrj               __aurng(__urng);
199*38fd1498Szrj 
200*38fd1498Szrj             do
201*38fd1498Szrj               {
202*38fd1498Szrj                 __x = result_type(2.0) * __aurng() - 1.0;
203*38fd1498Szrj                 __y = result_type(2.0) * __aurng() - 1.0;
204*38fd1498Szrj                 __r2 = __x * __x + __y * __y;
205*38fd1498Szrj               }
206*38fd1498Szrj             while (__r2 > 1.0 || __r2 == 0.0);
207*38fd1498Szrj 
208*38fd1498Szrj             const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2);
209*38fd1498Szrj             _M_saved = __x * __mult;
210*38fd1498Szrj             _M_saved_available = true;
211*38fd1498Szrj             *__f = __y * __mult * __param.stddev() + __param.mean();
212*38fd1498Szrj           }
213*38fd1498Szrj       }
214*38fd1498Szrj #endif
215*38fd1498Szrj 
216*38fd1498Szrj 
217*38fd1498Szrj _GLIBCXX_END_NAMESPACE_VERSION
218*38fd1498Szrj } // namespace
219*38fd1498Szrj 
220*38fd1498Szrj 
221*38fd1498Szrj #endif // _BITS_OPT_RANDOM_H
222