1 /********************************************************************** 2 Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Intel Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 **********************************************************************/ 29 30 /** 31 * @file memcpy_inline.h 32 * @brief Defines intrinsic memcpy functions used by the new hashing API 33 * 34 */ 35 36 #ifndef _MEMCPY_H_ 37 #define _MEMCPY_H_ 38 39 #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) 40 #include "intrinreg.h" 41 #endif 42 #include <string.h> 43 #include <assert.h> 44 45 #ifdef __cplusplus 46 extern "C" { 47 #endif 48 49 #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) 50 51 #define memcpy_varlen memcpy_sse_varlen 52 #define memcpy_fixedlen memcpy_sse_fixedlen 53 54 #define memclr_varlen memclr_sse_varlen 55 #define memclr_fixedlen memclr_sse_fixedlen 56 57 static inline void 58 memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes); 59 static inline void 60 memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes); 61 static inline void 62 memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes); 63 64 static inline void 65 memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes); 66 static inline void 67 memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes); 68 static inline void 69 memcpy_sse_varlen(void *dst, const void *src, size_t nbytes); 70 71 static inline void 72 memclr_lte32_sse_fixedlen(void *dst, size_t nbytes); 73 static inline void 74 memclr_gte16_sse_fixedlen(void *dst, size_t nbytes); 75 static inline void 76 memclr_sse_fixedlen(void *dst, size_t nbytes); 77 78 static inline void 79 memclr_lte32_sse_varlen(void *dst, size_t nbytes); 80 static inline void 81 memclr_gte16_sse_varlen(void *dst, size_t nbytes); 82 static inline void 83 memclr_sse_varlen(void *dst, size_t nbytes); 84 85 #define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \ 86 do { \ 87 intrinreg##N head; \ 88 intrinreg##N tail; \ 89 assert(N <= nbytes && nbytes <= 2 * N); \ 90 if (N == 1 || (fixedwidth && nbytes == N)) { \ 91 head = load_intrinreg##N(src); \ 92 store_intrinreg##N(dst, head); \ 93 } else { \ 94 head = load_intrinreg##N(src); \ 95 tail = load_intrinreg##N( \ 96 (const void *) ((const char *) src + (nbytes - N))); \ 97 store_intrinreg##N(dst, head); \ 98 store_intrinreg##N((void *) ((char *) dst + (nbytes - N)), tail); \ 99 } \ 100 } while (0) 101 102 #define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \ 103 do { \ 104 const intrinreg##N zero = { 0 }; \ 105 assert(N <= nbytes && nbytes <= 2 * N); \ 106 if (N == 1 || (fixedwidth && nbytes == N)) { \ 107 store_intrinreg##N(dst, zero); \ 108 } else { \ 109 store_intrinreg##N(dst, zero); \ 110 store_intrinreg##N((void *) ((char *) dst + (nbytes - N)), zero); \ 111 } \ 112 } while (0) 113 114 // Define load/store functions uniformly. 115 116 #define load_intrinreg16(src) _mm_loadu_ps((const float *) src) 117 #define store_intrinreg16(dst, val) _mm_storeu_ps((float *) dst, val) 118 119 static inline intrinreg8 120 load_intrinreg8(const void *src) 121 { 122 return *(intrinreg8 *) src; 123 } 124 125 static inline void 126 store_intrinreg8(void *dst, intrinreg8 val) 127 { 128 *(intrinreg8 *) dst = val; 129 } 130 131 static inline intrinreg4 132 load_intrinreg4(const void *src) 133 { 134 return *(intrinreg4 *) src; 135 } 136 137 static inline void 138 store_intrinreg4(void *dst, intrinreg4 val) 139 { 140 *(intrinreg4 *) dst = val; 141 } 142 143 static inline intrinreg2 144 load_intrinreg2(const void *src) 145 { 146 return *(intrinreg2 *) src; 147 } 148 149 static inline void 150 store_intrinreg2(void *dst, intrinreg2 val) 151 { 152 *(intrinreg2 *) dst = val; 153 } 154 155 static inline intrinreg1 156 load_intrinreg1(const void *src) 157 { 158 return *(intrinreg1 *) src; 159 } 160 161 static inline void 162 store_intrinreg1(void *dst, intrinreg1 val) 163 { 164 *(intrinreg1 *) dst = val; 165 } 166 167 static inline void 168 memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes) 169 { 170 size_t i; 171 size_t j; 172 intrinreg16 pool[4]; 173 size_t remaining_moves; 174 size_t tail_offset; 175 int do_tail; 176 assert(nbytes >= 16); 177 178 for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) { 179 for (j = 0; j < 4; j++) 180 pool[j] = 181 load_intrinreg16((const void *) ((const char *) src + i + 16 * j)); 182 for (j = 0; j < 4; j++) 183 store_intrinreg16((void *) ((char *) dst + i + 16 * j), pool[j]); 184 } 185 186 remaining_moves = (nbytes - i) / 16; 187 tail_offset = nbytes - 16; 188 do_tail = (tail_offset & (16 - 1)); 189 190 for (j = 0; j < remaining_moves; j++) 191 pool[j] = load_intrinreg16((const void *) ((const char *) src + i + 16 * j)); 192 193 if (do_tail) 194 pool[j] = load_intrinreg16((const void *) ((const char *) src + tail_offset)); 195 196 for (j = 0; j < remaining_moves; j++) 197 store_intrinreg16((void *) ((char *) dst + i + 16 * j), pool[j]); 198 199 if (do_tail) 200 store_intrinreg16((void *) ((char *) dst + tail_offset), pool[j]); 201 } 202 203 static inline void 204 memclr_gte16_sse_fixedlen(void *dst, size_t nbytes) 205 { 206 size_t i; 207 size_t j; 208 const intrinreg16 zero = { 0 }; 209 size_t remaining_moves; 210 size_t tail_offset; 211 int do_tail; 212 assert(nbytes >= 16); 213 214 for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) 215 for (j = 0; j < 4; j++) 216 store_intrinreg16((void *) ((char *) dst + i + 16 * j), zero); 217 218 remaining_moves = (nbytes - i) / 16; 219 tail_offset = nbytes - 16; 220 do_tail = (tail_offset & (16 - 1)); 221 222 for (j = 0; j < remaining_moves; j++) 223 store_intrinreg16((void *) ((char *) dst + i + 16 * j), zero); 224 225 if (do_tail) 226 store_intrinreg16((void *) ((char *) dst + tail_offset), zero); 227 } 228 229 static inline void 230 memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes) 231 { 232 assert(nbytes <= 32); 233 if (nbytes >= 16) 234 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes); 235 else if (nbytes >= 8) 236 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes); 237 else if (nbytes >= 4) 238 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes); 239 else if (nbytes >= 2) 240 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes); 241 else if (nbytes >= 1) 242 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes); 243 } 244 245 static inline void 246 memclr_lte32_sse_fixedlen(void *dst, size_t nbytes) 247 { 248 assert(nbytes <= 32); 249 if (nbytes >= 16) 250 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes); 251 else if (nbytes >= 8) 252 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes); 253 else if (nbytes >= 4) 254 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes); 255 else if (nbytes >= 2) 256 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes); 257 else if (nbytes >= 1) 258 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes); 259 } 260 261 static inline void 262 memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes) 263 { 264 assert(nbytes <= 32); 265 if (nbytes >= 16) 266 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes); 267 else if (nbytes >= 8) 268 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes); 269 else if (nbytes >= 4) 270 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes); 271 else if (nbytes >= 2) 272 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes); 273 else if (nbytes >= 1) 274 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes); 275 } 276 277 static inline void 278 memclr_lte32_sse_varlen(void *dst, size_t nbytes) 279 { 280 assert(nbytes <= 32); 281 if (nbytes >= 16) 282 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes); 283 else if (nbytes >= 8) 284 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes); 285 else if (nbytes >= 4) 286 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes); 287 else if (nbytes >= 2) 288 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes); 289 else if (nbytes >= 1) 290 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes); 291 } 292 293 static inline void 294 memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes) 295 { 296 size_t i = 0; 297 intrinreg16 tail; 298 299 assert(nbytes >= 16); 300 301 while (i + 128 <= nbytes) { 302 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i), 303 (const void *) ((const char *) src + i), 128); 304 i += 128; 305 } 306 if (i + 64 <= nbytes) { 307 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i), 308 (const void *) ((const char *) src + i), 64); 309 i += 64; 310 } 311 if (i + 32 <= nbytes) { 312 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i), 313 (const void *) ((const char *) src + i), 32); 314 i += 32; 315 } 316 if (i + 16 <= nbytes) { 317 memcpy_gte16_sse_fixedlen((void *) ((char *) dst + i), 318 (const void *) ((const char *) src + i), 16); 319 } 320 321 i = nbytes - 16; 322 tail = load_intrinreg16((const void *) ((const char *) src + i)); 323 store_intrinreg16((void *) ((char *) dst + i), tail); 324 } 325 326 static inline void 327 memclr_gte16_sse_varlen(void *dst, size_t nbytes) 328 { 329 size_t i = 0; 330 const intrinreg16 zero = { 0 }; 331 332 assert(nbytes >= 16); 333 334 while (i + 128 <= nbytes) { 335 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 128); 336 i += 128; 337 } 338 if (i + 64 <= nbytes) { 339 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 64); 340 i += 64; 341 } 342 if (i + 32 <= nbytes) { 343 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 32); 344 i += 32; 345 } 346 if (i + 16 <= nbytes) { 347 memclr_gte16_sse_fixedlen((void *) ((char *) dst + i), 16); 348 } 349 350 i = nbytes - 16; 351 store_intrinreg16((void *) ((char *) dst + i), zero); 352 } 353 354 static inline void 355 memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes) 356 { 357 if (nbytes >= 16) 358 memcpy_gte16_sse_fixedlen(dst, src, nbytes); 359 else 360 memcpy_lte32_sse_fixedlen(dst, src, nbytes); 361 } 362 363 static inline void 364 memclr_sse_fixedlen(void *dst, size_t nbytes) 365 { 366 if (nbytes >= 16) 367 memclr_gte16_sse_fixedlen(dst, nbytes); 368 else 369 memclr_lte32_sse_fixedlen(dst, nbytes); 370 } 371 372 static inline void 373 memcpy_sse_varlen(void *dst, const void *src, size_t nbytes) 374 { 375 if (nbytes >= 16) 376 memcpy_gte16_sse_varlen(dst, src, nbytes); 377 else 378 memcpy_lte32_sse_varlen(dst, src, nbytes); 379 } 380 381 static inline void 382 memclr_sse_varlen(void *dst, size_t nbytes) 383 { 384 if (nbytes >= 16) 385 memclr_gte16_sse_varlen(dst, nbytes); 386 else 387 memclr_lte32_sse_varlen(dst, nbytes); 388 } 389 #else 390 #define memcpy_varlen memcpy 391 #define memcpy_fixedlen memcpy 392 393 #define memclr_varlen(dst, n) memset(dst, 0, n) 394 #define memclr_fixedlen(dst, n) memset(dst, 0, n) 395 396 #endif 397 398 #ifdef __cplusplus 399 } 400 #endif 401 402 #endif // __MEMCPY_H 403