1 /********************************************************************** 2 Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Intel Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 **********************************************************************/ 29 30 31 /** 32 * @file memcpy_inline.h 33 * @brief Defines intrinsic memcpy functions used by the new hashing API 34 * 35 */ 36 37 #ifndef _MEMCPY_H_ 38 #define _MEMCPY_H_ 39 40 #if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \ 41 || defined(_M_IX86) 42 #include "intrinreg.h" 43 #endif 44 #include <string.h> 45 #include <assert.h> 46 47 #ifdef __cplusplus 48 extern "C" { 49 #endif 50 51 #if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \ 52 || defined(_M_IX86) 53 54 #define memcpy_varlen memcpy_sse_varlen 55 #define memcpy_fixedlen memcpy_sse_fixedlen 56 57 #define memclr_varlen memclr_sse_varlen 58 #define memclr_fixedlen memclr_sse_fixedlen 59 60 static inline void memcpy_lte32_sse_fixedlen(void* dst, const void* src, size_t nbytes); 61 static inline void memcpy_gte16_sse_fixedlen(void* dst, const void* src, size_t nbytes); 62 static inline void memcpy_sse_fixedlen (void* dst, const void* src, size_t nbytes); 63 64 static inline void memcpy_lte32_sse_varlen (void* dst, const void* src, size_t nbytes); 65 static inline void memcpy_gte16_sse_varlen (void* dst, const void* src, size_t nbytes); 66 static inline void memcpy_sse_varlen (void* dst, const void* src, size_t nbytes); 67 68 69 static inline void memclr_lte32_sse_fixedlen(void* dst, size_t nbytes); 70 static inline void memclr_gte16_sse_fixedlen(void* dst, size_t nbytes); 71 static inline void memclr_sse_fixedlen (void* dst, size_t nbytes); 72 73 static inline void memclr_lte32_sse_varlen (void* dst, size_t nbytes); 74 static inline void memclr_gte16_sse_varlen (void* dst, size_t nbytes); 75 static inline void memclr_sse_varlen (void* dst, size_t nbytes); 76 77 #define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \ 78 do { \ 79 intrinreg##N head; \ 80 intrinreg##N tail; \ 81 assert(N <= nbytes && nbytes <= 2*N); \ 82 if(N == 1 || (fixedwidth && nbytes==N) ) { \ 83 head = load_intrinreg##N(src); \ 84 store_intrinreg##N(dst, head); \ 85 } \ 86 else { \ 87 head = load_intrinreg##N(src); \ 88 tail = load_intrinreg##N((const void*)((const char*)src + (nbytes - N))); \ 89 store_intrinreg##N(dst, head); \ 90 store_intrinreg##N((void*)((char*)dst + (nbytes - N)), tail); \ 91 } \ 92 } while(0) 93 94 #define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \ 95 do { \ 96 const intrinreg##N zero = {0}; \ 97 assert(N <= nbytes && nbytes <= 2*N); \ 98 if(N == 1 || (fixedwidth && nbytes==N) ) { \ 99 store_intrinreg##N(dst, zero); \ 100 } \ 101 else { \ 102 store_intrinreg##N(dst, zero); \ 103 store_intrinreg##N((void*)((char*)dst + (nbytes - N)), zero); \ 104 } \ 105 } while(0) 106 107 // Define load/store functions uniformly. 108 109 #define load_intrinreg16(src) _mm_loadu_ps((const float*) src) 110 #define store_intrinreg16(dst,val) _mm_storeu_ps((float*) dst, val) 111 112 static inline intrinreg8 load_intrinreg8(const void *src) 113 { 114 return *(intrinreg8 *) src; 115 } 116 117 static inline void store_intrinreg8(void *dst, intrinreg8 val) 118 { 119 *(intrinreg8 *) dst = val; 120 } 121 122 static inline intrinreg4 load_intrinreg4(const void *src) 123 { 124 return *(intrinreg4 *) src; 125 } 126 127 static inline void store_intrinreg4(void *dst, intrinreg4 val) 128 { 129 *(intrinreg4 *) dst = val; 130 } 131 132 static inline intrinreg2 load_intrinreg2(const void *src) 133 { 134 return *(intrinreg2 *) src; 135 } 136 137 static inline void store_intrinreg2(void *dst, intrinreg2 val) 138 { 139 *(intrinreg2 *) dst = val; 140 } 141 142 static inline intrinreg1 load_intrinreg1(const void *src) 143 { 144 return *(intrinreg1 *) src; 145 } 146 147 static inline void store_intrinreg1(void *dst, intrinreg1 val) 148 { 149 *(intrinreg1 *) dst = val; 150 } 151 152 static inline void memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes) 153 { 154 size_t i; 155 size_t j; 156 intrinreg16 pool[4]; 157 size_t remaining_moves; 158 size_t tail_offset; 159 int do_tail; 160 assert(nbytes >= 16); 161 162 for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) { 163 for (j = 0; j < 4; j++) 164 pool[j] = 165 load_intrinreg16((const void *)((const char *)src + i + 16 * j)); 166 for (j = 0; j < 4; j++) 167 store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]); 168 } 169 170 remaining_moves = (nbytes - i) / 16; 171 tail_offset = nbytes - 16; 172 do_tail = (tail_offset & (16 - 1)); 173 174 for (j = 0; j < remaining_moves; j++) 175 pool[j] = load_intrinreg16((const void *)((const char *)src + i + 16 * j)); 176 177 if (do_tail) 178 pool[j] = load_intrinreg16((const void *)((const char *)src + tail_offset)); 179 180 for (j = 0; j < remaining_moves; j++) 181 store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]); 182 183 if (do_tail) 184 store_intrinreg16((void *)((char *)dst + tail_offset), pool[j]); 185 } 186 187 static inline void memclr_gte16_sse_fixedlen(void *dst, size_t nbytes) 188 { 189 size_t i; 190 size_t j; 191 const intrinreg16 zero = { 0 }; 192 size_t remaining_moves; 193 size_t tail_offset; 194 int do_tail; 195 assert(nbytes >= 16); 196 197 for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) 198 for (j = 0; j < 4; j++) 199 store_intrinreg16((void *)((char *)dst + i + 16 * j), zero); 200 201 remaining_moves = (nbytes - i) / 16; 202 tail_offset = nbytes - 16; 203 do_tail = (tail_offset & (16 - 1)); 204 205 for (j = 0; j < remaining_moves; j++) 206 store_intrinreg16((void *)((char *)dst + i + 16 * j), zero); 207 208 if (do_tail) 209 store_intrinreg16((void *)((char *)dst + tail_offset), zero); 210 } 211 212 static inline void memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes) 213 { 214 assert(nbytes <= 32); 215 if (nbytes >= 16) 216 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes); 217 else if (nbytes >= 8) 218 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes); 219 else if (nbytes >= 4) 220 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes); 221 else if (nbytes >= 2) 222 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes); 223 else if (nbytes >= 1) 224 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes); 225 } 226 227 static inline void memclr_lte32_sse_fixedlen(void *dst, size_t nbytes) 228 { 229 assert(nbytes <= 32); 230 if (nbytes >= 16) 231 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes); 232 else if (nbytes >= 8) 233 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes); 234 else if (nbytes >= 4) 235 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes); 236 else if (nbytes >= 2) 237 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes); 238 else if (nbytes >= 1) 239 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes); 240 } 241 242 static inline void memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes) 243 { 244 assert(nbytes <= 32); 245 if (nbytes >= 16) 246 MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes); 247 else if (nbytes >= 8) 248 MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes); 249 else if (nbytes >= 4) 250 MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes); 251 else if (nbytes >= 2) 252 MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes); 253 else if (nbytes >= 1) 254 MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes); 255 } 256 257 static inline void memclr_lte32_sse_varlen(void *dst, size_t nbytes) 258 { 259 assert(nbytes <= 32); 260 if (nbytes >= 16) 261 MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes); 262 else if (nbytes >= 8) 263 MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes); 264 else if (nbytes >= 4) 265 MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes); 266 else if (nbytes >= 2) 267 MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes); 268 else if (nbytes >= 1) 269 MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes); 270 } 271 272 static inline void memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes) 273 { 274 size_t i = 0; 275 intrinreg16 tail; 276 277 assert(nbytes >= 16); 278 279 while (i + 128 <= nbytes) { 280 memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), 281 (const void *)((const char *)src + i), 128); 282 i += 128; 283 } 284 if (i + 64 <= nbytes) { 285 memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), 286 (const void *)((const char *)src + i), 64); 287 i += 64; 288 } 289 if (i + 32 <= nbytes) { 290 memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), 291 (const void *)((const char *)src + i), 32); 292 i += 32; 293 } 294 if (i + 16 <= nbytes) { 295 memcpy_gte16_sse_fixedlen((void *)((char *)dst + i), 296 (const void *)((const char *)src + i), 16); 297 } 298 299 i = nbytes - 16; 300 tail = load_intrinreg16((const void *)((const char *)src + i)); 301 store_intrinreg16((void *)((char *)dst + i), tail); 302 } 303 304 static inline void memclr_gte16_sse_varlen(void *dst, size_t nbytes) 305 { 306 size_t i = 0; 307 const intrinreg16 zero = { 0 }; 308 309 assert(nbytes >= 16); 310 311 while (i + 128 <= nbytes) { 312 memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 128); 313 i += 128; 314 } 315 if (i + 64 <= nbytes) { 316 memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 64); 317 i += 64; 318 } 319 if (i + 32 <= nbytes) { 320 memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 32); 321 i += 32; 322 } 323 if (i + 16 <= nbytes) { 324 memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 16); 325 } 326 327 i = nbytes - 16; 328 store_intrinreg16((void *)((char *)dst + i), zero); 329 } 330 331 static inline void memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes) 332 { 333 if (nbytes >= 16) 334 memcpy_gte16_sse_fixedlen(dst, src, nbytes); 335 else 336 memcpy_lte32_sse_fixedlen(dst, src, nbytes); 337 } 338 339 static inline void memclr_sse_fixedlen(void *dst, size_t nbytes) 340 { 341 if (nbytes >= 16) 342 memclr_gte16_sse_fixedlen(dst, nbytes); 343 else 344 memclr_lte32_sse_fixedlen(dst, nbytes); 345 } 346 347 static inline void memcpy_sse_varlen(void *dst, const void *src, size_t nbytes) 348 { 349 if (nbytes >= 16) 350 memcpy_gte16_sse_varlen(dst, src, nbytes); 351 else 352 memcpy_lte32_sse_varlen(dst, src, nbytes); 353 } 354 355 static inline void memclr_sse_varlen(void *dst, size_t nbytes) 356 { 357 if (nbytes >= 16) 358 memclr_gte16_sse_varlen(dst, nbytes); 359 else 360 memclr_lte32_sse_varlen(dst, nbytes); 361 } 362 #else 363 #define memcpy_varlen memcpy 364 #define memcpy_fixedlen memcpy 365 366 #define memclr_varlen(dst,n) memset(dst,0,n) 367 #define memclr_fixedlen(dst,n) memset(dst,0,n) 368 369 #endif 370 371 #ifdef __cplusplus 372 } 373 #endif 374 375 #endif // __MEMCPY_H 376