1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stdint.h> 35 #include <stdio.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 #include <rte_common.h> 40 #include <rte_cycles.h> 41 #include <rte_random.h> 42 #include <rte_malloc.h> 43 44 #include <rte_memcpy.h> 45 46 #include "test.h" 47 48 /* 49 * Set this to the maximum buffer size you want to test. If it is 0, then the 50 * values in the buf_sizes[] array below will be used. 51 */ 52 #define TEST_VALUE_RANGE 0 53 54 /* List of buffer sizes to test */ 55 #if TEST_VALUE_RANGE == 0 56 static size_t buf_sizes[] = { 57 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 58 129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448, 59 449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600, 60 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192 61 }; 62 /* MUST be as large as largest packet size above */ 63 #define SMALL_BUFFER_SIZE 8192 64 #else /* TEST_VALUE_RANGE != 0 */ 65 static size_t buf_sizes[TEST_VALUE_RANGE]; 66 #define SMALL_BUFFER_SIZE TEST_VALUE_RANGE 67 #endif /* TEST_VALUE_RANGE == 0 */ 68 69 70 /* 71 * Arrays of this size are used for measuring uncached memory accesses by 72 * picking a random location within the buffer. Make this smaller if there are 73 * memory allocation errors. 74 */ 75 #define LARGE_BUFFER_SIZE (100 * 1024 * 1024) 76 77 /* How many times to run timing loop for performance tests */ 78 #define TEST_ITERATIONS 1000000 79 #define TEST_BATCH_SIZE 100 80 81 /* Data is aligned on this many bytes (power of 2) */ 82 #ifdef RTE_MACHINE_CPUFLAG_AVX512F 83 #define ALIGNMENT_UNIT 64 84 #elif defined RTE_MACHINE_CPUFLAG_AVX2 85 #define ALIGNMENT_UNIT 32 86 #else /* RTE_MACHINE_CPUFLAG */ 87 #define ALIGNMENT_UNIT 16 88 #endif /* RTE_MACHINE_CPUFLAG */ 89 90 /* 91 * Pointers used in performance tests. The two large buffers are for uncached 92 * access where random addresses within the buffer are used for each 93 * memcpy. The two small buffers are for cached access. 94 */ 95 static uint8_t *large_buf_read, *large_buf_write; 96 static uint8_t *small_buf_read, *small_buf_write; 97 98 /* Initialise data buffers. */ 99 static int 100 init_buffers(void) 101 { 102 unsigned i; 103 104 large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 105 if (large_buf_read == NULL) 106 goto error_large_buf_read; 107 108 large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 109 if (large_buf_write == NULL) 110 goto error_large_buf_write; 111 112 small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 113 if (small_buf_read == NULL) 114 goto error_small_buf_read; 115 116 small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 117 if (small_buf_write == NULL) 118 goto error_small_buf_write; 119 120 for (i = 0; i < LARGE_BUFFER_SIZE; i++) 121 large_buf_read[i] = rte_rand(); 122 for (i = 0; i < SMALL_BUFFER_SIZE; i++) 123 small_buf_read[i] = rte_rand(); 124 125 return 0; 126 127 error_small_buf_write: 128 rte_free(small_buf_read); 129 error_small_buf_read: 130 rte_free(large_buf_write); 131 error_large_buf_write: 132 rte_free(large_buf_read); 133 error_large_buf_read: 134 printf("ERROR: not enough memory\n"); 135 return -1; 136 } 137 138 /* Cleanup data buffers */ 139 static void 140 free_buffers(void) 141 { 142 rte_free(large_buf_read); 143 rte_free(large_buf_write); 144 rte_free(small_buf_read); 145 rte_free(small_buf_write); 146 } 147 148 /* 149 * Get a random offset into large array, with enough space needed to perform 150 * max copy size. Offset is aligned, uoffset is used for unalignment setting. 151 */ 152 static inline size_t 153 get_rand_offset(size_t uoffset) 154 { 155 return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) & 156 ~(ALIGNMENT_UNIT - 1)) + uoffset; 157 } 158 159 /* Fill in source and destination addresses. */ 160 static inline void 161 fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset, 162 size_t *src_addr, int is_src_cached, size_t src_uoffset) 163 { 164 unsigned int i; 165 166 for (i = 0; i < TEST_BATCH_SIZE; i++) { 167 dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset); 168 src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset); 169 } 170 } 171 172 /* 173 * WORKAROUND: For some reason the first test doing an uncached write 174 * takes a very long time (~25 times longer than is expected). So we do 175 * it once without timing. 176 */ 177 static void 178 do_uncached_write(uint8_t *dst, int is_dst_cached, 179 const uint8_t *src, int is_src_cached, size_t size) 180 { 181 unsigned i, j; 182 size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; 183 184 for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) { 185 fill_addr_arrays(dst_addrs, is_dst_cached, 0, 186 src_addrs, is_src_cached, 0); 187 for (j = 0; j < TEST_BATCH_SIZE; j++) { 188 rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size); 189 } 190 } 191 } 192 193 /* 194 * Run a single memcpy performance test. This is a macro to ensure that if 195 * the "size" parameter is a constant it won't be converted to a variable. 196 */ 197 #define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset, \ 198 src, is_src_cached, src_uoffset, size) \ 199 do { \ 200 unsigned int iter, t; \ 201 size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; \ 202 uint64_t start_time, total_time = 0; \ 203 uint64_t total_time2 = 0; \ 204 for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \ 205 fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \ 206 src_addrs, is_src_cached, src_uoffset); \ 207 start_time = rte_rdtsc(); \ 208 for (t = 0; t < TEST_BATCH_SIZE; t++) \ 209 rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \ 210 total_time += rte_rdtsc() - start_time; \ 211 } \ 212 for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \ 213 fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \ 214 src_addrs, is_src_cached, src_uoffset); \ 215 start_time = rte_rdtsc(); \ 216 for (t = 0; t < TEST_BATCH_SIZE; t++) \ 217 memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \ 218 total_time2 += rte_rdtsc() - start_time; \ 219 } \ 220 printf("%8.0f -", (double)total_time /TEST_ITERATIONS); \ 221 printf("%5.0f", (double)total_time2 / TEST_ITERATIONS); \ 222 } while (0) 223 224 /* Run aligned memcpy tests for each cached/uncached permutation */ 225 #define ALL_PERF_TESTS_FOR_SIZE(n) \ 226 do { \ 227 if (__builtin_constant_p(n)) \ 228 printf("\nC%6u", (unsigned)n); \ 229 else \ 230 printf("\n%7u", (unsigned)n); \ 231 SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n); \ 232 SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n); \ 233 SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n); \ 234 SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n); \ 235 } while (0) 236 237 /* Run unaligned memcpy tests for each cached/uncached permutation */ 238 #define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n) \ 239 do { \ 240 if (__builtin_constant_p(n)) \ 241 printf("\nC%6u", (unsigned)n); \ 242 else \ 243 printf("\n%7u", (unsigned)n); \ 244 SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n); \ 245 SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n); \ 246 SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n); \ 247 SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n); \ 248 } while (0) 249 250 /* Run memcpy tests for constant length */ 251 #define ALL_PERF_TEST_FOR_CONSTANT \ 252 do { \ 253 TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U); \ 254 TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U); \ 255 TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U); \ 256 } while (0) 257 258 /* Run all memcpy tests for aligned constant cases */ 259 static inline void 260 perf_test_constant_aligned(void) 261 { 262 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE 263 ALL_PERF_TEST_FOR_CONSTANT; 264 #undef TEST_CONSTANT 265 } 266 267 /* Run all memcpy tests for unaligned constant cases */ 268 static inline void 269 perf_test_constant_unaligned(void) 270 { 271 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED 272 ALL_PERF_TEST_FOR_CONSTANT; 273 #undef TEST_CONSTANT 274 } 275 276 /* Run all memcpy tests for aligned variable cases */ 277 static inline void 278 perf_test_variable_aligned(void) 279 { 280 unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]); 281 unsigned i; 282 for (i = 0; i < n; i++) { 283 ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]); 284 } 285 } 286 287 /* Run all memcpy tests for unaligned variable cases */ 288 static inline void 289 perf_test_variable_unaligned(void) 290 { 291 unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]); 292 unsigned i; 293 for (i = 0; i < n; i++) { 294 ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]); 295 } 296 } 297 298 /* Run all memcpy tests */ 299 static int 300 perf_test(void) 301 { 302 int ret; 303 304 ret = init_buffers(); 305 if (ret != 0) 306 return ret; 307 308 #if TEST_VALUE_RANGE != 0 309 /* Set up buf_sizes array, if required */ 310 unsigned i; 311 for (i = 0; i < TEST_VALUE_RANGE; i++) 312 buf_sizes[i] = i; 313 #endif 314 315 /* See function comment */ 316 do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE); 317 318 printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n" 319 "======= ============== ============== ============== ==============\n" 320 " Size Cache to cache Cache to mem Mem to cache Mem to mem\n" 321 "(bytes) (ticks) (ticks) (ticks) (ticks)\n" 322 "------- -------------- -------------- -------------- --------------"); 323 324 printf("\n========================== %2dB aligned ============================", ALIGNMENT_UNIT); 325 /* Do aligned tests where size is a variable */ 326 perf_test_variable_aligned(); 327 printf("\n------- -------------- -------------- -------------- --------------"); 328 /* Do aligned tests where size is a compile-time constant */ 329 perf_test_constant_aligned(); 330 printf("\n=========================== Unaligned ============================="); 331 /* Do unaligned tests where size is a variable */ 332 perf_test_variable_unaligned(); 333 printf("\n------- -------------- -------------- -------------- --------------"); 334 /* Do unaligned tests where size is a compile-time constant */ 335 perf_test_constant_unaligned(); 336 printf("\n======= ============== ============== ============== ==============\n\n"); 337 338 free_buffers(); 339 340 return 0; 341 } 342 343 static int 344 test_memcpy_perf(void) 345 { 346 int ret; 347 348 ret = perf_test(); 349 if (ret != 0) 350 return -1; 351 return 0; 352 } 353 354 REGISTER_TEST_COMMAND(memcpy_perf_autotest, test_memcpy_perf); 355