1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stdint.h> 35 #include <stdio.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 #include <rte_common.h> 40 #include <rte_cycles.h> 41 #include <rte_random.h> 42 #include <rte_malloc.h> 43 44 #include <rte_memcpy.h> 45 46 #include "test.h" 47 48 /* 49 * Set this to the maximum buffer size you want to test. If it is 0, then the 50 * values in the buf_sizes[] array below will be used. 51 */ 52 #define TEST_VALUE_RANGE 0 53 54 /* List of buffer sizes to test */ 55 #if TEST_VALUE_RANGE == 0 56 static size_t buf_sizes[] = { 57 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 58 129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448, 59 449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600, 60 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192 61 }; 62 /* MUST be as large as largest packet size above */ 63 #define SMALL_BUFFER_SIZE 8192 64 #else /* TEST_VALUE_RANGE != 0 */ 65 static size_t buf_sizes[TEST_VALUE_RANGE]; 66 #define SMALL_BUFFER_SIZE TEST_VALUE_RANGE 67 #endif /* TEST_VALUE_RANGE == 0 */ 68 69 70 /* 71 * Arrays of this size are used for measuring uncached memory accesses by 72 * picking a random location within the buffer. Make this smaller if there are 73 * memory allocation errors. 74 */ 75 #define LARGE_BUFFER_SIZE (100 * 1024 * 1024) 76 77 /* How many times to run timing loop for performance tests */ 78 #define TEST_ITERATIONS 1000000 79 #define TEST_BATCH_SIZE 100 80 81 /* Data is aligned on this many bytes (power of 2) */ 82 #define ALIGNMENT_UNIT 32 83 84 /* 85 * Pointers used in performance tests. The two large buffers are for uncached 86 * access where random addresses within the buffer are used for each 87 * memcpy. The two small buffers are for cached access. 88 */ 89 static uint8_t *large_buf_read, *large_buf_write; 90 static uint8_t *small_buf_read, *small_buf_write; 91 92 /* Initialise data buffers. */ 93 static int 94 init_buffers(void) 95 { 96 unsigned i; 97 98 large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 99 if (large_buf_read == NULL) 100 goto error_large_buf_read; 101 102 large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 103 if (large_buf_write == NULL) 104 goto error_large_buf_write; 105 106 small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 107 if (small_buf_read == NULL) 108 goto error_small_buf_read; 109 110 small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); 111 if (small_buf_write == NULL) 112 goto error_small_buf_write; 113 114 for (i = 0; i < LARGE_BUFFER_SIZE; i++) 115 large_buf_read[i] = rte_rand(); 116 for (i = 0; i < SMALL_BUFFER_SIZE; i++) 117 small_buf_read[i] = rte_rand(); 118 119 return 0; 120 121 error_small_buf_write: 122 rte_free(small_buf_read); 123 error_small_buf_read: 124 rte_free(large_buf_write); 125 error_large_buf_write: 126 rte_free(large_buf_read); 127 error_large_buf_read: 128 printf("ERROR: not enough memory\n"); 129 return -1; 130 } 131 132 /* Cleanup data buffers */ 133 static void 134 free_buffers(void) 135 { 136 rte_free(large_buf_read); 137 rte_free(large_buf_write); 138 rte_free(small_buf_read); 139 rte_free(small_buf_write); 140 } 141 142 /* 143 * Get a random offset into large array, with enough space needed to perform 144 * max copy size. Offset is aligned, uoffset is used for unalignment setting. 145 */ 146 static inline size_t 147 get_rand_offset(size_t uoffset) 148 { 149 return (((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) & 150 ~(ALIGNMENT_UNIT - 1)) + uoffset); 151 } 152 153 /* Fill in source and destination addresses. */ 154 static inline void 155 fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset, 156 size_t *src_addr, int is_src_cached, size_t src_uoffset) 157 { 158 unsigned int i; 159 160 for (i = 0; i < TEST_BATCH_SIZE; i++) { 161 dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset); 162 src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset); 163 } 164 } 165 166 /* 167 * WORKAROUND: For some reason the first test doing an uncached write 168 * takes a very long time (~25 times longer than is expected). So we do 169 * it once without timing. 170 */ 171 static void 172 do_uncached_write(uint8_t *dst, int is_dst_cached, 173 const uint8_t *src, int is_src_cached, size_t size) 174 { 175 unsigned i, j; 176 size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; 177 178 for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) { 179 fill_addr_arrays(dst_addrs, is_dst_cached, 0, 180 src_addrs, is_src_cached, 0); 181 for (j = 0; j < TEST_BATCH_SIZE; j++) { 182 rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size); 183 } 184 } 185 } 186 187 /* 188 * Run a single memcpy performance test. This is a macro to ensure that if 189 * the "size" parameter is a constant it won't be converted to a variable. 190 */ 191 #define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset, \ 192 src, is_src_cached, src_uoffset, size) \ 193 do { \ 194 unsigned int iter, t; \ 195 size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; \ 196 uint64_t start_time, total_time = 0; \ 197 uint64_t total_time2 = 0; \ 198 for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \ 199 fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \ 200 src_addrs, is_src_cached, src_uoffset); \ 201 start_time = rte_rdtsc(); \ 202 for (t = 0; t < TEST_BATCH_SIZE; t++) \ 203 rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \ 204 total_time += rte_rdtsc() - start_time; \ 205 } \ 206 for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) { \ 207 fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset, \ 208 src_addrs, is_src_cached, src_uoffset); \ 209 start_time = rte_rdtsc(); \ 210 for (t = 0; t < TEST_BATCH_SIZE; t++) \ 211 memcpy(dst+dst_addrs[t], src+src_addrs[t], size); \ 212 total_time2 += rte_rdtsc() - start_time; \ 213 } \ 214 printf("%8.0f -", (double)total_time /TEST_ITERATIONS); \ 215 printf("%5.0f", (double)total_time2 / TEST_ITERATIONS); \ 216 } while (0) 217 218 /* Run aligned memcpy tests for each cached/uncached permutation */ 219 #define ALL_PERF_TESTS_FOR_SIZE(n) \ 220 do { \ 221 if (__builtin_constant_p(n)) \ 222 printf("\nC%6u", (unsigned)n); \ 223 else \ 224 printf("\n%7u", (unsigned)n); \ 225 SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n); \ 226 SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n); \ 227 SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n); \ 228 SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n); \ 229 } while (0) 230 231 /* Run unaligned memcpy tests for each cached/uncached permutation */ 232 #define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n) \ 233 do { \ 234 if (__builtin_constant_p(n)) \ 235 printf("\nC%6u", (unsigned)n); \ 236 else \ 237 printf("\n%7u", (unsigned)n); \ 238 SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n); \ 239 SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n); \ 240 SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n); \ 241 SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n); \ 242 } while (0) 243 244 /* Run memcpy tests for constant length */ 245 #define ALL_PERF_TEST_FOR_CONSTANT \ 246 do { \ 247 TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U); \ 248 TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U); \ 249 TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U); \ 250 } while (0) 251 252 /* Run all memcpy tests for aligned constant cases */ 253 static inline void 254 perf_test_constant_aligned(void) 255 { 256 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE 257 ALL_PERF_TEST_FOR_CONSTANT; 258 #undef TEST_CONSTANT 259 } 260 261 /* Run all memcpy tests for unaligned constant cases */ 262 static inline void 263 perf_test_constant_unaligned(void) 264 { 265 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED 266 ALL_PERF_TEST_FOR_CONSTANT; 267 #undef TEST_CONSTANT 268 } 269 270 /* Run all memcpy tests for aligned variable cases */ 271 static inline void 272 perf_test_variable_aligned(void) 273 { 274 unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]); 275 unsigned i; 276 for (i = 0; i < n; i++) { 277 ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]); 278 } 279 } 280 281 /* Run all memcpy tests for unaligned variable cases */ 282 static inline void 283 perf_test_variable_unaligned(void) 284 { 285 unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]); 286 unsigned i; 287 for (i = 0; i < n; i++) { 288 ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]); 289 } 290 } 291 292 /* Run all memcpy tests */ 293 static int 294 perf_test(void) 295 { 296 int ret; 297 298 ret = init_buffers(); 299 if (ret != 0) 300 return ret; 301 302 #if TEST_VALUE_RANGE != 0 303 /* Set up buf_sizes array, if required */ 304 unsigned i; 305 for (i = 0; i < TEST_VALUE_RANGE; i++) 306 buf_sizes[i] = i; 307 #endif 308 309 /* See function comment */ 310 do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE); 311 312 printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n" 313 "======= ============== ============== ============== ==============\n" 314 " Size Cache to cache Cache to mem Mem to cache Mem to mem\n" 315 "(bytes) (ticks) (ticks) (ticks) (ticks)\n" 316 "------- -------------- -------------- -------------- --------------"); 317 318 printf("\n========================== %2dB aligned ============================", ALIGNMENT_UNIT); 319 /* Do aligned tests where size is a variable */ 320 perf_test_variable_aligned(); 321 printf("\n------- -------------- -------------- -------------- --------------"); 322 /* Do aligned tests where size is a compile-time constant */ 323 perf_test_constant_aligned(); 324 printf("\n=========================== Unaligned ============================="); 325 /* Do unaligned tests where size is a variable */ 326 perf_test_variable_unaligned(); 327 printf("\n------- -------------- -------------- -------------- --------------"); 328 /* Do unaligned tests where size is a compile-time constant */ 329 perf_test_constant_unaligned(); 330 printf("\n======= ============== ============== ============== ==============\n\n"); 331 332 free_buffers(); 333 334 return 0; 335 } 336 337 static int 338 test_memcpy_perf(void) 339 { 340 int ret; 341 342 ret = perf_test(); 343 if (ret != 0) 344 return -1; 345 return 0; 346 } 347 348 static struct test_command memcpy_perf_cmd = { 349 .command = "memcpy_perf_autotest", 350 .callback = test_memcpy_perf, 351 }; 352 REGISTER_TEST_COMMAND(memcpy_perf_cmd); 353