1*d6b92ffaSHans Petter Selasky /*
2*d6b92ffaSHans Petter Selasky * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
3*d6b92ffaSHans Petter Selasky * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
4*d6b92ffaSHans Petter Selasky *
5*d6b92ffaSHans Petter Selasky * This software is available to you under a choice of one of two
6*d6b92ffaSHans Petter Selasky * licenses. You may choose to be licensed under the terms of the GNU
7*d6b92ffaSHans Petter Selasky * General Public License (GPL) Version 2, available from the file
8*d6b92ffaSHans Petter Selasky * COPYING in the main directory of this source tree, or the
9*d6b92ffaSHans Petter Selasky * OpenIB.org BSD license below:
10*d6b92ffaSHans Petter Selasky *
11*d6b92ffaSHans Petter Selasky * Redistribution and use in source and binary forms, with or
12*d6b92ffaSHans Petter Selasky * without modification, are permitted provided that the following
13*d6b92ffaSHans Petter Selasky * conditions are met:
14*d6b92ffaSHans Petter Selasky *
15*d6b92ffaSHans Petter Selasky * - Redistributions of source code must retain the above
16*d6b92ffaSHans Petter Selasky * copyright notice, this list of conditions and the following
17*d6b92ffaSHans Petter Selasky * disclaimer.
18*d6b92ffaSHans Petter Selasky *
19*d6b92ffaSHans Petter Selasky * - Redistributions in binary form must reproduce the above
20*d6b92ffaSHans Petter Selasky * copyright notice, this list of conditions and the following
21*d6b92ffaSHans Petter Selasky * disclaimer in the documentation and/or other materials
22*d6b92ffaSHans Petter Selasky * provided with the distribution.
23*d6b92ffaSHans Petter Selasky *
24*d6b92ffaSHans Petter Selasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25*d6b92ffaSHans Petter Selasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26*d6b92ffaSHans Petter Selasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27*d6b92ffaSHans Petter Selasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28*d6b92ffaSHans Petter Selasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29*d6b92ffaSHans Petter Selasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30*d6b92ffaSHans Petter Selasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31*d6b92ffaSHans Petter Selasky * SOFTWARE.
32*d6b92ffaSHans Petter Selasky */
33*d6b92ffaSHans Petter Selasky
34*d6b92ffaSHans Petter Selasky #include <config.h>
35*d6b92ffaSHans Petter Selasky
36*d6b92ffaSHans Petter Selasky #include <errno.h>
37*d6b92ffaSHans Petter Selasky #include <sys/mman.h>
38*d6b92ffaSHans Petter Selasky #include <unistd.h>
39*d6b92ffaSHans Petter Selasky #include <stdlib.h>
40*d6b92ffaSHans Petter Selasky #include <stdint.h>
41*d6b92ffaSHans Petter Selasky #include <stdio.h>
42*d6b92ffaSHans Petter Selasky #include <string.h>
43*d6b92ffaSHans Petter Selasky #include <dirent.h>
44*d6b92ffaSHans Petter Selasky #include <limits.h>
45*d6b92ffaSHans Petter Selasky #include <inttypes.h>
46*d6b92ffaSHans Petter Selasky
47*d6b92ffaSHans Petter Selasky #include "ibverbs.h"
48*d6b92ffaSHans Petter Selasky
49*d6b92ffaSHans Petter Selasky struct ibv_mem_node {
50*d6b92ffaSHans Petter Selasky enum {
51*d6b92ffaSHans Petter Selasky IBV_RED,
52*d6b92ffaSHans Petter Selasky IBV_BLACK
53*d6b92ffaSHans Petter Selasky } color;
54*d6b92ffaSHans Petter Selasky struct ibv_mem_node *parent;
55*d6b92ffaSHans Petter Selasky struct ibv_mem_node *left, *right;
56*d6b92ffaSHans Petter Selasky uintptr_t start, end;
57*d6b92ffaSHans Petter Selasky int refcnt;
58*d6b92ffaSHans Petter Selasky };
59*d6b92ffaSHans Petter Selasky
60*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *mm_root;
61*d6b92ffaSHans Petter Selasky static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
62*d6b92ffaSHans Petter Selasky static int page_size;
63*d6b92ffaSHans Petter Selasky static int huge_page_enabled;
64*d6b92ffaSHans Petter Selasky static int too_late;
65*d6b92ffaSHans Petter Selasky
smaps_page_size(FILE * file)66*d6b92ffaSHans Petter Selasky static unsigned long smaps_page_size(FILE *file)
67*d6b92ffaSHans Petter Selasky {
68*d6b92ffaSHans Petter Selasky int n;
69*d6b92ffaSHans Petter Selasky unsigned long size = page_size;
70*d6b92ffaSHans Petter Selasky char buf[1024];
71*d6b92ffaSHans Petter Selasky
72*d6b92ffaSHans Petter Selasky while (fgets(buf, sizeof(buf), file) != NULL) {
73*d6b92ffaSHans Petter Selasky if (!strstr(buf, "KernelPageSize:"))
74*d6b92ffaSHans Petter Selasky continue;
75*d6b92ffaSHans Petter Selasky
76*d6b92ffaSHans Petter Selasky n = sscanf(buf, "%*s %lu", &size);
77*d6b92ffaSHans Petter Selasky if (n < 1)
78*d6b92ffaSHans Petter Selasky continue;
79*d6b92ffaSHans Petter Selasky
80*d6b92ffaSHans Petter Selasky /* page size is printed in Kb */
81*d6b92ffaSHans Petter Selasky size = size * 1024;
82*d6b92ffaSHans Petter Selasky
83*d6b92ffaSHans Petter Selasky break;
84*d6b92ffaSHans Petter Selasky }
85*d6b92ffaSHans Petter Selasky
86*d6b92ffaSHans Petter Selasky return size;
87*d6b92ffaSHans Petter Selasky }
88*d6b92ffaSHans Petter Selasky
get_page_size(void * base)89*d6b92ffaSHans Petter Selasky static unsigned long get_page_size(void *base)
90*d6b92ffaSHans Petter Selasky {
91*d6b92ffaSHans Petter Selasky unsigned long ret = page_size;
92*d6b92ffaSHans Petter Selasky pid_t pid;
93*d6b92ffaSHans Petter Selasky FILE *file;
94*d6b92ffaSHans Petter Selasky char buf[1024];
95*d6b92ffaSHans Petter Selasky
96*d6b92ffaSHans Petter Selasky pid = getpid();
97*d6b92ffaSHans Petter Selasky snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid);
98*d6b92ffaSHans Petter Selasky
99*d6b92ffaSHans Petter Selasky file = fopen(buf, "r" STREAM_CLOEXEC);
100*d6b92ffaSHans Petter Selasky if (!file)
101*d6b92ffaSHans Petter Selasky goto out;
102*d6b92ffaSHans Petter Selasky
103*d6b92ffaSHans Petter Selasky while (fgets(buf, sizeof(buf), file) != NULL) {
104*d6b92ffaSHans Petter Selasky int n;
105*d6b92ffaSHans Petter Selasky uintptr_t range_start, range_end;
106*d6b92ffaSHans Petter Selasky
107*d6b92ffaSHans Petter Selasky n = sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, &range_start, &range_end);
108*d6b92ffaSHans Petter Selasky
109*d6b92ffaSHans Petter Selasky if (n < 2)
110*d6b92ffaSHans Petter Selasky continue;
111*d6b92ffaSHans Petter Selasky
112*d6b92ffaSHans Petter Selasky if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) {
113*d6b92ffaSHans Petter Selasky ret = smaps_page_size(file);
114*d6b92ffaSHans Petter Selasky break;
115*d6b92ffaSHans Petter Selasky }
116*d6b92ffaSHans Petter Selasky }
117*d6b92ffaSHans Petter Selasky
118*d6b92ffaSHans Petter Selasky fclose(file);
119*d6b92ffaSHans Petter Selasky
120*d6b92ffaSHans Petter Selasky out:
121*d6b92ffaSHans Petter Selasky return ret;
122*d6b92ffaSHans Petter Selasky }
123*d6b92ffaSHans Petter Selasky
ibv_fork_init(void)124*d6b92ffaSHans Petter Selasky int ibv_fork_init(void)
125*d6b92ffaSHans Petter Selasky {
126*d6b92ffaSHans Petter Selasky void *tmp, *tmp_aligned;
127*d6b92ffaSHans Petter Selasky int ret;
128*d6b92ffaSHans Petter Selasky unsigned long size;
129*d6b92ffaSHans Petter Selasky
130*d6b92ffaSHans Petter Selasky if (getenv("RDMAV_HUGEPAGES_SAFE"))
131*d6b92ffaSHans Petter Selasky huge_page_enabled = 1;
132*d6b92ffaSHans Petter Selasky
133*d6b92ffaSHans Petter Selasky if (mm_root)
134*d6b92ffaSHans Petter Selasky return 0;
135*d6b92ffaSHans Petter Selasky
136*d6b92ffaSHans Petter Selasky if (too_late)
137*d6b92ffaSHans Petter Selasky return EINVAL;
138*d6b92ffaSHans Petter Selasky
139*d6b92ffaSHans Petter Selasky page_size = sysconf(_SC_PAGESIZE);
140*d6b92ffaSHans Petter Selasky if (page_size < 0)
141*d6b92ffaSHans Petter Selasky return errno;
142*d6b92ffaSHans Petter Selasky
143*d6b92ffaSHans Petter Selasky if (posix_memalign(&tmp, page_size, page_size))
144*d6b92ffaSHans Petter Selasky return ENOMEM;
145*d6b92ffaSHans Petter Selasky
146*d6b92ffaSHans Petter Selasky if (huge_page_enabled) {
147*d6b92ffaSHans Petter Selasky size = get_page_size(tmp);
148*d6b92ffaSHans Petter Selasky tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1));
149*d6b92ffaSHans Petter Selasky } else {
150*d6b92ffaSHans Petter Selasky size = page_size;
151*d6b92ffaSHans Petter Selasky tmp_aligned = tmp;
152*d6b92ffaSHans Petter Selasky }
153*d6b92ffaSHans Petter Selasky
154*d6b92ffaSHans Petter Selasky ret = madvise(tmp_aligned, size, MADV_DONTFORK) ||
155*d6b92ffaSHans Petter Selasky madvise(tmp_aligned, size, MADV_DOFORK);
156*d6b92ffaSHans Petter Selasky
157*d6b92ffaSHans Petter Selasky free(tmp);
158*d6b92ffaSHans Petter Selasky
159*d6b92ffaSHans Petter Selasky if (ret)
160*d6b92ffaSHans Petter Selasky return ENOSYS;
161*d6b92ffaSHans Petter Selasky
162*d6b92ffaSHans Petter Selasky mm_root = malloc(sizeof *mm_root);
163*d6b92ffaSHans Petter Selasky if (!mm_root)
164*d6b92ffaSHans Petter Selasky return ENOMEM;
165*d6b92ffaSHans Petter Selasky
166*d6b92ffaSHans Petter Selasky mm_root->parent = NULL;
167*d6b92ffaSHans Petter Selasky mm_root->left = NULL;
168*d6b92ffaSHans Petter Selasky mm_root->right = NULL;
169*d6b92ffaSHans Petter Selasky mm_root->color = IBV_BLACK;
170*d6b92ffaSHans Petter Selasky mm_root->start = 0;
171*d6b92ffaSHans Petter Selasky mm_root->end = UINTPTR_MAX;
172*d6b92ffaSHans Petter Selasky mm_root->refcnt = 0;
173*d6b92ffaSHans Petter Selasky
174*d6b92ffaSHans Petter Selasky return 0;
175*d6b92ffaSHans Petter Selasky }
176*d6b92ffaSHans Petter Selasky
__mm_prev(struct ibv_mem_node * node)177*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
178*d6b92ffaSHans Petter Selasky {
179*d6b92ffaSHans Petter Selasky if (node->left) {
180*d6b92ffaSHans Petter Selasky node = node->left;
181*d6b92ffaSHans Petter Selasky while (node->right)
182*d6b92ffaSHans Petter Selasky node = node->right;
183*d6b92ffaSHans Petter Selasky } else {
184*d6b92ffaSHans Petter Selasky while (node->parent && node == node->parent->left)
185*d6b92ffaSHans Petter Selasky node = node->parent;
186*d6b92ffaSHans Petter Selasky
187*d6b92ffaSHans Petter Selasky node = node->parent;
188*d6b92ffaSHans Petter Selasky }
189*d6b92ffaSHans Petter Selasky
190*d6b92ffaSHans Petter Selasky return node;
191*d6b92ffaSHans Petter Selasky }
192*d6b92ffaSHans Petter Selasky
__mm_next(struct ibv_mem_node * node)193*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
194*d6b92ffaSHans Petter Selasky {
195*d6b92ffaSHans Petter Selasky if (node->right) {
196*d6b92ffaSHans Petter Selasky node = node->right;
197*d6b92ffaSHans Petter Selasky while (node->left)
198*d6b92ffaSHans Petter Selasky node = node->left;
199*d6b92ffaSHans Petter Selasky } else {
200*d6b92ffaSHans Petter Selasky while (node->parent && node == node->parent->right)
201*d6b92ffaSHans Petter Selasky node = node->parent;
202*d6b92ffaSHans Petter Selasky
203*d6b92ffaSHans Petter Selasky node = node->parent;
204*d6b92ffaSHans Petter Selasky }
205*d6b92ffaSHans Petter Selasky
206*d6b92ffaSHans Petter Selasky return node;
207*d6b92ffaSHans Petter Selasky }
208*d6b92ffaSHans Petter Selasky
__mm_rotate_right(struct ibv_mem_node * node)209*d6b92ffaSHans Petter Selasky static void __mm_rotate_right(struct ibv_mem_node *node)
210*d6b92ffaSHans Petter Selasky {
211*d6b92ffaSHans Petter Selasky struct ibv_mem_node *tmp;
212*d6b92ffaSHans Petter Selasky
213*d6b92ffaSHans Petter Selasky tmp = node->left;
214*d6b92ffaSHans Petter Selasky
215*d6b92ffaSHans Petter Selasky node->left = tmp->right;
216*d6b92ffaSHans Petter Selasky if (node->left)
217*d6b92ffaSHans Petter Selasky node->left->parent = node;
218*d6b92ffaSHans Petter Selasky
219*d6b92ffaSHans Petter Selasky if (node->parent) {
220*d6b92ffaSHans Petter Selasky if (node->parent->right == node)
221*d6b92ffaSHans Petter Selasky node->parent->right = tmp;
222*d6b92ffaSHans Petter Selasky else
223*d6b92ffaSHans Petter Selasky node->parent->left = tmp;
224*d6b92ffaSHans Petter Selasky } else
225*d6b92ffaSHans Petter Selasky mm_root = tmp;
226*d6b92ffaSHans Petter Selasky
227*d6b92ffaSHans Petter Selasky tmp->parent = node->parent;
228*d6b92ffaSHans Petter Selasky
229*d6b92ffaSHans Petter Selasky tmp->right = node;
230*d6b92ffaSHans Petter Selasky node->parent = tmp;
231*d6b92ffaSHans Petter Selasky }
232*d6b92ffaSHans Petter Selasky
__mm_rotate_left(struct ibv_mem_node * node)233*d6b92ffaSHans Petter Selasky static void __mm_rotate_left(struct ibv_mem_node *node)
234*d6b92ffaSHans Petter Selasky {
235*d6b92ffaSHans Petter Selasky struct ibv_mem_node *tmp;
236*d6b92ffaSHans Petter Selasky
237*d6b92ffaSHans Petter Selasky tmp = node->right;
238*d6b92ffaSHans Petter Selasky
239*d6b92ffaSHans Petter Selasky node->right = tmp->left;
240*d6b92ffaSHans Petter Selasky if (node->right)
241*d6b92ffaSHans Petter Selasky node->right->parent = node;
242*d6b92ffaSHans Petter Selasky
243*d6b92ffaSHans Petter Selasky if (node->parent) {
244*d6b92ffaSHans Petter Selasky if (node->parent->right == node)
245*d6b92ffaSHans Petter Selasky node->parent->right = tmp;
246*d6b92ffaSHans Petter Selasky else
247*d6b92ffaSHans Petter Selasky node->parent->left = tmp;
248*d6b92ffaSHans Petter Selasky } else
249*d6b92ffaSHans Petter Selasky mm_root = tmp;
250*d6b92ffaSHans Petter Selasky
251*d6b92ffaSHans Petter Selasky tmp->parent = node->parent;
252*d6b92ffaSHans Petter Selasky
253*d6b92ffaSHans Petter Selasky tmp->left = node;
254*d6b92ffaSHans Petter Selasky node->parent = tmp;
255*d6b92ffaSHans Petter Selasky }
256*d6b92ffaSHans Petter Selasky
257*d6b92ffaSHans Petter Selasky #if 0
258*d6b92ffaSHans Petter Selasky static int verify(struct ibv_mem_node *node)
259*d6b92ffaSHans Petter Selasky {
260*d6b92ffaSHans Petter Selasky int hl, hr;
261*d6b92ffaSHans Petter Selasky
262*d6b92ffaSHans Petter Selasky if (!node)
263*d6b92ffaSHans Petter Selasky return 1;
264*d6b92ffaSHans Petter Selasky
265*d6b92ffaSHans Petter Selasky hl = verify(node->left);
266*d6b92ffaSHans Petter Selasky hr = verify(node->left);
267*d6b92ffaSHans Petter Selasky
268*d6b92ffaSHans Petter Selasky if (!hl || !hr)
269*d6b92ffaSHans Petter Selasky return 0;
270*d6b92ffaSHans Petter Selasky if (hl != hr)
271*d6b92ffaSHans Petter Selasky return 0;
272*d6b92ffaSHans Petter Selasky
273*d6b92ffaSHans Petter Selasky if (node->color == IBV_RED) {
274*d6b92ffaSHans Petter Selasky if (node->left && node->left->color != IBV_BLACK)
275*d6b92ffaSHans Petter Selasky return 0;
276*d6b92ffaSHans Petter Selasky if (node->right && node->right->color != IBV_BLACK)
277*d6b92ffaSHans Petter Selasky return 0;
278*d6b92ffaSHans Petter Selasky return hl;
279*d6b92ffaSHans Petter Selasky }
280*d6b92ffaSHans Petter Selasky
281*d6b92ffaSHans Petter Selasky return hl + 1;
282*d6b92ffaSHans Petter Selasky }
283*d6b92ffaSHans Petter Selasky #endif
284*d6b92ffaSHans Petter Selasky
__mm_add_rebalance(struct ibv_mem_node * node)285*d6b92ffaSHans Petter Selasky static void __mm_add_rebalance(struct ibv_mem_node *node)
286*d6b92ffaSHans Petter Selasky {
287*d6b92ffaSHans Petter Selasky struct ibv_mem_node *parent, *gp, *uncle;
288*d6b92ffaSHans Petter Selasky
289*d6b92ffaSHans Petter Selasky while (node->parent && node->parent->color == IBV_RED) {
290*d6b92ffaSHans Petter Selasky parent = node->parent;
291*d6b92ffaSHans Petter Selasky gp = node->parent->parent;
292*d6b92ffaSHans Petter Selasky
293*d6b92ffaSHans Petter Selasky if (parent == gp->left) {
294*d6b92ffaSHans Petter Selasky uncle = gp->right;
295*d6b92ffaSHans Petter Selasky
296*d6b92ffaSHans Petter Selasky if (uncle && uncle->color == IBV_RED) {
297*d6b92ffaSHans Petter Selasky parent->color = IBV_BLACK;
298*d6b92ffaSHans Petter Selasky uncle->color = IBV_BLACK;
299*d6b92ffaSHans Petter Selasky gp->color = IBV_RED;
300*d6b92ffaSHans Petter Selasky
301*d6b92ffaSHans Petter Selasky node = gp;
302*d6b92ffaSHans Petter Selasky } else {
303*d6b92ffaSHans Petter Selasky if (node == parent->right) {
304*d6b92ffaSHans Petter Selasky __mm_rotate_left(parent);
305*d6b92ffaSHans Petter Selasky node = parent;
306*d6b92ffaSHans Petter Selasky parent = node->parent;
307*d6b92ffaSHans Petter Selasky }
308*d6b92ffaSHans Petter Selasky
309*d6b92ffaSHans Petter Selasky parent->color = IBV_BLACK;
310*d6b92ffaSHans Petter Selasky gp->color = IBV_RED;
311*d6b92ffaSHans Petter Selasky
312*d6b92ffaSHans Petter Selasky __mm_rotate_right(gp);
313*d6b92ffaSHans Petter Selasky }
314*d6b92ffaSHans Petter Selasky } else {
315*d6b92ffaSHans Petter Selasky uncle = gp->left;
316*d6b92ffaSHans Petter Selasky
317*d6b92ffaSHans Petter Selasky if (uncle && uncle->color == IBV_RED) {
318*d6b92ffaSHans Petter Selasky parent->color = IBV_BLACK;
319*d6b92ffaSHans Petter Selasky uncle->color = IBV_BLACK;
320*d6b92ffaSHans Petter Selasky gp->color = IBV_RED;
321*d6b92ffaSHans Petter Selasky
322*d6b92ffaSHans Petter Selasky node = gp;
323*d6b92ffaSHans Petter Selasky } else {
324*d6b92ffaSHans Petter Selasky if (node == parent->left) {
325*d6b92ffaSHans Petter Selasky __mm_rotate_right(parent);
326*d6b92ffaSHans Petter Selasky node = parent;
327*d6b92ffaSHans Petter Selasky parent = node->parent;
328*d6b92ffaSHans Petter Selasky }
329*d6b92ffaSHans Petter Selasky
330*d6b92ffaSHans Petter Selasky parent->color = IBV_BLACK;
331*d6b92ffaSHans Petter Selasky gp->color = IBV_RED;
332*d6b92ffaSHans Petter Selasky
333*d6b92ffaSHans Petter Selasky __mm_rotate_left(gp);
334*d6b92ffaSHans Petter Selasky }
335*d6b92ffaSHans Petter Selasky }
336*d6b92ffaSHans Petter Selasky }
337*d6b92ffaSHans Petter Selasky
338*d6b92ffaSHans Petter Selasky mm_root->color = IBV_BLACK;
339*d6b92ffaSHans Petter Selasky }
340*d6b92ffaSHans Petter Selasky
__mm_add(struct ibv_mem_node * new)341*d6b92ffaSHans Petter Selasky static void __mm_add(struct ibv_mem_node *new)
342*d6b92ffaSHans Petter Selasky {
343*d6b92ffaSHans Petter Selasky struct ibv_mem_node *node, *parent = NULL;
344*d6b92ffaSHans Petter Selasky
345*d6b92ffaSHans Petter Selasky node = mm_root;
346*d6b92ffaSHans Petter Selasky while (node) {
347*d6b92ffaSHans Petter Selasky parent = node;
348*d6b92ffaSHans Petter Selasky if (node->start < new->start)
349*d6b92ffaSHans Petter Selasky node = node->right;
350*d6b92ffaSHans Petter Selasky else
351*d6b92ffaSHans Petter Selasky node = node->left;
352*d6b92ffaSHans Petter Selasky }
353*d6b92ffaSHans Petter Selasky
354*d6b92ffaSHans Petter Selasky if (parent->start < new->start)
355*d6b92ffaSHans Petter Selasky parent->right = new;
356*d6b92ffaSHans Petter Selasky else
357*d6b92ffaSHans Petter Selasky parent->left = new;
358*d6b92ffaSHans Petter Selasky
359*d6b92ffaSHans Petter Selasky new->parent = parent;
360*d6b92ffaSHans Petter Selasky new->left = NULL;
361*d6b92ffaSHans Petter Selasky new->right = NULL;
362*d6b92ffaSHans Petter Selasky
363*d6b92ffaSHans Petter Selasky new->color = IBV_RED;
364*d6b92ffaSHans Petter Selasky __mm_add_rebalance(new);
365*d6b92ffaSHans Petter Selasky }
366*d6b92ffaSHans Petter Selasky
__mm_remove(struct ibv_mem_node * node)367*d6b92ffaSHans Petter Selasky static void __mm_remove(struct ibv_mem_node *node)
368*d6b92ffaSHans Petter Selasky {
369*d6b92ffaSHans Petter Selasky struct ibv_mem_node *child, *parent, *sib, *tmp;
370*d6b92ffaSHans Petter Selasky int nodecol;
371*d6b92ffaSHans Petter Selasky
372*d6b92ffaSHans Petter Selasky if (node->left && node->right) {
373*d6b92ffaSHans Petter Selasky tmp = node->left;
374*d6b92ffaSHans Petter Selasky while (tmp->right)
375*d6b92ffaSHans Petter Selasky tmp = tmp->right;
376*d6b92ffaSHans Petter Selasky
377*d6b92ffaSHans Petter Selasky nodecol = tmp->color;
378*d6b92ffaSHans Petter Selasky child = tmp->left;
379*d6b92ffaSHans Petter Selasky tmp->color = node->color;
380*d6b92ffaSHans Petter Selasky
381*d6b92ffaSHans Petter Selasky if (tmp->parent != node) {
382*d6b92ffaSHans Petter Selasky parent = tmp->parent;
383*d6b92ffaSHans Petter Selasky parent->right = tmp->left;
384*d6b92ffaSHans Petter Selasky if (tmp->left)
385*d6b92ffaSHans Petter Selasky tmp->left->parent = parent;
386*d6b92ffaSHans Petter Selasky
387*d6b92ffaSHans Petter Selasky tmp->left = node->left;
388*d6b92ffaSHans Petter Selasky node->left->parent = tmp;
389*d6b92ffaSHans Petter Selasky } else
390*d6b92ffaSHans Petter Selasky parent = tmp;
391*d6b92ffaSHans Petter Selasky
392*d6b92ffaSHans Petter Selasky tmp->right = node->right;
393*d6b92ffaSHans Petter Selasky node->right->parent = tmp;
394*d6b92ffaSHans Petter Selasky
395*d6b92ffaSHans Petter Selasky tmp->parent = node->parent;
396*d6b92ffaSHans Petter Selasky if (node->parent) {
397*d6b92ffaSHans Petter Selasky if (node->parent->left == node)
398*d6b92ffaSHans Petter Selasky node->parent->left = tmp;
399*d6b92ffaSHans Petter Selasky else
400*d6b92ffaSHans Petter Selasky node->parent->right = tmp;
401*d6b92ffaSHans Petter Selasky } else
402*d6b92ffaSHans Petter Selasky mm_root = tmp;
403*d6b92ffaSHans Petter Selasky } else {
404*d6b92ffaSHans Petter Selasky nodecol = node->color;
405*d6b92ffaSHans Petter Selasky
406*d6b92ffaSHans Petter Selasky child = node->left ? node->left : node->right;
407*d6b92ffaSHans Petter Selasky parent = node->parent;
408*d6b92ffaSHans Petter Selasky
409*d6b92ffaSHans Petter Selasky if (child)
410*d6b92ffaSHans Petter Selasky child->parent = parent;
411*d6b92ffaSHans Petter Selasky if (parent) {
412*d6b92ffaSHans Petter Selasky if (parent->left == node)
413*d6b92ffaSHans Petter Selasky parent->left = child;
414*d6b92ffaSHans Petter Selasky else
415*d6b92ffaSHans Petter Selasky parent->right = child;
416*d6b92ffaSHans Petter Selasky } else
417*d6b92ffaSHans Petter Selasky mm_root = child;
418*d6b92ffaSHans Petter Selasky }
419*d6b92ffaSHans Petter Selasky
420*d6b92ffaSHans Petter Selasky free(node);
421*d6b92ffaSHans Petter Selasky
422*d6b92ffaSHans Petter Selasky if (nodecol == IBV_RED)
423*d6b92ffaSHans Petter Selasky return;
424*d6b92ffaSHans Petter Selasky
425*d6b92ffaSHans Petter Selasky while ((!child || child->color == IBV_BLACK) && child != mm_root) {
426*d6b92ffaSHans Petter Selasky if (parent->left == child) {
427*d6b92ffaSHans Petter Selasky sib = parent->right;
428*d6b92ffaSHans Petter Selasky
429*d6b92ffaSHans Petter Selasky if (sib->color == IBV_RED) {
430*d6b92ffaSHans Petter Selasky parent->color = IBV_RED;
431*d6b92ffaSHans Petter Selasky sib->color = IBV_BLACK;
432*d6b92ffaSHans Petter Selasky __mm_rotate_left(parent);
433*d6b92ffaSHans Petter Selasky sib = parent->right;
434*d6b92ffaSHans Petter Selasky }
435*d6b92ffaSHans Petter Selasky
436*d6b92ffaSHans Petter Selasky if ((!sib->left || sib->left->color == IBV_BLACK) &&
437*d6b92ffaSHans Petter Selasky (!sib->right || sib->right->color == IBV_BLACK)) {
438*d6b92ffaSHans Petter Selasky sib->color = IBV_RED;
439*d6b92ffaSHans Petter Selasky child = parent;
440*d6b92ffaSHans Petter Selasky parent = child->parent;
441*d6b92ffaSHans Petter Selasky } else {
442*d6b92ffaSHans Petter Selasky if (!sib->right || sib->right->color == IBV_BLACK) {
443*d6b92ffaSHans Petter Selasky if (sib->left)
444*d6b92ffaSHans Petter Selasky sib->left->color = IBV_BLACK;
445*d6b92ffaSHans Petter Selasky sib->color = IBV_RED;
446*d6b92ffaSHans Petter Selasky __mm_rotate_right(sib);
447*d6b92ffaSHans Petter Selasky sib = parent->right;
448*d6b92ffaSHans Petter Selasky }
449*d6b92ffaSHans Petter Selasky
450*d6b92ffaSHans Petter Selasky sib->color = parent->color;
451*d6b92ffaSHans Petter Selasky parent->color = IBV_BLACK;
452*d6b92ffaSHans Petter Selasky if (sib->right)
453*d6b92ffaSHans Petter Selasky sib->right->color = IBV_BLACK;
454*d6b92ffaSHans Petter Selasky __mm_rotate_left(parent);
455*d6b92ffaSHans Petter Selasky child = mm_root;
456*d6b92ffaSHans Petter Selasky break;
457*d6b92ffaSHans Petter Selasky }
458*d6b92ffaSHans Petter Selasky } else {
459*d6b92ffaSHans Petter Selasky sib = parent->left;
460*d6b92ffaSHans Petter Selasky
461*d6b92ffaSHans Petter Selasky if (sib->color == IBV_RED) {
462*d6b92ffaSHans Petter Selasky parent->color = IBV_RED;
463*d6b92ffaSHans Petter Selasky sib->color = IBV_BLACK;
464*d6b92ffaSHans Petter Selasky __mm_rotate_right(parent);
465*d6b92ffaSHans Petter Selasky sib = parent->left;
466*d6b92ffaSHans Petter Selasky }
467*d6b92ffaSHans Petter Selasky
468*d6b92ffaSHans Petter Selasky if ((!sib->left || sib->left->color == IBV_BLACK) &&
469*d6b92ffaSHans Petter Selasky (!sib->right || sib->right->color == IBV_BLACK)) {
470*d6b92ffaSHans Petter Selasky sib->color = IBV_RED;
471*d6b92ffaSHans Petter Selasky child = parent;
472*d6b92ffaSHans Petter Selasky parent = child->parent;
473*d6b92ffaSHans Petter Selasky } else {
474*d6b92ffaSHans Petter Selasky if (!sib->left || sib->left->color == IBV_BLACK) {
475*d6b92ffaSHans Petter Selasky if (sib->right)
476*d6b92ffaSHans Petter Selasky sib->right->color = IBV_BLACK;
477*d6b92ffaSHans Petter Selasky sib->color = IBV_RED;
478*d6b92ffaSHans Petter Selasky __mm_rotate_left(sib);
479*d6b92ffaSHans Petter Selasky sib = parent->left;
480*d6b92ffaSHans Petter Selasky }
481*d6b92ffaSHans Petter Selasky
482*d6b92ffaSHans Petter Selasky sib->color = parent->color;
483*d6b92ffaSHans Petter Selasky parent->color = IBV_BLACK;
484*d6b92ffaSHans Petter Selasky if (sib->left)
485*d6b92ffaSHans Petter Selasky sib->left->color = IBV_BLACK;
486*d6b92ffaSHans Petter Selasky __mm_rotate_right(parent);
487*d6b92ffaSHans Petter Selasky child = mm_root;
488*d6b92ffaSHans Petter Selasky break;
489*d6b92ffaSHans Petter Selasky }
490*d6b92ffaSHans Petter Selasky }
491*d6b92ffaSHans Petter Selasky }
492*d6b92ffaSHans Petter Selasky
493*d6b92ffaSHans Petter Selasky if (child)
494*d6b92ffaSHans Petter Selasky child->color = IBV_BLACK;
495*d6b92ffaSHans Petter Selasky }
496*d6b92ffaSHans Petter Selasky
__mm_find_start(uintptr_t start,uintptr_t end)497*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
498*d6b92ffaSHans Petter Selasky {
499*d6b92ffaSHans Petter Selasky struct ibv_mem_node *node = mm_root;
500*d6b92ffaSHans Petter Selasky
501*d6b92ffaSHans Petter Selasky while (node) {
502*d6b92ffaSHans Petter Selasky if (node->start <= start && node->end >= start)
503*d6b92ffaSHans Petter Selasky break;
504*d6b92ffaSHans Petter Selasky
505*d6b92ffaSHans Petter Selasky if (node->start < start)
506*d6b92ffaSHans Petter Selasky node = node->right;
507*d6b92ffaSHans Petter Selasky else
508*d6b92ffaSHans Petter Selasky node = node->left;
509*d6b92ffaSHans Petter Selasky }
510*d6b92ffaSHans Petter Selasky
511*d6b92ffaSHans Petter Selasky return node;
512*d6b92ffaSHans Petter Selasky }
513*d6b92ffaSHans Petter Selasky
merge_ranges(struct ibv_mem_node * node,struct ibv_mem_node * prev)514*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node,
515*d6b92ffaSHans Petter Selasky struct ibv_mem_node *prev)
516*d6b92ffaSHans Petter Selasky {
517*d6b92ffaSHans Petter Selasky prev->end = node->end;
518*d6b92ffaSHans Petter Selasky prev->refcnt = node->refcnt;
519*d6b92ffaSHans Petter Selasky __mm_remove(node);
520*d6b92ffaSHans Petter Selasky
521*d6b92ffaSHans Petter Selasky return prev;
522*d6b92ffaSHans Petter Selasky }
523*d6b92ffaSHans Petter Selasky
split_range(struct ibv_mem_node * node,uintptr_t cut_line)524*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *split_range(struct ibv_mem_node *node,
525*d6b92ffaSHans Petter Selasky uintptr_t cut_line)
526*d6b92ffaSHans Petter Selasky {
527*d6b92ffaSHans Petter Selasky struct ibv_mem_node *new_node = NULL;
528*d6b92ffaSHans Petter Selasky
529*d6b92ffaSHans Petter Selasky new_node = malloc(sizeof *new_node);
530*d6b92ffaSHans Petter Selasky if (!new_node)
531*d6b92ffaSHans Petter Selasky return NULL;
532*d6b92ffaSHans Petter Selasky new_node->start = cut_line;
533*d6b92ffaSHans Petter Selasky new_node->end = node->end;
534*d6b92ffaSHans Petter Selasky new_node->refcnt = node->refcnt;
535*d6b92ffaSHans Petter Selasky node->end = cut_line - 1;
536*d6b92ffaSHans Petter Selasky __mm_add(new_node);
537*d6b92ffaSHans Petter Selasky
538*d6b92ffaSHans Petter Selasky return new_node;
539*d6b92ffaSHans Petter Selasky }
540*d6b92ffaSHans Petter Selasky
get_start_node(uintptr_t start,uintptr_t end,int inc)541*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end,
542*d6b92ffaSHans Petter Selasky int inc)
543*d6b92ffaSHans Petter Selasky {
544*d6b92ffaSHans Petter Selasky struct ibv_mem_node *node, *tmp = NULL;
545*d6b92ffaSHans Petter Selasky
546*d6b92ffaSHans Petter Selasky node = __mm_find_start(start, end);
547*d6b92ffaSHans Petter Selasky if (node->start < start)
548*d6b92ffaSHans Petter Selasky node = split_range(node, start);
549*d6b92ffaSHans Petter Selasky else {
550*d6b92ffaSHans Petter Selasky tmp = __mm_prev(node);
551*d6b92ffaSHans Petter Selasky if (tmp && tmp->refcnt == node->refcnt + inc)
552*d6b92ffaSHans Petter Selasky node = merge_ranges(node, tmp);
553*d6b92ffaSHans Petter Selasky }
554*d6b92ffaSHans Petter Selasky return node;
555*d6b92ffaSHans Petter Selasky }
556*d6b92ffaSHans Petter Selasky
557*d6b92ffaSHans Petter Selasky /*
558*d6b92ffaSHans Petter Selasky * This function is called if madvise() fails to undo merging/splitting
559*d6b92ffaSHans Petter Selasky * operations performed on the node.
560*d6b92ffaSHans Petter Selasky */
undo_node(struct ibv_mem_node * node,uintptr_t start,int inc)561*d6b92ffaSHans Petter Selasky static struct ibv_mem_node *undo_node(struct ibv_mem_node *node,
562*d6b92ffaSHans Petter Selasky uintptr_t start, int inc)
563*d6b92ffaSHans Petter Selasky {
564*d6b92ffaSHans Petter Selasky struct ibv_mem_node *tmp = NULL;
565*d6b92ffaSHans Petter Selasky
566*d6b92ffaSHans Petter Selasky /*
567*d6b92ffaSHans Petter Selasky * This condition can be true only if we merged this
568*d6b92ffaSHans Petter Selasky * node with the previous one, so we need to split them.
569*d6b92ffaSHans Petter Selasky */
570*d6b92ffaSHans Petter Selasky if (start > node->start) {
571*d6b92ffaSHans Petter Selasky tmp = split_range(node, start);
572*d6b92ffaSHans Petter Selasky if (tmp) {
573*d6b92ffaSHans Petter Selasky node->refcnt += inc;
574*d6b92ffaSHans Petter Selasky node = tmp;
575*d6b92ffaSHans Petter Selasky } else
576*d6b92ffaSHans Petter Selasky return NULL;
577*d6b92ffaSHans Petter Selasky }
578*d6b92ffaSHans Petter Selasky
579*d6b92ffaSHans Petter Selasky tmp = __mm_prev(node);
580*d6b92ffaSHans Petter Selasky if (tmp && tmp->refcnt == node->refcnt)
581*d6b92ffaSHans Petter Selasky node = merge_ranges(node, tmp);
582*d6b92ffaSHans Petter Selasky
583*d6b92ffaSHans Petter Selasky tmp = __mm_next(node);
584*d6b92ffaSHans Petter Selasky if (tmp && tmp->refcnt == node->refcnt)
585*d6b92ffaSHans Petter Selasky node = merge_ranges(tmp, node);
586*d6b92ffaSHans Petter Selasky
587*d6b92ffaSHans Petter Selasky return node;
588*d6b92ffaSHans Petter Selasky }
589*d6b92ffaSHans Petter Selasky
ibv_madvise_range(void * base,size_t size,int advice)590*d6b92ffaSHans Petter Selasky static int ibv_madvise_range(void *base, size_t size, int advice)
591*d6b92ffaSHans Petter Selasky {
592*d6b92ffaSHans Petter Selasky uintptr_t start, end;
593*d6b92ffaSHans Petter Selasky struct ibv_mem_node *node, *tmp;
594*d6b92ffaSHans Petter Selasky int inc;
595*d6b92ffaSHans Petter Selasky int rolling_back = 0;
596*d6b92ffaSHans Petter Selasky int ret = 0;
597*d6b92ffaSHans Petter Selasky unsigned long range_page_size;
598*d6b92ffaSHans Petter Selasky
599*d6b92ffaSHans Petter Selasky if (!size)
600*d6b92ffaSHans Petter Selasky return 0;
601*d6b92ffaSHans Petter Selasky
602*d6b92ffaSHans Petter Selasky if (huge_page_enabled)
603*d6b92ffaSHans Petter Selasky range_page_size = get_page_size(base);
604*d6b92ffaSHans Petter Selasky else
605*d6b92ffaSHans Petter Selasky range_page_size = page_size;
606*d6b92ffaSHans Petter Selasky
607*d6b92ffaSHans Petter Selasky start = (uintptr_t) base & ~(range_page_size - 1);
608*d6b92ffaSHans Petter Selasky end = ((uintptr_t) (base + size + range_page_size - 1) &
609*d6b92ffaSHans Petter Selasky ~(range_page_size - 1)) - 1;
610*d6b92ffaSHans Petter Selasky
611*d6b92ffaSHans Petter Selasky pthread_mutex_lock(&mm_mutex);
612*d6b92ffaSHans Petter Selasky again:
613*d6b92ffaSHans Petter Selasky inc = advice == MADV_DONTFORK ? 1 : -1;
614*d6b92ffaSHans Petter Selasky
615*d6b92ffaSHans Petter Selasky node = get_start_node(start, end, inc);
616*d6b92ffaSHans Petter Selasky if (!node) {
617*d6b92ffaSHans Petter Selasky ret = -1;
618*d6b92ffaSHans Petter Selasky goto out;
619*d6b92ffaSHans Petter Selasky }
620*d6b92ffaSHans Petter Selasky
621*d6b92ffaSHans Petter Selasky while (node && node->start <= end) {
622*d6b92ffaSHans Petter Selasky if (node->end > end) {
623*d6b92ffaSHans Petter Selasky if (!split_range(node, end + 1)) {
624*d6b92ffaSHans Petter Selasky ret = -1;
625*d6b92ffaSHans Petter Selasky goto out;
626*d6b92ffaSHans Petter Selasky }
627*d6b92ffaSHans Petter Selasky }
628*d6b92ffaSHans Petter Selasky
629*d6b92ffaSHans Petter Selasky if ((inc == -1 && node->refcnt == 1) ||
630*d6b92ffaSHans Petter Selasky (inc == 1 && node->refcnt == 0)) {
631*d6b92ffaSHans Petter Selasky /*
632*d6b92ffaSHans Petter Selasky * If this is the first time through the loop,
633*d6b92ffaSHans Petter Selasky * and we merged this node with the previous
634*d6b92ffaSHans Petter Selasky * one, then we only want to do the madvise()
635*d6b92ffaSHans Petter Selasky * on start ... node->end (rather than
636*d6b92ffaSHans Petter Selasky * starting at node->start).
637*d6b92ffaSHans Petter Selasky *
638*d6b92ffaSHans Petter Selasky * Otherwise we end up doing madvise() on
639*d6b92ffaSHans Petter Selasky * bigger region than we're being asked to,
640*d6b92ffaSHans Petter Selasky * and that may lead to a spurious failure.
641*d6b92ffaSHans Petter Selasky */
642*d6b92ffaSHans Petter Selasky if (start > node->start)
643*d6b92ffaSHans Petter Selasky ret = madvise((void *) start, node->end - start + 1,
644*d6b92ffaSHans Petter Selasky advice);
645*d6b92ffaSHans Petter Selasky else
646*d6b92ffaSHans Petter Selasky ret = madvise((void *) node->start,
647*d6b92ffaSHans Petter Selasky node->end - node->start + 1,
648*d6b92ffaSHans Petter Selasky advice);
649*d6b92ffaSHans Petter Selasky if (ret) {
650*d6b92ffaSHans Petter Selasky node = undo_node(node, start, inc);
651*d6b92ffaSHans Petter Selasky
652*d6b92ffaSHans Petter Selasky if (rolling_back || !node)
653*d6b92ffaSHans Petter Selasky goto out;
654*d6b92ffaSHans Petter Selasky
655*d6b92ffaSHans Petter Selasky /* madvise failed, roll back previous changes */
656*d6b92ffaSHans Petter Selasky rolling_back = 1;
657*d6b92ffaSHans Petter Selasky advice = advice == MADV_DONTFORK ?
658*d6b92ffaSHans Petter Selasky MADV_DOFORK : MADV_DONTFORK;
659*d6b92ffaSHans Petter Selasky tmp = __mm_prev(node);
660*d6b92ffaSHans Petter Selasky if (!tmp || start > tmp->end)
661*d6b92ffaSHans Petter Selasky goto out;
662*d6b92ffaSHans Petter Selasky end = tmp->end;
663*d6b92ffaSHans Petter Selasky goto again;
664*d6b92ffaSHans Petter Selasky }
665*d6b92ffaSHans Petter Selasky }
666*d6b92ffaSHans Petter Selasky
667*d6b92ffaSHans Petter Selasky node->refcnt += inc;
668*d6b92ffaSHans Petter Selasky node = __mm_next(node);
669*d6b92ffaSHans Petter Selasky }
670*d6b92ffaSHans Petter Selasky
671*d6b92ffaSHans Petter Selasky if (node) {
672*d6b92ffaSHans Petter Selasky tmp = __mm_prev(node);
673*d6b92ffaSHans Petter Selasky if (tmp && node->refcnt == tmp->refcnt)
674*d6b92ffaSHans Petter Selasky node = merge_ranges(node, tmp);
675*d6b92ffaSHans Petter Selasky }
676*d6b92ffaSHans Petter Selasky
677*d6b92ffaSHans Petter Selasky out:
678*d6b92ffaSHans Petter Selasky if (rolling_back)
679*d6b92ffaSHans Petter Selasky ret = -1;
680*d6b92ffaSHans Petter Selasky
681*d6b92ffaSHans Petter Selasky pthread_mutex_unlock(&mm_mutex);
682*d6b92ffaSHans Petter Selasky
683*d6b92ffaSHans Petter Selasky return ret;
684*d6b92ffaSHans Petter Selasky }
685*d6b92ffaSHans Petter Selasky
ibv_dontfork_range(void * base,size_t size)686*d6b92ffaSHans Petter Selasky int ibv_dontfork_range(void *base, size_t size)
687*d6b92ffaSHans Petter Selasky {
688*d6b92ffaSHans Petter Selasky if (mm_root)
689*d6b92ffaSHans Petter Selasky return ibv_madvise_range(base, size, MADV_DONTFORK);
690*d6b92ffaSHans Petter Selasky else {
691*d6b92ffaSHans Petter Selasky too_late = 1;
692*d6b92ffaSHans Petter Selasky return 0;
693*d6b92ffaSHans Petter Selasky }
694*d6b92ffaSHans Petter Selasky }
695*d6b92ffaSHans Petter Selasky
ibv_dofork_range(void * base,size_t size)696*d6b92ffaSHans Petter Selasky int ibv_dofork_range(void *base, size_t size)
697*d6b92ffaSHans Petter Selasky {
698*d6b92ffaSHans Petter Selasky if (mm_root)
699*d6b92ffaSHans Petter Selasky return ibv_madvise_range(base, size, MADV_DOFORK);
700*d6b92ffaSHans Petter Selasky else {
701*d6b92ffaSHans Petter Selasky too_late = 1;
702*d6b92ffaSHans Petter Selasky return 0;
703*d6b92ffaSHans Petter Selasky }
704*d6b92ffaSHans Petter Selasky }
705