1*433d6423SLionel Sambuc /* libbdev - IPC and recovery functions */
2*433d6423SLionel Sambuc
3*433d6423SLionel Sambuc #include <minix/drivers.h>
4*433d6423SLionel Sambuc #include <minix/bdev.h>
5*433d6423SLionel Sambuc #include <assert.h>
6*433d6423SLionel Sambuc
7*433d6423SLionel Sambuc #include "const.h"
8*433d6423SLionel Sambuc #include "type.h"
9*433d6423SLionel Sambuc #include "proto.h"
10*433d6423SLionel Sambuc
bdev_cancel(dev_t dev)11*433d6423SLionel Sambuc static void bdev_cancel(dev_t dev)
12*433d6423SLionel Sambuc {
13*433d6423SLionel Sambuc /* Recovering the driver for the given device has failed repeatedly. Mark it as
14*433d6423SLionel Sambuc * permanently unusable, and clean up any associated calls and resources.
15*433d6423SLionel Sambuc */
16*433d6423SLionel Sambuc bdev_call_t *call, *next;
17*433d6423SLionel Sambuc
18*433d6423SLionel Sambuc printf("bdev: giving up on major %d\n", major(dev));
19*433d6423SLionel Sambuc
20*433d6423SLionel Sambuc /* Cancel all pending asynchronous requests. */
21*433d6423SLionel Sambuc call = NULL;
22*433d6423SLionel Sambuc
23*433d6423SLionel Sambuc while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL)
24*433d6423SLionel Sambuc bdev_callback_asyn(call, EDEADSRCDST);
25*433d6423SLionel Sambuc
26*433d6423SLionel Sambuc /* Mark the driver as unusable. */
27*433d6423SLionel Sambuc bdev_driver_clear(dev);
28*433d6423SLionel Sambuc }
29*433d6423SLionel Sambuc
bdev_recover(dev_t dev,int update_endpt)30*433d6423SLionel Sambuc static int bdev_recover(dev_t dev, int update_endpt)
31*433d6423SLionel Sambuc {
32*433d6423SLionel Sambuc /* The IPC subsystem has signaled an error communicating to the driver
33*433d6423SLionel Sambuc * associated with the given device. Try to recover. If 'update_endpt' is set,
34*433d6423SLionel Sambuc * we need to find the new endpoint of the driver first. Return TRUE iff
35*433d6423SLionel Sambuc * recovery has been successful.
36*433d6423SLionel Sambuc */
37*433d6423SLionel Sambuc bdev_call_t *call, *next;
38*433d6423SLionel Sambuc endpoint_t endpt;
39*433d6423SLionel Sambuc int r, active, nr_tries;
40*433d6423SLionel Sambuc
41*433d6423SLionel Sambuc /* Only print output if there is something to recover. Some drivers may be
42*433d6423SLionel Sambuc * shut down and later restarted legitimately, and if they were not in use
43*433d6423SLionel Sambuc * while that happened, there is no need to flood the console with messages.
44*433d6423SLionel Sambuc */
45*433d6423SLionel Sambuc active = bdev_minor_is_open(dev) || bdev_call_iter_maj(dev, NULL, &next);
46*433d6423SLionel Sambuc
47*433d6423SLionel Sambuc if (active)
48*433d6423SLionel Sambuc printf("bdev: recovering from a driver restart on major %d\n",
49*433d6423SLionel Sambuc major(dev));
50*433d6423SLionel Sambuc
51*433d6423SLionel Sambuc for (nr_tries = 0; nr_tries < RECOVER_TRIES; nr_tries++) {
52*433d6423SLionel Sambuc /* First update the endpoint, if necessary. */
53*433d6423SLionel Sambuc if (update_endpt)
54*433d6423SLionel Sambuc (void) bdev_driver_update(dev);
55*433d6423SLionel Sambuc
56*433d6423SLionel Sambuc if ((endpt = bdev_driver_get(dev)) == NONE)
57*433d6423SLionel Sambuc break;
58*433d6423SLionel Sambuc
59*433d6423SLionel Sambuc /* If anything goes wrong, update the endpoint again next time. */
60*433d6423SLionel Sambuc update_endpt = TRUE;
61*433d6423SLionel Sambuc
62*433d6423SLionel Sambuc /* Reopen all minor devices on the new driver. */
63*433d6423SLionel Sambuc if ((r = bdev_minor_reopen(dev)) != OK) {
64*433d6423SLionel Sambuc /* If the driver died again, we may give it another try. */
65*433d6423SLionel Sambuc if (r == EDEADSRCDST)
66*433d6423SLionel Sambuc continue;
67*433d6423SLionel Sambuc
68*433d6423SLionel Sambuc /* If another error occurred, we cannot continue using the
69*433d6423SLionel Sambuc * driver as is, but we also cannot force it to restart.
70*433d6423SLionel Sambuc */
71*433d6423SLionel Sambuc break;
72*433d6423SLionel Sambuc }
73*433d6423SLionel Sambuc
74*433d6423SLionel Sambuc /* Resend all asynchronous requests. */
75*433d6423SLionel Sambuc call = NULL;
76*433d6423SLionel Sambuc
77*433d6423SLionel Sambuc while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL) {
78*433d6423SLionel Sambuc /* It is not strictly necessary that we manage to reissue all
79*433d6423SLionel Sambuc * asynchronous requests successfully. We can fail them on an
80*433d6423SLionel Sambuc * individual basis here, without affecting the overall
81*433d6423SLionel Sambuc * recovery. Note that we will never get new IPC failures here.
82*433d6423SLionel Sambuc */
83*433d6423SLionel Sambuc if ((r = bdev_restart_asyn(call)) != OK)
84*433d6423SLionel Sambuc bdev_callback_asyn(call, r);
85*433d6423SLionel Sambuc }
86*433d6423SLionel Sambuc
87*433d6423SLionel Sambuc /* Recovery seems successful. We can now reissue the current
88*433d6423SLionel Sambuc * synchronous request (if any), and continue normal operation.
89*433d6423SLionel Sambuc */
90*433d6423SLionel Sambuc if (active)
91*433d6423SLionel Sambuc printf("bdev: recovery successful, new driver at %d\n", endpt);
92*433d6423SLionel Sambuc
93*433d6423SLionel Sambuc return TRUE;
94*433d6423SLionel Sambuc }
95*433d6423SLionel Sambuc
96*433d6423SLionel Sambuc /* Recovery failed repeatedly. Give up on this driver. */
97*433d6423SLionel Sambuc bdev_cancel(dev);
98*433d6423SLionel Sambuc
99*433d6423SLionel Sambuc return FALSE;
100*433d6423SLionel Sambuc }
101*433d6423SLionel Sambuc
bdev_update(dev_t dev,char * label)102*433d6423SLionel Sambuc void bdev_update(dev_t dev, char *label)
103*433d6423SLionel Sambuc {
104*433d6423SLionel Sambuc /* Set the endpoint for a driver. Perform recovery if necessary.
105*433d6423SLionel Sambuc */
106*433d6423SLionel Sambuc endpoint_t endpt, old_endpt;
107*433d6423SLionel Sambuc
108*433d6423SLionel Sambuc old_endpt = bdev_driver_get(dev);
109*433d6423SLionel Sambuc
110*433d6423SLionel Sambuc endpt = bdev_driver_set(dev, label);
111*433d6423SLionel Sambuc
112*433d6423SLionel Sambuc /* If updating the driver causes an endpoint change, we need to perform
113*433d6423SLionel Sambuc * recovery, but not update the endpoint yet again.
114*433d6423SLionel Sambuc */
115*433d6423SLionel Sambuc if (old_endpt != NONE && old_endpt != endpt)
116*433d6423SLionel Sambuc bdev_recover(dev, FALSE /*update_endpt*/);
117*433d6423SLionel Sambuc }
118*433d6423SLionel Sambuc
bdev_senda(dev_t dev,const message * m_orig,bdev_id_t id)119*433d6423SLionel Sambuc int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t id)
120*433d6423SLionel Sambuc {
121*433d6423SLionel Sambuc /* Send an asynchronous request for the given device. This function will never
122*433d6423SLionel Sambuc * get any new IPC errors sending to the driver. If sending an asynchronous
123*433d6423SLionel Sambuc * request fails, we will find out through other ways later.
124*433d6423SLionel Sambuc */
125*433d6423SLionel Sambuc endpoint_t endpt;
126*433d6423SLionel Sambuc message m;
127*433d6423SLionel Sambuc int r;
128*433d6423SLionel Sambuc
129*433d6423SLionel Sambuc /* If we have no usable driver endpoint, fail instantly. */
130*433d6423SLionel Sambuc if ((endpt = bdev_driver_get(dev)) == NONE)
131*433d6423SLionel Sambuc return EDEADSRCDST;
132*433d6423SLionel Sambuc
133*433d6423SLionel Sambuc m = *m_orig;
134*433d6423SLionel Sambuc m.m_lbdev_lblockdriver_msg.id = id;
135*433d6423SLionel Sambuc
136*433d6423SLionel Sambuc r = asynsend(endpt, &m);
137*433d6423SLionel Sambuc
138*433d6423SLionel Sambuc if (r != OK)
139*433d6423SLionel Sambuc printf("bdev: asynsend to driver (%d) failed (%d)\n", endpt, r);
140*433d6423SLionel Sambuc
141*433d6423SLionel Sambuc return r;
142*433d6423SLionel Sambuc }
143*433d6423SLionel Sambuc
bdev_sendrec(dev_t dev,const message * m_orig)144*433d6423SLionel Sambuc int bdev_sendrec(dev_t dev, const message *m_orig)
145*433d6423SLionel Sambuc {
146*433d6423SLionel Sambuc /* Send a synchronous request for the given device, and wait for the reply.
147*433d6423SLionel Sambuc * Return ERESTART if the caller should try to reissue the request.
148*433d6423SLionel Sambuc */
149*433d6423SLionel Sambuc endpoint_t endpt;
150*433d6423SLionel Sambuc message m;
151*433d6423SLionel Sambuc int r;
152*433d6423SLionel Sambuc
153*433d6423SLionel Sambuc /* If we have no usable driver endpoint, fail instantly. */
154*433d6423SLionel Sambuc if ((endpt = bdev_driver_get(dev)) == NONE)
155*433d6423SLionel Sambuc return EDEADSRCDST;
156*433d6423SLionel Sambuc
157*433d6423SLionel Sambuc /* Send the request and block until we receive a reply. */
158*433d6423SLionel Sambuc m = *m_orig;
159*433d6423SLionel Sambuc m.m_lbdev_lblockdriver_msg.id = NO_ID;
160*433d6423SLionel Sambuc
161*433d6423SLionel Sambuc r = ipc_sendrec(endpt, &m);
162*433d6423SLionel Sambuc
163*433d6423SLionel Sambuc /* If communication failed, the driver has died. We assume it will be
164*433d6423SLionel Sambuc * restarted soon after, so we attempt recovery. Upon success, we let the
165*433d6423SLionel Sambuc * caller reissue the synchronous request.
166*433d6423SLionel Sambuc */
167*433d6423SLionel Sambuc if (r == EDEADSRCDST) {
168*433d6423SLionel Sambuc if (!bdev_recover(dev, TRUE /*update_endpt*/))
169*433d6423SLionel Sambuc return EDEADSRCDST;
170*433d6423SLionel Sambuc
171*433d6423SLionel Sambuc return ERESTART;
172*433d6423SLionel Sambuc }
173*433d6423SLionel Sambuc
174*433d6423SLionel Sambuc if (r != OK) {
175*433d6423SLionel Sambuc printf("bdev: IPC to driver (%d) failed (%d)\n", endpt, r);
176*433d6423SLionel Sambuc return r;
177*433d6423SLionel Sambuc }
178*433d6423SLionel Sambuc
179*433d6423SLionel Sambuc if (m.m_type != BDEV_REPLY) {
180*433d6423SLionel Sambuc printf("bdev: driver (%d) sent weird response (%d)\n",
181*433d6423SLionel Sambuc endpt, m.m_type);
182*433d6423SLionel Sambuc return EINVAL;
183*433d6423SLionel Sambuc }
184*433d6423SLionel Sambuc
185*433d6423SLionel Sambuc /* The protocol contract states that no asynchronous reply can satisfy a
186*433d6423SLionel Sambuc * synchronous SENDREC call, so we can never get an asynchronous reply here.
187*433d6423SLionel Sambuc */
188*433d6423SLionel Sambuc if (m.m_lblockdriver_lbdev_reply.id != NO_ID) {
189*433d6423SLionel Sambuc printf("bdev: driver (%d) sent invalid ID (%d)\n", endpt,
190*433d6423SLionel Sambuc m.m_lblockdriver_lbdev_reply.id);
191*433d6423SLionel Sambuc return EINVAL;
192*433d6423SLionel Sambuc }
193*433d6423SLionel Sambuc
194*433d6423SLionel Sambuc /* Unless the caller is misusing libbdev, we will only get ERESTART if we
195*433d6423SLionel Sambuc * have managed to resend a raw block I/O request to the driver after a
196*433d6423SLionel Sambuc * restart, but before VFS has had a chance to reopen the associated device
197*433d6423SLionel Sambuc * first. This is highly exceptional, and hard to deal with correctly. We
198*433d6423SLionel Sambuc * take the easiest route: sleep for a while so that VFS can reopen the
199*433d6423SLionel Sambuc * device, and then resend the request. If the call keeps failing, the caller
200*433d6423SLionel Sambuc * will eventually give up.
201*433d6423SLionel Sambuc */
202*433d6423SLionel Sambuc if (m.m_lblockdriver_lbdev_reply.status == ERESTART) {
203*433d6423SLionel Sambuc printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
204*433d6423SLionel Sambuc endpt);
205*433d6423SLionel Sambuc
206*433d6423SLionel Sambuc micro_delay(1000);
207*433d6423SLionel Sambuc
208*433d6423SLionel Sambuc return ERESTART;
209*433d6423SLionel Sambuc }
210*433d6423SLionel Sambuc
211*433d6423SLionel Sambuc /* Return the result of our request. */
212*433d6423SLionel Sambuc return m.m_lblockdriver_lbdev_reply.status;
213*433d6423SLionel Sambuc }
214*433d6423SLionel Sambuc
bdev_receive(dev_t dev,message * m)215*433d6423SLionel Sambuc static int bdev_receive(dev_t dev, message *m)
216*433d6423SLionel Sambuc {
217*433d6423SLionel Sambuc /* Receive one valid message.
218*433d6423SLionel Sambuc */
219*433d6423SLionel Sambuc endpoint_t endpt;
220*433d6423SLionel Sambuc int r, nr_tries = 0;
221*433d6423SLionel Sambuc
222*433d6423SLionel Sambuc for (;;) {
223*433d6423SLionel Sambuc /* Retrieve and check the driver endpoint on every try, as it will
224*433d6423SLionel Sambuc * change with each driver restart.
225*433d6423SLionel Sambuc */
226*433d6423SLionel Sambuc if ((endpt = bdev_driver_get(dev)) == NONE)
227*433d6423SLionel Sambuc return EDEADSRCDST;
228*433d6423SLionel Sambuc
229*433d6423SLionel Sambuc r = sef_receive(endpt, m);
230*433d6423SLionel Sambuc
231*433d6423SLionel Sambuc if (r == EDEADSRCDST) {
232*433d6423SLionel Sambuc /* If we reached the maximum number of retries, give up. */
233*433d6423SLionel Sambuc if (++nr_tries == DRIVER_TRIES)
234*433d6423SLionel Sambuc break;
235*433d6423SLionel Sambuc
236*433d6423SLionel Sambuc /* Attempt recovery. If successful, all asynchronous requests
237*433d6423SLionel Sambuc * will have been resent, and we can retry receiving a reply.
238*433d6423SLionel Sambuc */
239*433d6423SLionel Sambuc if (!bdev_recover(dev, TRUE /*update_endpt*/))
240*433d6423SLionel Sambuc return EDEADSRCDST;
241*433d6423SLionel Sambuc
242*433d6423SLionel Sambuc continue;
243*433d6423SLionel Sambuc }
244*433d6423SLionel Sambuc
245*433d6423SLionel Sambuc if (r != OK) {
246*433d6423SLionel Sambuc printf("bdev: IPC to driver (%d) failed (%d)\n", endpt, r);
247*433d6423SLionel Sambuc
248*433d6423SLionel Sambuc return r;
249*433d6423SLionel Sambuc }
250*433d6423SLionel Sambuc
251*433d6423SLionel Sambuc if (m->m_type != BDEV_REPLY) {
252*433d6423SLionel Sambuc printf("bdev: driver (%d) sent weird response (%d)\n",
253*433d6423SLionel Sambuc endpt, m->m_type);
254*433d6423SLionel Sambuc return EINVAL;
255*433d6423SLionel Sambuc }
256*433d6423SLionel Sambuc
257*433d6423SLionel Sambuc /* The caller is responsible for checking the ID and status. */
258*433d6423SLionel Sambuc return OK;
259*433d6423SLionel Sambuc }
260*433d6423SLionel Sambuc
261*433d6423SLionel Sambuc /* All tries failed, even though all recovery attempts succeeded. In this
262*433d6423SLionel Sambuc * case, we let the caller recheck whether it wants to keep calling us,
263*433d6423SLionel Sambuc * returning ERESTART to indicate we can be called again but did not actually
264*433d6423SLionel Sambuc * receive a message.
265*433d6423SLionel Sambuc */
266*433d6423SLionel Sambuc return ERESTART;
267*433d6423SLionel Sambuc }
268*433d6423SLionel Sambuc
bdev_reply_asyn(message * m)269*433d6423SLionel Sambuc void bdev_reply_asyn(message *m)
270*433d6423SLionel Sambuc {
271*433d6423SLionel Sambuc /* A reply has come in from a disk driver.
272*433d6423SLionel Sambuc */
273*433d6423SLionel Sambuc bdev_call_t *call;
274*433d6423SLionel Sambuc endpoint_t endpt;
275*433d6423SLionel Sambuc bdev_id_t id;
276*433d6423SLionel Sambuc int r;
277*433d6423SLionel Sambuc
278*433d6423SLionel Sambuc /* This is a requirement for the caller. */
279*433d6423SLionel Sambuc assert(m->m_type == BDEV_REPLY);
280*433d6423SLionel Sambuc
281*433d6423SLionel Sambuc /* Get the corresponding asynchronous call structure. */
282*433d6423SLionel Sambuc id = m->m_lblockdriver_lbdev_reply.id;
283*433d6423SLionel Sambuc
284*433d6423SLionel Sambuc if ((call = bdev_call_get(id)) == NULL) {
285*433d6423SLionel Sambuc printf("bdev: driver (%d) replied to unknown request (%d)\n",
286*433d6423SLionel Sambuc m->m_source, m->m_lblockdriver_lbdev_reply.id);
287*433d6423SLionel Sambuc return;
288*433d6423SLionel Sambuc }
289*433d6423SLionel Sambuc
290*433d6423SLionel Sambuc /* Make sure the reply was sent from the right endpoint. */
291*433d6423SLionel Sambuc endpt = bdev_driver_get(call->dev);
292*433d6423SLionel Sambuc
293*433d6423SLionel Sambuc if (m->m_source != endpt) {
294*433d6423SLionel Sambuc /* If the endpoint is NONE, this may be a stray reply. */
295*433d6423SLionel Sambuc if (endpt != NONE)
296*433d6423SLionel Sambuc printf("bdev: driver (%d) replied to request not sent to it\n",
297*433d6423SLionel Sambuc m->m_source);
298*433d6423SLionel Sambuc return;
299*433d6423SLionel Sambuc }
300*433d6423SLionel Sambuc
301*433d6423SLionel Sambuc /* See the ERESTART comment in bdev_sendrec(). */
302*433d6423SLionel Sambuc if (m->m_lblockdriver_lbdev_reply.status == ERESTART) {
303*433d6423SLionel Sambuc printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
304*433d6423SLionel Sambuc endpt);
305*433d6423SLionel Sambuc
306*433d6423SLionel Sambuc micro_delay(1000);
307*433d6423SLionel Sambuc
308*433d6423SLionel Sambuc if ((r = bdev_restart_asyn(call)) != OK)
309*433d6423SLionel Sambuc bdev_callback_asyn(call, r);
310*433d6423SLionel Sambuc
311*433d6423SLionel Sambuc return;
312*433d6423SLionel Sambuc }
313*433d6423SLionel Sambuc
314*433d6423SLionel Sambuc bdev_callback_asyn(call, m->m_lblockdriver_lbdev_reply.status);
315*433d6423SLionel Sambuc }
316*433d6423SLionel Sambuc
bdev_wait_asyn(bdev_id_t id)317*433d6423SLionel Sambuc int bdev_wait_asyn(bdev_id_t id)
318*433d6423SLionel Sambuc {
319*433d6423SLionel Sambuc /* Wait for an asynchronous request to complete.
320*433d6423SLionel Sambuc */
321*433d6423SLionel Sambuc bdev_call_t *call;
322*433d6423SLionel Sambuc dev_t dev;
323*433d6423SLionel Sambuc message m;
324*433d6423SLionel Sambuc int r;
325*433d6423SLionel Sambuc
326*433d6423SLionel Sambuc if ((call = bdev_call_get(id)) == NULL)
327*433d6423SLionel Sambuc return ENOENT;
328*433d6423SLionel Sambuc
329*433d6423SLionel Sambuc dev = call->dev;
330*433d6423SLionel Sambuc
331*433d6423SLionel Sambuc do {
332*433d6423SLionel Sambuc if ((r = bdev_receive(dev, &m)) != OK && r != ERESTART)
333*433d6423SLionel Sambuc return r;
334*433d6423SLionel Sambuc
335*433d6423SLionel Sambuc /* Processing the reply will free up the call structure as a side
336*433d6423SLionel Sambuc * effect. If we repeatedly get ERESTART, we will repeatedly resend the
337*433d6423SLionel Sambuc * asynchronous request, which will then eventually hit the retry limit
338*433d6423SLionel Sambuc * and we will break out of the loop.
339*433d6423SLionel Sambuc */
340*433d6423SLionel Sambuc if (r == OK)
341*433d6423SLionel Sambuc bdev_reply_asyn(&m);
342*433d6423SLionel Sambuc
343*433d6423SLionel Sambuc } while (bdev_call_get(id) != NULL);
344*433d6423SLionel Sambuc
345*433d6423SLionel Sambuc return OK;
346*433d6423SLionel Sambuc }
347