xref: /minix3/minix/servers/vfs/sdev.c (revision 27852ebe53d5bf221cf5058cb7e858fa8fa8895e)
1 /*
2  * This file implements the lower socket layer of VFS: communication with
3  * socket drivers.  Socket driver communication evolved out of character driver
4  * communication, and the two have many similarities between them.  Most
5  * importantly, socket driver communication also has the distinction between
6  * short-lived and long-lived requests.
7  *
8  * Short-lived requests are expected to be replied to by the socket driver
9  * immediately in all cases.  For such requests, VFS keeps the worker thread
10  * for the calling process alive until the reply arrives.  In contrast,
11  * long-lived requests may block.  For such requests, VFS suspends the calling
12  * process until a reply comes in, or until a signal interrupts the request.
13  * Both short-lived and long-lived requests may be aborted if VFS finds that
14  * the corresponding socket driver has died.  Even though long-lived requests
15  * may be marked as nonblocking, nonblocking calls are still handled as
16  * long-lived in terms of VFS processing.
17  *
18  * For an overview of the socket driver requests and replies, message layouts,
19  * and which requests are long-lived or short-lived (i.e. may suspend or not),
20  * please refer to the corresponding table in the libsockdriver source code.
21  *
22  * For most long-lived socket requests, the main VFS thread processes the reply
23  * from the socket driver.  This typically consists of waking up the user
24  * process that originally issued the system call on the socket by simply
25  * relaying the call's result code.  Some socket calls require a specific reply
26  * message and/or additional post-call actions; for those, resume_*() calls are
27  * made back into the upper socket layer.
28  *
29  * If a process is interrupted by a signal, any ongoing long-lived socket
30  * request must be canceled.  This is done by sending a one-way cancel request
31  * to the socket driver, and waiting for it to reply to the original request.
32  * In this case, the reply will be processed from the worker thread that is
33  * handling the cancel operation.  Canceling does not imply call failure: the
34  * cancellation may result in a partial I/O reply, and a successful reply may
35  * cross the cancel request.
36  *
37  * One main exception is the reply to an accept request.  Once a connection has
38  * been accepted, a new socket has to be created for it.  This requires actions
39  * that require the ability to block the current thread, and so, a worker
40  * thread is spawned for processing successful accept replies, unless the reply
41  * was received from a worker thread already (as may be the case if the accept
42  * request was being canceled).
43  */
44 
45 #include "fs.h"
46 #include <sys/socket.h>
47 #include <minix/callnr.h>
48 
49 /*
50  * Send a short-lived request message to the given socket driver, and suspend
51  * the current worker thread until a reply message has been received.  On
52  * success, the function will return OK, and the reply message will be stored
53  * in the message structure pointed to by 'm_ptr'.  The function may fail if
54  * the socket driver dies before sending a reply.  In that case, the function
55  * will return a negative error code, and also store the same negative error
56  * code in the m_type field of the 'm_ptr' message structure.
57  */
58 static int
59 sdev_sendrec(struct smap * sp, message * m_ptr)
60 {
61 	int r;
62 
63 	/* Send the request to the driver. */
64 	if ((r = asynsend3(sp->smap_endpt, m_ptr, AMF_NOREPLY)) != OK)
65 		panic("VFS: asynsend in sdev_sendrec failed: %d", r);
66 
67 	/* Suspend this thread until we have received the response. */
68 	self->w_task = sp->smap_endpt;
69 	self->w_drv_sendrec = m_ptr;
70 
71 	worker_wait();
72 
73 	self->w_task = NONE;
74 	assert(self->w_drv_sendrec == NULL);
75 
76 	return (!IS_SDEV_RS(m_ptr->m_type)) ? m_ptr->m_type : OK;
77 }
78 
79 /*
80  * Suspend the current process for later completion of its system call.
81  */
82 int
83 sdev_suspend(dev_t dev, cp_grant_id_t grant0, cp_grant_id_t grant1,
84 	cp_grant_id_t grant2, int fd, vir_bytes buf)
85 {
86 
87 	fp->fp_sdev.dev = dev;
88 	fp->fp_sdev.callnr = job_call_nr;
89 	fp->fp_sdev.grant[0] = grant0;
90 	fp->fp_sdev.grant[1] = grant1;
91 	fp->fp_sdev.grant[2] = grant2;
92 
93 	if (job_call_nr == VFS_ACCEPT) {
94 		assert(fd != -1);
95 		assert(buf == 0);
96 		fp->fp_sdev.aux.fd = fd;
97 	} else if (job_call_nr == VFS_RECVMSG) {
98 		assert(fd == -1);
99 		/*
100 		 * TODO: we are not yet consistent enough in dealing with
101 		 * mapped NULL pages to have an assert(buf != 0) here..
102 		 */
103 		fp->fp_sdev.aux.buf = buf;
104 	} else {
105 		assert(fd == -1);
106 		assert(buf == 0);
107 	}
108 
109 	suspend(FP_BLOCKED_ON_SDEV);
110 	return SUSPEND;
111 }
112 
113 /*
114  * Create a socket or socket pair.  Return OK on success, with the new socket
115  * device identifier(s) stored in the 'dev' array.  Return an error code upon
116  * failure.
117  */
118 int
119 sdev_socket(int domain, int type, int protocol, dev_t * dev, int pair)
120 {
121 	struct smap *sp;
122 	message m;
123 	sockid_t sock_id, sock_id2;
124 	int r;
125 
126 	/* We could return EAFNOSUPPORT, but the caller should have checked. */
127 	if ((sp = get_smap_by_domain(domain)) == NULL)
128 		panic("VFS: sdev_socket for unknown domain");
129 
130 	/* Prepare the request message. */
131 	memset(&m, 0, sizeof(m));
132 	m.m_type = pair ? SDEV_SOCKETPAIR : SDEV_SOCKET;
133 	m.m_vfs_lsockdriver_socket.req_id = (sockid_t)who_e;
134 	m.m_vfs_lsockdriver_socket.domain = domain;
135 	m.m_vfs_lsockdriver_socket.type = type;
136 	m.m_vfs_lsockdriver_socket.protocol = protocol;
137 	m.m_vfs_lsockdriver_socket.user_endpt = who_e;
138 
139 	/* Send the request, and wait for the reply. */
140 	if ((r = sdev_sendrec(sp, &m)) != OK)
141 		return r;	/* socket driver died */
142 
143 	/* Parse the reply message, and check for protocol errors. */
144 	if (m.m_type != SDEV_SOCKET_REPLY) {
145 		printf("VFS: %d sent bad reply type %d for call %d\n",
146 		    sp->smap_endpt, m.m_type, job_call_nr);
147 		return EIO;
148 	}
149 
150 	sock_id = m.m_lsockdriver_vfs_socket_reply.sock_id;
151 	sock_id2 = m.m_lsockdriver_vfs_socket_reply.sock_id2;
152 
153 	/* Check for regular errors.  Upon success, return the socket(s). */
154 	if (sock_id < 0)
155 		return sock_id;
156 
157 	dev[0] = make_smap_dev(sp, sock_id);
158 
159 	if (pair) {
160 		/* Okay, one more protocol error. */
161 		if (sock_id2 < 0) {
162 			printf("VFS: %d sent bad SOCKETPAIR socket ID %d\n",
163 			    sp->smap_endpt, sock_id2);
164 			(void)sdev_close(dev[0], FALSE /*may_suspend*/);
165 			return EIO;
166 		}
167 
168 		dev[1] = make_smap_dev(sp, sock_id2);
169 	}
170 
171 	return OK;
172 }
173 
174 /*
175  * Bind or connect a socket to a particular address.  These calls may block, so
176  * suspend the current process instead of making the thread wait for the reply.
177  */
178 static int
179 sdev_bindconn(dev_t dev, int type, vir_bytes addr, unsigned int addr_len,
180 	int filp_flags)
181 {
182 	struct smap *sp;
183 	sockid_t sock_id;
184 	cp_grant_id_t grant;
185 	message m;
186 	int r;
187 
188 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
189 		return EIO;
190 
191 	/* Allocate resources. */
192 	grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, addr_len,
193 	    CPF_READ);
194 	if (!GRANT_VALID(grant))
195 		panic("VFS: cpf_grant_magic failed");
196 
197 	/* Prepare the request message. */
198 	memset(&m, 0, sizeof(m));
199 	m.m_type = type;
200 	m.m_vfs_lsockdriver_addr.req_id = (sockid_t)who_e;
201 	m.m_vfs_lsockdriver_addr.sock_id = sock_id;
202 	m.m_vfs_lsockdriver_addr.grant = grant;
203 	m.m_vfs_lsockdriver_addr.len = addr_len;
204 	m.m_vfs_lsockdriver_addr.user_endpt = who_e;
205 	m.m_vfs_lsockdriver_addr.sflags =
206 	    (filp_flags & O_NONBLOCK) ? SDEV_NONBLOCK : 0;
207 
208 	/* Send the request to the driver. */
209 	if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK)
210 		panic("VFS: asynsend in sdev_bindconn failed: %d", r);
211 
212 	/* Suspend the process until the reply arrives. */
213 	return sdev_suspend(dev, grant, GRANT_INVALID, GRANT_INVALID, -1, 0);
214 }
215 
216 /*
217  * Bind a socket to a local address.
218  */
219 int
220 sdev_bind(dev_t dev, vir_bytes addr, unsigned int addr_len, int filp_flags)
221 {
222 
223 	return sdev_bindconn(dev, SDEV_BIND, addr, addr_len, filp_flags);
224 }
225 
226 /*
227  * Connect a socket to a remote address.
228  */
229 int
230 sdev_connect(dev_t dev, vir_bytes addr, unsigned int addr_len, int filp_flags)
231 {
232 
233 	return sdev_bindconn(dev, SDEV_CONNECT, addr, addr_len, filp_flags);
234 }
235 
236 /*
237  * Send and receive a "simple" request: listen, shutdown, or close.  Note that
238  * while cancel requests use the same request format, they require a different
239  * way of handling their replies.
240  */
241 static int
242 sdev_simple(dev_t dev, int type, int param)
243 {
244 	struct smap *sp;
245 	sockid_t sock_id;
246 	message m;
247 	int r;
248 
249 	assert(type == SDEV_LISTEN || type == SDEV_SHUTDOWN ||
250 	    type == SDEV_CLOSE);
251 
252 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
253 		return EIO;
254 
255 	/* Prepare the request message. */
256 	memset(&m, 0, sizeof(m));
257 	m.m_type = type;
258 	m.m_vfs_lsockdriver_simple.req_id = (sockid_t)who_e;
259 	m.m_vfs_lsockdriver_simple.sock_id = sock_id;
260 	m.m_vfs_lsockdriver_simple.param = param;
261 
262 	/* Send the request, and wait for the reply. */
263 	if ((r = sdev_sendrec(sp, &m)) != OK)
264 		return r;	/* socket driver died */
265 
266 	/* Parse and return the reply. */
267 	if (m.m_type != SDEV_REPLY) {
268 		printf("VFS: %d sent bad reply type %d for call %d\n",
269 		    sp->smap_endpt, m.m_type, job_call_nr);
270 		return EIO;
271 	}
272 
273 	return m.m_lsockdriver_vfs_reply.status;
274 }
275 
276 /*
277  * Put a socket in listening mode.
278  */
279 int
280 sdev_listen(dev_t dev, int backlog)
281 {
282 
283 	assert(backlog >= 0);
284 
285 	return sdev_simple(dev, SDEV_LISTEN, backlog);
286 }
287 
288 /*
289  * Accept a new connection on a socket.
290  */
291 int
292 sdev_accept(dev_t dev, vir_bytes addr, unsigned int addr_len, int filp_flags,
293 	int listen_fd)
294 {
295 	struct smap *sp;
296 	sockid_t sock_id;
297 	cp_grant_id_t grant;
298 	message m;
299 	int r;
300 
301 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
302 		return EIO;
303 
304 	/* Allocate resources. */
305 	if (addr != 0) {
306 		grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, addr_len,
307 		    CPF_WRITE);
308 		if (!GRANT_VALID(grant))
309 			panic("VFS: cpf_grant_magic failed");
310 	} else
311 		grant = GRANT_INVALID;
312 
313 	/* Prepare the request message. */
314 	memset(&m, 0, sizeof(m));
315 	m.m_type = SDEV_ACCEPT;
316 	m.m_vfs_lsockdriver_addr.req_id = (sockid_t)who_e;
317 	m.m_vfs_lsockdriver_addr.sock_id = sock_id;
318 	m.m_vfs_lsockdriver_addr.grant = grant;
319 	m.m_vfs_lsockdriver_addr.len = addr_len;
320 	m.m_vfs_lsockdriver_addr.user_endpt = who_e;
321 	m.m_vfs_lsockdriver_addr.sflags =
322 	    (filp_flags & O_NONBLOCK) ? SDEV_NONBLOCK : 0;
323 
324 	/* Send the request to the driver. */
325 	if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK)
326 		panic("VFS: asynsend in sdev_accept failed: %d", r);
327 
328 	/* Suspend the process until the reply arrives. */
329 	return sdev_suspend(dev, grant, GRANT_INVALID, GRANT_INVALID,
330 	    listen_fd, 0);
331 }
332 
333 /*
334  * Send or receive a message on a socket.  All read (read(2), recvfrom(2), and
335  * recvmsg(2)) and write (write(2), sendto(2), sendmsg(2)) system calls on
336  * sockets pass through this function.  The function is named sdev_readwrite
337  * rather than sdev_sendrecv to avoid confusion with sdev_sendrec.
338  */
339 int
340 sdev_readwrite(dev_t dev, vir_bytes data_buf, size_t data_len,
341 	vir_bytes ctl_buf, unsigned int ctl_len, vir_bytes addr_buf,
342 	unsigned int addr_len, int flags, int rw_flag, int filp_flags,
343 	vir_bytes user_buf)
344 {
345 	struct smap *sp;
346 	sockid_t sock_id;
347 	cp_grant_id_t data_grant, ctl_grant, addr_grant;
348 	message m;
349 	int r, bits;
350 
351 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
352 		return EIO;
353 
354 	/* Allocate resources. */
355 	data_grant = GRANT_INVALID;
356 	ctl_grant = GRANT_INVALID;
357 	addr_grant = GRANT_INVALID;
358 	bits = (rw_flag == WRITING) ? CPF_READ : CPF_WRITE;
359 
360 	/*
361 	 * Supposedly it is allowed to send or receive zero data bytes, even
362 	 * though it is a bad idea as the return value will then be zero, which
363 	 * may also indicate EOF (as per W. Richard Stevens).
364 	 */
365 	if (data_buf != 0) {
366 		data_grant = cpf_grant_magic(sp->smap_endpt, who_e, data_buf,
367 		    data_len, bits);
368 		if (!GRANT_VALID(data_grant))
369 			panic("VFS: cpf_grant_magic failed");
370 	}
371 
372 	if (ctl_buf != 0) {
373 		ctl_grant = cpf_grant_magic(sp->smap_endpt, who_e, ctl_buf,
374 		    ctl_len, bits);
375 		if (!GRANT_VALID(ctl_grant))
376 			panic("VFS: cpf_grant_magic failed");
377 	}
378 
379 	if (addr_buf != 0) {
380 		addr_grant = cpf_grant_magic(sp->smap_endpt, who_e, addr_buf,
381 		    addr_len, bits);
382 		if (!GRANT_VALID(addr_grant))
383 			panic("VFS: cpf_grant_magic failed");
384 	}
385 
386 	/* Prepare the request message. */
387 	memset(&m, 0, sizeof(m));
388 	m.m_type = (rw_flag == WRITING) ? SDEV_SEND : SDEV_RECV;
389 	m.m_vfs_lsockdriver_sendrecv.req_id = (sockid_t)who_e;
390 	m.m_vfs_lsockdriver_sendrecv.sock_id = sock_id;
391 	m.m_vfs_lsockdriver_sendrecv.data_grant = data_grant;
392 	m.m_vfs_lsockdriver_sendrecv.data_len = data_len;
393 	m.m_vfs_lsockdriver_sendrecv.ctl_grant = ctl_grant;
394 	m.m_vfs_lsockdriver_sendrecv.ctl_len = ctl_len;
395 	m.m_vfs_lsockdriver_sendrecv.addr_grant = addr_grant;
396 	m.m_vfs_lsockdriver_sendrecv.addr_len = addr_len;
397 	m.m_vfs_lsockdriver_sendrecv.user_endpt = who_e;
398 	m.m_vfs_lsockdriver_sendrecv.flags = flags;
399 	if (filp_flags & O_NONBLOCK)
400 		m.m_vfs_lsockdriver_sendrecv.flags |= MSG_DONTWAIT;
401 	if (rw_flag == WRITING && (filp_flags & O_NOSIGPIPE))
402 		m.m_vfs_lsockdriver_sendrecv.flags |= MSG_NOSIGNAL;
403 
404 	/* Send the request to the driver. */
405 	if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK)
406 		panic("VFS: asynsend in sdev_readwrite failed: %d", r);
407 
408 	/* Suspend the process until the reply arrives. */
409 	return sdev_suspend(dev, data_grant, ctl_grant, addr_grant, -1,
410 	    user_buf);
411 }
412 
413 /*
414  * Perform I/O control.
415  */
416 int
417 sdev_ioctl(dev_t dev, unsigned long request, vir_bytes buf, int filp_flags)
418 {
419 	struct smap *sp;
420 	sockid_t sock_id;
421 	cp_grant_id_t grant;
422 	message m;
423 	int r;
424 
425 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
426 		return EIO;
427 
428 	/* Allocate resources. */
429 	grant = make_ioctl_grant(sp->smap_endpt, who_e, buf, request);
430 
431 	/* Prepare the request message. */
432 	memset(&m, 0, sizeof(m));
433 	m.m_type = SDEV_IOCTL;
434 	m.m_vfs_lsockdriver_ioctl.req_id = (sockid_t)who_e;
435 	m.m_vfs_lsockdriver_ioctl.sock_id = sock_id;
436 	m.m_vfs_lsockdriver_ioctl.request = request;
437 	m.m_vfs_lsockdriver_ioctl.grant = grant;
438 	m.m_vfs_lsockdriver_ioctl.user_endpt = who_e;
439 	m.m_vfs_lsockdriver_ioctl.sflags =
440 	    (filp_flags & O_NONBLOCK) ? SDEV_NONBLOCK : 0;
441 
442 	/* Send the request to the driver. */
443 	if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK)
444 		panic("VFS: asynsend in sdev_ioctl failed: %d", r);
445 
446 	/* Suspend the process until the reply arrives. */
447 	return sdev_suspend(dev, grant, GRANT_INVALID, GRANT_INVALID, -1, 0);
448 }
449 
450 /*
451  * Set socket options.
452  */
453 int
454 sdev_setsockopt(dev_t dev, int level, int name, vir_bytes addr,
455 	unsigned int len)
456 {
457 	struct smap *sp;
458 	sockid_t sock_id;
459 	cp_grant_id_t grant;
460 	message m;
461 	int r;
462 
463 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
464 		return EIO;
465 
466 	/* Allocate resources. */
467 	grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, len, CPF_READ);
468 	if (!GRANT_VALID(grant))
469 		panic("VFS: cpf_grant_magic failed");
470 
471 	/* Prepare the request message. */
472 	memset(&m, 0, sizeof(m));
473 	m.m_type = SDEV_SETSOCKOPT;
474 	m.m_vfs_lsockdriver_getset.req_id = (sockid_t)who_e;
475 	m.m_vfs_lsockdriver_getset.sock_id = sock_id;
476 	m.m_vfs_lsockdriver_getset.level = level;
477 	m.m_vfs_lsockdriver_getset.name = name;
478 	m.m_vfs_lsockdriver_getset.grant = grant;
479 	m.m_vfs_lsockdriver_getset.len = len;
480 
481 	/* Send the request, and wait for the reply. */
482 	r = sdev_sendrec(sp, &m);
483 
484 	/* Free resources. */
485 	(void)cpf_revoke(grant);
486 
487 	if (r != OK)
488 		return r;	/* socket driver died */
489 
490 	/* Parse and return the reply. */
491 	if (m.m_type != SDEV_REPLY) {
492 		printf("VFS: %d sent bad reply type %d for call %d\n",
493 		    sp->smap_endpt, m.m_type, job_call_nr);
494 		return EIO;
495 	}
496 
497 	return m.m_lsockdriver_vfs_reply.status;
498 }
499 
500 /*
501  * Send and receive a "get" request: getsockopt, getsockname, or getpeername.
502  */
503 static int
504 sdev_get(dev_t dev, int type, int level, int name, vir_bytes addr,
505 	unsigned int * len)
506 {
507 	struct smap *sp;
508 	sockid_t sock_id;
509 	cp_grant_id_t grant;
510 	message m;
511 	int r;
512 
513 	assert(type == SDEV_GETSOCKOPT || type == SDEV_GETSOCKNAME ||
514 	    type == SDEV_GETPEERNAME);
515 
516 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
517 		return EIO;
518 
519 	/* Allocate resources. */
520 	grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, *len, CPF_WRITE);
521 	if (!GRANT_VALID(grant))
522 		panic("VFS: cpf_grant_magic failed");
523 
524 	/* Prepare the request message. */
525 	memset(&m, 0, sizeof(m));
526 	m.m_type = type;
527 	m.m_vfs_lsockdriver_getset.req_id = (sockid_t)who_e;
528 	m.m_vfs_lsockdriver_getset.sock_id = sock_id;
529 	m.m_vfs_lsockdriver_getset.level = level;
530 	m.m_vfs_lsockdriver_getset.name = name;
531 	m.m_vfs_lsockdriver_getset.grant = grant;
532 	m.m_vfs_lsockdriver_getset.len = *len;
533 
534 	/* Send the request, and wait for the reply. */
535 	r = sdev_sendrec(sp, &m);
536 
537 	/* Free resources. */
538 	(void)cpf_revoke(grant);
539 
540 	if (r != OK)
541 		return r;	/* socket driver died */
542 
543 	/* Parse and return the reply. */
544 	if (m.m_type != SDEV_REPLY) {
545 		printf("VFS: %d sent bad reply type %d for call %d\n",
546 		    sp->smap_endpt, m.m_type, job_call_nr);
547 		return EIO;
548 	}
549 
550 	if ((r = m.m_lsockdriver_vfs_reply.status) < 0)
551 		return r;
552 
553 	*len = (unsigned int)r;
554 	return OK;
555 }
556 
557 /*
558  * Get socket options.
559  */
560 int
561 sdev_getsockopt(dev_t dev, int level, int name, vir_bytes addr,
562 	unsigned int * len)
563 {
564 
565 	return sdev_get(dev, SDEV_GETSOCKOPT, level, name, addr, len);
566 }
567 
568 /*
569  * Get the local address of a socket.
570  */
571 int
572 sdev_getsockname(dev_t dev, vir_bytes addr, unsigned int * addr_len)
573 {
574 
575 	return sdev_get(dev, SDEV_GETSOCKNAME, 0, 0, addr, addr_len);
576 }
577 
578 /*
579  * Get the remote address of a socket.
580  */
581 int
582 sdev_getpeername(dev_t dev, vir_bytes addr, unsigned int * addr_len)
583 {
584 
585 	return sdev_get(dev, SDEV_GETPEERNAME, 0, 0, addr, addr_len);
586 }
587 
588 /*
589  * Shut down socket send and receive operations.
590  */
591 int
592 sdev_shutdown(dev_t dev, int how)
593 {
594 
595 	assert(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR);
596 
597 	return sdev_simple(dev, SDEV_SHUTDOWN, how);
598 }
599 
600 /*
601  * Close the socket identified by the given socket device number.
602  */
603 int
604 sdev_close(dev_t dev, int may_suspend)
605 {
606 	struct smap *sp;
607 	sockid_t sock_id;
608 	message m;
609 	int r;
610 
611 	/*
612 	 * Originally, all close requests were blocking the calling thread, but
613 	 * the new support for SO_LINGER has changed that.  In a very strictly
614 	 * limited subset of cases - namely, the user process calling close(2),
615 	 * we suspend the close request and handle it asynchronously.  In all
616 	 * other cases, including close-on-exit, close-on-exec, and even dup2,
617 	 * the close is issued as a thread-synchronous request instead.
618 	 */
619 	if (may_suspend) {
620 		if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
621 			return EIO;
622 
623 		/* Prepare the request message. */
624 		memset(&m, 0, sizeof(m));
625 		m.m_type = SDEV_CLOSE;
626 		m.m_vfs_lsockdriver_simple.req_id = (sockid_t)who_e;
627 		m.m_vfs_lsockdriver_simple.sock_id = sock_id;
628 		m.m_vfs_lsockdriver_simple.param = 0;
629 
630 		/* Send the request to the driver. */
631 		if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK)
632 			panic("VFS: asynsend in sdev_bindconn failed: %d", r);
633 
634 		/* Suspend the process until the reply arrives. */
635 		return sdev_suspend(dev, GRANT_INVALID, GRANT_INVALID,
636 		    GRANT_INVALID, -1, 0);
637 	} else
638 		/* Block the calling thread until the socket is closed. */
639 		return sdev_simple(dev, SDEV_CLOSE, SDEV_NONBLOCK);
640 }
641 
642 /*
643  * Initiate a select call on a socket device.  Return OK iff the request was
644  * sent, without suspending the process.
645  */
646 int
647 sdev_select(dev_t dev, int ops)
648 {
649 	struct smap *sp;
650 	sockid_t sock_id;
651 	message m;
652 	int r;
653 
654 	if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL)
655 		return EIO;
656 
657 	/* Prepare the request message. */
658 	memset(&m, 0, sizeof(m));
659 	m.m_type = SDEV_SELECT;
660 	m.m_vfs_lsockdriver_select.sock_id = sock_id;
661 	m.m_vfs_lsockdriver_select.ops = ops;
662 
663 	/* Send the request to the driver. */
664 	if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK)
665 		panic("VFS: asynsend in sdev_select failed: %d", r);
666 
667 	return OK;
668 }
669 
670 /*
671  * A reply has arrived for a previous socket accept request, and the reply
672  * indicates that a socket has been accepted.  A status is also returned;
673  * usually, this status is OK, but if not, the newly accepted socket must be
674  * closed immediately again.  Process the low-level aspects of the reply, and
675  * call resume_accept() to let the upper socket layer handle the rest.  This
676  * function is always called from a worker thread, and may thus block.
677  */
678 static void
679 sdev_finish_accept(struct fproc * rfp, message * m_ptr)
680 {
681 	struct smap *sp;
682 	sockid_t sock_id;
683 	dev_t dev;
684 	unsigned int len;
685 	int status;
686 
687 	assert(rfp->fp_sdev.callnr == VFS_ACCEPT);
688 	assert(m_ptr->m_type == SDEV_ACCEPT_REPLY);
689 	assert(m_ptr->m_lsockdriver_vfs_accept_reply.sock_id >= 0);
690 
691 	/* Free resources.  Accept requests use up to one grant. */
692 	if (GRANT_VALID(rfp->fp_sdev.grant[0]))
693 		cpf_revoke(rfp->fp_sdev.grant[0]);
694 	assert(!GRANT_VALID(rfp->fp_sdev.grant[1]));
695 	assert(!GRANT_VALID(rfp->fp_sdev.grant[2]));
696 
697 	sock_id = m_ptr->m_lsockdriver_vfs_accept_reply.sock_id;
698 	status = m_ptr->m_lsockdriver_vfs_accept_reply.status;
699 	len = m_ptr->m_lsockdriver_vfs_accept_reply.len;
700 
701 	/*
702 	 * We do not want the upper socket layer (socket.c) to deal with smap
703 	 * and socket ID details, so we construct the new socket device number
704 	 * here.  We won't use the saved listen FD to determine the smap entry
705 	 * here, since that involves file pointers and other upper-layer-only
706 	 * stuff.  So we have to look it up by the source endpoint.  As a
707 	 * result, we detect some driver deaths here (but not all: see below).
708 	 */
709 	if ((sp = get_smap_by_endpt(m_ptr->m_source)) != NULL) {
710 		/* Leave 'status' as is, regardless of whether it is OK. */
711 		dev = make_smap_dev(sp, sock_id);
712 	} else {
713 		/*
714 		 * The driver must have died while the thread was blocked on
715 		 * activation.  Extremely rare, but theoretically possible.
716 		 * Some driver deaths are indicated only by a driver-up
717 		 * announcement though; resume_accept() will detect this by
718 		 * checking that the listening socket has not been invalidated.
719 		 */
720 		status = EIO;
721 		dev = NO_DEV;
722 	}
723 
724 	/* Let the upper socket layer handle the rest. */
725 	resume_accept(rfp, status, dev, len, rfp->fp_sdev.aux.fd);
726 }
727 
728 /*
729  * Worker thread stub for finishing successful accept requests.
730  */
731 static void
732 do_accept_reply(void)
733 {
734 
735 	sdev_finish_accept(fp, &job_m_in);
736 }
737 
738 /*
739  * With the exception of successful accept requests, this function is called
740  * whenever a reply is received for a socket driver request for which the
741  * corresponding user process was suspended (as opposed to requests which just
742  * suspend the worker thread), i.e., for long-lasting socket calls.  This
743  * function is also called if the socket driver has died during a long-lasting
744  * socket call, in which case the given message's m_type is a negative error
745  * code.
746  *
747  * The division between the upper socket layer (socket.c) and the lower socket
748  * layer (this file) here is roughly: if resuming the system call involves no
749  * more than a simple replycode() call, do that here; otherwise call into the
750  * upper socket layer to handle the details.  In any case, do not ever let the
751  * upper socket layer deal with reply message parsing or suspension state.
752  *
753  * This function may or may not be called from a worker thread; as such, it
754  * MUST NOT block its calling thread.  This function is called for failed
755  * accept requests; successful accept requests have their replies routed
756  * through sdev_finish_accept() instead, because those require a worker thread.
757  */
758 static void
759 sdev_finish(struct fproc * rfp, message * m_ptr)
760 {
761 	unsigned int ctl_len, addr_len;
762 	int callnr, status, flags;
763 
764 	/* The suspension status must just have been cleared by the caller. */
765 	assert(rfp->fp_blocked_on == FP_BLOCKED_ON_NONE);
766 
767 	/*
768 	 * Free resources.  Every suspending call sets all grant fields, so we
769 	 * can safely revoke all of them without testing the original call.
770 	 */
771 	if (GRANT_VALID(rfp->fp_sdev.grant[0]))
772 		cpf_revoke(rfp->fp_sdev.grant[0]);
773 	if (GRANT_VALID(rfp->fp_sdev.grant[1]))
774 		cpf_revoke(rfp->fp_sdev.grant[1]);
775 	if (GRANT_VALID(rfp->fp_sdev.grant[2]))
776 		cpf_revoke(rfp->fp_sdev.grant[2]);
777 
778 	/*
779 	 * Now that the socket driver call has finished (or been stopped due to
780 	 * driver death), we need to finish the corresponding system call from
781 	 * the user process.  The action to take depends on the system call.
782 	 */
783 	callnr = rfp->fp_sdev.callnr;
784 
785 	switch (callnr) {
786 	case VFS_BIND:
787 	case VFS_CONNECT:
788 	case VFS_WRITE:
789 	case VFS_SENDTO:
790 	case VFS_SENDMSG:
791 	case VFS_IOCTL:
792 	case VFS_CLOSE:
793 		/*
794 		 * These calls all use the same SDEV_REPLY reply type and only
795 		 * need to reply an OK-or-error status code back to userland.
796 		 */
797 		if (m_ptr->m_type == SDEV_REPLY) {
798 			status = m_ptr->m_lsockdriver_vfs_reply.status;
799 
800 			/*
801 			 * For close(2) calls, the return value must indicate
802 			 * that the file descriptor has been closed.
803 			 */
804 			if (callnr == VFS_CLOSE &&
805 			    status != OK && status != EINPROGRESS)
806 				status = OK;
807 		} else if (m_ptr->m_type < 0) {
808 			status = m_ptr->m_type;
809 		} else {
810 			printf("VFS: %d sent bad reply type %d for call %d\n",
811 			    m_ptr->m_source, m_ptr->m_type, callnr);
812 			status = EIO;
813 		}
814 		replycode(rfp->fp_endpoint, status);
815 		break;
816 
817 	case VFS_READ:
818 	case VFS_RECVFROM:
819 	case VFS_RECVMSG:
820 		/*
821 		 * These calls use SDEV_RECV_REPLY.  The action to take depends
822 		 * on the exact call.
823 		 */
824 		ctl_len = addr_len = 0;
825 		flags = 0;
826 		if (m_ptr->m_type == SDEV_RECV_REPLY) {
827 			status = m_ptr->m_lsockdriver_vfs_recv_reply.status;
828 			ctl_len = m_ptr->m_lsockdriver_vfs_recv_reply.ctl_len;
829 			addr_len =
830 			    m_ptr->m_lsockdriver_vfs_recv_reply.addr_len;
831 			flags = m_ptr->m_lsockdriver_vfs_recv_reply.flags;
832 		} else if (m_ptr->m_type < 0) {
833 			status = m_ptr->m_type;
834 		} else {
835 			printf("VFS: %d sent bad reply type %d for call %d\n",
836 			    m_ptr->m_source, m_ptr->m_type, callnr);
837 			status = EIO;
838 		}
839 
840 		switch (callnr) {
841 		case VFS_READ:
842 			replycode(rfp->fp_endpoint, status);
843 			break;
844 		case VFS_RECVFROM:
845 			resume_recvfrom(rfp, status, addr_len);
846 			break;
847 		case VFS_RECVMSG:
848 			resume_recvmsg(rfp, status, ctl_len, addr_len, flags,
849 			    rfp->fp_sdev.aux.buf);
850 			break;
851 		}
852 		break;
853 
854 	case VFS_ACCEPT:
855 		/*
856 		 * This call uses SDEV_ACCEPT_REPLY.  We only get here if the
857 		 * accept call has failed without creating a new socket, in
858 		 * which case we can simply call replycode() with the error.
859 		 * For nothing other than consistency, we let resume_accept()
860 		 * handle this case too.
861 		 */
862 		addr_len = 0;
863 		if (m_ptr->m_type == SDEV_ACCEPT_REPLY) {
864 			assert(m_ptr->m_lsockdriver_vfs_accept_reply.sock_id <
865 			    0);
866 			status = m_ptr->m_lsockdriver_vfs_accept_reply.status;
867 			addr_len = m_ptr->m_lsockdriver_vfs_accept_reply.len;
868 		} else if (m_ptr->m_type < 0) {
869 			status = m_ptr->m_type;
870 		} else {
871 			printf("VFS: %d sent bad reply type %d for call %d\n",
872 			    m_ptr->m_source, m_ptr->m_type, callnr);
873 			status = EIO;
874 		}
875 		/*
876 		 * Quick rundown of m_lsockdriver_vfs_accept_reply cases:
877 		 *
878 		 * - sock_id >= 0, status == OK: new socket accepted
879 		 * - sock_id >= 0, status != OK: new socket must be closed
880 		 * - sock_id < 0, status != OK: failure accepting socket
881 		 * - sock_id < 0, status == OK: invalid, covered right here
882 		 *
883 		 * See libsockdriver for why there are two reply fields at all.
884 		 */
885 		if (status >= 0) {
886 			printf("VFS: %d sent bad status %d for call %d\n",
887 			    m_ptr->m_source, status, callnr);
888 			status = EIO;
889 		}
890 		resume_accept(rfp, status, NO_DEV, addr_len,
891 		    rfp->fp_sdev.aux.fd);
892 		break;
893 
894 	default:
895 		/*
896 		 * Ultimately, enumerating all system calls that may cause
897 		 * socket I/O may prove too cumbersome.  In that case, the
898 		 * callnr field could be replaced by a field that stores the
899 		 * combination of the expected reply type and the action to
900 		 * take, for example.
901 		 */
902 		panic("VFS: socket reply %d for unknown call %d from %d",
903 		    m_ptr->m_type, callnr, rfp->fp_endpoint);
904 	}
905 }
906 
907 /*
908  * Abort the suspended socket call for the given process, because the
909  * corresponding socket driver has died.
910  */
911 void
912 sdev_stop(struct fproc * rfp)
913 {
914 	message m;
915 
916 	assert(rfp->fp_blocked_on == FP_BLOCKED_ON_SDEV);
917 
918 	rfp->fp_blocked_on = FP_BLOCKED_ON_NONE;
919 
920 	/*
921 	 * We use one single approach both here and when stopping worker
922 	 * threads: the reply message's m_type is set to an error code (always
923 	 * EIO for now) instead of an actual SDEV_ reply code.  We test for
924 	 * this case in non-suspending calls as well as in sdev_finish().
925 	 */
926 	m.m_type = EIO;
927 	sdev_finish(rfp, &m);
928 }
929 
930 /*
931  * Cancel the ongoing long-lasting socket call, because the calling process has
932  * received a caught or terminating signal.  This function is always called
933  * from a worker thread (as part of PM) work, with 'fp' set to the process that
934  * issued the original system call.  The calling function has just unsuspended
935  * the process out of _SDEV blocking state.  The job of this function is to
936  * issue a cancel request and then block until a reply comes in; the reply may
937  * indicate success, in which case it must be handled accordingly.
938  */
939 void
940 sdev_cancel(void)
941 {
942 	struct smap *sp;
943 	message m;
944 	sockid_t sock_id;
945 
946 	/* The suspension status must just have been cleared by the caller. */
947 	assert(fp->fp_blocked_on == FP_BLOCKED_ON_NONE);
948 
949 	if ((sp = get_smap_by_dev(fp->fp_sdev.dev, &sock_id)) != NULL) {
950 		/* Prepare the request message. */
951 		memset(&m, 0, sizeof(m));
952 		m.m_type = SDEV_CANCEL;
953 		m.m_vfs_lsockdriver_simple.req_id = (sockid_t)who_e;
954 		m.m_vfs_lsockdriver_simple.sock_id = sock_id;
955 
956 		/*
957 		 * Send the cancel request, and wait for a reply.  The reply
958 		 * will be for the original request and must be processed
959 		 * accordingly.  It is possible that the original request
960 		 * actually succeeded, because 1) the cancel request resulted
961 		 * in partial success or 2) the original reply and the cancel
962 		 * request crossed each other.  It is because of the second
963 		 * case that a socket driver must not respond at all to a
964 		 * cancel operation for an unknown request.
965 		 */
966 		sdev_sendrec(sp, &m);
967 	} else
968 		m.m_type = EIO;
969 
970 	/*
971 	 * Successful accept requests require special processing, but since we
972 	 * are already operating from a working thread here, we need not spawn
973 	 * an additional worker thread for this case.
974 	 */
975 	if (m.m_type == SDEV_ACCEPT_REPLY &&
976 	    m.m_lsockdriver_vfs_accept_reply.sock_id >= 0)
977 		sdev_finish_accept(fp, &m);
978 	else
979 		sdev_finish(fp, &m);
980 }
981 
982 /*
983  * A socket driver has sent a reply to a socket request.  Process it, by either
984  * waking up an active worker thread, finishing the system call from here, or
985  * (in the exceptional case of accept calls) spawning a new worker thread to
986  * process the reply.  This function MUST NOT block its calling thread.
987  */
988 void
989 sdev_reply(void)
990 {
991 	struct fproc *rfp;
992 	struct smap *sp;
993 	struct worker_thread *wp;
994 	sockid_t req_id = -1;
995 	dev_t dev;
996 	int slot;
997 
998 	if ((sp = get_smap_by_endpt(who_e)) == NULL) {
999 		printf("VFS: ignoring sock dev reply from unknown driver %d\n",
1000 		    who_e);
1001 		return;
1002 	}
1003 
1004 	switch (call_nr) {
1005 	case SDEV_REPLY:
1006 		req_id = m_in.m_lsockdriver_vfs_reply.req_id;
1007 		break;
1008 	case SDEV_SOCKET_REPLY:
1009 		req_id = m_in.m_lsockdriver_vfs_socket_reply.req_id;
1010 		break;
1011 	case SDEV_ACCEPT_REPLY:
1012 		req_id = m_in.m_lsockdriver_vfs_accept_reply.req_id;
1013 		break;
1014 	case SDEV_RECV_REPLY:
1015 		req_id = m_in.m_lsockdriver_vfs_recv_reply.req_id;
1016 		break;
1017 	case SDEV_SELECT1_REPLY:
1018 		dev = make_smap_dev(sp,
1019 		    m_in.m_lsockdriver_vfs_select_reply.sock_id);
1020 		select_sdev_reply1(dev,
1021 		    m_in.m_lsockdriver_vfs_select_reply.status);
1022 		return;
1023 	case SDEV_SELECT2_REPLY:
1024 		dev = make_smap_dev(sp,
1025 		    m_in.m_lsockdriver_vfs_select_reply.sock_id);
1026 		select_sdev_reply2(dev,
1027 		    m_in.m_lsockdriver_vfs_select_reply.status);
1028 		return;
1029 	default:
1030 		printf("VFS: ignoring unknown sock dev reply %d from %d\n",
1031 		    call_nr, who_e);
1032 		return;
1033 	}
1034 
1035 	if (isokendpt((endpoint_t)req_id, &slot) != OK) {
1036 		printf("VFS: ignoring sock dev reply from %d for unknown %d\n",
1037 		    who_e, req_id);
1038 		return;
1039 	}
1040 
1041 	rfp = &fproc[slot];
1042 	wp = rfp->fp_worker;
1043 	if (wp != NULL && wp->w_task == who_e && wp->w_drv_sendrec != NULL) {
1044 		assert(!fp_is_blocked(rfp));
1045 		*wp->w_drv_sendrec = m_in;
1046 		wp->w_drv_sendrec = NULL;
1047 		worker_signal(wp);	/* resume suspended thread */
1048 		/*
1049 		 * It is up to the worker thread to 1) check that the reply is
1050 		 * of the right type for the request, and 2) keep in mind that
1051 		 * the reply type may be EIO in case the socket driver died.
1052 		 */
1053 	} else if (rfp->fp_blocked_on != FP_BLOCKED_ON_SDEV ||
1054 	    get_smap_by_dev(rfp->fp_sdev.dev, NULL) != sp) {
1055 		printf("VFS: ignoring sock dev reply, %d not blocked on %d\n",
1056 		    rfp->fp_endpoint, who_e);
1057 		return;
1058 	} else if (call_nr == SDEV_ACCEPT_REPLY &&
1059 	    m_in.m_lsockdriver_vfs_accept_reply.sock_id >= 0) {
1060 		/*
1061 		 * For accept replies that return a new socket, we need to
1062 		 * spawn a worker thread, because accept calls may block (so
1063 		 * there will no longer be a worker thread) and processing the
1064 		 * reply requires additional blocking calls (which we cannot
1065 		 * issue from the main thread).  This is tricky.  Under no
1066 		 * circumstances may we "lose" a legitimate reply, because this
1067 		 * would lead to resource leaks in the socket driver.  To this
1068 		 * end, we rely on the current worker thread model to
1069 		 * prioritize regular work over PM work.  Still, sdev_cancel()
1070 		 * may end up receiving the accept reply if it was already
1071 		 * blocked waiting for the reply message, and it must then
1072 		 * perform the same tasks.
1073 		 */
1074 		/*
1075 		 * It is possible that if all threads are in use, there is a
1076 		 * "gap" between starting the thread and its activation.  The
1077 		 * main problem for this case is that the socket driver dies
1078 		 * within that gap.  For accepts, we address this with no less
1079 		 * than two checks: 1) in this file, by looking up the smap
1080 		 * entry by the reply source endpoint again - if the entry is
1081 		 * no longer valid, the socket driver must have died; 2) in
1082 		 * socket.c, by revalidating the original listening socket - if
1083 		 * the listening socket has been invalidated, the driver died.
1084 		 *
1085 		 * Since we unsuspend the process now, a socket driver sending
1086 		 * two accept replies in a row may never cause VFS to attempt
1087 		 * spawning two threads; the second reply should be ignored.
1088 		 */
1089 		assert(fp->fp_func == NULL);
1090 
1091 		worker_start(rfp, do_accept_reply, &m_in, FALSE /*use_spare*/);
1092 
1093 		/*
1094 		 * TODO: I just introduced the notion of not using the fp_u
1095 		 * union across yields after unsuspension, but for socket calls
1096 		 * we have a lot of socket state to carry over, so I'm now
1097 		 * immediately violating my own rule again here.  Possible
1098 		 * solutions: 1) introduce another blocking state just to mark
1099 		 * the fp_u union in use (this has side effects though), 2)
1100 		 * introduce a pseudo message type which covers both the accept
1101 		 * reply fields and the fp_u state (do_pending_pipe does this),
1102 		 * or 3) add a fp_flags flag for this purpose.  In any case,
1103 		 * the whole point is that we catch any attempts to reuse fp_u
1104 		 * for other purposes and thus cause state corruption. This
1105 		 * should not happen anyway, but it's too dangerous to leave
1106 		 * entirely unchecked.  --dcvmoole
1107 		 */
1108 		rfp->fp_blocked_on = FP_BLOCKED_ON_NONE;
1109 	} else {
1110 		rfp->fp_blocked_on = FP_BLOCKED_ON_NONE;
1111 
1112 		sdev_finish(rfp, &m_in);
1113 	}
1114 }
1115