xref: /netbsd-src/sys/arch/x86/x86/intr.c (revision 86552e688512bf17cc3d11f6ecaf3a4f833d67a8)
1 /*	$NetBSD: intr.c,v 1.169 2024/09/11 05:17:45 mrg Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, 2008, 2009, 2019 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright 2002 (c) Wasabi Systems, Inc.
34  * All rights reserved.
35  *
36  * Written by Frank van der Linden for Wasabi Systems, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *      This product includes software developed for the NetBSD Project by
49  *      Wasabi Systems, Inc.
50  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
51  *    or promote products derived from this software without specific prior
52  *    written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
57  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
58  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
61  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
62  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
63  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
64  * POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 /*-
68  * Copyright (c) 1991 The Regents of the University of California.
69  * All rights reserved.
70  *
71  * This code is derived from software contributed to Berkeley by
72  * William Jolitz.
73  *
74  * Redistribution and use in source and binary forms, with or without
75  * modification, are permitted provided that the following conditions
76  * are met:
77  * 1. Redistributions of source code must retain the above copyright
78  *    notice, this list of conditions and the following disclaimer.
79  * 2. Redistributions in binary form must reproduce the above copyright
80  *    notice, this list of conditions and the following disclaimer in the
81  *    documentation and/or other materials provided with the distribution.
82  * 3. Neither the name of the University nor the names of its contributors
83  *    may be used to endorse or promote products derived from this software
84  *    without specific prior written permission.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
87  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
88  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
89  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
90  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
91  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
92  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
93  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
94  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
95  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
96  * SUCH DAMAGE.
97  *
98  *	@(#)isa.c	7.2 (Berkeley) 5/13/91
99  */
100 
101 /*-
102  * Copyright (c) 1993, 1994 Charles Hannum.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *	This product includes software developed by the University of
115  *	California, Berkeley and its contributors.
116  * 4. Neither the name of the University nor the names of its contributors
117  *    may be used to endorse or promote products derived from this software
118  *    without specific prior written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
122  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
123  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
124  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
125  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
126  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
127  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
128  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
129  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
130  * SUCH DAMAGE.
131  *
132  *	@(#)isa.c	7.2 (Berkeley) 5/13/91
133  */
134 
135 #include <sys/cdefs.h>
136 __KERNEL_RCSID(0, "$NetBSD: intr.c,v 1.169 2024/09/11 05:17:45 mrg Exp $");
137 
138 #include "opt_acpi.h"
139 #include "opt_intrdebug.h"
140 #include "opt_multiprocessor.h"
141 #include "opt_pci.h"
142 
143 #include <sys/param.h>
144 #include <sys/systm.h>
145 #include <sys/kernel.h>
146 #include <sys/syslog.h>
147 #include <sys/device.h>
148 #include <sys/kmem.h>
149 #include <sys/proc.h>
150 #include <sys/errno.h>
151 #include <sys/intr.h>
152 #include <sys/cpu.h>
153 #include <sys/xcall.h>
154 #include <sys/interrupt.h>
155 #include <sys/reboot.h> /* for AB_VERBOSE */
156 #include <sys/sdt.h>
157 
158 #include <sys/kauth.h>
159 #include <sys/conf.h>
160 
161 #include <uvm/uvm_extern.h>
162 
163 #include <machine/i8259.h>
164 #include <machine/pio.h>
165 
166 #include <x86/intr_private.h>
167 
168 #include "ioapic.h"
169 #include "lapic.h"
170 #include "pci.h"
171 #include "acpica.h"
172 #ifndef XENPV
173 #include "hyperv.h"
174 #if NHYPERV > 0
175 #include <dev/hyperv/hypervvar.h>
176 
177 extern void Xresume_hyperv_hypercall(void);
178 extern void Xrecurse_hyperv_hypercall(void);
179 #endif
180 #endif
181 
182 #if NIOAPIC > 0 || NACPICA > 0
183 #include <machine/i82093var.h>
184 #include <machine/mpbiosvar.h>
185 #include <machine/mpacpi.h>
186 #endif
187 
188 #if NLAPIC > 0
189 #include <machine/i82489var.h>
190 #endif
191 
192 #if NPCI > 0
193 #include <dev/pci/ppbreg.h>
194 #endif
195 
196 #include <x86/pci/msipic.h>
197 #include <x86/pci/pci_msi_machdep.h>
198 
199 #if NPCI == 0 || !defined(__HAVE_PCI_MSI_MSIX)
200 #define msipic_is_msi_pic(PIC)	(false)
201 #endif
202 
203 #include <ddb/db_active.h>
204 
205 #ifdef DDB
206 #include <ddb/db_output.h>
207 #endif
208 
209 #ifdef INTRDEBUG
210 #define DPRINTF(msg) printf msg
211 #else
212 #define DPRINTF(msg)
213 #endif
214 
215 static SIMPLEQ_HEAD(, intrsource) io_interrupt_sources =
216 	SIMPLEQ_HEAD_INITIALIZER(io_interrupt_sources);
217 
218 static kmutex_t intr_distribute_lock;
219 
220 static int intr_allocate_slot_cpu(struct cpu_info *, struct pic *, int, int *,
221 				  struct intrsource *);
222 static int __noinline intr_allocate_slot(struct pic *, int, int,
223 					 struct cpu_info **, int *, int *,
224 					 struct intrsource *);
225 
226 static void intr_source_free(struct cpu_info *, int, struct pic *, int);
227 
228 static void intr_establish_xcall(void *, void *);
229 static void intr_disestablish_xcall(void *, void *);
230 
231 static const char *legacy_intr_string(int, char *, size_t, struct pic *);
232 
233 static const char *xen_intr_string(int, char *, size_t, struct pic *);
234 
235 #if defined(INTRSTACKSIZE)
236 static inline bool redzone_const_or_false(bool);
237 static inline int redzone_const_or_zero(int);
238 #endif
239 
240 static void intr_redistribute_xc_t(void *, void *);
241 static void intr_redistribute_xc_s1(void *, void *);
242 static void intr_redistribute_xc_s2(void *, void *);
243 static bool intr_redistribute(struct cpu_info *);
244 static struct intrsource *intr_get_io_intrsource(const char *);
245 static void intr_free_io_intrsource_direct(struct intrsource *);
246 static int intr_num_handlers(struct intrsource *);
247 static int intr_find_unused_slot(struct cpu_info *, int *);
248 static void intr_activate_xcall(void *, void *);
249 static void intr_deactivate_xcall(void *, void *);
250 static void intr_get_affinity(struct intrsource *, kcpuset_t *);
251 static int intr_set_affinity(struct intrsource *, const kcpuset_t *);
252 
253 SDT_PROBE_DEFINE3(sdt, kernel, intr, entry,
254     "int (*)(void *)"/*func*/,
255     "void *"/*arg*/,
256     "struct intrhand *"/*ih*/);
257 SDT_PROBE_DEFINE4(sdt, kernel, intr, return,
258     "int (*)(void *)"/*func*/,
259     "void *"/*arg*/,
260     "struct intrhand *"/*ih*/,
261     "int"/*handled*/);
262 
263 /*
264  * Fill in default interrupt table (in case of spurious interrupt
265  * during configuration of kernel), setup interrupt control unit
266  */
267 void
268 intr_default_setup(void)
269 {
270 	struct idt_vec *iv = &(cpu_info_primary.ci_idtvec);
271 	int i;
272 
273 	/* icu vectors */
274 	for (i = 0; i < NUM_LEGACY_IRQS; i++) {
275 		idt_vec_reserve(iv, ICU_OFFSET + i);
276 		idt_vec_set(iv, ICU_OFFSET + i, legacy_stubs[i].ist_entry);
277 	}
278 
279 	/*
280 	 * Eventually might want to check if it's actually there.
281 	 */
282 	i8259_default_setup();
283 
284 	mutex_init(&intr_distribute_lock, MUTEX_DEFAULT, IPL_NONE);
285 }
286 
287 /*
288  * Handle a NMI, possibly a machine check.
289  * return true to panic system, false to ignore.
290  */
291 void
292 x86_nmi(void)
293 {
294 
295 	log(LOG_CRIT, "NMI port 61 %x, port 70 %x\n", inb(0x61), inb(0x70));
296 }
297 
298 /*
299  * Create an interrupt id such as "ioapic0 pin 9". This interrupt id is used
300  * by MI code and intrctl(8).
301  */
302 const char *
303 intr_create_intrid(int legacy_irq, struct pic *pic, int pin, char *buf,
304     size_t len)
305 {
306 	int ih = 0;
307 
308 #if NPCI > 0
309 #if defined(__HAVE_PCI_MSI_MSIX)
310 	if ((pic->pic_type == PIC_MSI) || (pic->pic_type == PIC_MSIX)) {
311 		uint64_t pih;
312 		int dev, vec;
313 
314 		dev = msipic_get_devid(pic);
315 		vec = pin;
316 		pih = __SHIFTIN((uint64_t)dev, MSI_INT_DEV_MASK)
317 			| __SHIFTIN((uint64_t)vec, MSI_INT_VEC_MASK)
318 			| APIC_INT_VIA_MSI;
319 		if (pic->pic_type == PIC_MSI)
320 			MSI_INT_MAKE_MSI(pih);
321 		else if (pic->pic_type == PIC_MSIX)
322 			MSI_INT_MAKE_MSIX(pih);
323 
324 		return x86_pci_msi_string(NULL, pih, buf, len);
325 	}
326 #endif /* __HAVE_PCI_MSI_MSIX */
327 #endif
328 
329 	if (pic->pic_type == PIC_XEN) {
330 		ih = pin;	/* Port == pin */
331 		return xen_intr_string(pin, buf, len, pic);
332 	}
333 
334 	/*
335 	 * If the device is pci, "legacy_irq" is always -1. Least 8 bit of "ih"
336 	 * is only used in intr_string() to show the irq number.
337 	 * If the device is "legacy"(such as floppy), it should not use
338 	 * intr_string().
339 	 */
340 	if (pic->pic_type == PIC_I8259) {
341 		ih = legacy_irq;
342 		return legacy_intr_string(ih, buf, len, pic);
343 	}
344 
345 #if NIOAPIC > 0 || NACPICA > 0
346 	ih = ((pic->pic_apicid << APIC_INT_APIC_SHIFT) & APIC_INT_APIC_MASK)
347 	    | ((pin << APIC_INT_PIN_SHIFT) & APIC_INT_PIN_MASK);
348 	if (pic->pic_type == PIC_IOAPIC) {
349 		ih |= APIC_INT_VIA_APIC;
350 	}
351 	ih |= pin;
352 	return intr_string(ih, buf, len);
353 #endif
354 
355 	return NULL; /* No pic found! */
356 }
357 
358 /*
359  * Find intrsource from io_interrupt_sources list.
360  */
361 static struct intrsource *
362 intr_get_io_intrsource(const char *intrid)
363 {
364 	struct intrsource *isp;
365 
366 	KASSERT(mutex_owned(&cpu_lock));
367 
368 	SIMPLEQ_FOREACH(isp, &io_interrupt_sources, is_list) {
369 		KASSERT(isp->is_intrid != NULL);
370 		if (strncmp(intrid, isp->is_intrid, INTRIDBUF - 1) == 0)
371 			return isp;
372 	}
373 	return NULL;
374 }
375 
376 /*
377  * Allocate intrsource and add to io_interrupt_sources list.
378  */
379 struct intrsource *
380 intr_allocate_io_intrsource(const char *intrid)
381 {
382 	CPU_INFO_ITERATOR cii;
383 	struct cpu_info *ci;
384 	struct intrsource *isp;
385 	struct percpu_evcnt *pep;
386 
387 	KASSERT(mutex_owned(&cpu_lock));
388 
389 	if (intrid == NULL)
390 		return NULL;
391 
392 	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
393 	pep = kmem_zalloc(sizeof(*pep) * ncpu, KM_SLEEP);
394 	isp->is_saved_evcnt = pep;
395 	for (CPU_INFO_FOREACH(cii, ci)) {
396 		pep->cpuid = ci->ci_cpuid;
397 		pep++;
398 	}
399 	strlcpy(isp->is_intrid, intrid, sizeof(isp->is_intrid));
400 
401 	SIMPLEQ_INSERT_TAIL(&io_interrupt_sources, isp, is_list);
402 
403 	return isp;
404 }
405 
406 /*
407  * Remove from io_interrupt_sources list and free by the intrsource pointer.
408  */
409 static void
410 intr_free_io_intrsource_direct(struct intrsource *isp)
411 {
412 	KASSERT(mutex_owned(&cpu_lock));
413 
414 	SIMPLEQ_REMOVE(&io_interrupt_sources, isp, intrsource, is_list);
415 
416 	/* Is this interrupt established? */
417 	if (isp->is_evname[0] != '\0') {
418 		evcnt_detach(&isp->is_evcnt);
419 		isp->is_evname[0] = '\0';
420 	}
421 
422 	kmem_free(isp->is_saved_evcnt,
423 	    sizeof(*(isp->is_saved_evcnt)) * ncpu);
424 
425 	kmem_free(isp, sizeof(*isp));
426 }
427 
428 /*
429  * Remove from io_interrupt_sources list and free by the interrupt id.
430  * This function can be used by MI code.
431  */
432 void
433 intr_free_io_intrsource(const char *intrid)
434 {
435 	struct intrsource *isp;
436 
437 	KASSERT(mutex_owned(&cpu_lock));
438 
439 	if (intrid == NULL)
440 		return;
441 
442 	if ((isp = intr_get_io_intrsource(intrid)) == NULL) {
443 		return;
444 	}
445 
446 	/* If the interrupt uses shared IRQ, don't free yet. */
447 	if (isp->is_handlers != NULL) {
448 		return;
449 	}
450 
451 	intr_free_io_intrsource_direct(isp);
452 }
453 
454 static int
455 intr_allocate_slot_cpu(struct cpu_info *ci, struct pic *pic, int pin,
456 		       int *index, struct intrsource *chained)
457 {
458 	int slot, i;
459 	struct intrsource *isp;
460 
461 	KASSERT(mutex_owned(&cpu_lock));
462 
463 	if (pic == &i8259_pic) {
464 		KASSERT(CPU_IS_PRIMARY(ci));
465 		slot = pin;
466 	} else {
467 		int start = 0;
468 		int max = MAX_INTR_SOURCES;
469 		slot = -1;
470 
471 		/* avoid reserved slots for legacy interrupts. */
472 		if (CPU_IS_PRIMARY(ci) && msipic_is_msi_pic(pic))
473 			start = NUM_LEGACY_IRQS;
474 		/* don't step over Xen's slots */
475 		if (vm_guest == VM_GUEST_XENPVH)
476 			max = SIR_XENIPL_VM;
477 		/*
478 		 * intr_allocate_slot has checked for an existing mapping.
479 		 * Now look for a free slot.
480 		 */
481 		for (i = start; i < max ; i++) {
482 			if (ci->ci_isources[i] == NULL) {
483 				slot = i;
484 				break;
485 			}
486 		}
487 		if (slot == -1) {
488 			return EBUSY;
489 		}
490 	}
491 
492 	isp = ci->ci_isources[slot];
493 	if (isp == NULL) {
494 		const char *via;
495 
496 		isp = chained;
497 		KASSERT(isp != NULL);
498 		if (pic->pic_type == PIC_MSI || pic->pic_type == PIC_MSIX)
499 			via = "vec";
500 		else
501 			via = "pin";
502 		snprintf(isp->is_evname, sizeof (isp->is_evname),
503 		    "%s %d", via, pin);
504 		evcnt_attach_dynamic(&isp->is_evcnt, EVCNT_TYPE_INTR, NULL,
505 		    pic->pic_name, isp->is_evname);
506 		isp->is_active_cpu = ci->ci_cpuid;
507 		ci->ci_isources[slot] = isp;
508 	}
509 
510 	*index = slot;
511 	return 0;
512 }
513 
514 /*
515  * A simple round-robin allocator to assign interrupts to CPUs.
516  */
517 static int __noinline
518 intr_allocate_slot(struct pic *pic, int pin, int level,
519 		   struct cpu_info **cip, int *index, int *idt_slot,
520 		   struct intrsource *chained)
521 {
522 	CPU_INFO_ITERATOR cii;
523 	struct cpu_info *ci, *lci;
524 	struct intrsource *isp;
525 	int slot = 0, idtvec, error;
526 
527 	KASSERT(mutex_owned(&cpu_lock));
528 
529 	/* First check if this pin is already used by an interrupt vector. */
530 	for (CPU_INFO_FOREACH(cii, ci)) {
531 		for (slot = 0 ; slot < MAX_INTR_SOURCES ; slot++) {
532 			if ((isp = ci->ci_isources[slot]) == NULL) {
533 				continue;
534 			}
535 			if (isp->is_pic == pic &&
536 			    pin != -1 && isp->is_pin == pin) {
537 				*idt_slot = isp->is_idtvec;
538 				*index = slot;
539 				*cip = ci;
540 				return 0;
541 			}
542 		}
543 	}
544 
545 	/*
546 	 * The pic/pin combination doesn't have an existing mapping.
547 	 * Find a slot for a new interrupt source.  For the i8259 case,
548 	 * we always use reserved slots of the primary CPU.  Otherwise,
549 	 * we make an attempt to balance the interrupt load.
550 	 *
551 	 * PIC and APIC usage are essentially exclusive, so the reservation
552 	 * of the ISA slots is ignored when assigning IOAPIC slots.
553 	 */
554 	if (pic == &i8259_pic) {
555 		/*
556 		 * Must be directed to BP.
557 		 */
558 		ci = &cpu_info_primary;
559 		error = intr_allocate_slot_cpu(ci, pic, pin, &slot, chained);
560 	} else {
561 		/*
562 		 * Find least loaded AP/BP and try to allocate there.
563 		 */
564 		ci = NULL;
565 		for (CPU_INFO_FOREACH(cii, lci)) {
566 			if ((lci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
567 				continue;
568 			}
569 #if 0
570 			if (ci == NULL ||
571 			    ci->ci_nintrhand > lci->ci_nintrhand) {
572 				ci = lci;
573 			}
574 #else
575 			ci = &cpu_info_primary;
576 #endif
577 		}
578 		KASSERT(ci != NULL);
579 		error = intr_allocate_slot_cpu(ci, pic, pin, &slot, chained);
580 
581 		/*
582 		 * If that did not work, allocate anywhere.
583 		 */
584 		if (error != 0) {
585 			for (CPU_INFO_FOREACH(cii, ci)) {
586 				if ((ci->ci_schedstate.spc_flags &
587 				    SPCF_NOINTR) != 0) {
588 					continue;
589 				}
590 				error = intr_allocate_slot_cpu(ci, pic,
591 				    pin, &slot, chained);
592 				if (error == 0) {
593 					break;
594 				}
595 			}
596 		}
597 	}
598 	if (error != 0) {
599 		return error;
600 	}
601 	KASSERT(ci != NULL);
602 
603 	/*
604 	 * Now allocate an IDT vector.
605 	 * For the 8259 these are reserved up front.
606 	 */
607 	if (pic == &i8259_pic) {
608 		idtvec = ICU_OFFSET + pin;
609 	} else {
610 		/*
611 		 * TODO to support MSI (not MSI-X) multiple vectors
612 		 *
613 		 * PCI Local Bus Specification Revision 3.0 says the devices
614 		 * which use MSI multiple vectors increment the low order bits
615 		 * of MSI message data.
616 		 * On the other hand, Intel SDM "10.11.2 Message Data Register
617 		 * Format" says the 7:0 bits of MSI message data mean Interrupt
618 		 * Descriptor Table(IDT) vector.
619 		 * As the result of these two documents, the IDT vectors which
620 		 * are used by a device using MSI multiple vectors must be
621 		 * continuous.
622 		 */
623 		struct idt_vec *iv;
624 
625 		iv = idt_vec_ref(&ci->ci_idtvec);
626 		idtvec = idt_vec_alloc(iv, APIC_LEVEL(level), IDT_INTR_HIGH);
627 	}
628 	if (idtvec < 0) {
629 		evcnt_detach(&ci->ci_isources[slot]->is_evcnt);
630 		ci->ci_isources[slot]->is_evname[0] = '\0';
631 		ci->ci_isources[slot] = NULL;
632 		return EBUSY;
633 	}
634 	ci->ci_isources[slot]->is_idtvec = idtvec;
635 	*idt_slot = idtvec;
636 	*index = slot;
637 	*cip = ci;
638 	return 0;
639 }
640 
641 static void
642 intr_source_free(struct cpu_info *ci, int slot, struct pic *pic, int idtvec)
643 {
644 	struct intrsource *isp;
645 	struct idt_vec *iv;
646 
647 	isp = ci->ci_isources[slot];
648 	iv = idt_vec_ref(&ci->ci_idtvec);
649 
650 	if (isp->is_handlers != NULL)
651 		return;
652 	ci->ci_isources[slot] = NULL;
653 	if (pic != &i8259_pic)
654 		idt_vec_free(iv, idtvec);
655 
656 	isp->is_recurse = NULL;
657 	isp->is_resume = NULL;
658 }
659 
660 #ifdef MULTIPROCESSOR
661 static int intr_biglock_wrapper(void *);
662 static int intr_wrapper(void *);
663 
664 /*
665  * intr_wrapper: perform diagnostic checks before and after calling the
666  * real handler.
667  * intr_biglock_wrapper: grab biglock and call a real interrupt handler.
668  */
669 
670 static int
671 intr_wrapper(void *vp)
672 {
673 	struct intrhand *ih = vp;
674 	struct lwp *l = curlwp;
675 	int locks;
676 	int nopreempt;
677 	int ret;
678 
679 	locks = curcpu()->ci_biglock_count;
680 	nopreempt = l->l_nopreempt;
681 	SDT_PROBE3(sdt, kernel, intr, entry,
682 	    ih->ih_realfun, ih->ih_realarg, ih);
683 	ret = (*ih->ih_realfun)(ih->ih_realarg);
684 	SDT_PROBE4(sdt, kernel, intr, return,
685 	    ih->ih_realfun, ih->ih_realarg, ih, ret);
686 	KASSERTMSG(locks == curcpu()->ci_biglock_count,
687 	    "%s @ %p slipped locks %d -> %d",
688 	    ih->ih_xname, ih->ih_realfun, locks, curcpu()->ci_biglock_count);
689 	KASSERTMSG(nopreempt == l->l_nopreempt,
690 	    "%s @ %p slipped nopreempt %d -> %d lwp %p/%p func %p",
691 	    ih->ih_xname, ih->ih_realfun, nopreempt, l->l_nopreempt, l, curlwp,
692 	    ih->ih_realfun);
693 
694 	return ret;
695 }
696 
697 static int
698 intr_biglock_wrapper(void *vp)
699 {
700 	int ret;
701 
702 	KERNEL_LOCK(1, NULL);
703 
704 	ret = intr_wrapper(vp);
705 
706 	KERNEL_UNLOCK_ONE(NULL);
707 
708 	return ret;
709 }
710 
711 #endif /* MULTIPROCESSOR */
712 
713 #ifdef KDTRACE_HOOKS
714 static int
715 intr_kdtrace_wrapper(void *vp)
716 {
717 	struct intrhand *ih = vp;
718 	struct lwp *l = curlwp;
719 	int ret;
720 
721 	int nopreempt;
722 	nopreempt = l->l_nopreempt;
723 	SDT_PROBE3(sdt, kernel, intr, entry,
724 	    ih->ih_realfun, ih->ih_realarg, ih);
725 	ret = (*ih->ih_realfun)(ih->ih_realarg);
726 	SDT_PROBE4(sdt, kernel, intr, return,
727 	    ih->ih_realfun, ih->ih_realarg, ih, ret);
728 	KASSERTMSG(nopreempt == l->l_nopreempt,
729 	    "%s @ %p slipped nopreempt %d -> %d  lwp %p/%p",
730 	    ih->ih_xname, ih->ih_realfun, nopreempt, l->l_nopreempt, l, curlwp);
731 
732 	return ret;
733 }
734 #endif
735 
736 /*
737  * Append device name to intrsource. If device A and device B share IRQ number,
738  * the device name of the interrupt id is "device A, device B".
739  */
740 static void
741 intr_append_intrsource_xname(struct intrsource *isp, const char *xname)
742 {
743 
744 	if (isp->is_xname[0] != '\0')
745 		strlcat(isp->is_xname, ", ", sizeof(isp->is_xname));
746 	strlcat(isp->is_xname, xname, sizeof(isp->is_xname));
747 }
748 
749 /*
750  * Called on bound CPU to handle calling pic_hwunmask from contexts
751  * that are not already running on the bound CPU.
752  *
753  * => caller (on initiating CPU) holds cpu_lock on our behalf
754  * => arg1: struct intrhand *ih
755  */
756 static void
757 intr_hwunmask_xcall(void *arg1, void *arg2)
758 {
759 	struct intrhand * const ih = arg1;
760 	struct cpu_info * const ci = ih->ih_cpu;
761 
762 	KASSERT(ci == curcpu() || !mp_online);
763 
764 	const u_long psl = x86_read_psl();
765 	x86_disable_intr();
766 
767 	struct intrsource * const source = ci->ci_isources[ih->ih_slot];
768 	struct pic * const pic = source->is_pic;
769 
770 	if (source->is_mask_count == 0) {
771 		(*pic->pic_hwunmask)(pic, ih->ih_pin);
772 	}
773 
774 	x86_write_psl(psl);
775 }
776 
777 /*
778  * Handle per-CPU component of interrupt establish.
779  *
780  * => caller (on initiating CPU) holds cpu_lock on our behalf
781  * => arg1: struct intrhand *ih
782  * => arg2: int idt_vec
783  */
784 static void
785 intr_establish_xcall(void *arg1, void *arg2)
786 {
787 	struct idt_vec *iv;
788 	struct intrsource *source;
789 	struct intrstub *stubp;
790 	struct intrhand *ih;
791 	struct cpu_info *ci;
792 	int idt_vec;
793 	u_long psl;
794 
795 	ih = arg1;
796 
797 	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
798 
799 	ci = ih->ih_cpu;
800 	source = ci->ci_isources[ih->ih_slot];
801 	idt_vec = (int)(intptr_t)arg2;
802 	iv = idt_vec_ref(&ci->ci_idtvec);
803 
804 	/* Disable interrupts locally. */
805 	psl = x86_read_psl();
806 	x86_disable_intr();
807 
808 	/* Link in the handler and re-calculate masks. */
809 	*(ih->ih_prevp) = ih;
810 	x86_intr_calculatemasks(ci);
811 
812 	/* Hook in new IDT vector and SPL state. */
813 	if (source->is_resume == NULL || source->is_idtvec != idt_vec) {
814 		if (source->is_idtvec != 0 && source->is_idtvec != idt_vec)
815 			idt_vec_free(iv, source->is_idtvec);
816 		source->is_idtvec = idt_vec;
817 		if (source->is_type == IST_LEVEL) {
818 			stubp = &source->is_pic->pic_level_stubs[ih->ih_slot];
819 		} else {
820 			stubp = &source->is_pic->pic_edge_stubs[ih->ih_slot];
821 		}
822 		source->is_resume = stubp->ist_resume;
823 		source->is_recurse = stubp->ist_recurse;
824 		idt_vec_set(iv, idt_vec, stubp->ist_entry);
825 	}
826 
827 	/* Re-enable interrupts locally. */
828 	x86_write_psl(psl);
829 }
830 
831 void *
832 intr_establish_xname(int legacy_irq, struct pic *pic, int pin, int type,
833 		     int level, int (*handler)(void *), void *arg,
834 		     bool known_mpsafe, const char *xname)
835 {
836 	struct intrhand **p, *q, *ih;
837 	struct cpu_info *ci;
838 	int slot, error, idt_vec;
839 	struct intrsource *chained, *source;
840 #ifdef MULTIPROCESSOR
841 	bool mpsafe = (known_mpsafe || level != IPL_VM);
842 #endif /* MULTIPROCESSOR */
843 	uint64_t where;
844 	const char *intrstr;
845 	char intrstr_buf[INTRIDBUF];
846 
847 	KASSERTMSG((legacy_irq == -1 || (0 <= legacy_irq && legacy_irq < 16)),
848 	    "bad legacy IRQ value: %d", legacy_irq);
849 	KASSERTMSG((legacy_irq != -1 || pic != &i8259_pic),
850 	    "non-legacy IRQ on i8259");
851 
852 	ih = kmem_alloc(sizeof(*ih), KM_SLEEP);
853 	intrstr = intr_create_intrid(legacy_irq, pic, pin, intrstr_buf,
854 	    sizeof(intrstr_buf));
855 	KASSERT(intrstr != NULL);
856 
857 	mutex_enter(&cpu_lock);
858 
859 	/* allocate intrsource pool, if not yet. */
860 	chained = intr_get_io_intrsource(intrstr);
861 	if (chained == NULL) {
862 		if (msipic_is_msi_pic(pic)) {
863 			mutex_exit(&cpu_lock);
864 			kmem_free(ih, sizeof(*ih));
865 			printf("%s: %s has no intrsource\n", __func__, intrstr);
866 			return NULL;
867 		}
868 		chained = intr_allocate_io_intrsource(intrstr);
869 		if (chained == NULL) {
870 			mutex_exit(&cpu_lock);
871 			kmem_free(ih, sizeof(*ih));
872 			printf("%s: can't allocate io_intersource\n", __func__);
873 			return NULL;
874 		}
875 	}
876 
877 	error = intr_allocate_slot(pic, pin, level, &ci, &slot, &idt_vec,
878 	    chained);
879 	if (error != 0) {
880 		intr_free_io_intrsource_direct(chained);
881 		mutex_exit(&cpu_lock);
882 		kmem_free(ih, sizeof(*ih));
883 		printf("failed to allocate interrupt slot for PIC %s pin %d\n",
884 		    pic->pic_name, pin);
885 		return NULL;
886 	}
887 
888 	source = ci->ci_isources[slot];
889 
890 	if (source->is_handlers != NULL &&
891 	    source->is_pic->pic_type != pic->pic_type) {
892 		intr_free_io_intrsource_direct(chained);
893 		mutex_exit(&cpu_lock);
894 		kmem_free(ih, sizeof(*ih));
895 		printf("%s: can't share intr source between "
896 		       "different PIC types (legacy_irq %d pin %d slot %d)\n",
897 		    __func__, legacy_irq, pin, slot);
898 		return NULL;
899 	}
900 
901 	source->is_pin = pin;
902 	source->is_pic = pic;
903 	intr_append_intrsource_xname(source, xname);
904 	switch (source->is_type) {
905 	case IST_NONE:
906 		source->is_type = type;
907 		break;
908 	case IST_EDGE:
909 	case IST_LEVEL:
910 		if (source->is_type == type)
911 			break;
912 		/* FALLTHROUGH */
913 	case IST_PULSE:
914 		if (type != IST_NONE) {
915 			int otype = source->is_type;
916 
917 			intr_source_free(ci, slot, pic, idt_vec);
918 			intr_free_io_intrsource_direct(chained);
919 			mutex_exit(&cpu_lock);
920 			kmem_free(ih, sizeof(*ih));
921 			printf("%s: pic %s pin %d: can't share "
922 			       "type %d with %d\n",
923 				__func__, pic->pic_name, pin,
924 				otype, type);
925 			return NULL;
926 		}
927 		break;
928 	default:
929 		panic("%s: bad intr type %d for pic %s pin %d\n",
930 		    __func__, source->is_type, pic->pic_name, pin);
931 		/* NOTREACHED */
932 	}
933 
934 	/*
935 	 * If the establishing interrupt uses shared IRQ, the interrupt uses
936 	 * "ci->ci_isources[slot]" instead of allocated by the establishing
937 	 * device's pci_intr_alloc() or this function.
938 	 */
939 	if (source->is_handlers != NULL) {
940 		struct intrsource *isp, *nisp;
941 
942 		SIMPLEQ_FOREACH_SAFE(isp, &io_interrupt_sources,
943 		    is_list, nisp) {
944 			if (strncmp(intrstr, isp->is_intrid, INTRIDBUF - 1) == 0
945 			    && isp->is_handlers == NULL)
946 				intr_free_io_intrsource_direct(isp);
947 		}
948 	}
949 
950 	/*
951 	 * We're now committed.  Mask the interrupt in hardware and
952 	 * count it for load distribution.
953 	 */
954 	(*pic->pic_hwmask)(pic, pin);
955 	(ci->ci_nintrhand)++;
956 
957 	/*
958 	 * Figure out where to put the handler.
959 	 * This is O(N^2), but we want to preserve the order, and N is
960 	 * generally small.
961 	 */
962 	for (p = &ci->ci_isources[slot]->is_handlers;
963 	     (q = *p) != NULL && q->ih_level > level;
964 	     p = &q->ih_next) {
965 		/* nothing */;
966 	}
967 
968 	ih->ih_pic = pic;
969 	ih->ih_fun = ih->ih_realfun = handler;
970 	ih->ih_arg = ih->ih_realarg = arg;
971 	ih->ih_prevp = p;
972 	ih->ih_next = *p;
973 	ih->ih_level = level;
974 	ih->ih_pin = pin;
975 	ih->ih_cpu = ci;
976 	ih->ih_slot = slot;
977 	strlcpy(ih->ih_xname, xname, sizeof(ih->ih_xname));
978 #ifdef KDTRACE_HOOKS
979 	/*
980 	 * XXX i8254_clockintr is special -- takes a magic extra
981 	 * argument.  This should be fixed properly in some way that
982 	 * doesn't involve sketchy function pointer casts.  See also
983 	 * the comments in x86/isa/clock.c.
984 	 */
985 	if (handler != __FPTRCAST(int (*)(void *), i8254_clockintr)) {
986 		ih->ih_fun = intr_kdtrace_wrapper;
987 		ih->ih_arg = ih;
988 	}
989 #endif
990 #ifdef MULTIPROCESSOR
991 	if (!mpsafe) {
992 		KASSERT(handler !=			/* XXX */
993 		    __FPTRCAST(int (*)(void *), i8254_clockintr));
994 		ih->ih_fun = intr_biglock_wrapper;
995 		ih->ih_arg = ih;
996 	} else {
997 		if (handler !=
998 		    __FPTRCAST(int (*)(void *), i8254_clockintr)) { /* XXX */
999 #ifdef DIAGNOSTIC
1000 			/* wrap all interrupts */
1001 			ih->ih_fun = intr_wrapper;
1002 			ih->ih_arg = ih;
1003 #endif
1004 		}
1005 	}
1006 #endif /* MULTIPROCESSOR */
1007 
1008 	/*
1009 	 * Call out to the remote CPU to update its interrupt state.
1010 	 * Only make RPCs if the APs are up and running.
1011 	 */
1012 	if (ci == curcpu() || !mp_online) {
1013 		intr_establish_xcall(ih, (void *)(intptr_t)idt_vec);
1014 	} else {
1015 		where = xc_unicast(0, intr_establish_xcall, ih,
1016 		    (void *)(intptr_t)idt_vec, ci);
1017 		xc_wait(where);
1018 	}
1019 
1020 	/* All set up, so add a route for the interrupt and unmask it. */
1021 	(*pic->pic_addroute)(pic, ci, pin, idt_vec, type);
1022 	if (ci == curcpu() || !mp_online) {
1023 		intr_hwunmask_xcall(ih, NULL);
1024 	} else {
1025 		where = xc_unicast(0, intr_hwunmask_xcall, ih, NULL, ci);
1026 		xc_wait(where);
1027 	}
1028 	mutex_exit(&cpu_lock);
1029 
1030 	if (bootverbose || cpu_index(ci) != 0)
1031 		aprint_verbose("allocated pic %s type %s pin %d level %d to "
1032 		    "%s slot %d idt entry %d\n",
1033 		    pic->pic_name, type == IST_EDGE ? "edge" : "level", pin,
1034 		    level, device_xname(ci->ci_dev), slot, idt_vec);
1035 
1036 	return ih;
1037 }
1038 
1039 void *
1040 intr_establish(int legacy_irq, struct pic *pic, int pin, int type, int level,
1041 	       int (*handler)(void *), void *arg, bool known_mpsafe)
1042 {
1043 
1044 	return intr_establish_xname(legacy_irq, pic, pin, type,
1045 	    level, handler, arg, known_mpsafe, "unknown");
1046 }
1047 
1048 /*
1049  * Called on bound CPU to handle intr_mask() / intr_unmask().
1050  *
1051  * => caller (on initiating CPU) holds cpu_lock on our behalf
1052  * => arg1: struct intrhand *ih
1053  * => arg2: true -> mask, false -> unmask.
1054  */
1055 static void
1056 intr_mask_xcall(void *arg1, void *arg2)
1057 {
1058 	struct intrhand * const ih = arg1;
1059 	const uintptr_t mask = (uintptr_t)arg2;
1060 	struct cpu_info * const ci = ih->ih_cpu;
1061 	bool force_pending = false;
1062 
1063 	KASSERT(ci == curcpu() || !mp_online);
1064 
1065 	/*
1066 	 * We need to disable interrupts to hold off the interrupt
1067 	 * vectors.
1068 	 */
1069 	const u_long psl = x86_read_psl();
1070 	x86_disable_intr();
1071 
1072 	struct intrsource * const source = ci->ci_isources[ih->ih_slot];
1073 	struct pic * const pic = source->is_pic;
1074 
1075 	if (mask) {
1076 		source->is_mask_count++;
1077 		KASSERT(source->is_mask_count != 0);
1078 		if (source->is_mask_count == 1) {
1079 			(*pic->pic_hwmask)(pic, ih->ih_pin);
1080 		}
1081 	} else {
1082 		KASSERT(source->is_mask_count != 0);
1083 		if (--source->is_mask_count == 0) {
1084 			/*
1085 			 * If this interrupt source is being moved, don't
1086 			 * unmask it at the hw.
1087 			 */
1088 			if (! source->is_distribute_pending) {
1089 				(*pic->pic_hwunmask)(pic, ih->ih_pin);
1090 			}
1091 
1092 			/*
1093 			 * For level-sensitive interrupts, the hardware
1094 			 * will let us know.  For everything else, we
1095 			 * need to explicitly handle interrupts that
1096 			 * happened when the source was masked.
1097 			 */
1098 			const uint64_t bit = (1U << ih->ih_slot);
1099 			if (ci->ci_imasked & bit) {
1100 				ci->ci_imasked &= ~bit;
1101 				if (source->is_type != IST_LEVEL) {
1102 					ci->ci_ipending |= bit;
1103 					force_pending = true;
1104 				}
1105 			}
1106 		}
1107 	}
1108 
1109 	/* Re-enable interrupts. */
1110 	x86_write_psl(psl);
1111 
1112 	if (force_pending) {
1113 		/* Force processing of any pending interrupts. */
1114 		splx(splhigh());
1115 	}
1116 }
1117 
1118 static void
1119 intr_mask_internal(struct intrhand * const ih, const bool mask)
1120 {
1121 
1122 	/*
1123 	 * Call out to the remote CPU to update its interrupt state.
1124 	 * Only make RPCs if the APs are up and running.
1125 	 */
1126 	mutex_enter(&cpu_lock);
1127 	struct cpu_info * const ci = ih->ih_cpu;
1128 	void * const mask_arg = (void *)(uintptr_t)mask;
1129 	if (ci == curcpu() || !mp_online) {
1130 		intr_mask_xcall(ih, mask_arg);
1131 	} else {
1132 		const uint64_t where =
1133 		    xc_unicast(0, intr_mask_xcall, ih, mask_arg, ci);
1134 		xc_wait(where);
1135 	}
1136 	mutex_exit(&cpu_lock);
1137 }
1138 
1139 void
1140 intr_mask(struct intrhand *ih)
1141 {
1142 
1143 	if (cpu_intr_p()) {
1144 		/*
1145 		 * Special case of calling intr_mask() from an interrupt
1146 		 * handler: we MUST be called from the bound CPU for this
1147 		 * interrupt (presumably from a handler we're about to
1148 		 * mask).
1149 		 *
1150 		 * We can't take the cpu_lock in this case, and we must
1151 		 * therefore be extra careful.
1152 		 */
1153 		KASSERT(ih->ih_cpu == curcpu() || !mp_online);
1154 		intr_mask_xcall(ih, (void *)(uintptr_t)true);
1155 		return;
1156 	}
1157 
1158 	intr_mask_internal(ih, true);
1159 }
1160 
1161 void
1162 intr_unmask(struct intrhand *ih)
1163 {
1164 
1165 	/*
1166 	 * This is not safe to call from an interrupt context because
1167 	 * we don't want to accidentally unmask an interrupt source
1168 	 * that's masked because it's being serviced.
1169 	 */
1170 	KASSERT(!cpu_intr_p());
1171 	intr_mask_internal(ih, false);
1172 }
1173 
1174 /*
1175  * Called on bound CPU to handle intr_disestablish().
1176  *
1177  * => caller (on initiating CPU) holds cpu_lock on our behalf
1178  * => arg1: struct intrhand *ih
1179  * => arg2: unused
1180  */
1181 static void
1182 intr_disestablish_xcall(void *arg1, void *arg2)
1183 {
1184 	struct intrhand **p, *q;
1185 	struct cpu_info *ci;
1186 	struct pic *pic;
1187 	struct intrsource *source;
1188 	struct intrhand *ih;
1189 	u_long psl;
1190 	int idtvec;
1191 
1192 	ih = arg1;
1193 	ci = ih->ih_cpu;
1194 
1195 	KASSERT(ci == curcpu() || !mp_online);
1196 
1197 	/* Disable interrupts locally. */
1198 	psl = x86_read_psl();
1199 	x86_disable_intr();
1200 
1201 	pic = ci->ci_isources[ih->ih_slot]->is_pic;
1202 	source = ci->ci_isources[ih->ih_slot];
1203 	idtvec = source->is_idtvec;
1204 
1205 	(*pic->pic_hwmask)(pic, ih->ih_pin);
1206 
1207 	/*
1208 	 * ci_pending is stable on the current CPU while interrupts are
1209 	 * blocked, and we only need to synchronize with interrupt
1210 	 * vectors on the same CPU, so no need for atomics or membars.
1211 	 */
1212 	ci->ci_ipending &= ~(1ULL << ih->ih_slot);
1213 
1214 	/*
1215 	 * Remove the handler from the chain.
1216 	 */
1217 	for (p = &source->is_handlers; (q = *p) != NULL && q != ih;
1218 	     p = &q->ih_next)
1219 		;
1220 	if (q == NULL) {
1221 		x86_write_psl(psl);
1222 		panic("%s: handler not registered", __func__);
1223 		/* NOTREACHED */
1224 	}
1225 
1226 	*p = q->ih_next;
1227 
1228 	x86_intr_calculatemasks(ci);
1229 	/*
1230 	 * If there is no any handler, 1) do delroute because it has no
1231 	 * any source and 2) dont' hwunmask to prevent spurious interrupt.
1232 	 *
1233 	 * If there is any handler, 1) don't delroute because it has source
1234 	 * and 2) do hwunmask to be able to get interrupt again.
1235 	 *
1236 	 */
1237 	if (source->is_handlers == NULL)
1238 		(*pic->pic_delroute)(pic, ci, ih->ih_pin, idtvec,
1239 		    source->is_type);
1240 	else if (source->is_mask_count == 0)
1241 		(*pic->pic_hwunmask)(pic, ih->ih_pin);
1242 
1243 	/* If the source is free we can drop it now. */
1244 	intr_source_free(ci, ih->ih_slot, pic, idtvec);
1245 
1246 	/* Re-enable interrupts. */
1247 	x86_write_psl(psl);
1248 
1249 	DPRINTF(("%s: remove slot %d (pic %s pin %d vec %d)\n",
1250 	    device_xname(ci->ci_dev), ih->ih_slot, pic->pic_name,
1251 	    ih->ih_pin, idtvec));
1252 }
1253 
1254 static int
1255 intr_num_handlers(struct intrsource *isp)
1256 {
1257 	struct intrhand *ih;
1258 	int num;
1259 
1260 	num = 0;
1261 	for (ih = isp->is_handlers; ih != NULL; ih = ih->ih_next)
1262 		num++;
1263 
1264 	return num;
1265 }
1266 
1267 /*
1268  * Deregister an interrupt handler.
1269  */
1270 void
1271 intr_disestablish(struct intrhand *ih)
1272 {
1273 	struct cpu_info *ci;
1274 	struct intrsource *isp;
1275 	uint64_t where;
1276 
1277 	/*
1278 	 * Count the removal for load balancing.
1279 	 * Call out to the remote CPU to update its interrupt state.
1280 	 * Only make RPCs if the APs are up and running.
1281 	 */
1282 	mutex_enter(&cpu_lock);
1283 	ci = ih->ih_cpu;
1284 	(ci->ci_nintrhand)--;
1285 	KASSERT(ci->ci_nintrhand >= 0);
1286 	isp = ci->ci_isources[ih->ih_slot];
1287 	if (ci == curcpu() || !mp_online) {
1288 		intr_disestablish_xcall(ih, NULL);
1289 	} else {
1290 		where = xc_unicast(0, intr_disestablish_xcall, ih, NULL, ci);
1291 		xc_wait(where);
1292 	}
1293 	if (!msipic_is_msi_pic(isp->is_pic) && intr_num_handlers(isp) < 1) {
1294 		intr_free_io_intrsource_direct(isp);
1295 	}
1296 	mutex_exit(&cpu_lock);
1297 	kmem_free(ih, sizeof(*ih));
1298 }
1299 
1300 static const char *
1301 xen_intr_string(int port, char *buf, size_t len, struct pic *pic)
1302 {
1303 	KASSERT(pic->pic_type == PIC_XEN);
1304 
1305 	KASSERT(port >= 0);
1306 
1307 	snprintf(buf, len, "%s chan %d", pic->pic_name, port);
1308 
1309 	return buf;
1310 }
1311 
1312 static const char *
1313 legacy_intr_string(int ih, char *buf, size_t len, struct pic *pic)
1314 {
1315 	int legacy_irq;
1316 
1317 	KASSERT(pic->pic_type == PIC_I8259);
1318 #if NLAPIC > 0
1319 	KASSERT(APIC_IRQ_ISLEGACY(ih));
1320 
1321 	legacy_irq = APIC_IRQ_LEGACY_IRQ(ih);
1322 #else
1323 	legacy_irq = ih;
1324 #endif
1325 	KASSERT(legacy_irq >= 0 && legacy_irq < 16);
1326 
1327 	snprintf(buf, len, "%s pin %d", pic->pic_name, legacy_irq);
1328 
1329 	return buf;
1330 }
1331 
1332 const char *
1333 intr_string(intr_handle_t ih, char *buf, size_t len)
1334 {
1335 #if NIOAPIC > 0
1336 	struct ioapic_softc *pic;
1337 #endif
1338 
1339 	if (ih == 0)
1340 		panic("%s: bogus handle 0x%" PRIx64, __func__, ih);
1341 
1342 #if NIOAPIC > 0
1343 	if (ih & APIC_INT_VIA_APIC) {
1344 		pic = ioapic_find(APIC_IRQ_APIC(ih));
1345 		if (pic != NULL) {
1346 			snprintf(buf, len, "%s pin %d",
1347 			    device_xname(pic->sc_dev), APIC_IRQ_PIN(ih));
1348 		} else {
1349 			snprintf(buf, len,
1350 			    "apic %d int %d (irq %d)",
1351 			    APIC_IRQ_APIC(ih),
1352 			    APIC_IRQ_PIN(ih),
1353 			    APIC_IRQ_LEGACY_IRQ(ih));
1354 		}
1355 	} else
1356 		snprintf(buf, len, "irq %d", APIC_IRQ_LEGACY_IRQ(ih));
1357 
1358 #elif NLAPIC > 0
1359 	snprintf(buf, len, "irq %d", APIC_IRQ_LEGACY_IRQ(ih));
1360 #else
1361 	snprintf(buf, len, "irq %d", (int) ih);
1362 #endif
1363 	return buf;
1364 
1365 }
1366 
1367 /*
1368  * Fake interrupt handler structures for the benefit of symmetry with
1369  * other interrupt sources, and the benefit of x86_intr_calculatemasks()
1370  */
1371 struct intrhand fake_timer_intrhand;
1372 struct intrhand fake_ipi_intrhand;
1373 #if NHYPERV > 0
1374 struct intrhand fake_hyperv_intrhand;
1375 #endif
1376 
1377 #if NLAPIC > 0 && defined(MULTIPROCESSOR)
1378 static const char *x86_ipi_names[X86_NIPI] = X86_IPI_NAMES;
1379 #endif
1380 
1381 #if defined(INTRSTACKSIZE)
1382 static inline bool
1383 redzone_const_or_false(bool x)
1384 {
1385 #ifdef DIAGNOSTIC
1386 	return x;
1387 #else
1388 	return false;
1389 #endif /* !DIAGNOSTIC */
1390 }
1391 
1392 static inline int
1393 redzone_const_or_zero(int x)
1394 {
1395 	return redzone_const_or_false(true) ? x : 0;
1396 }
1397 #endif
1398 
1399 /*
1400  * Initialize all handlers that aren't dynamically allocated, and exist
1401  * for each CPU.
1402  */
1403 void
1404 cpu_intr_init(struct cpu_info *ci)
1405 {
1406 #if NLAPIC > 0
1407 	struct intrsource *isp;
1408 	static int first = 1;
1409 #if defined(MULTIPROCESSOR)
1410 	int i;
1411 #endif
1412 
1413 	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
1414 	isp->is_recurse = Xrecurse_lapic_ltimer;
1415 	isp->is_resume = Xresume_lapic_ltimer;
1416 	fake_timer_intrhand.ih_pic = &local_pic;
1417 	fake_timer_intrhand.ih_level = IPL_CLOCK;
1418 	isp->is_handlers = &fake_timer_intrhand;
1419 	isp->is_pic = &local_pic;
1420 	ci->ci_isources[LIR_TIMER] = isp;
1421 	evcnt_attach_dynamic(&isp->is_evcnt,
1422 	    first ? EVCNT_TYPE_INTR : EVCNT_TYPE_MISC, NULL,
1423 	    device_xname(ci->ci_dev), "timer");
1424 	first = 0;
1425 
1426 #ifdef MULTIPROCESSOR
1427 	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
1428 	isp->is_recurse = Xrecurse_lapic_ipi;
1429 	isp->is_resume = Xresume_lapic_ipi;
1430 	fake_ipi_intrhand.ih_pic = &local_pic;
1431 	fake_ipi_intrhand.ih_level = IPL_HIGH;
1432 	isp->is_handlers = &fake_ipi_intrhand;
1433 	isp->is_pic = &local_pic;
1434 	ci->ci_isources[LIR_IPI] = isp;
1435 
1436 	for (i = 0; i < X86_NIPI; i++)
1437 		evcnt_attach_dynamic(&ci->ci_ipi_events[i], EVCNT_TYPE_MISC,
1438 		    NULL, device_xname(ci->ci_dev), x86_ipi_names[i]);
1439 #endif /* MULTIPROCESSOR */
1440 
1441 #if NHYPERV > 0
1442 	if (hyperv_hypercall_enabled()) {
1443 		isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
1444 		isp->is_recurse = Xrecurse_hyperv_hypercall;
1445 		isp->is_resume = Xresume_hyperv_hypercall;
1446 		fake_hyperv_intrhand.ih_level = IPL_NET;
1447 		isp->is_handlers = &fake_hyperv_intrhand;
1448 		isp->is_pic = &local_pic;
1449 		ci->ci_isources[LIR_HV] = isp;
1450 		evcnt_attach_dynamic(&isp->is_evcnt, EVCNT_TYPE_INTR, NULL,
1451 		    device_xname(ci->ci_dev), "Hyper-V hypercall");
1452 	}
1453 #endif /* NHYPERV > 0 */
1454 #endif /* NLAPIC > 0 */
1455 
1456 #if defined(__HAVE_PREEMPTION)
1457 	x86_init_preempt(ci);
1458 
1459 #endif
1460 	x86_intr_calculatemasks(ci);
1461 
1462 #if defined(INTRSTACKSIZE)
1463 	vaddr_t istack;
1464 
1465 	/*
1466 	 * If the red zone is activated, protect both the top and
1467 	 * the bottom of the stack with an unmapped page.
1468 	 */
1469 	istack = uvm_km_alloc(kernel_map,
1470 	    INTRSTACKSIZE + redzone_const_or_zero(2 * PAGE_SIZE), 0,
1471 	    UVM_KMF_WIRED | UVM_KMF_ZERO);
1472 	if (redzone_const_or_false(true)) {
1473 		pmap_kremove(istack, PAGE_SIZE);
1474 		pmap_kremove(istack + INTRSTACKSIZE + PAGE_SIZE, PAGE_SIZE);
1475 		pmap_update(pmap_kernel());
1476 	}
1477 
1478 	/*
1479 	 * 33 used to be 1.  Arbitrarily reserve 32 more register_t's
1480 	 * of space for ddb(4) to examine some subroutine arguments
1481 	 * and to hunt for the next stack frame.
1482 	 */
1483 	ci->ci_intrstack = (char *)istack + redzone_const_or_zero(PAGE_SIZE) +
1484 	    INTRSTACKSIZE - 33 * sizeof(register_t);
1485 #endif
1486 
1487 	ci->ci_idepth = -1;
1488 }
1489 
1490 #if defined(INTRDEBUG) || defined(DDB)
1491 
1492 void
1493 intr_printconfig(void)
1494 {
1495 	int i;
1496 	struct intrhand *ih;
1497 	struct intrsource *isp;
1498 	struct cpu_info *ci;
1499 	CPU_INFO_ITERATOR cii;
1500 	void (*pr)(const char *, ...);
1501 
1502 	pr = printf;
1503 #ifdef DDB
1504 	if (db_active) {
1505 		pr = db_printf;
1506 	}
1507 #endif
1508 
1509 	for (CPU_INFO_FOREACH(cii, ci)) {
1510 		(*pr)("%s: interrupt masks:\n", device_xname(ci->ci_dev));
1511 		for (i = 0; i < NIPL; i++)
1512 			(*pr)("IPL %d mask %016"PRIx64" unmask %016"PRIx64"\n",
1513 			    i, ci->ci_imask[i], ci->ci_iunmask[i]);
1514 		for (i = 0; i < MAX_INTR_SOURCES; i++) {
1515 			isp = ci->ci_isources[i];
1516 			if (isp == NULL)
1517 				continue;
1518 			(*pr)("%s source %d is pin %d from pic %s type %d "
1519 			    "maxlevel %d\n", device_xname(ci->ci_dev), i,
1520 			    isp->is_pin, isp->is_pic->pic_name, isp->is_type,
1521 			    isp->is_maxlevel);
1522 			for (ih = isp->is_handlers; ih != NULL;
1523 			     ih = ih->ih_next)
1524 				(*pr)("\thandler %p level %d\n",
1525 				    ih->ih_fun, ih->ih_level);
1526 #if NIOAPIC > 0
1527 			if (isp->is_pic->pic_type == PIC_IOAPIC) {
1528 				struct ioapic_softc *sc;
1529 				sc = isp->is_pic->pic_ioapic;
1530 				(*pr)("\tioapic redir 0x%x\n",
1531 				    sc->sc_pins[isp->is_pin].ip_map->redir);
1532 			}
1533 #endif
1534 
1535 		}
1536 	}
1537 }
1538 
1539 #endif
1540 
1541 /*
1542  * Save current affinitied cpu's interrupt count.
1543  */
1544 static void
1545 intr_save_evcnt(struct intrsource *source, cpuid_t cpuid)
1546 {
1547 	struct percpu_evcnt *pep;
1548 	uint64_t curcnt;
1549 	int i;
1550 
1551 	curcnt = source->is_evcnt.ev_count;
1552 	pep = source->is_saved_evcnt;
1553 
1554 	for (i = 0; i < ncpu; i++) {
1555 		if (pep[i].cpuid == cpuid) {
1556 			pep[i].count = curcnt;
1557 			break;
1558 		}
1559 	}
1560 }
1561 
1562 /*
1563  * Restore current affinitied cpu's interrupt count.
1564  */
1565 static void
1566 intr_restore_evcnt(struct intrsource *source, cpuid_t cpuid)
1567 {
1568 	struct percpu_evcnt *pep;
1569 	int i;
1570 
1571 	pep = source->is_saved_evcnt;
1572 
1573 	for (i = 0; i < ncpu; i++) {
1574 		if (pep[i].cpuid == cpuid) {
1575 			source->is_evcnt.ev_count = pep[i].count;
1576 			break;
1577 		}
1578 	}
1579 }
1580 
1581 static void
1582 intr_redistribute_xc_t(void *arg1, void *arg2)
1583 {
1584 	struct cpu_info *ci;
1585 	struct intrsource *isp;
1586 	int slot;
1587 	u_long psl;
1588 
1589 	ci = curcpu();
1590 	isp = arg1;
1591 	slot = (int)(intptr_t)arg2;
1592 
1593 	/* Disable interrupts locally. */
1594 	psl = x86_read_psl();
1595 	x86_disable_intr();
1596 
1597 	/* Hook it in and re-calculate masks. */
1598 	ci->ci_isources[slot] = isp;
1599 	x86_intr_calculatemasks(curcpu());
1600 
1601 	/* Re-enable interrupts locally. */
1602 	x86_write_psl(psl);
1603 }
1604 
1605 static void
1606 intr_redistribute_xc_s1(void *arg1, void *arg2)
1607 {
1608 	struct pic *pic;
1609 	struct intrsource *isp;
1610 	struct cpu_info *nci;
1611 	u_long psl;
1612 
1613 	isp = arg1;
1614 	nci = arg2;
1615 
1616 	/*
1617 	 * Disable interrupts on-chip and mask the pin.  Back out
1618 	 * and let the interrupt be processed if one is pending.
1619 	 */
1620 	pic = isp->is_pic;
1621 	for (;;) {
1622 		psl = x86_read_psl();
1623 		x86_disable_intr();
1624 		if ((*pic->pic_trymask)(pic, isp->is_pin)) {
1625 			break;
1626 		}
1627 		x86_write_psl(psl);
1628 		DELAY(1000);
1629 	}
1630 
1631 	/* pic_addroute will unmask the interrupt. */
1632 	(*pic->pic_addroute)(pic, nci, isp->is_pin, isp->is_idtvec,
1633 	    isp->is_type);
1634 	x86_write_psl(psl);
1635 }
1636 
1637 static void
1638 intr_redistribute_xc_s2(void *arg1, void *arg2)
1639 {
1640 	struct cpu_info *ci;
1641 	u_long psl;
1642 	int slot;
1643 
1644 	ci = curcpu();
1645 	slot = (int)(uintptr_t)arg1;
1646 
1647 	/* Disable interrupts locally. */
1648 	psl = x86_read_psl();
1649 	x86_disable_intr();
1650 
1651 	/* Patch out the source and re-calculate masks. */
1652 	ci->ci_isources[slot] = NULL;
1653 	x86_intr_calculatemasks(ci);
1654 
1655 	/* Re-enable interrupts locally. */
1656 	x86_write_psl(psl);
1657 }
1658 
1659 static bool
1660 intr_redistribute(struct cpu_info *oci)
1661 {
1662 	struct intrsource *isp;
1663 	struct intrhand *ih;
1664 	CPU_INFO_ITERATOR cii;
1665 	struct cpu_info *nci, *ici;
1666 	int oslot, nslot;
1667 	uint64_t where;
1668 
1669 	KASSERT(mutex_owned(&cpu_lock));
1670 
1671 	/* Look for an interrupt source that we can migrate. */
1672 	for (oslot = 0; oslot < MAX_INTR_SOURCES; oslot++) {
1673 		if ((isp = oci->ci_isources[oslot]) == NULL) {
1674 			continue;
1675 		}
1676 		if (isp->is_pic->pic_type == PIC_IOAPIC) {
1677 			break;
1678 		}
1679 	}
1680 	if (oslot == MAX_INTR_SOURCES) {
1681 		return false;
1682 	}
1683 
1684 	/* Find least loaded CPU and try to move there. */
1685 	nci = NULL;
1686 	for (CPU_INFO_FOREACH(cii, ici)) {
1687 		if ((ici->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
1688 			continue;
1689 		}
1690 		KASSERT(ici != oci);
1691 		if (nci == NULL || nci->ci_nintrhand > ici->ci_nintrhand) {
1692 			nci = ici;
1693 		}
1694 	}
1695 	if (nci == NULL) {
1696 		return false;
1697 	}
1698 	for (nslot = 0; nslot < MAX_INTR_SOURCES; nslot++) {
1699 		if (nci->ci_isources[nslot] == NULL) {
1700 			break;
1701 		}
1702 	}
1703 
1704 	/* If that did not work, allocate anywhere. */
1705 	if (nslot == MAX_INTR_SOURCES) {
1706 		for (CPU_INFO_FOREACH(cii, nci)) {
1707 			if ((nci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
1708 				continue;
1709 			}
1710 			KASSERT(nci != oci);
1711 			for (nslot = 0; nslot < MAX_INTR_SOURCES; nslot++) {
1712 				if (nci->ci_isources[nslot] == NULL) {
1713 					break;
1714 				}
1715 			}
1716 			if (nslot != MAX_INTR_SOURCES) {
1717 				break;
1718 			}
1719 		}
1720 	}
1721 	if (nslot == MAX_INTR_SOURCES) {
1722 		return false;
1723 	}
1724 
1725 	/*
1726 	 * Now we have new CPU and new slot.  Run a cross-call to set up
1727 	 * the new vector on the target CPU.
1728 	 */
1729 	where = xc_unicast(0, intr_redistribute_xc_t, isp,
1730 	    (void *)(intptr_t)nslot, nci);
1731 	xc_wait(where);
1732 
1733 	/*
1734 	 * We're ready to go on the target CPU.  Run a cross call to
1735 	 * reroute the interrupt away from the source CPU.
1736 	 */
1737 	where = xc_unicast(0, intr_redistribute_xc_s1, isp, nci, oci);
1738 	xc_wait(where);
1739 
1740 	/* Sleep for (at least) 10ms to allow the change to take hold. */
1741 	(void)kpause("intrdist", false, mstohz(10), NULL);
1742 
1743 	/* Complete removal from the source CPU. */
1744 	where = xc_unicast(0, intr_redistribute_xc_s2,
1745 	    (void *)(uintptr_t)oslot, NULL, oci);
1746 	xc_wait(where);
1747 
1748 	/* Finally, take care of book-keeping. */
1749 	for (ih = isp->is_handlers; ih != NULL; ih = ih->ih_next) {
1750 		oci->ci_nintrhand--;
1751 		nci->ci_nintrhand++;
1752 		ih->ih_cpu = nci;
1753 	}
1754 	intr_save_evcnt(isp, oci->ci_cpuid);
1755 	intr_restore_evcnt(isp, nci->ci_cpuid);
1756 	isp->is_active_cpu = nci->ci_cpuid;
1757 
1758 	return true;
1759 }
1760 
1761 void
1762 cpu_intr_redistribute(void)
1763 {
1764 	CPU_INFO_ITERATOR cii;
1765 	struct cpu_info *ci;
1766 
1767 	KASSERT(mutex_owned(&cpu_lock));
1768 	KASSERT(mp_online);
1769 
1770 	/* Direct interrupts away from shielded CPUs. */
1771 	for (CPU_INFO_FOREACH(cii, ci)) {
1772 		if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) == 0) {
1773 			continue;
1774 		}
1775 		while (intr_redistribute(ci)) {
1776 			/* nothing */
1777 		}
1778 	}
1779 
1780 	/* XXX should now re-balance */
1781 }
1782 
1783 u_int
1784 cpu_intr_count(struct cpu_info *ci)
1785 {
1786 
1787 	KASSERT(ci->ci_nintrhand >= 0);
1788 
1789 	return ci->ci_nintrhand;
1790 }
1791 
1792 static int
1793 intr_find_unused_slot(struct cpu_info *ci, int *index)
1794 {
1795 	int slot, i;
1796 
1797 	KASSERT(mutex_owned(&cpu_lock));
1798 
1799 	slot = -1;
1800 	for (i = 0; i < MAX_INTR_SOURCES ; i++) {
1801 		if (ci->ci_isources[i] == NULL) {
1802 			slot = i;
1803 			break;
1804 		}
1805 	}
1806 	if (slot == -1) {
1807 		DPRINTF(("cannot allocate ci_isources\n"));
1808 		return EBUSY;
1809 	}
1810 
1811 	*index = slot;
1812 	return 0;
1813 }
1814 
1815 /*
1816  * Let cpu_info ready to accept the interrupt.
1817  */
1818 static void
1819 intr_activate_xcall(void *arg1, void *arg2)
1820 {
1821 	struct cpu_info *ci;
1822 	struct intrsource *source;
1823 	struct intrstub *stubp;
1824 	struct intrhand *ih;
1825 	struct idt_vec *iv;
1826 	u_long psl;
1827 	int idt_vec;
1828 	int slot;
1829 
1830 	ih = arg1;
1831 
1832 	kpreempt_disable();
1833 
1834 	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
1835 
1836 	ci = ih->ih_cpu;
1837 	slot = ih->ih_slot;
1838 	source = ci->ci_isources[slot];
1839 	idt_vec = source->is_idtvec;
1840 	iv = idt_vec_ref(&ci->ci_idtvec);
1841 
1842 	psl = x86_read_psl();
1843 	x86_disable_intr();
1844 
1845 	x86_intr_calculatemasks(ci);
1846 
1847 	if (source->is_type == IST_LEVEL) {
1848 		stubp = &source->is_pic->pic_level_stubs[slot];
1849 	} else {
1850 		stubp = &source->is_pic->pic_edge_stubs[slot];
1851 	}
1852 
1853 	source->is_resume = stubp->ist_resume;
1854 	source->is_recurse = stubp->ist_recurse;
1855 	idt_vec_set(iv, idt_vec, stubp->ist_entry);
1856 
1857 	x86_write_psl(psl);
1858 
1859 	kpreempt_enable();
1860 }
1861 
1862 /*
1863  * Let cpu_info not accept the interrupt.
1864  */
1865 static void
1866 intr_deactivate_xcall(void *arg1, void *arg2)
1867 {
1868 	struct cpu_info *ci;
1869 	struct intrhand *ih, *lih;
1870 	struct intrsource *isp;
1871 	u_long psl;
1872 	int idt_vec;
1873 	int slot;
1874 
1875 	ih = arg1;
1876 
1877 	kpreempt_disable();
1878 
1879 	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
1880 
1881 	ci = ih->ih_cpu;
1882 	slot = ih->ih_slot;
1883 	isp = ci->ci_isources[slot];
1884 	idt_vec = isp->is_idtvec;
1885 
1886 	psl = x86_read_psl();
1887 	x86_disable_intr();
1888 
1889 	/* Move all devices sharing IRQ number. */
1890 	ci->ci_isources[slot] = NULL;
1891 	for (lih = ih; lih != NULL; lih = lih->ih_next) {
1892 		ci->ci_nintrhand--;
1893 	}
1894 
1895 	x86_intr_calculatemasks(ci);
1896 
1897 	if (idt_vec_is_pcpu()) {
1898 		idt_vec_free(&ci->ci_idtvec, idt_vec);
1899 	} else {
1900 		/*
1901 		 * Skip unsetgate(), because the same idt[] entry is
1902 		 * overwritten in intr_activate_xcall().
1903 		 */
1904 	}
1905 
1906 	x86_write_psl(psl);
1907 
1908 	kpreempt_enable();
1909 }
1910 
1911 static void
1912 intr_get_affinity(struct intrsource *isp, kcpuset_t *cpuset)
1913 {
1914 	struct cpu_info *ci;
1915 
1916 	KASSERT(mutex_owned(&cpu_lock));
1917 
1918 	if (isp == NULL) {
1919 		kcpuset_zero(cpuset);
1920 		return;
1921 	}
1922 
1923 	KASSERTMSG(isp->is_handlers != NULL,
1924 	    "Don't get affinity for the device which is not established.");
1925 
1926 	ci = isp->is_handlers->ih_cpu;
1927 	if (ci == NULL) {
1928 		kcpuset_zero(cpuset);
1929 		return;
1930 	}
1931 
1932 	kcpuset_set(cpuset, cpu_index(ci));
1933 	return;
1934 }
1935 
1936 static int
1937 intr_set_affinity(struct intrsource *isp, const kcpuset_t *cpuset)
1938 {
1939 	struct cpu_info *oldci, *newci;
1940 	struct intrhand *ih, *lih;
1941 	struct pic *pic;
1942 	u_int cpu_idx;
1943 	int old_idtvec, new_idtvec;
1944 	int oldslot, newslot;
1945 	int err;
1946 	int pin;
1947 
1948 	KASSERT(mutex_owned(&intr_distribute_lock));
1949 	KASSERT(mutex_owned(&cpu_lock));
1950 
1951 	/* XXX
1952 	 * logical destination mode is not supported, use lowest index cpu.
1953 	 */
1954 	cpu_idx = kcpuset_ffs(cpuset) - 1;
1955 	newci = cpu_lookup(cpu_idx);
1956 	if (newci == NULL) {
1957 		DPRINTF(("invalid cpu index: %u\n", cpu_idx));
1958 		return EINVAL;
1959 	}
1960 	if ((newci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
1961 		DPRINTF(("the cpu is set nointr shield. index:%u\n", cpu_idx));
1962 		return EINVAL;
1963 	}
1964 
1965 	if (isp == NULL) {
1966 		DPRINTF(("invalid intrctl handler\n"));
1967 		return EINVAL;
1968 	}
1969 
1970 	/* i8259_pic supports only primary cpu, see i8259.c. */
1971 	pic = isp->is_pic;
1972 	if (pic == &i8259_pic) {
1973 		DPRINTF(("i8259 pic does not support set_affinity\n"));
1974 		return ENOTSUP;
1975 	}
1976 
1977 	ih = isp->is_handlers;
1978 	KASSERTMSG(ih != NULL,
1979 	    "Don't set affinity for the device which is not established.");
1980 
1981 	oldci = ih->ih_cpu;
1982 	if (newci == oldci) /* nothing to do */
1983 		return 0;
1984 
1985 	oldslot = ih->ih_slot;
1986 
1987 	err = intr_find_unused_slot(newci, &newslot);
1988 	if (err) {
1989 		DPRINTF(("failed to allocate interrupt slot for PIC %s intrid "
1990 			"%s\n", isp->is_pic->pic_name, isp->is_intrid));
1991 		return err;
1992 	}
1993 
1994 	old_idtvec = isp->is_idtvec;
1995 
1996 	if (idt_vec_is_pcpu()) {
1997 		new_idtvec = idt_vec_alloc(&newci->ci_idtvec,
1998 		    APIC_LEVEL(ih->ih_level), IDT_INTR_HIGH);
1999 		if (new_idtvec == 0)
2000 			return EBUSY;
2001 		DPRINTF(("interrupt from cpu%d vec %d to cpu%d vec %d\n",
2002 		    cpu_index(oldci), old_idtvec, cpu_index(newci),
2003 			new_idtvec));
2004 	} else {
2005 		new_idtvec = isp->is_idtvec;
2006 	}
2007 
2008 	/* Prevent intr_unmask() from reenabling the source at the hw. */
2009 	isp->is_distribute_pending = true;
2010 
2011 	pin = isp->is_pin;
2012 	(*pic->pic_hwmask)(pic, pin); /* for ci_ipending check */
2013 	membar_sync();
2014 	while (oldci->ci_ipending & (1ULL << oldslot)) {
2015 		(void)kpause("intrdist", false, 1, &cpu_lock);
2016 		membar_sync();
2017 	}
2018 
2019 	kpreempt_disable();
2020 
2021 	/* deactivate old interrupt setting */
2022 	if (oldci == curcpu() || !mp_online) {
2023 		intr_deactivate_xcall(ih, NULL);
2024 	} else {
2025 		uint64_t where;
2026 		where = xc_unicast(0, intr_deactivate_xcall, ih,
2027 				   NULL, oldci);
2028 		xc_wait(where);
2029 	}
2030 	intr_save_evcnt(isp, oldci->ci_cpuid);
2031 	(*pic->pic_delroute)(pic, oldci, pin, old_idtvec, isp->is_type);
2032 
2033 	/* activate new interrupt setting */
2034 	isp->is_idtvec =  new_idtvec;
2035 	newci->ci_isources[newslot] = isp;
2036 	for (lih = ih; lih != NULL; lih = lih->ih_next) {
2037 		newci->ci_nintrhand++;
2038 		lih->ih_cpu = newci;
2039 		lih->ih_slot = newslot;
2040 	}
2041 	if (newci == curcpu() || !mp_online) {
2042 		intr_activate_xcall(ih, NULL);
2043 	} else {
2044 		uint64_t where;
2045 		where = xc_unicast(0, intr_activate_xcall, ih,
2046 				   NULL, newci);
2047 		xc_wait(where);
2048 	}
2049 	intr_restore_evcnt(isp, newci->ci_cpuid);
2050 	isp->is_active_cpu = newci->ci_cpuid;
2051 	(*pic->pic_addroute)(pic, newci, pin, new_idtvec, isp->is_type);
2052 
2053 	isp->is_distribute_pending = false;
2054 	if (newci == curcpu() || !mp_online) {
2055 		intr_hwunmask_xcall(ih, NULL);
2056 	} else {
2057 		uint64_t where;
2058 		where = xc_unicast(0, intr_hwunmask_xcall, ih, NULL, newci);
2059 		xc_wait(where);
2060 	}
2061 
2062 	kpreempt_enable();
2063 
2064 	return err;
2065 }
2066 
2067 static bool
2068 intr_is_affinity_intrsource(struct intrsource *isp, const kcpuset_t *cpuset)
2069 {
2070 	struct cpu_info *ci;
2071 
2072 	KASSERT(mutex_owned(&cpu_lock));
2073 
2074 	/*
2075 	 * The device is already pci_intr_alloc'ed, however it is not
2076 	 * established yet.
2077 	 */
2078 	if (isp->is_handlers == NULL)
2079 		return false;
2080 
2081 	ci = isp->is_handlers->ih_cpu;
2082 	KASSERT(ci != NULL);
2083 
2084 	return kcpuset_isset(cpuset, cpu_index(ci));
2085 }
2086 
2087 static struct intrhand *
2088 intr_get_handler(const char *intrid)
2089 {
2090 	struct intrsource *isp;
2091 
2092 	KASSERT(mutex_owned(&cpu_lock));
2093 
2094 	isp = intr_get_io_intrsource(intrid);
2095 	if (isp == NULL)
2096 		return NULL;
2097 
2098 	return isp->is_handlers;
2099 }
2100 
2101 uint64_t
2102 x86_intr_get_count(const char *intrid, u_int cpu_idx)
2103 {
2104 	struct cpu_info *ci;
2105 	struct intrsource *isp;
2106 	struct intrhand *ih;
2107 	struct percpu_evcnt pep;
2108 	cpuid_t cpuid;
2109 	int i, slot;
2110 	uint64_t count = 0;
2111 
2112 	KASSERT(mutex_owned(&cpu_lock));
2113 	ci = cpu_lookup(cpu_idx);
2114 	cpuid = ci->ci_cpuid;
2115 
2116 	ih = intr_get_handler(intrid);
2117 	if (ih == NULL) {
2118 		count = 0;
2119 		goto out;
2120 	}
2121 	slot = ih->ih_slot;
2122 	isp = ih->ih_cpu->ci_isources[slot];
2123 
2124 	for (i = 0; i < ncpu; i++) {
2125 		pep = isp->is_saved_evcnt[i];
2126 		if (cpuid == pep.cpuid) {
2127 			if (isp->is_active_cpu == pep.cpuid) {
2128 				count = isp->is_evcnt.ev_count;
2129 				goto out;
2130 			} else {
2131 				count = pep.count;
2132 				goto out;
2133 			}
2134 		}
2135 	}
2136 
2137  out:
2138 	return count;
2139 }
2140 
2141 void
2142 x86_intr_get_assigned(const char *intrid, kcpuset_t *cpuset)
2143 {
2144 	struct cpu_info *ci;
2145 	struct intrhand *ih;
2146 
2147 	KASSERT(mutex_owned(&cpu_lock));
2148 	kcpuset_zero(cpuset);
2149 
2150 	ih = intr_get_handler(intrid);
2151 	if (ih == NULL)
2152 		return;
2153 
2154 	ci = ih->ih_cpu;
2155 	kcpuset_set(cpuset, cpu_index(ci));
2156 }
2157 
2158 void
2159 x86_intr_get_devname(const char *intrid, char *buf, size_t len)
2160 {
2161 	struct intrsource *isp;
2162 	struct intrhand *ih;
2163 	int slot;
2164 
2165 	KASSERT(mutex_owned(&cpu_lock));
2166 
2167 	ih = intr_get_handler(intrid);
2168 	if (ih == NULL) {
2169 		buf[0] = '\0';
2170 		return;
2171 	}
2172 	slot = ih->ih_slot;
2173 	isp = ih->ih_cpu->ci_isources[slot];
2174 	strlcpy(buf, isp->is_xname, len);
2175 
2176 }
2177 
2178 /*
2179  * MI interface for subr_interrupt.c
2180  */
2181 uint64_t
2182 interrupt_get_count(const char *intrid, u_int cpu_idx)
2183 {
2184 	struct intrsource *isp;
2185 	uint64_t count = 0;
2186 
2187 	mutex_enter(&cpu_lock);
2188 	isp = intr_get_io_intrsource(intrid);
2189 	if (isp != NULL)
2190 		count = isp->is_pic->pic_intr_get_count(intrid, cpu_idx);
2191 	mutex_exit(&cpu_lock);
2192 	return count;
2193 }
2194 
2195 /*
2196  * MI interface for subr_interrupt.c
2197  */
2198 void
2199 interrupt_get_assigned(const char *intrid, kcpuset_t *cpuset)
2200 {
2201 	struct intrsource *isp;
2202 
2203 	mutex_enter(&cpu_lock);
2204 	isp = intr_get_io_intrsource(intrid);
2205 	if (isp != NULL)
2206 		isp->is_pic->pic_intr_get_assigned(intrid, cpuset);
2207 	mutex_exit(&cpu_lock);
2208 }
2209 
2210 /*
2211  * MI interface for subr_interrupt.c
2212  */
2213 void
2214 interrupt_get_available(kcpuset_t *cpuset)
2215 {
2216 	CPU_INFO_ITERATOR cii;
2217 	struct cpu_info *ci;
2218 
2219 	kcpuset_zero(cpuset);
2220 
2221 	mutex_enter(&cpu_lock);
2222 	for (CPU_INFO_FOREACH(cii, ci)) {
2223 		if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) == 0) {
2224 			kcpuset_set(cpuset, cpu_index(ci));
2225 		}
2226 	}
2227 	mutex_exit(&cpu_lock);
2228 }
2229 
2230 /*
2231  * MI interface for subr_interrupt.c
2232  */
2233 void
2234 interrupt_get_devname(const char *intrid, char *buf, size_t len)
2235 {
2236 	struct intrsource *isp;
2237 
2238 	mutex_enter(&cpu_lock);
2239 	isp = intr_get_io_intrsource(intrid);
2240 	if (isp != NULL) {
2241 		if (isp->is_pic->pic_intr_get_devname == NULL) {
2242 			printf("NULL get_devname intrid %s pic %s\n",
2243 			    intrid, isp->is_pic->pic_name);
2244 		} else {
2245 			isp->is_pic->pic_intr_get_devname(intrid, buf, len);
2246 		}
2247 	}
2248 	mutex_exit(&cpu_lock);
2249 }
2250 
2251 static int
2252 intr_distribute_locked(struct intrhand *ih, const kcpuset_t *newset,
2253     kcpuset_t *oldset)
2254 {
2255 	struct intrsource *isp;
2256 	int slot;
2257 
2258 	KASSERT(mutex_owned(&intr_distribute_lock));
2259 	KASSERT(mutex_owned(&cpu_lock));
2260 
2261 	if (ih == NULL)
2262 		return EINVAL;
2263 
2264 	slot = ih->ih_slot;
2265 	isp = ih->ih_cpu->ci_isources[slot];
2266 	KASSERT(isp != NULL);
2267 
2268 	if (oldset != NULL)
2269 		intr_get_affinity(isp, oldset);
2270 
2271 	return intr_set_affinity(isp, newset);
2272 }
2273 
2274 /*
2275  * MI interface for subr_interrupt.c
2276  */
2277 int
2278 interrupt_distribute(void *cookie, const kcpuset_t *newset, kcpuset_t *oldset)
2279 {
2280 	int error;
2281 	struct intrhand *ih = cookie;
2282 
2283 	mutex_enter(&intr_distribute_lock);
2284 	mutex_enter(&cpu_lock);
2285 	error = intr_distribute_locked(ih, newset, oldset);
2286 	mutex_exit(&cpu_lock);
2287 	mutex_exit(&intr_distribute_lock);
2288 
2289 	return error;
2290 }
2291 
2292 /*
2293  * MI interface for subr_interrupt.c
2294  */
2295 int
2296 interrupt_distribute_handler(const char *intrid, const kcpuset_t *newset,
2297     kcpuset_t *oldset)
2298 {
2299 	int error;
2300 	struct intrhand *ih;
2301 
2302 	mutex_enter(&intr_distribute_lock);
2303 	mutex_enter(&cpu_lock);
2304 
2305 	ih = intr_get_handler(intrid);
2306 	if (ih == NULL) {
2307 		error = ENOENT;
2308 		goto out;
2309 	}
2310 	error = intr_distribute_locked(ih, newset, oldset);
2311 
2312  out:
2313 	mutex_exit(&cpu_lock);
2314 	mutex_exit(&intr_distribute_lock);
2315 	return error;
2316 }
2317 
2318 /*
2319  * MI interface for subr_interrupt.c
2320  */
2321 struct intrids_handler *
2322 interrupt_construct_intrids(const kcpuset_t *cpuset)
2323 {
2324 	struct intrsource *isp;
2325 	struct intrids_handler *ii_handler;
2326 	intrid_t *ids;
2327 	int i, count;
2328 
2329 	if (kcpuset_iszero(cpuset))
2330 		return 0;
2331 
2332 	/*
2333 	 * Count the number of interrupts which affinity to any cpu of
2334 	 * "cpuset".
2335 	 */
2336 	count = 0;
2337 	mutex_enter(&cpu_lock);
2338 	SIMPLEQ_FOREACH(isp, &io_interrupt_sources, is_list) {
2339 		if (intr_is_affinity_intrsource(isp, cpuset))
2340 			count++;
2341 	}
2342 	mutex_exit(&cpu_lock);
2343 
2344 	ii_handler = kmem_zalloc(sizeof(int) + sizeof(intrid_t) * count,
2345 	    KM_SLEEP);
2346 	if (ii_handler == NULL)
2347 		return NULL;
2348 	ii_handler->iih_nids = count;
2349 	if (count == 0)
2350 		return ii_handler;
2351 
2352 	ids = ii_handler->iih_intrids;
2353 	i = 0;
2354 	mutex_enter(&cpu_lock);
2355 	SIMPLEQ_FOREACH(isp, &io_interrupt_sources, is_list) {
2356 		/* Ignore devices attached after counting "count". */
2357 		if (i >= count) {
2358 			DPRINTF(("New devices are attached after counting.\n"));
2359 			break;
2360 		}
2361 
2362 		if (!intr_is_affinity_intrsource(isp, cpuset))
2363 			continue;
2364 
2365 		strncpy(ids[i], isp->is_intrid, sizeof(intrid_t));
2366 		i++;
2367 	}
2368 	mutex_exit(&cpu_lock);
2369 
2370 	return ii_handler;
2371 }
2372 
2373 /*
2374  * MI interface for subr_interrupt.c
2375  */
2376 void
2377 interrupt_destruct_intrids(struct intrids_handler *ii_handler)
2378 {
2379 	size_t iih_size;
2380 
2381 	if (ii_handler == NULL)
2382 		return;
2383 
2384 	iih_size = sizeof(int) + sizeof(intrid_t) * ii_handler->iih_nids;
2385 	kmem_free(ii_handler, iih_size);
2386 }
2387