xref: /dflybsd-src/usr.sbin/powerd/powerd.c (revision 267c04fd19451e986d2f8130b7f155892c9527f8)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * The powerd daemon :
37  * - Monitor the cpu load and adjusts cpu and cpu power domain
38  *   performance accordingly.
39  * - Monitor battery life.  Alarm alerts and shutdown the machine
40  *   if battery life goes low.
41  */
42 
43 #define _KERNEL_STRUCTURES
44 #include <sys/types.h>
45 #include <sys/sysctl.h>
46 #include <sys/kinfo.h>
47 #include <sys/file.h>
48 #include <sys/queue.h>
49 #include <sys/soundcard.h>
50 #include <sys/time.h>
51 #include <machine/cpufunc.h>
52 #include <err.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <unistd.h>
56 #include <string.h>
57 #include <syslog.h>
58 
59 #include "alert1.h"
60 
61 #define MAXDOM		MAXCPU	/* worst case, 1 cpu per domain */
62 
63 #define MAXFREQ		64
64 #define CST_STRLEN	16
65 
66 struct cpu_pwrdom {
67 	TAILQ_ENTRY(cpu_pwrdom)	dom_link;
68 	int			dom_id;
69 	int			dom_ncpus;
70 	cpumask_t		dom_cpumask;
71 };
72 
73 struct cpu_state {
74 	double			cpu_qavg;
75 	double			cpu_uavg;	/* used for speeding up */
76 	double			cpu_davg;	/* used for slowing down */
77 	int			cpu_limit;
78 	int			cpu_count;
79 	char			cpu_name[8];
80 };
81 
82 static void usage(void);
83 static void get_ncpus(void);
84 
85 /* usched cpumask */
86 static void get_uschedcpus(void);
87 static void set_uschedcpus(void);
88 
89 /* perfbias(4) */
90 static int has_perfbias(void);
91 static void set_perfbias(int, int);
92 
93 /* acpi(4) P-state */
94 static void acpi_getcpufreq_str(int, int *, int *);
95 static int acpi_getcpufreq_bin(int, int *, int *);
96 static void acpi_get_cpufreq(int, int *, int *);
97 static void acpi_set_cpufreq(int, int);
98 static int acpi_get_cpupwrdom(void);
99 
100 /* mwait C-state hint */
101 static int probe_cstate(void);
102 static void set_cstate(int, int);
103 
104 /* Performance monitoring */
105 static void init_perf(void);
106 static void mon_perf(double);
107 static void adj_perf(cpumask_t, cpumask_t);
108 static void adj_cpu_pwrdom(int, int);
109 static void adj_cpu_perf(int, int);
110 static void get_cputime(double);
111 static int get_nstate(struct cpu_state *, double);
112 static void add_spare_cpus(const cpumask_t, int);
113 static void restore_perf(void);
114 
115 /* Battery monitoring */
116 static int has_battery(void);
117 static int mon_battery(void);
118 static void low_battery_alert(int);
119 
120 /* Runtime states for performance monitoring */
121 static int global_pcpu_limit;
122 static struct cpu_state pcpu_state[MAXCPU];
123 static struct cpu_state global_cpu_state;
124 static cpumask_t cpu_used;		/* cpus w/ high perf */
125 static cpumask_t cpu_pwrdom_used;	/* cpu power domains w/ high perf */
126 static cpumask_t usched_cpu_used;	/* cpus for usched */
127 
128 /* Constants */
129 static cpumask_t cpu_pwrdom_mask;	/* usable cpu power domains */
130 static int cpu2pwrdom[MAXCPU];		/* cpu to cpu power domain map */
131 static struct cpu_pwrdom *cpu_pwrdomain[MAXDOM];
132 static int NCpus;			/* # of cpus */
133 static char orig_global_cx[CST_STRLEN];
134 static char cpu_perf_cx[CST_STRLEN];
135 static int cpu_perf_cxlen;
136 static char cpu_idle_cx[CST_STRLEN];
137 static int cpu_idle_cxlen;
138 
139 static int DebugOpt;
140 static int TurboOpt = 1;
141 static int PowerFd;
142 static int Hysteresis = 10;	/* percentage */
143 static double TriggerUp = 0.25;	/* single-cpu load to force max freq */
144 static double TriggerDown;	/* load per cpu to force the min freq */
145 static int HasPerfbias = 0;
146 static int AdjustCpuFreq = 1;
147 static int AdjustCstate = 0;
148 
149 static volatile int stopped;
150 
151 /* Battery life monitoring */
152 static int BatLifeMin = 2;	/* shutdown the box, if low on battery life */
153 static struct timespec BatLifePrevT;
154 static int BatLifePollIntvl = 5; /* unit: sec */
155 static struct timespec BatShutdownStartT;
156 static int BatShutdownLinger = -1;
157 static int BatShutdownLingerSet = 60; /* unit: sec */
158 static int BatShutdownLingerCnt;
159 static int BatShutdownAudioAlert = 1;
160 
161 static void sigintr(int signo);
162 
163 int
164 main(int ac, char **av)
165 {
166 	double srt;
167 	double pollrate;
168 	int ch;
169 	char buf[64];
170 	int monbat;
171 
172 	srt = 8.0;	/* time for samples - 8 seconds */
173 	pollrate = 1.0;	/* polling rate in seconds */
174 
175 	while ((ch = getopt(ac, av, "cdefp:r:tu:B:L:P:QT:")) != -1) {
176 		switch(ch) {
177 		case 'c':
178 			AdjustCstate = 1;
179 			break;
180 		case 'd':
181 			DebugOpt = 1;
182 			break;
183 		case 'e':
184 			HasPerfbias = 1;
185 			break;
186 		case 'f':
187 			AdjustCpuFreq = 0;
188 			break;
189 		case 'p':
190 			Hysteresis = (int)strtol(optarg, NULL, 10);
191 			break;
192 		case 'r':
193 			pollrate = strtod(optarg, NULL);
194 			break;
195 		case 't':
196 			TurboOpt = 0;
197 			break;
198 		case 'u':
199 			TriggerUp = (double)strtol(optarg, NULL, 10) / 100;
200 			break;
201 		case 'B':
202 			BatLifeMin = strtol(optarg, NULL, 10);
203 			break;
204 		case 'L':
205 			BatShutdownLingerSet = strtol(optarg, NULL, 10);
206 			if (BatShutdownLingerSet < 0)
207 				BatShutdownLingerSet = 0;
208 			break;
209 		case 'P':
210 			BatLifePollIntvl = strtol(optarg, NULL, 10);
211 			break;
212 		case 'Q':
213 			BatShutdownAudioAlert = 0;
214 			break;
215 		case 'T':
216 			srt = strtod(optarg, NULL);
217 			break;
218 		default:
219 			usage();
220 			/* NOT REACHED */
221 		}
222 	}
223 	ac -= optind;
224 	av += optind;
225 
226 	setlinebuf(stdout);
227 
228 	/* Get number of cpus */
229 	get_ncpus();
230 
231 	if (0 > Hysteresis || Hysteresis > 99) {
232 		fprintf(stderr, "Invalid hysteresis value\n");
233 		exit(1);
234 	}
235 
236 	if (0 > TriggerUp || TriggerUp > 1) {
237 		fprintf(stderr, "Invalid load limit value\n");
238 		exit(1);
239 	}
240 
241 	TriggerDown = TriggerUp - (TriggerUp * (double) Hysteresis / 100);
242 
243 	/*
244 	 * Make sure powerd is not already running.
245 	 */
246 	PowerFd = open("/var/run/powerd.pid", O_CREAT|O_RDWR, 0644);
247 	if (PowerFd < 0) {
248 		fprintf(stderr,
249 			"Cannot create /var/run/powerd.pid, "
250 			"continuing anyway\n");
251 	} else {
252 		if (flock(PowerFd, LOCK_EX|LOCK_NB) < 0) {
253 			fprintf(stderr, "powerd is already running\n");
254 			exit(1);
255 		}
256 	}
257 
258 	/*
259 	 * Demonize and set pid
260 	 */
261 	if (DebugOpt == 0) {
262 		daemon(0, 0);
263 		openlog("powerd", LOG_CONS | LOG_PID, LOG_DAEMON);
264 	}
265 
266 	if (PowerFd >= 0) {
267 		ftruncate(PowerFd, 0);
268 		snprintf(buf, sizeof(buf), "%d\n", (int)getpid());
269 		write(PowerFd, buf, strlen(buf));
270 	}
271 
272 	/* Do we need to monitor battery life? */
273 	if (BatLifePollIntvl <= 0)
274 		monbat = 0;
275 	else
276 		monbat = has_battery();
277 
278 	/* Do we have perfbias(4)? */
279 	if (HasPerfbias)
280 		HasPerfbias = has_perfbias();
281 
282 	/* Could we adjust C-state? */
283 	if (AdjustCstate)
284 		AdjustCstate = probe_cstate();
285 
286 	/*
287 	 * Wait hw.acpi.cpu.px_dom* sysctl to be created by kernel.
288 	 *
289 	 * Since hw.acpi.cpu.px_dom* creation is queued into ACPI
290 	 * taskqueue and ACPI taskqueue is shared across various
291 	 * ACPI modules, any delay in other modules may cause
292 	 * hw.acpi.cpu.px_dom* to be created at quite a later time
293 	 * (e.g. cmbat module's task could take quite a lot of time).
294 	 */
295 	for (;;) {
296 		/* Prime delta cputime calculation. */
297 		get_cputime(pollrate);
298 
299 		/* Wait for all cpus to appear */
300 		if (acpi_get_cpupwrdom())
301 			break;
302 		usleep((int)(pollrate * 1000000.0));
303 	}
304 
305 	/*
306 	 * Catch some signals so that max performance could be restored.
307 	 */
308 	signal(SIGINT, sigintr);
309 	signal(SIGTERM, sigintr);
310 
311 	/* Initialize performance states */
312 	init_perf();
313 
314 	srt = srt / pollrate;	/* convert to sample count */
315 	if (DebugOpt)
316 		printf("samples for downgrading: %5.2f\n", srt);
317 
318 	/*
319 	 * Monitoring loop
320 	 */
321 	while (!stopped) {
322 		/*
323 		 * Monitor performance
324 		 */
325 		get_cputime(pollrate);
326 		mon_perf(srt);
327 
328 		/*
329 		 * Monitor battery
330 		 */
331 		if (monbat)
332 			monbat = mon_battery();
333 
334 		usleep((int)(pollrate * 1000000.0));
335 	}
336 
337 	/*
338 	 * Set to maximum performance if killed.
339 	 */
340 	syslog(LOG_INFO, "killed, setting max and exiting");
341 	restore_perf();
342 
343 	exit(0);
344 }
345 
346 static void
347 sigintr(int signo __unused)
348 {
349 	stopped = 1;
350 }
351 
352 /*
353  * Figure out the cpu power domains.
354  */
355 static int
356 acpi_get_cpupwrdom(void)
357 {
358 	struct cpu_pwrdom *dom;
359 	cpumask_t pwrdom_mask;
360 	char buf[64];
361 	char members[1024];
362 	char *str;
363 	size_t msize;
364 	int n, i, ncpu = 0, dom_id;
365 
366 	memset(cpu2pwrdom, 0, sizeof(cpu2pwrdom));
367 	memset(cpu_pwrdomain, 0, sizeof(cpu_pwrdomain));
368 	CPUMASK_ASSZERO(cpu_pwrdom_mask);
369 
370 	for (i = 0; i < MAXDOM; ++i) {
371 		snprintf(buf, sizeof(buf),
372 			 "hw.acpi.cpu.px_dom%d.available", i);
373 		if (sysctlbyname(buf, NULL, NULL, NULL, 0) < 0)
374 			continue;
375 
376 		dom = calloc(1, sizeof(*dom));
377 		dom->dom_id = i;
378 
379 		if (cpu_pwrdomain[i] != NULL) {
380 			fprintf(stderr, "cpu power domain %d exists\n", i);
381 			exit(1);
382 		}
383 		cpu_pwrdomain[i] = dom;
384 		CPUMASK_ORBIT(cpu_pwrdom_mask, i);
385 	}
386 	pwrdom_mask = cpu_pwrdom_mask;
387 
388 	while (CPUMASK_TESTNZERO(pwrdom_mask)) {
389 		dom_id = BSFCPUMASK(pwrdom_mask);
390 		CPUMASK_NANDBIT(pwrdom_mask, dom_id);
391 		dom = cpu_pwrdomain[dom_id];
392 
393 		CPUMASK_ASSZERO(dom->dom_cpumask);
394 
395 		snprintf(buf, sizeof(buf),
396 			 "hw.acpi.cpu.px_dom%d.members", dom->dom_id);
397 		msize = sizeof(members);
398 		if (sysctlbyname(buf, members, &msize, NULL, 0) < 0) {
399 			cpu_pwrdomain[dom_id] = NULL;
400 			free(dom);
401 			continue;
402 		}
403 
404 		members[msize] = 0;
405 		for (str = strtok(members, " "); str; str = strtok(NULL, " ")) {
406 			n = -1;
407 			sscanf(str, "cpu%d", &n);
408 			if (n >= 0) {
409 				++ncpu;
410 				++dom->dom_ncpus;
411 				CPUMASK_ORBIT(dom->dom_cpumask, n);
412 				cpu2pwrdom[n] = dom->dom_id;
413 			}
414 		}
415 		if (dom->dom_ncpus == 0) {
416 			cpu_pwrdomain[dom_id] = NULL;
417 			free(dom);
418 			continue;
419 		}
420 		if (DebugOpt) {
421 			printf("dom%d cpumask: ", dom->dom_id);
422 			for (i = 0; i < (int)NELEM(dom->dom_cpumask.ary); ++i) {
423 				printf("%jx ",
424 				    (uintmax_t)dom->dom_cpumask.ary[i]);
425 			}
426 			printf("\n");
427 		}
428 	}
429 
430 	if (ncpu != NCpus) {
431 		if (DebugOpt)
432 			printf("Found %d cpus, expecting %d\n", ncpu, NCpus);
433 
434 		pwrdom_mask = cpu_pwrdom_mask;
435 		while (CPUMASK_TESTNZERO(pwrdom_mask)) {
436 			dom_id = BSFCPUMASK(pwrdom_mask);
437 			CPUMASK_NANDBIT(pwrdom_mask, dom_id);
438 			dom = cpu_pwrdomain[dom_id];
439 			if (dom != NULL)
440 				free(dom);
441 		}
442 		return 0;
443 	}
444 	return 1;
445 }
446 
447 /*
448  * Save per-cpu load and sum of per-cpu load.
449  */
450 static void
451 get_cputime(double pollrate)
452 {
453 	static struct kinfo_cputime ocpu_time[MAXCPU];
454 	static struct kinfo_cputime ncpu_time[MAXCPU];
455 	size_t slen;
456 	int ncpu;
457 	int cpu;
458 	uint64_t delta;
459 
460 	bcopy(ncpu_time, ocpu_time, sizeof(struct kinfo_cputime) * NCpus);
461 
462 	slen = sizeof(ncpu_time);
463 	if (sysctlbyname("kern.cputime", &ncpu_time, &slen, NULL, 0) < 0) {
464 		fprintf(stderr, "kern.cputime sysctl not available\n");
465 		exit(1);
466 	}
467 	ncpu = slen / sizeof(ncpu_time[0]);
468 
469 	delta = 0;
470 	for (cpu = 0; cpu < ncpu; ++cpu) {
471 		uint64_t d;
472 
473 		d = (ncpu_time[cpu].cp_user + ncpu_time[cpu].cp_sys +
474 		     ncpu_time[cpu].cp_nice + ncpu_time[cpu].cp_intr) -
475 		    (ocpu_time[cpu].cp_user + ocpu_time[cpu].cp_sys +
476 		     ocpu_time[cpu].cp_nice + ocpu_time[cpu].cp_intr);
477 		pcpu_state[cpu].cpu_qavg = (double)d / (pollrate * 1000000.0);
478 
479 		delta += d;
480 	}
481 	global_cpu_state.cpu_qavg = (double)delta / (pollrate * 1000000.0);
482 }
483 
484 static void
485 acpi_getcpufreq_str(int dom_id, int *highest0, int *lowest0)
486 {
487 	char buf[256], sysid[64];
488 	size_t buflen;
489 	char *ptr;
490 	int v, highest, lowest;
491 
492 	/*
493 	 * Retrieve availability list
494 	 */
495 	snprintf(sysid, sizeof(sysid), "hw.acpi.cpu.px_dom%d.available",
496 	    dom_id);
497 	buflen = sizeof(buf) - 1;
498 	if (sysctlbyname(sysid, buf, &buflen, NULL, 0) < 0)
499 		return;
500 	buf[buflen] = 0;
501 
502 	/*
503 	 * Parse out the highest and lowest cpu frequencies
504 	 */
505 	ptr = buf;
506 	highest = lowest = 0;
507 	while (ptr && (v = strtol(ptr, &ptr, 10)) > 0) {
508 		if (lowest == 0 || lowest > v)
509 			lowest = v;
510 		if (highest == 0 || highest < v)
511 			highest = v;
512 		/*
513 		 * Detect turbo mode
514 		 */
515 		if (!TurboOpt && highest - v == 1)
516 			highest = v;
517 	}
518 
519 	*highest0 = highest;
520 	*lowest0 = lowest;
521 }
522 
523 static int
524 acpi_getcpufreq_bin(int dom_id, int *highest0, int *lowest0)
525 {
526 	char sysid[64];
527 	int freq[MAXFREQ];
528 	size_t freqlen;
529 	int freqcnt;
530 
531 	/*
532 	 * Retrieve availability list
533 	 */
534 	snprintf(sysid, sizeof(sysid), "hw.acpi.cpu.px_dom%d.avail", dom_id);
535 	freqlen = sizeof(freq);
536 	if (sysctlbyname(sysid, freq, &freqlen, NULL, 0) < 0)
537 		return 0;
538 
539 	freqcnt = freqlen / sizeof(freq[0]);
540 	if (freqcnt == 0)
541 		return 0;
542 
543 	*lowest0 = freq[freqcnt - 1];
544 
545 	*highest0 = freq[0];
546 	if (!TurboOpt && freqcnt > 1 && freq[0] - freq[1] == 1)
547 		*highest0 = freq[1];
548 	return 1;
549 }
550 
551 static void
552 acpi_get_cpufreq(int dom_id, int *highest, int *lowest)
553 {
554 	*highest = 0;
555 	*lowest = 0;
556 
557 	if (acpi_getcpufreq_bin(dom_id, highest, lowest))
558 		return;
559 	acpi_getcpufreq_str(dom_id, highest, lowest);
560 }
561 
562 static
563 void
564 usage(void)
565 {
566 	fprintf(stderr, "usage: powerd [-cdeftQ] [-p hysteresis] "
567 	    "[-r poll_interval] [-u trigger_up] "
568 	    "[-B min_battery_life] [-L low_battery_linger] "
569 	    "[-P battery_poll_interval] [-T sample_interval]\n");
570 	exit(1);
571 }
572 
573 #ifndef timespecsub
574 #define timespecsub(vvp, uvp)						\
575 	do {								\
576 		(vvp)->tv_sec -= (uvp)->tv_sec;				\
577 		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
578 		if ((vvp)->tv_nsec < 0) {				\
579 			(vvp)->tv_sec--;				\
580 			(vvp)->tv_nsec += 1000000000;			\
581 		}							\
582 	} while (0)
583 #endif
584 
585 #define BAT_SYSCTL_TIME_MAX	50000000 /* unit: nanosecond */
586 
587 static int
588 has_battery(void)
589 {
590 	struct timespec s, e;
591 	size_t len;
592 	int val;
593 
594 	clock_gettime(CLOCK_MONOTONIC_FAST, &s);
595 	BatLifePrevT = s;
596 
597 	len = sizeof(val);
598 	if (sysctlbyname("hw.acpi.acline", &val, &len, NULL, 0) < 0) {
599 		/* No AC line information */
600 		return 0;
601 	}
602 	clock_gettime(CLOCK_MONOTONIC_FAST, &e);
603 
604 	timespecsub(&e, &s);
605 	if (e.tv_sec > 0 || e.tv_nsec > BAT_SYSCTL_TIME_MAX) {
606 		/* hw.acpi.acline takes to long to be useful */
607 		syslog(LOG_NOTICE, "hw.acpi.acline takes too long");
608 		return 0;
609 	}
610 
611 	clock_gettime(CLOCK_MONOTONIC_FAST, &s);
612 	len = sizeof(val);
613 	if (sysctlbyname("hw.acpi.battery.life", &val, &len, NULL, 0) < 0) {
614 		/* No battery life */
615 		return 0;
616 	}
617 	clock_gettime(CLOCK_MONOTONIC_FAST, &e);
618 
619 	timespecsub(&e, &s);
620 	if (e.tv_sec > 0 || e.tv_nsec > BAT_SYSCTL_TIME_MAX) {
621 		/* hw.acpi.battery.life takes to long to be useful */
622 		syslog(LOG_NOTICE, "hw.acpi.battery.life takes too long");
623 		return 0;
624 	}
625 	return 1;
626 }
627 
628 static void
629 low_battery_alert(int life)
630 {
631 	int fmt, stereo, freq;
632 	int fd;
633 
634 	syslog(LOG_ALERT, "low battery life %d%%, please plugin AC line, #%d",
635 	    life, BatShutdownLingerCnt);
636 	++BatShutdownLingerCnt;
637 
638 	if (!BatShutdownAudioAlert)
639 		return;
640 
641 	fd = open("/dev/dsp", O_WRONLY);
642 	if (fd < 0)
643 		return;
644 
645 	fmt = AFMT_S16_LE;
646 	if (ioctl(fd, SNDCTL_DSP_SETFMT, &fmt, sizeof(fmt)) < 0)
647 		goto done;
648 
649 	stereo = 0;
650 	if (ioctl(fd, SNDCTL_DSP_STEREO, &stereo, sizeof(stereo)) < 0)
651 		goto done;
652 
653 	freq = 44100;
654 	if (ioctl(fd, SNDCTL_DSP_SPEED, &freq, sizeof(freq)) < 0)
655 		goto done;
656 
657 	write(fd, alert1, sizeof(alert1));
658 	write(fd, alert1, sizeof(alert1));
659 
660 done:
661 	close(fd);
662 }
663 
664 static int
665 mon_battery(void)
666 {
667 	struct timespec cur, ts;
668 	int acline, life;
669 	size_t len;
670 
671 	clock_gettime(CLOCK_MONOTONIC_FAST, &cur);
672 	ts = cur;
673 	timespecsub(&ts, &BatLifePrevT);
674 	if (ts.tv_sec < BatLifePollIntvl)
675 		return 1;
676 	BatLifePrevT = cur;
677 
678 	len = sizeof(acline);
679 	if (sysctlbyname("hw.acpi.acline", &acline, &len, NULL, 0) < 0)
680 		return 1;
681 	if (acline) {
682 		BatShutdownLinger = -1;
683 		BatShutdownLingerCnt = 0;
684 		return 1;
685 	}
686 
687 	len = sizeof(life);
688 	if (sysctlbyname("hw.acpi.battery.life", &life, &len, NULL, 0) < 0)
689 		return 1;
690 
691 	if (BatShutdownLinger > 0) {
692 		ts = cur;
693 		timespecsub(&ts, &BatShutdownStartT);
694 		if (ts.tv_sec > BatShutdownLinger)
695 			BatShutdownLinger = 0;
696 	}
697 
698 	if (life <= BatLifeMin) {
699 		if (BatShutdownLinger == 0 || BatShutdownLingerSet == 0) {
700 			syslog(LOG_ALERT, "low battery life %d%%, "
701 			    "shutting down", life);
702 			if (vfork() == 0)
703 				execlp("poweroff", "poweroff", NULL);
704 			return 0;
705 		} else if (BatShutdownLinger < 0) {
706 			BatShutdownLinger = BatShutdownLingerSet;
707 			BatShutdownStartT = cur;
708 		}
709 		low_battery_alert(life);
710 	}
711 	return 1;
712 }
713 
714 static void
715 get_ncpus(void)
716 {
717 	size_t slen;
718 
719 	slen = sizeof(NCpus);
720 	if (sysctlbyname("hw.ncpu", &NCpus, &slen, NULL, 0) < 0)
721 		err(1, "sysctlbyname hw.ncpu failed");
722 	if (DebugOpt)
723 		printf("hw.ncpu %d\n", NCpus);
724 }
725 
726 static void
727 get_uschedcpus(void)
728 {
729 	size_t slen;
730 
731 	slen = sizeof(usched_cpu_used);
732 	if (sysctlbyname("kern.usched_global_cpumask", &usched_cpu_used, &slen,
733 	    NULL, 0) < 0)
734 		err(1, "sysctlbyname kern.usched_global_cpumask failed");
735 	if (DebugOpt) {
736 		int i;
737 
738 		printf("usched cpumask was: ");
739 		for (i = 0; i < (int)NELEM(usched_cpu_used.ary); ++i)
740 			printf("%jx ", (uintmax_t)usched_cpu_used.ary[i]);
741 		printf("\n");
742 	}
743 }
744 
745 static void
746 set_uschedcpus(void)
747 {
748 	if (DebugOpt) {
749 		int i;
750 
751 		printf("usched cpumask: ");
752 		for (i = 0; i < (int)NELEM(usched_cpu_used.ary); ++i) {
753 			printf("%jx ",
754 			    (uintmax_t)usched_cpu_used.ary[i]);
755 		}
756 		printf("\n");
757 	}
758 	sysctlbyname("kern.usched_global_cpumask", NULL, 0,
759 	    &usched_cpu_used, sizeof(usched_cpu_used));
760 }
761 
762 static int
763 has_perfbias(void)
764 {
765 	size_t len;
766 	int hint;
767 
768 	len = sizeof(hint);
769 	if (sysctlbyname("machdep.perfbias0.hint", &hint, &len, NULL, 0) < 0)
770 		return 0;
771 	return 1;
772 }
773 
774 static void
775 set_perfbias(int cpu, int inc)
776 {
777 	int hint = inc ? 0 : 15;
778 	char sysid[64];
779 
780 	if (DebugOpt)
781 		printf("cpu%d set perfbias hint %d\n", cpu, hint);
782 	snprintf(sysid, sizeof(sysid), "machdep.perfbias%d.hint", cpu);
783 	sysctlbyname(sysid, NULL, NULL, &hint, sizeof(hint));
784 }
785 
786 static void
787 init_perf(void)
788 {
789 	struct cpu_state *state;
790 	int cpu;
791 
792 	/* Get usched cpumask */
793 	get_uschedcpus();
794 
795 	/*
796 	 * Assume everything are used and are maxed out, before we
797 	 * start.
798 	 */
799 
800 	CPUMASK_ASSBMASK(cpu_used, NCpus);
801 	cpu_pwrdom_used = cpu_pwrdom_mask;
802 	global_pcpu_limit = NCpus;
803 
804 	for (cpu = 0; cpu < NCpus; ++cpu) {
805 		state = &pcpu_state[cpu];
806 
807 		state->cpu_uavg = 0.0;
808 		state->cpu_davg = 0.0;
809 		state->cpu_limit = 1;
810 		state->cpu_count = 1;
811 		snprintf(state->cpu_name, sizeof(state->cpu_name), "cpu%d",
812 		    cpu);
813 	}
814 
815 	state = &global_cpu_state;
816 	state->cpu_uavg = 0.0;
817 	state->cpu_davg = 0.0;
818 	state->cpu_limit = NCpus;
819 	state->cpu_count = NCpus;
820 	strlcpy(state->cpu_name, "global", sizeof(state->cpu_name));
821 }
822 
823 static int
824 get_nstate(struct cpu_state *state, double srt)
825 {
826 	int ustate, dstate, nstate;
827 
828 	/* speeding up */
829 	state->cpu_uavg = (state->cpu_uavg * 2.0 + state->cpu_qavg) / 3.0;
830 	/* slowing down */
831 	state->cpu_davg = (state->cpu_davg * srt + state->cpu_qavg) / (srt + 1);
832 	if (state->cpu_davg < state->cpu_uavg)
833 		state->cpu_davg = state->cpu_uavg;
834 
835 	ustate = state->cpu_uavg / TriggerUp;
836 	if (ustate < state->cpu_limit)
837 		ustate = state->cpu_uavg / TriggerDown;
838 	dstate = state->cpu_davg / TriggerUp;
839 	if (dstate < state->cpu_limit)
840 		dstate = state->cpu_davg / TriggerDown;
841 
842 	nstate = (ustate > dstate) ? ustate : dstate;
843 	if (nstate > state->cpu_count)
844 		nstate = state->cpu_count;
845 
846 	if (DebugOpt) {
847 		printf("%s qavg=%5.2f uavg=%5.2f davg=%5.2f "
848 		    "%2d ncpus=%d\n", state->cpu_name,
849 		    state->cpu_qavg, state->cpu_uavg, state->cpu_davg,
850 		    state->cpu_limit, nstate);
851 	}
852 	return nstate;
853 }
854 
855 static void
856 mon_perf(double srt)
857 {
858 	cpumask_t ocpu_used, ocpu_pwrdom_used;
859 	int pnstate = 0, nstate;
860 	int cpu;
861 
862 	/*
863 	 * Find cpus requiring performance and their cooresponding power
864 	 * domains.  Save the number of cpus requiring performance in
865 	 * pnstate.
866 	 */
867 	ocpu_used = cpu_used;
868 	ocpu_pwrdom_used = cpu_pwrdom_used;
869 
870 	CPUMASK_ASSZERO(cpu_used);
871 	CPUMASK_ASSZERO(cpu_pwrdom_used);
872 
873 	for (cpu = 0; cpu < NCpus; ++cpu) {
874 		struct cpu_state *state = &pcpu_state[cpu];
875 		int s;
876 
877 		s = get_nstate(state, srt);
878 		if (s) {
879 			CPUMASK_ORBIT(cpu_used, cpu);
880 			CPUMASK_ORBIT(cpu_pwrdom_used, cpu2pwrdom[cpu]);
881 		}
882 		pnstate += s;
883 
884 		state->cpu_limit = s;
885 	}
886 
887 	/*
888 	 * Calculate nstate, the number of cpus we wish to run at max
889 	 * performance.
890 	 */
891 	nstate = get_nstate(&global_cpu_state, srt);
892 
893 	if (nstate == global_cpu_state.cpu_limit &&
894 	    (pnstate == global_pcpu_limit || nstate > pnstate)) {
895 		/* Nothing changed; keep the sets */
896 		cpu_used = ocpu_used;
897 		cpu_pwrdom_used = ocpu_pwrdom_used;
898 
899 		global_pcpu_limit = pnstate;
900 		return;
901 	}
902 	global_pcpu_limit = pnstate;
903 
904 	if (nstate > pnstate) {
905 		/*
906 		 * Add spare cpus to meet global performance requirement.
907 		 */
908 		add_spare_cpus(ocpu_used, nstate - pnstate);
909 	}
910 
911 	global_cpu_state.cpu_limit = nstate;
912 
913 	/*
914 	 * Adjust cpu and cpu power domain performance
915 	 */
916 	adj_perf(ocpu_used, ocpu_pwrdom_used);
917 }
918 
919 static void
920 add_spare_cpus(const cpumask_t ocpu_used, int ncpu)
921 {
922 	cpumask_t saved_pwrdom, xcpu_used;
923 	int done = 0, cpu;
924 
925 	/*
926 	 * Find more cpus in the previous cpu set.
927 	 */
928 	xcpu_used = cpu_used;
929 	CPUMASK_XORMASK(xcpu_used, ocpu_used);
930 	while (CPUMASK_TESTNZERO(xcpu_used)) {
931 		cpu = BSFCPUMASK(xcpu_used);
932 		CPUMASK_NANDBIT(xcpu_used, cpu);
933 
934 		if (CPUMASK_TESTBIT(ocpu_used, cpu)) {
935 			CPUMASK_ORBIT(cpu_pwrdom_used, cpu2pwrdom[cpu]);
936 			CPUMASK_ORBIT(cpu_used, cpu);
937 			--ncpu;
938 			if (ncpu == 0)
939 				return;
940 		}
941 	}
942 
943 	/*
944 	 * Find more cpus in the used cpu power domains.
945 	 */
946 	saved_pwrdom = cpu_pwrdom_used;
947 again:
948 	while (CPUMASK_TESTNZERO(saved_pwrdom)) {
949 		cpumask_t unused_cpumask;
950 		int dom;
951 
952 		dom = BSFCPUMASK(saved_pwrdom);
953 		CPUMASK_NANDBIT(saved_pwrdom, dom);
954 
955 		unused_cpumask = cpu_pwrdomain[dom]->dom_cpumask;
956 		CPUMASK_NANDMASK(unused_cpumask, cpu_used);
957 
958 		while (CPUMASK_TESTNZERO(unused_cpumask)) {
959 			cpu = BSFCPUMASK(unused_cpumask);
960 			CPUMASK_NANDBIT(unused_cpumask, cpu);
961 
962 			CPUMASK_ORBIT(cpu_pwrdom_used, dom);
963 			CPUMASK_ORBIT(cpu_used, cpu);
964 			--ncpu;
965 			if (ncpu == 0)
966 				return;
967 		}
968 	}
969 	if (!done) {
970 		done = 1;
971 		/*
972 		 * Find more cpus in unused cpu power domains
973 		 */
974 		saved_pwrdom = cpu_pwrdom_mask;
975 		CPUMASK_NANDMASK(saved_pwrdom, cpu_pwrdom_used);
976 		goto again;
977 	}
978 	if (DebugOpt)
979 		printf("%d cpus not found\n", ncpu);
980 }
981 
982 static void
983 acpi_set_cpufreq(int dom, int inc)
984 {
985 	int lowest, highest, desired;
986 	char sysid[64];
987 
988 	acpi_get_cpufreq(dom, &highest, &lowest);
989 	if (highest == 0 || lowest == 0)
990 		return;
991 	desired = inc ? highest : lowest;
992 
993 	if (DebugOpt)
994 		printf("dom%d set frequency %d\n", dom, desired);
995 	snprintf(sysid, sizeof(sysid), "hw.acpi.cpu.px_dom%d.select", dom);
996 	sysctlbyname(sysid, NULL, NULL, &desired, sizeof(desired));
997 }
998 
999 static void
1000 adj_cpu_pwrdom(int dom, int inc)
1001 {
1002 	if (AdjustCpuFreq)
1003 		acpi_set_cpufreq(dom, inc);
1004 }
1005 
1006 static void
1007 adj_cpu_perf(int cpu, int inc)
1008 {
1009 	if (DebugOpt) {
1010 		if (inc)
1011 			printf("cpu%d increase perf\n", cpu);
1012 		else
1013 			printf("cpu%d decrease perf\n", cpu);
1014 	}
1015 
1016 	if (HasPerfbias)
1017 		set_perfbias(cpu, inc);
1018 	if (AdjustCstate)
1019 		set_cstate(cpu, inc);
1020 }
1021 
1022 static void
1023 adj_perf(cpumask_t xcpu_used, cpumask_t xcpu_pwrdom_used)
1024 {
1025 	cpumask_t old_usched_used;
1026 	int cpu, inc;
1027 
1028 	/*
1029 	 * Set cpus requiring performance to the userland process
1030 	 * scheduler.  Leave the rest of cpus unmapped.
1031 	 */
1032 	old_usched_used = usched_cpu_used;
1033 	usched_cpu_used = cpu_used;
1034 	if (CPUMASK_TESTZERO(usched_cpu_used))
1035 		CPUMASK_ORBIT(usched_cpu_used, 0);
1036 	if (CPUMASK_CMPMASKNEQ(usched_cpu_used, old_usched_used))
1037 		set_uschedcpus();
1038 
1039 	/*
1040 	 * Adjust per-cpu performance.
1041 	 */
1042 	CPUMASK_XORMASK(xcpu_used, cpu_used);
1043 	while (CPUMASK_TESTNZERO(xcpu_used)) {
1044 		cpu = BSFCPUMASK(xcpu_used);
1045 		CPUMASK_NANDBIT(xcpu_used, cpu);
1046 
1047 		if (CPUMASK_TESTBIT(cpu_used, cpu)) {
1048 			/* Increase cpu performance */
1049 			inc = 1;
1050 		} else {
1051 			/* Decrease cpu performance */
1052 			inc = 0;
1053 		}
1054 		adj_cpu_perf(cpu, inc);
1055 	}
1056 
1057 	/*
1058 	 * Adjust cpu power domain performance.  This could affect
1059 	 * a set of cpus.
1060 	 */
1061 	CPUMASK_XORMASK(xcpu_pwrdom_used, cpu_pwrdom_used);
1062 	while (CPUMASK_TESTNZERO(xcpu_pwrdom_used)) {
1063 		int dom;
1064 
1065 		dom = BSFCPUMASK(xcpu_pwrdom_used);
1066 		CPUMASK_NANDBIT(xcpu_pwrdom_used, dom);
1067 
1068 		if (CPUMASK_TESTBIT(cpu_pwrdom_used, dom)) {
1069 			/* Increase cpu power domain performance */
1070 			inc = 1;
1071 		} else {
1072 			/* Decrease cpu power domain performance */
1073 			inc = 0;
1074 		}
1075 		adj_cpu_pwrdom(dom, inc);
1076 	}
1077 }
1078 
1079 static void
1080 restore_perf(void)
1081 {
1082 	cpumask_t ocpu_used, ocpu_pwrdom_used;
1083 
1084 	ocpu_used = cpu_used;
1085 	ocpu_pwrdom_used = cpu_pwrdom_used;
1086 
1087 	/* Max out all cpus and cpu power domains performance */
1088 	CPUMASK_ASSBMASK(cpu_used, NCpus);
1089 	cpu_pwrdom_used = cpu_pwrdom_mask;
1090 
1091 	adj_perf(ocpu_used, ocpu_pwrdom_used);
1092 
1093 	if (AdjustCstate) {
1094 		/*
1095 		 * Restore the original mwait C-state
1096 		 */
1097 		if (DebugOpt)
1098 			printf("global set cstate %s\n", orig_global_cx);
1099 		sysctlbyname("machdep.mwait.CX.idle", NULL, NULL,
1100 		    orig_global_cx, strlen(orig_global_cx) + 1);
1101 	}
1102 }
1103 
1104 static int
1105 probe_cstate(void)
1106 {
1107 	char cx_supported[1024];
1108 	const char *target;
1109 	char *ptr;
1110 	int idle_hlt, deep = 1;
1111 	size_t len;
1112 
1113 	len = sizeof(idle_hlt);
1114 	if (sysctlbyname("machdep.cpu_idle_hlt", &idle_hlt, &len, NULL, 0) < 0)
1115 		return 0;
1116 	if (idle_hlt != 1)
1117 		return 0;
1118 
1119 	len = sizeof(cx_supported);
1120 	if (sysctlbyname("machdep.mwait.CX.supported", cx_supported, &len,
1121 	    NULL, 0) < 0)
1122 		return 0;
1123 
1124 	len = sizeof(orig_global_cx);
1125 	if (sysctlbyname("machdep.mwait.CX.idle", orig_global_cx, &len,
1126 	    NULL, 0) < 0)
1127 		return 0;
1128 
1129 	strlcpy(cpu_perf_cx, "AUTODEEP", sizeof(cpu_perf_cx));
1130 	cpu_perf_cxlen = strlen(cpu_perf_cx) + 1;
1131 	if (sysctlbyname("machdep.mwait.CX.idle", NULL, NULL,
1132 	    cpu_perf_cx, cpu_perf_cxlen) < 0) {
1133 		/* AUTODEEP is not supported; try AUTO */
1134 		deep = 0;
1135 		strlcpy(cpu_perf_cx, "AUTO", sizeof(cpu_perf_cx));
1136 		cpu_perf_cxlen = strlen(cpu_perf_cx) + 1;
1137 		if (sysctlbyname("machdep.mwait.CX.idle", NULL, NULL,
1138 		    cpu_perf_cx, cpu_perf_cxlen) < 0)
1139 			return 0;
1140 	}
1141 
1142 	if (!deep)
1143 		target = "C2/0";
1144 	else
1145 		target = NULL;
1146 	for (ptr = strtok(cx_supported, " "); ptr != NULL;
1147 	     ptr = strtok(NULL, " ")) {
1148 		if (target == NULL ||
1149 		    (target != NULL && strcmp(ptr, target) == 0)) {
1150 			strlcpy(cpu_idle_cx, ptr, sizeof(cpu_idle_cx));
1151 			cpu_idle_cxlen = strlen(cpu_idle_cx) + 1;
1152 			if (target != NULL)
1153 				break;
1154 		}
1155 	}
1156 	if (cpu_idle_cxlen == 0)
1157 		return 0;
1158 
1159 	if (DebugOpt) {
1160 		printf("cstate orig %s, perf %s, idle %s\n",
1161 		    orig_global_cx, cpu_perf_cx, cpu_idle_cx);
1162 	}
1163 	return 1;
1164 }
1165 
1166 static void
1167 set_cstate(int cpu, int inc)
1168 {
1169 	const char *cst;
1170 	char sysid[64];
1171 	size_t len;
1172 
1173 	if (inc) {
1174 		cst = cpu_perf_cx;
1175 		len = cpu_perf_cxlen;
1176 	} else {
1177 		cst = cpu_idle_cx;
1178 		len = cpu_idle_cxlen;
1179 	}
1180 
1181 	if (DebugOpt)
1182 		printf("cpu%d set cstate %s\n", cpu, cst);
1183 	snprintf(sysid, sizeof(sysid), "machdep.mwait.CX.idle%d", cpu);
1184 	sysctlbyname(sysid, NULL, NULL, cst, len);
1185 }
1186