1 /*
2  * Copyright (c) 2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Sepherosa Ziehau <sepherosa@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/kernel.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/module.h>
41 #include <sys/sensors.h>
42 #include <sys/sysctl.h>
43 #include <sys/systm.h>
44 
45 #include <dev/misc/dimm/dimm.h>
46 
47 #define DIMM_TEMP_HIWAT_DEFAULT	85
48 #define DIMM_TEMP_LOWAT_DEFAULT	75
49 
50 #define DIMM_ECC_THRESH_DEFAULT	5
51 
52 struct dimm_softc {
53 	TAILQ_ENTRY(dimm_softc) dimm_link;
54 	int			dimm_node;
55 	int			dimm_chan;
56 	int			dimm_slot;
57 	int			dimm_temp_hiwat;
58 	int			dimm_temp_lowat;
59 	int			dimm_id;
60 	int			dimm_ref;
61 	int			dimm_ecc_cnt;
62 	int			dimm_ecc_thresh;
63 
64 	struct ksensordev	dimm_sensdev;
65 	uint32_t		dimm_sens_taskflags;	/* DIMM_SENS_TF_ */
66 
67 	struct sysctl_ctx_list	dimm_sysctl_ctx;
68 	struct sysctl_oid	*dimm_sysctl_tree;
69 };
70 TAILQ_HEAD(dimm_softc_list, dimm_softc);
71 
72 #define DIMM_SENS_TF_TEMP_CRIT		0x1
73 #define DIMM_SENS_TF_ECC_CRIT		0x2
74 
75 static void	dimm_mod_unload(void);
76 static void	dimm_sensor_ecc(struct dimm_softc *, struct ksensor *,
77 		    boolean_t);
78 
79 /* In the ascending order of dimm_softc.dimm_id */
80 static struct dimm_softc_list	dimm_softc_list;
81 
82 static SYSCTL_NODE(_hw, OID_AUTO, dimminfo, CTLFLAG_RD, NULL,
83     "DIMM information");
84 
85 struct dimm_softc *
dimm_create(int node,int chan,int slot)86 dimm_create(int node, int chan, int slot)
87 {
88 	struct dimm_softc *sc, *after = NULL;
89 	int dimm_id = 0;
90 
91 	SYSCTL_XLOCK();
92 
93 	TAILQ_FOREACH(sc, &dimm_softc_list, dimm_link) {
94 		/*
95 		 * Already exists; done.
96 		 */
97 		if (sc->dimm_node == node && sc->dimm_chan == chan &&
98 		    sc->dimm_slot == slot) {
99 			KASSERT(sc->dimm_ref > 0, ("invalid dimm reference %d",
100 			    sc->dimm_ref));
101 			sc->dimm_ref++;
102 			SYSCTL_XUNLOCK();
103 			return sc;
104 		}
105 
106 		/*
107 		 * Find the lowest usable id.
108 		 */
109 		if (sc->dimm_id == dimm_id) {
110 			++dimm_id;
111 			after = sc;
112 		}
113 	}
114 
115 	sc = kmalloc(sizeof(*sc), M_DEVBUF, M_WAITOK | M_ZERO);
116 	sc->dimm_node = node;
117 	sc->dimm_chan = chan;
118 	sc->dimm_slot = slot;
119 	sc->dimm_id = dimm_id;
120 	sc->dimm_ref = 1;
121 	sc->dimm_temp_hiwat = DIMM_TEMP_HIWAT_DEFAULT;
122 	sc->dimm_temp_lowat = DIMM_TEMP_LOWAT_DEFAULT;
123 	sc->dimm_ecc_thresh = DIMM_ECC_THRESH_DEFAULT;
124 
125 	ksnprintf(sc->dimm_sensdev.xname, sizeof(sc->dimm_sensdev.xname),
126 	    "dimm%d", sc->dimm_id);
127 
128 	/*
129 	 * Create sysctl tree for the location information.  Use
130 	 * same name as the sensor device.
131 	 */
132 	sysctl_ctx_init(&sc->dimm_sysctl_ctx);
133 	sc->dimm_sysctl_tree = SYSCTL_ADD_NODE(&sc->dimm_sysctl_ctx,
134 	    SYSCTL_STATIC_CHILDREN(_hw_dimminfo), OID_AUTO,
135 	    sc->dimm_sensdev.xname, CTLFLAG_RD, 0, "");
136 	if (sc->dimm_sysctl_tree != NULL) {
137 		SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx,
138 		    SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO,
139 		    "node", CTLFLAG_RD, &sc->dimm_node, 0,
140 		    "CPU node of this DIMM");
141 		SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx,
142 		    SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO,
143 		    "chan", CTLFLAG_RD, &sc->dimm_chan, 0,
144 		    "channel of this DIMM");
145 		SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx,
146 		    SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO,
147 		    "slot", CTLFLAG_RD, &sc->dimm_slot, 0,
148 		    "slot of this DIMM");
149 		SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx,
150 		    SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO,
151 		    "temp_hiwat", CTLFLAG_RW, &sc->dimm_temp_hiwat, 0,
152 		    "Raise alarm once DIMM temperature is above this value "
153 		    "(unit: C)");
154 		SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx,
155 		    SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO,
156 		    "temp_lowat", CTLFLAG_RW, &sc->dimm_temp_lowat, 0,
157 		    "Cancel alarm once DIMM temperature is below this value "
158 		    "(unit: C)");
159 		SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx,
160 		    SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO,
161 		    "ecc_thresh", CTLFLAG_RW, &sc->dimm_ecc_thresh, 0,
162 		    "Raise alarm once number ECC errors go above this value");
163 	}
164 
165 	if (after == NULL) {
166 		KKASSERT(sc->dimm_id == 0);
167 		TAILQ_INSERT_HEAD(&dimm_softc_list, sc, dimm_link);
168 	} else {
169 		TAILQ_INSERT_AFTER(&dimm_softc_list, after, sc, dimm_link);
170 	}
171 
172 	sensordev_install(&sc->dimm_sensdev);
173 
174 	SYSCTL_XUNLOCK();
175 	return sc;
176 }
177 
178 int
dimm_destroy(struct dimm_softc * sc)179 dimm_destroy(struct dimm_softc *sc)
180 {
181 	SYSCTL_XLOCK();
182 
183 	KASSERT(sc->dimm_ref > 0, ("invalid dimm reference %d", sc->dimm_ref));
184 	sc->dimm_ref--;
185 	if (sc->dimm_ref > 0) {
186 		SYSCTL_XUNLOCK();
187 		return EAGAIN;
188 	}
189 
190 	sensordev_deinstall(&sc->dimm_sensdev);
191 
192 	TAILQ_REMOVE(&dimm_softc_list, sc, dimm_link);
193 	if (sc->dimm_sysctl_tree != NULL)
194 		sysctl_ctx_free(&sc->dimm_sysctl_ctx);
195 	kfree(sc, M_DEVBUF);
196 
197 	SYSCTL_XUNLOCK();
198 	return 0;
199 }
200 
201 void
dimm_sensor_attach(struct dimm_softc * sc,struct ksensor * sens)202 dimm_sensor_attach(struct dimm_softc *sc, struct ksensor *sens)
203 {
204 	sensor_attach(&sc->dimm_sensdev, sens);
205 }
206 
207 void
dimm_sensor_detach(struct dimm_softc * sc,struct ksensor * sens)208 dimm_sensor_detach(struct dimm_softc *sc, struct ksensor *sens)
209 {
210 	sensor_detach(&sc->dimm_sensdev, sens);
211 }
212 
213 void
dimm_set_temp_thresh(struct dimm_softc * sc,int hiwat,int lowat)214 dimm_set_temp_thresh(struct dimm_softc *sc, int hiwat, int lowat)
215 {
216 	sc->dimm_temp_hiwat = hiwat;
217 	sc->dimm_temp_lowat = lowat;
218 }
219 
220 void
dimm_set_ecc_thresh(struct dimm_softc * sc,int thresh)221 dimm_set_ecc_thresh(struct dimm_softc *sc, int thresh)
222 {
223 	sc->dimm_ecc_thresh = thresh;
224 }
225 
226 void
dimm_sensor_temp(struct dimm_softc * sc,struct ksensor * sens,int temp)227 dimm_sensor_temp(struct dimm_softc *sc, struct ksensor *sens, int temp)
228 {
229 	enum sensor_status status;
230 
231 	if (temp >= sc->dimm_temp_hiwat &&
232 	    (sc->dimm_sens_taskflags & DIMM_SENS_TF_TEMP_CRIT) == 0) {
233 		char temp_str[16], data[64];
234 
235 		ksnprintf(temp_str, sizeof(temp_str), "%d", temp);
236 		ksnprintf(data, sizeof(data), "node=%d channel=%d dimm=%d",
237 		    sc->dimm_node, sc->dimm_chan, sc->dimm_slot);
238 		devctl_notify("memtemp", "Thermal", temp_str, data);
239 
240 		kprintf("dimm%d: node%d channel%d DIMM%d "
241 		    "temperature (%dC) is too high (>= %dC)\n",
242 		    sc->dimm_id, sc->dimm_node, sc->dimm_chan, sc->dimm_slot,
243 		    temp, sc->dimm_temp_hiwat);
244 
245 		sc->dimm_sens_taskflags |= DIMM_SENS_TF_TEMP_CRIT;
246 	} else if ((sc->dimm_sens_taskflags & DIMM_SENS_TF_TEMP_CRIT) &&
247 	     temp < sc->dimm_temp_lowat) {
248 		sc->dimm_sens_taskflags &= ~DIMM_SENS_TF_TEMP_CRIT;
249 	}
250 
251 	if (sc->dimm_sens_taskflags & DIMM_SENS_TF_TEMP_CRIT)
252 		status = SENSOR_S_CRIT;
253 	else
254 		status = SENSOR_S_OK;
255 	sensor_set_temp_degc(sens, temp, status);
256 }
257 
258 void
dimm_sensor_ecc_set(struct dimm_softc * sc,struct ksensor * sens,int ecc_cnt,boolean_t crit)259 dimm_sensor_ecc_set(struct dimm_softc *sc, struct ksensor *sens,
260     int ecc_cnt, boolean_t crit)
261 {
262 	sc->dimm_ecc_cnt = ecc_cnt;
263 	dimm_sensor_ecc(sc, sens, crit);
264 }
265 
266 void
dimm_sensor_ecc_add(struct dimm_softc * sc,struct ksensor * sens,int ecc_cnt,boolean_t crit)267 dimm_sensor_ecc_add(struct dimm_softc *sc, struct ksensor *sens,
268     int ecc_cnt, boolean_t crit)
269 {
270 	sc->dimm_ecc_cnt += ecc_cnt;
271 	dimm_sensor_ecc(sc, sens, crit);
272 }
273 
274 static void
dimm_sensor_ecc(struct dimm_softc * sc,struct ksensor * sens,boolean_t crit)275 dimm_sensor_ecc(struct dimm_softc *sc, struct ksensor *sens, boolean_t crit)
276 {
277 	enum sensor_status status;
278 
279 	if (!crit && sc->dimm_ecc_cnt >= sc->dimm_ecc_thresh)
280 		crit = TRUE;
281 
282 	if (crit && (sc->dimm_sens_taskflags & DIMM_SENS_TF_ECC_CRIT) == 0) {
283 		char ecc_str[16], data[64];
284 
285 		ksnprintf(ecc_str, sizeof(ecc_str), "%d", sc->dimm_ecc_cnt);
286 		ksnprintf(data, sizeof(data), "node=%d channel=%d dimm=%d",
287 		    sc->dimm_node, sc->dimm_chan, sc->dimm_slot);
288 		devctl_notify("ecc", "ECC", ecc_str, data);
289 
290 		kprintf("dimm%d: node%d channel%d DIMM%d "
291 		    "too many ECC errors %d\n",
292 		    sc->dimm_id, sc->dimm_node, sc->dimm_chan, sc->dimm_slot,
293 		    sc->dimm_ecc_cnt);
294 
295 		sc->dimm_sens_taskflags |= DIMM_SENS_TF_ECC_CRIT;
296 	}
297 
298 	if (sc->dimm_sens_taskflags & DIMM_SENS_TF_ECC_CRIT)
299 		status = SENSOR_S_CRIT;
300 	else
301 		status = SENSOR_S_OK;
302 	sensor_set(sens, sc->dimm_ecc_cnt, status);
303 }
304 
305 static void
dimm_mod_unload(void)306 dimm_mod_unload(void)
307 {
308 	struct dimm_softc *sc;
309 
310 	SYSCTL_XLOCK();
311 
312 	while ((sc = TAILQ_FIRST(&dimm_softc_list)) != NULL) {
313 		int error;
314 
315 		error = dimm_destroy(sc);
316 		KASSERT(!error, ("dimm%d is still referenced, ref %d",
317 		    sc->dimm_id, sc->dimm_ref));
318 	}
319 
320 	SYSCTL_XUNLOCK();
321 }
322 
323 static int
dimm_mod_event(module_t mod,int type,void * unused)324 dimm_mod_event(module_t mod, int type, void *unused)
325 {
326 	switch (type) {
327 	case MOD_LOAD:
328 		TAILQ_INIT(&dimm_softc_list);
329 		return 0;
330 
331 	case MOD_UNLOAD:
332 		dimm_mod_unload();
333 		return 0;
334 
335 	default:
336 		return 0;
337 	}
338 }
339 
340 static moduledata_t dimm_mod = {
341 	"dimm",
342 	dimm_mod_event,
343 	0
344 };
345 DECLARE_MODULE(dimm, dimm_mod, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY);
346 MODULE_VERSION(dimm, 1);
347