xref: /spdk/test/scheduler/cgroups.sh (revision 79c52a64269385b9a8b53c2fe03e2343160d38b3)
1#  SPDX-License-Identifier: BSD-3-Clause
2#  Copyright (C) 2021 Intel Corporation.
3#  All rights reserved.
4
5check_cgroup() {
6	# Try to work with both, cgroup-v1 and cgroup-v2. Verify which version is
7	# in use by looking up interfaces common for either of the versions.
8	if [[ -e $sysfs_cgroup/cgroup.controllers ]]; then
9		# cgroup2 is mounted, check if cpuset controller is available
10		[[ $(< "$sysfs_cgroup/cgroup.controllers") == *cpuset* ]] && echo 2
11	elif [[ -e $sysfs_cgroup/cpuset/tasks ]]; then
12		# cgroup's cpuset subsystem is mounted
13		echo 1
14	fi || return 1
15}
16
17init_cpuset_cgroup() {
18	local cgroup pid
19	local -A cgroups=()
20
21	# For cgroup-v2 we need to prepare cpuset subsystem on our own
22	if ((cgroup_version == 2)); then
23		set_cgroup_attr / cgroup.subtree_control "+cpuset"
24		create_cgroup /cpuset
25		set_cgroup_attr /cpuset cgroup.subtree_control "+cpuset"
26		# On distros which use cgroup-v2 under systemd, each process is
27		# maintained under separate, pre-configured subtree. With the rule of
28		# "internal processes are not permitted" this means that we won't find
29		# ourselves under subsystem's root, rather on the bottom of the cgroup
30		# maintaining user's session. To recreate the simple /cpuset setup from
31		# v1, move all the threads from all the existing cgroups to the top
32		# cgroup / and then migrate it to the /cpuset we created above.
33		for pid in /proc/+([0-9]); do
34			cgroup=$(get_cgroup "${pid##*/}") || continue
35			[[ $cgroup != / ]] || continue
36			cgroups["$cgroup"]=$cgroup
37		done 2> /dev/null
38		for cgroup in "${!cgroups[@]}"; do
39			move_cgroup_procs "$cgroup" /
40		done
41		# Now, move all the threads to the cpuset
42		move_cgroup_procs / /cpuset
43	elif ((cgroup_version == 1)); then
44		set_cgroup_attr /cpuset cgroup.procs "$$"
45	fi
46}
47
48is_cgroup_threaded() {
49	[[ -e $sysfs_cgroup/$1/cgroup.type ]] || return 1
50	[[ $(< "$sysfs_cgroup/$1/cgroup.type") == threaded ]]
51}
52
53move_cgroup_procs() {
54	local old_cgroup=$1
55	local new_cgroup=$2
56	local proc procs old_proc_interface new_proc_interface
57
58	# If target cgroups don't exist then there's nothing to do.
59	[[ -e $sysfs_cgroup/$old_cgroup ]] || return 0
60	[[ -e $sysfs_cgroup/$new_cgroup ]] || return 0
61
62	old_proc_interface=cgroup.procs
63	new_proc_interface=cgroup.procs
64	if ((cgroup_version == 2)); then
65		if is_cgroup_threaded "$new_cgroup"; then
66			new_proc_interface=cgroup.threads
67		fi
68		if is_cgroup_threaded "$old_cgroup"; then
69			old_proc_interface=cgroup.threads
70		fi
71	fi
72
73	fold_list_onto_array procs $(< "$sysfs_cgroup/$old_cgroup/$old_proc_interface")
74
75	local moved=0
76	for proc in "${!procs[@]}"; do
77		# We can't move every kernel thread around and every process can
78		# exit at any point so ignore any failures upon writing the
79		# processes out but keep count of any failed attempts for debugging
80		# purposes.
81		if move_proc "$proc" "$new_cgroup" "$old_cgroup" "$new_proc_interface"; then
82			((++moved))
83		fi
84	done
85	echo "Moved $moved processes, failed $((${#procs[@]} - moved))" >&2
86}
87
88move_proc() {
89	local proc=$1 new_cgroup=$2 old_cgroup=${3:-N/A} attr=$4 write_fail out=/dev/stderr
90
91	[[ -n $SILENT_CGROUP_DEBUG ]] && out=/dev/null
92
93	echo "Moving $proc ($(id_proc "$proc" 2>&1)) to $new_cgroup from $old_cgroup" > "$out"
94	if ! write_fail=$(set_cgroup_attr "$new_cgroup" "$attr" "$proc" 2>&1); then
95		echo "Moving $proc failed: ${write_fail##*: }" > "$out"
96		return 1
97	fi
98}
99
100set_cgroup_attr() {
101	local cgroup=$1
102	local attr=$2
103	local val=$3
104
105	[[ -e $sysfs_cgroup/$cgroup/$attr ]] || return 1
106
107	if [[ -n $val ]]; then
108		echo "$val" > "$sysfs_cgroup/$cgroup/$attr"
109	fi
110}
111
112create_cgroup() {
113	[[ ! -e $sysfs_cgroup/$1 ]] || return 0
114	mkdir "$sysfs_cgroup/$1"
115	if ((cgroup_version == 2)); then
116		echo "threaded" > "$sysfs_cgroup/$1/cgroup.type"
117	fi
118}
119
120remove_cgroup() {
121	local root_cgroup
122	root_cgroup=$(dirname "$1")
123
124	[[ -e $sysfs_cgroup/$1 ]] || return 0
125	move_cgroup_procs "$1" "$root_cgroup"
126	rmdir "$sysfs_cgroup/$1"
127}
128
129exec_in_cgroup() {
130	# Run this function as a background job - the reason why it remains {} instead
131	# of being declared as a subshell is to avoid having an extra bash fork around
132	# - note the exec call.
133
134	local cgroup=$1
135	local proc_interface=cgroup.procs
136
137	shift || return 1
138
139	if ((cgroup_version == 2)) && is_cgroup_threaded "$cgroup"; then
140		proc_interface=cgroup.threads
141	fi
142	set_cgroup_attr "$cgroup" "$proc_interface" "$BASHPID"
143	exec "$@"
144}
145
146kill_in_cgroup() {
147	local cgroup=$1
148	local pid=$2
149	local proc_interface=cgroup.procs
150	local cgroup_pids
151
152	if ((cgroup_version == 2)) && is_cgroup_threaded "$cgroup"; then
153		proc_interface=cgroup.threads
154	fi
155
156	fold_list_onto_array \
157		cgroup_pids \
158		$(< "$sysfs_cgroup/$cgroup/$proc_interface")
159
160	if [[ -n $pid ]]; then
161		if [[ -n ${cgroup_pids[pid]} ]]; then
162			kill "$pid"
163		fi
164	elif ((${#cgroup_pids[@]} > 0)); then
165		kill "${cgroup_pids[@]}"
166	fi
167}
168
169remove_cpuset_cgroup() {
170	if ((cgroup_version == 2)); then
171		remove_cgroup /cpuset
172	fi
173}
174
175get_cgroup() {
176	local pid=${1:-self} cgroup
177
178	[[ -e /proc/$pid/cgroup ]] || return 1
179	cgroup=$(< "/proc/$pid/cgroup")
180	echo "${cgroup##*:}"
181}
182
183get_cgroup_path() {
184	local cgroup
185
186	cgroup=$(get_cgroup "$1") || return 1
187	echo "$sysfs_cgroup$cgroup"
188}
189
190_set_cgroup_attr_top_bottom() {
191	local cgroup_path=$1 attr=$2 val=$3
192
193	if [[ -e ${cgroup_path%/*}/$attr ]]; then
194		_set_cgroup_attr_top_bottom "${cgroup_path%/*}" "$attr" "$val"
195	fi
196
197	if [[ -e $cgroup_path/$attr ]]; then
198		echo "$val" > "$cgroup_path/$attr"
199	fi
200}
201
202set_cgroup_attr_top_bottom() {
203	_set_cgroup_attr_top_bottom "$(get_cgroup_path "$1")" "$2" "$3"
204}
205
206id_proc() {
207	local pid=$1 flag_to_check=${2:-all}
208	local flags flags_map=() comm stats tflags
209
210	[[ -e /proc/$pid/stat ]] || return 1
211	# Comm is wrapped in () but the name of the thread itself may include "()", giving in result
212	# something similar to: ((sd-pam))
213	comm=$(< "/proc/$pid/stat") || return 1
214
215	stats=(${comm/*) /}) tflags=${stats[6]}
216
217	# include/linux/sched.h
218	flags_map[0x1]=PF_VCPU
219	flags_map[0x2]=PF_IDLE
220	flags_map[0x4]=PF_EXITING
221	flags_map[0x8]=PF_POSTCOREDUMP
222	flags_map[0x10]=PF_IO_WORKER
223	flags_map[0x20]=PF_WQ_WORKER
224	flags_map[0x40]=PF_FORK_NO_EXEC
225	flags_map[0x80]=PF_MCE_PROCESS
226	flags_map[0x100]=PF_SUPERPRIV
227	flags_map[0x200]=PF_DUMPCORE
228	flags_map[0x400]=PF_SIGNALED
229	flags_map[0x800]=PF_MEMALLOC
230	flags_map[0x1000]=PF_NPROC_EXCEEDED
231	flags_map[0x2000]=PF_USED_MATH
232	flags_map[0x4000]=PF_USER_WORKER
233	flags_map[0x8000]=PF_NOFREEZE
234	flags_map[0x20000]=PF_KSWAPD
235	flags_map[0x40000]=PF_MEMALLOC_NOFS
236	flags_map[0x80000]=PF_MEMALLOC_NOIO
237	flags_map[0x100000]=PF_LOCAL_THROTTLE
238	flags_map[0x00200000]=PF_KTHREAD
239	flags_map[0x00400000]=PF_RANDOMIZE
240	flags_map[0x04000000]=PF_NO_SETAFFINITY
241	flags_map[0x08000000]=PF_MCE_EARLY
242	flags_map[0x10000000]=PF_MEMALLOC_PIN
243	flags_map[0x80000000]=PF_SUSPEND_TASK
244
245	for flag in "${!flags_map[@]}"; do
246		[[ $flag_to_check == "${flags_map[flag]}" || $flag_to_check == all ]] || continue
247		((tflags & flag)) && flags=${flags:+$flags,}"${flags_map[flag]}"
248	done
249	if [[ -n $flags ]]; then
250		echo "$flags" >&2
251		return 0
252	fi
253	return 1
254}
255
256declare -r sysfs_cgroup=/sys/fs/cgroup
257cgroup_version=$(check_cgroup)
258