xref: /spdk/test/scheduler/cgroups.sh (revision 1e3d25b901a6b9d2dce4999e2ecbc02f98d79f05)
1#  SPDX-License-Identifier: BSD-3-Clause
2#  Copyright (C) 2021 Intel Corporation.
3#  All rights reserved.
4
5check_cgroup() {
6	# Try to work with both, cgroup-v1 and cgroup-v2. Verify which version is
7	# in use by looking up interfaces common for either of the versions.
8	if [[ -e $sysfs_cgroup/cgroup.controllers ]]; then
9		# cgroup2 is mounted, check if cpuset controller is available
10		[[ $(< "$sysfs_cgroup/cgroup.controllers") == *cpuset* ]] && echo 2
11	elif [[ -e $sysfs_cgroup/cpuset/tasks ]]; then
12		# cgroup's cpuset subsystem is mounted
13		echo 1
14	fi || return 1
15}
16
17init_cpuset_cgroup() {
18	local cgroup pid
19	local -A cgroups=()
20
21	# For cgroup-v2 we need to prepare cpuset subsystem on our own
22	if ((cgroup_version == 2)); then
23		set_cgroup_attr / cgroup.subtree_control "+cpuset"
24		create_cgroup /cpuset
25		set_cgroup_attr /cpuset cgroup.subtree_control "+cpuset"
26		# On distros which use cgroup-v2 under systemd, each process is
27		# maintained under separate, pre-configured subtree. With the rule of
28		# "internal processes are not permitted" this means that we won't find
29		# ourselves under subsystem's root, rather on the bottom of the cgroup
30		# maintaining user's session. To recreate the simple /cpuset setup from
31		# v1, move all the threads from all the existing cgroups to the top
32		# cgroup / and then migrate it to the /cpuset we created above.
33		for pid in /proc/+([0-9]); do
34			cgroup=$(get_cgroup "${pid##*/}") || continue
35			[[ $cgroup != / ]] || continue
36			cgroups["$cgroup"]=$cgroup
37		done 2> /dev/null
38		for cgroup in "${!cgroups[@]}"; do
39			move_cgroup_procs "$cgroup" /
40		done
41		# Now, move all the threads to the cpuset
42		move_cgroup_procs / /cpuset
43	elif ((cgroup_version == 1)); then
44		set_cgroup_attr /cpuset cgroup.procs "$$"
45	fi
46}
47
48is_cgroup_threaded() {
49	[[ -e $sysfs_cgroup/$1/cgroup.type ]] || return 1
50	[[ $(< "$sysfs_cgroup/$1/cgroup.type") == threaded ]]
51}
52
53move_cgroup_procs() {
54	local old_cgroup=$1
55	local new_cgroup=$2
56	local proc procs old_proc_interface new_proc_interface
57
58	# If target cgroups don't exist then there's nothing to do.
59	[[ -e $sysfs_cgroup/$old_cgroup ]] || return 0
60	[[ -e $sysfs_cgroup/$new_cgroup ]] || return 0
61
62	old_proc_interface=cgroup.procs
63	new_proc_interface=cgroup.procs
64	if ((cgroup_version == 2)); then
65		if is_cgroup_threaded "$new_cgroup"; then
66			new_proc_interface=cgroup.threads
67		fi
68		if is_cgroup_threaded "$old_cgroup"; then
69			old_proc_interface=cgroup.threads
70		fi
71	fi
72
73	fold_list_onto_array procs $(< "$sysfs_cgroup/$old_cgroup/$old_proc_interface")
74
75	local moved=0
76	for proc in "${!procs[@]}"; do
77		# We can't move every kernel thread around and every process can
78		# exit at any point so ignore any failures upon writing the
79		# processes out but keep count of any failed attempts for debugging
80		# purposes.
81		if move_proc "$proc" "$new_cgroup" "$old_cgroup" "$new_proc_interface"; then
82			((++moved))
83		fi
84	done
85	echo "Moved $moved processes, failed $((${#procs[@]} - moved))" >&2
86}
87
88move_proc() {
89	local proc=$1 new_cgroup=$2 old_cgroup=${3:-N/A} attr=$4 write_fail
90
91	echo "Moving $proc ($(id_proc "$proc" 2>&1)) to $new_cgroup from $old_cgroup" >&2
92	if ! write_fail=$(set_cgroup_attr "$new_cgroup" "$attr" "$proc" 2>&1); then
93		echo "Moving $proc failed: ${write_fail##*: }" >&2
94		return 1
95	fi
96}
97
98set_cgroup_attr() {
99	local cgroup=$1
100	local attr=$2
101	local val=$3
102
103	[[ -e $sysfs_cgroup/$cgroup/$attr ]] || return 1
104
105	if [[ -n $val ]]; then
106		echo "$val" > "$sysfs_cgroup/$cgroup/$attr"
107	fi
108}
109
110create_cgroup() {
111	[[ ! -e $sysfs_cgroup/$1 ]] || return 0
112	mkdir "$sysfs_cgroup/$1"
113	if ((cgroup_version == 2)); then
114		echo "threaded" > "$sysfs_cgroup/$1/cgroup.type"
115	fi
116}
117
118remove_cgroup() {
119	local root_cgroup
120	root_cgroup=$(dirname "$1")
121
122	[[ -e $sysfs_cgroup/$1 ]] || return 0
123	move_cgroup_procs "$1" "$root_cgroup"
124	rmdir "$sysfs_cgroup/$1"
125}
126
127exec_in_cgroup() {
128	# Run this function as a background job - the reason why it remains {} instead
129	# of being declared as a subshell is to avoid having an extra bash fork around
130	# - note the exec call.
131
132	local cgroup=$1
133	local proc_interface=cgroup.procs
134
135	shift || return 1
136
137	if ((cgroup_version == 2)) && is_cgroup_threaded "$cgroup"; then
138		proc_interface=cgroup.threads
139	fi
140	set_cgroup_attr "$cgroup" "$proc_interface" "$BASHPID"
141	exec "$@"
142}
143
144kill_in_cgroup() {
145	local cgroup=$1
146	local pid=$2
147	local proc_interface=cgroup.procs
148	local cgroup_pids
149
150	if ((cgroup_version == 2)) && is_cgroup_threaded "$cgroup"; then
151		proc_interface=cgroup.threads
152	fi
153
154	fold_list_onto_array \
155		cgroup_pids \
156		$(< "$sysfs_cgroup/$cgroup/$proc_interface")
157
158	if [[ -n $pid ]]; then
159		if [[ -n ${cgroup_pids[pid]} ]]; then
160			kill "$pid"
161		fi
162	elif ((${#cgroup_pids[@]} > 0)); then
163		kill "${cgroup_pids[@]}"
164	fi
165}
166
167remove_cpuset_cgroup() {
168	if ((cgroup_version == 2)); then
169		remove_cgroup /cpuset
170	fi
171}
172
173get_cgroup() {
174	local pid=${1:-self} cgroup
175
176	[[ -e /proc/$pid/cgroup ]] || return 1
177	cgroup=$(< "/proc/$pid/cgroup")
178	echo "${cgroup##*:}"
179}
180
181get_cgroup_path() {
182	local cgroup
183
184	cgroup=$(get_cgroup "$1") || return 1
185	echo "$sysfs_cgroup$cgroup"
186}
187
188_set_cgroup_attr_top_bottom() {
189	local cgroup_path=$1 attr=$2 val=$3
190
191	if [[ -e ${cgroup_path%/*}/$attr ]]; then
192		_set_cgroup_attr_top_bottom "${cgroup_path%/*}" "$attr" "$val"
193	fi
194
195	if [[ -e $cgroup_path/$attr ]]; then
196		echo "$val" > "$cgroup_path/$attr"
197	fi
198}
199
200set_cgroup_attr_top_bottom() {
201	_set_cgroup_attr_top_bottom "$(get_cgroup_path "$1")" "$2" "$3"
202}
203
204id_proc() {
205	local pid=$1 flag_to_check=${2:-all}
206	local flags flags_map=() comm stats tflags
207
208	[[ -e /proc/$pid/stat ]] || return 1
209	# Comm is wrapped in () but the name of the thread itself may include "()", giving in result
210	# something similar to: ((sd-pam))
211	comm=$(< "/proc/$pid/stat") || return 1
212
213	stats=(${comm/*) /}) tflags=${stats[6]}
214
215	# include/linux/sched.h
216	flags_map[0x1]=PF_VCPU
217	flags_map[0x2]=PF_IDLE
218	flags_map[0x4]=PF_EXITING
219	flags_map[0x8]=PF_POSTCOREDUMP
220	flags_map[0x10]=PF_IO_WORKER
221	flags_map[0x20]=PF_WQ_WORKER
222	flags_map[0x40]=PF_FORK_NO_EXEC
223	flags_map[0x80]=PF_MCE_PROCESS
224	flags_map[0x100]=PF_SUPERPRIV
225	flags_map[0x200]=PF_DUMPCORE
226	flags_map[0x400]=PF_SIGNALED
227	flags_map[0x800]=PF_MEMALLOC
228	flags_map[0x1000]=PF_NPROC_EXCEEDED
229	flags_map[0x2000]=PF_USED_MATH
230	flags_map[0x4000]=PF_USER_WORKER
231	flags_map[0x8000]=PF_NOFREEZE
232	flags_map[0x20000]=PF_KSWAPD
233	flags_map[0x40000]=PF_MEMALLOC_NOFS
234	flags_map[0x80000]=PF_MEMALLOC_NOIO
235	flags_map[0x100000]=PF_LOCAL_THROTTLE
236	flags_map[0x00200000]=PF_KTHREAD
237	flags_map[0x00400000]=PF_RANDOMIZE
238	flags_map[0x04000000]=PF_NO_SETAFFINITY
239	flags_map[0x08000000]=PF_MCE_EARLY
240	flags_map[0x10000000]=PF_MEMALLOC_PIN
241	flags_map[0x80000000]=PF_SUSPEND_TASK
242
243	for flag in "${!flags_map[@]}"; do
244		[[ $flag_to_check == "${flags_map[flag]}" || $flag_to_check == all ]] || continue
245		((tflags & flag)) && flags=${flags:+$flags,}"${flags_map[flag]}"
246	done
247	if [[ -n $flags ]]; then
248		echo "$flags" >&2
249		return 0
250	fi
251	return 1
252}
253
254declare -r sysfs_cgroup=/sys/fs/cgroup
255cgroup_version=$(check_cgroup)
256