1# SPDX-License-Identifier: BSD-3-Clause 2# Copyright (C) 2021 Intel Corporation. 3# All rights reserved. 4 5check_cgroup() { 6 # Try to work with both, cgroup-v1 and cgroup-v2. Verify which version is 7 # in use by looking up interfaces common for either of the versions. 8 if [[ -e $sysfs_cgroup/cgroup.controllers ]]; then 9 # cgroup2 is mounted, check if cpuset controller is available 10 [[ $(< "$sysfs_cgroup/cgroup.controllers") == *cpuset* ]] && echo 2 11 elif [[ -e $sysfs_cgroup/cpuset/tasks ]]; then 12 # cgroup's cpuset subsystem is mounted 13 echo 1 14 fi || return 1 15} 16 17init_cpuset_cgroup() { 18 local cgroup pid 19 local -A cgroups=() 20 21 # For cgroup-v2 we need to prepare cpuset subsystem on our own 22 if ((cgroup_version == 2)); then 23 set_cgroup_attr / cgroup.subtree_control "+cpuset" 24 create_cgroup /cpuset 25 set_cgroup_attr /cpuset cgroup.subtree_control "+cpuset" 26 # On distros which use cgroup-v2 under systemd, each process is 27 # maintained under separate, pre-configured subtree. With the rule of 28 # "internal processes are not permitted" this means that we won't find 29 # ourselves under subsystem's root, rather on the bottom of the cgroup 30 # maintaining user's session. To recreate the simple /cpuset setup from 31 # v1, move all the threads from all the existing cgroups to the top 32 # cgroup / and then migrate it to the /cpuset we created above. 33 for pid in /proc/+([0-9]); do 34 cgroup=$(get_cgroup "${pid##*/}") || continue 35 [[ $cgroup != / ]] || continue 36 cgroups["$cgroup"]=$cgroup 37 done 2> /dev/null 38 for cgroup in "${!cgroups[@]}"; do 39 move_cgroup_procs "$cgroup" / 40 done 41 # Now, move all the threads to the cpuset 42 move_cgroup_procs / /cpuset 43 elif ((cgroup_version == 1)); then 44 set_cgroup_attr /cpuset cgroup.procs "$$" 45 fi 46} 47 48is_cgroup_threaded() { 49 [[ -e $sysfs_cgroup/$1/cgroup.type ]] || return 1 50 [[ $(< "$sysfs_cgroup/$1/cgroup.type") == threaded ]] 51} 52 53move_cgroup_procs() { 54 local old_cgroup=$1 55 local new_cgroup=$2 56 local proc procs old_proc_interface new_proc_interface 57 58 # If target cgroups don't exist then there's nothing to do. 59 [[ -e $sysfs_cgroup/$old_cgroup ]] || return 0 60 [[ -e $sysfs_cgroup/$new_cgroup ]] || return 0 61 62 old_proc_interface=cgroup.procs 63 new_proc_interface=cgroup.procs 64 if ((cgroup_version == 2)); then 65 if is_cgroup_threaded "$new_cgroup"; then 66 new_proc_interface=cgroup.threads 67 fi 68 if is_cgroup_threaded "$old_cgroup"; then 69 old_proc_interface=cgroup.threads 70 fi 71 fi 72 73 fold_list_onto_array procs $(< "$sysfs_cgroup/$old_cgroup/$old_proc_interface") 74 75 local moved=0 76 for proc in "${!procs[@]}"; do 77 # We can't move every kernel thread around and every process can 78 # exit at any point so ignore any failures upon writing the 79 # processes out but keep count of any failed attempts for debugging 80 # purposes. 81 if move_proc "$proc" "$new_cgroup" "$old_cgroup" "$new_proc_interface"; then 82 ((++moved)) 83 fi 84 done 85 echo "Moved $moved processes, failed $((${#procs[@]} - moved))" >&2 86} 87 88move_proc() { 89 local proc=$1 new_cgroup=$2 old_cgroup=${3:-N/A} attr=$4 write_fail out=/dev/stderr 90 91 [[ -n $SILENT_CGROUP_DEBUG ]] && out=/dev/null 92 93 echo "Moving $proc ($(id_proc "$proc" 2>&1)) to $new_cgroup from $old_cgroup" > "$out" 94 if ! write_fail=$(set_cgroup_attr "$new_cgroup" "$attr" "$proc" 2>&1); then 95 echo "Moving $proc failed: ${write_fail##*: }" > "$out" 96 return 1 97 fi 98} 99 100set_cgroup_attr() { 101 local cgroup=$1 102 local attr=$2 103 local val=$3 104 105 [[ -e $sysfs_cgroup/$cgroup/$attr ]] || return 1 106 107 if [[ -n $val ]]; then 108 echo "$val" > "$sysfs_cgroup/$cgroup/$attr" 109 fi 110} 111 112create_cgroup() { 113 [[ ! -e $sysfs_cgroup/$1 ]] || return 0 114 mkdir "$sysfs_cgroup/$1" 115 if ((cgroup_version == 2)); then 116 echo "threaded" > "$sysfs_cgroup/$1/cgroup.type" 117 fi 118} 119 120remove_cgroup() { 121 local root_cgroup 122 root_cgroup=$(dirname "$1") 123 124 [[ -e $sysfs_cgroup/$1 ]] || return 0 125 move_cgroup_procs "$1" "$root_cgroup" 126 rmdir "$sysfs_cgroup/$1" 127} 128 129exec_in_cgroup() { 130 # Run this function as a background job - the reason why it remains {} instead 131 # of being declared as a subshell is to avoid having an extra bash fork around 132 # - note the exec call. 133 134 local cgroup=$1 135 local proc_interface=cgroup.procs 136 137 shift || return 1 138 139 if ((cgroup_version == 2)) && is_cgroup_threaded "$cgroup"; then 140 proc_interface=cgroup.threads 141 fi 142 set_cgroup_attr "$cgroup" "$proc_interface" "$BASHPID" 143 exec "$@" 144} 145 146kill_in_cgroup() { 147 local cgroup=$1 148 local pid=$2 149 local proc_interface=cgroup.procs 150 local cgroup_pids 151 152 if ((cgroup_version == 2)) && is_cgroup_threaded "$cgroup"; then 153 proc_interface=cgroup.threads 154 fi 155 156 fold_list_onto_array \ 157 cgroup_pids \ 158 $(< "$sysfs_cgroup/$cgroup/$proc_interface") 159 160 if [[ -n $pid ]]; then 161 if [[ -n ${cgroup_pids[pid]} ]]; then 162 kill "$pid" 163 fi 164 elif ((${#cgroup_pids[@]} > 0)); then 165 kill "${cgroup_pids[@]}" 166 fi 167} 168 169remove_cpuset_cgroup() { 170 if ((cgroup_version == 2)); then 171 remove_cgroup /cpuset 172 fi 173} 174 175get_cgroup() { 176 local pid=${1:-self} cgroup 177 178 [[ -e /proc/$pid/cgroup ]] || return 1 179 cgroup=$(< "/proc/$pid/cgroup") 180 echo "${cgroup##*:}" 181} 182 183get_cgroup_path() { 184 local cgroup 185 186 cgroup=$(get_cgroup "$1") || return 1 187 echo "$sysfs_cgroup$cgroup" 188} 189 190_set_cgroup_attr_top_bottom() { 191 local cgroup_path=$1 attr=$2 val=$3 192 193 if [[ -e ${cgroup_path%/*}/$attr ]]; then 194 _set_cgroup_attr_top_bottom "${cgroup_path%/*}" "$attr" "$val" 195 fi 196 197 if [[ -e $cgroup_path/$attr ]]; then 198 echo "$val" > "$cgroup_path/$attr" 199 fi 200} 201 202set_cgroup_attr_top_bottom() { 203 _set_cgroup_attr_top_bottom "$(get_cgroup_path "$1")" "$2" "$3" 204} 205 206id_proc() { 207 local pid=$1 flag_to_check=${2:-all} 208 local flags flags_map=() comm stats tflags 209 210 [[ -e /proc/$pid/stat ]] || return 1 211 # Comm is wrapped in () but the name of the thread itself may include "()", giving in result 212 # something similar to: ((sd-pam)) 213 comm=$(< "/proc/$pid/stat") || return 1 214 215 stats=(${comm/*) /}) tflags=${stats[6]} 216 217 # include/linux/sched.h 218 flags_map[0x1]=PF_VCPU 219 flags_map[0x2]=PF_IDLE 220 flags_map[0x4]=PF_EXITING 221 flags_map[0x8]=PF_POSTCOREDUMP 222 flags_map[0x10]=PF_IO_WORKER 223 flags_map[0x20]=PF_WQ_WORKER 224 flags_map[0x40]=PF_FORK_NO_EXEC 225 flags_map[0x80]=PF_MCE_PROCESS 226 flags_map[0x100]=PF_SUPERPRIV 227 flags_map[0x200]=PF_DUMPCORE 228 flags_map[0x400]=PF_SIGNALED 229 flags_map[0x800]=PF_MEMALLOC 230 flags_map[0x1000]=PF_NPROC_EXCEEDED 231 flags_map[0x2000]=PF_USED_MATH 232 flags_map[0x4000]=PF_USER_WORKER 233 flags_map[0x8000]=PF_NOFREEZE 234 flags_map[0x20000]=PF_KSWAPD 235 flags_map[0x40000]=PF_MEMALLOC_NOFS 236 flags_map[0x80000]=PF_MEMALLOC_NOIO 237 flags_map[0x100000]=PF_LOCAL_THROTTLE 238 flags_map[0x00200000]=PF_KTHREAD 239 flags_map[0x00400000]=PF_RANDOMIZE 240 flags_map[0x04000000]=PF_NO_SETAFFINITY 241 flags_map[0x08000000]=PF_MCE_EARLY 242 flags_map[0x10000000]=PF_MEMALLOC_PIN 243 flags_map[0x80000000]=PF_SUSPEND_TASK 244 245 for flag in "${!flags_map[@]}"; do 246 [[ $flag_to_check == "${flags_map[flag]}" || $flag_to_check == all ]] || continue 247 ((tflags & flag)) && flags=${flags:+$flags,}"${flags_map[flag]}" 248 done 249 if [[ -n $flags ]]; then 250 echo "$flags" >&2 251 return 0 252 fi 253 return 1 254} 255 256declare -r sysfs_cgroup=/sys/fs/cgroup 257cgroup_version=$(check_cgroup) 258