Skip to content

Commit fc5de01

Browse files
committed
dump and restore cpu affinity of each thread.
Add one entry of thread_cpuallow_entry into thread_core_entry to save cpu affinity info. Restore it after threads restored but before running. Add option --with-cpu-affinity to enable this function at restore. Signed-off-by: hdzhoujie <[email protected]> Signed-off-by: He jingxian <[email protected]> Signed-off-by: Sang Yan <[email protected]>
1 parent 50db2be commit fc5de01

File tree

15 files changed

+126
-1
lines changed

15 files changed

+126
-1
lines changed

compel/arch/arm/plugins/std/syscalls/syscall.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ fsopen 430 430 (char *fsname, unsigned int flags)
118118
fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux)
119119
fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags)
120120
clone3 435 435 (struct clone_args *uargs, size_t size)
121+
sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask)
121122
pidfd_open 434 434 (pid_t pid, unsigned int flags)
122123
openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size)
123124
pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags)

compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
114114
__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
115115
__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
116116
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
117+
__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
117118
__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags)
118119
__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size)
119120
__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags)

compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
114114
__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
115115
__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
116116
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
117+
__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
117118
__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags)
118119
__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size)
119120
__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags)

compel/arch/x86/plugins/std/syscalls/syscall_32.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *
6363
__NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior)
6464
__NR_gettid 224 sys_gettid (void)
6565
__NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3)
66+
__NR_sched_setaffinity 241 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
6667
__NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
6768
__NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
6869
__NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)

compel/arch/x86/plugins/std/syscalls/syscall_64.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsign
7373
__NR_umount2 166 sys_umount2 (char *name, int flags)
7474
__NR_gettid 186 sys_gettid (void)
7575
__NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3)
76+
__NR_sched_setaffinity 203 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
7677
__NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
7778
__NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
7879
__NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)

criu/config.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
697697
{ "cgroup-yard", required_argument, 0, 1096 },
698698
{ "pre-dump-mode", required_argument, 0, 1097 },
699699
{ "file-validation", required_argument, 0, 1098 },
700+
BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity),
700701
BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check),
701702
{ "lsm-mount-context", required_argument, 0, 1099 },
702703
{ "network-lock", required_argument, 0, 1100 },

criu/cr-dump.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc)
140140
{
141141
int ret;
142142
struct sched_param sp;
143+
cpu_set_t cpumask;
143144

144145
BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */
145146

@@ -185,6 +186,17 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc)
185186
tc->has_sched_nice = true;
186187
tc->sched_nice = ret;
187188

189+
if (opts.with_cpu_affinity) {
190+
pr_info("\tdumping allowed cpus for %d\n", pid);
191+
ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask);
192+
if (ret < 0) {
193+
pr_perror("Can't get sched affinity for %d", pid);
194+
return -1;
195+
}
196+
tc->allowed_cpus->has_cpumask = true;
197+
memcpy(tc->allowed_cpus->cpumask, &cpumask, sizeof(cpu_set_t));
198+
}
199+
188200
return 0;
189201
}
190202

criu/cr-restore.c

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ static int prepare_restorer_blob(void);
120120
static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core);
121121
static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core);
122122
static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core);
123+
static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core);
123124

124125
/*
125126
* Architectures can overwrite this function to restore registers that are not
@@ -918,6 +919,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
918919
if (prepare_signals(pid, ta, core))
919920
return -1;
920921

922+
if (prepare_allowed_cpus(pid, ta, core))
923+
return -1;
924+
921925
if (prepare_posix_timers(pid, ta, core))
922926
return -1;
923927

@@ -3290,6 +3294,34 @@ static int prepare_signals(int pid, struct task_restore_args *ta, CoreEntry *lea
32903294
return ret;
32913295
}
32923296

3297+
static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core)
3298+
{
3299+
int i;
3300+
cpu_set_t *cpumask;
3301+
bool *has_cpumask;
3302+
3303+
if (!opts.with_cpu_affinity) {
3304+
return 0;
3305+
}
3306+
3307+
ta->allowed_cpus = (char *)rst_mem_align_cpos(RM_PRIVATE);
3308+
for (i = 0; i < current->nr_threads; i++) {
3309+
has_cpumask = rst_mem_alloc(sizeof(bool), RM_PRIVATE);
3310+
if (!has_cpumask)
3311+
return -1;
3312+
memcpy(has_cpumask, &(current->core[i]->thread_core->allowed_cpus->has_cpumask), sizeof(bool));
3313+
3314+
if (!(*has_cpumask))
3315+
continue;
3316+
3317+
cpumask = rst_mem_alloc(sizeof(cpu_set_t), RM_PRIVATE);
3318+
if (!cpumask)
3319+
return -1;
3320+
memcpy(cpumask, current->core[i]->thread_core->allowed_cpus->cpumask, sizeof(cpu_set_t));
3321+
}
3322+
return 0;
3323+
}
3324+
32933325
extern void __gcov_flush(void) __attribute__((weak));
32943326
void __gcov_flush(void)
32953327
{
@@ -3740,6 +3772,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
37403772
RST_MEM_FIXUP_PPTR(task_args->timerfd);
37413773
RST_MEM_FIXUP_PPTR(task_args->posix_timers);
37423774
RST_MEM_FIXUP_PPTR(task_args->siginfo);
3775+
RST_MEM_FIXUP_PPTR(task_args->allowed_cpus);
37433776
RST_MEM_FIXUP_PPTR(task_args->rlims);
37443777
RST_MEM_FIXUP_PPTR(task_args->helpers);
37453778
RST_MEM_FIXUP_PPTR(task_args->zombies);
@@ -3900,7 +3933,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
39003933
task_args->thread_args = thread_args;
39013934

39023935
task_args->auto_dedup = opts.auto_dedup;
3903-
3936+
task_args->with_cpu_affinity = opts.with_cpu_affinity;
39043937
/*
39053938
* In the restorer we need to know if it is SELinux or not. For SELinux
39063939
* we must change the process context before creating threads. For

criu/crtools.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,8 @@ int main(int argc, char *argv[], char *envp[])
509509
" --file-validation METHOD\n"
510510
" pass the validation method to be used; argument\n"
511511
" can be 'filesize' or 'buildid' (default).\n"
512+
" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n"
513+
" same cpu quantity.\n"
512514
" --skip-file-rwx-check\n"
513515
" Skip checking file permissions\n"
514516
" (r/w/x for u/g/o) on restore.\n"

criu/include/cr_options.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ struct cr_options {
236236
* explicitly request it as it comes with many limitations.
237237
*/
238238
int unprivileged;
239+
240+
/* restore cpu affinity */
241+
int with_cpu_affinity;
239242
};
240243

241244
extern struct cr_options opts;

criu/include/restorer.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <signal.h>
55
#include <limits.h>
6+
#include <sched.h>
67
#include <sys/resource.h>
78
#include <linux/filter.h>
89

@@ -171,6 +172,8 @@ struct task_restore_args {
171172
siginfo_t *siginfo;
172173
unsigned int siginfo_n;
173174

175+
char *allowed_cpus;
176+
174177
struct rst_tcp_sock *tcp_socks;
175178
unsigned int tcp_socks_n;
176179

@@ -240,6 +243,8 @@ struct task_restore_args {
240243

241244
uid_t uid;
242245
u32 cap_eff[CR_CAP_SIZE];
246+
247+
bool with_cpu_affinity;
243248
} __aligned(64);
244249

245250
/*

criu/pie/restorer.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,40 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
430430
return 0;
431431
}
432432

433+
static int restore_cpu_affinity(struct task_restore_args *args)
434+
{
435+
int i;
436+
int pid;
437+
int ret;
438+
cpu_set_t *cpumask;
439+
char *allowed_cpus;
440+
bool *has_cpumask;
441+
442+
if (!args->with_cpu_affinity) {
443+
return 0;
444+
}
445+
446+
allowed_cpus = args->allowed_cpus;
447+
for (i = 0; i < args->nr_threads; i++) {
448+
has_cpumask = (bool *)allowed_cpus;
449+
allowed_cpus += sizeof(bool);
450+
if (!(*has_cpumask)) {
451+
continue;
452+
}
453+
454+
pid = args->thread_args[i].pid;
455+
cpumask = (cpu_set_t *)allowed_cpus;
456+
ret = sys_sched_setaffinity(pid, sizeof(cpu_set_t), cpumask);
457+
if (ret) {
458+
pr_err("\t Restore %d cpumask failed.\n", pid);
459+
return ret;
460+
}
461+
allowed_cpus += sizeof(cpu_set_t);
462+
}
463+
464+
return 0;
465+
}
466+
433467
static int restore_rseq(struct rst_rseq_param *rseq)
434468
{
435469
int ret;
@@ -1968,6 +2002,10 @@ long __export_restore_task(struct task_restore_args *args)
19682002

19692003
pr_info("%ld: Restored\n", sys_getpid());
19702004

2005+
ret = restore_cpu_affinity(args);
2006+
if (ret)
2007+
goto core_restore_end;
2008+
19712009
restore_finish_stage(task_entries_local, CR_STATE_RESTORE);
19722010

19732011
if (wait_helpers(args) < 0)

criu/pstree.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,13 @@ CoreEntry *core_entry_alloc(int th, int tsk)
5858
CredsEntry *ce = NULL;
5959

6060
sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry);
61+
sz += sizeof(ThreadAllowedcpusEntry);
6162

6263
sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]);
6364
sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]);
6465
sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]);
6566
sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]);
67+
sz += sizeof(cpu_set_t);
6668
/*
6769
* @groups are dynamic and allocated
6870
* on demand.
@@ -127,6 +129,11 @@ CoreEntry *core_entry_alloc(int th, int tsk)
127129
ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0]));
128130
ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0]));
129131

132+
core->thread_core->allowed_cpus = xptr_pull(&m, ThreadAllowedcpusEntry);
133+
thread_allowedcpus_entry__init(core->thread_core->allowed_cpus);
134+
core->thread_core->allowed_cpus->n_cpumask = sizeof(cpu_set_t) / sizeof(uint64_t);
135+
core->thread_core->allowed_cpus->cpumask = xptr_pull_s(&m, sizeof(cpu_set_t));
136+
130137
if (arch_alloc_thread_info(core)) {
131138
xfree(core);
132139
core = NULL;
@@ -278,6 +285,7 @@ int dump_pstree(struct pstree_item *root_item)
278285
PstreeEntry e = PSTREE_ENTRY__INIT;
279286
int ret = -1, i;
280287
struct cr_img *img;
288+
unsigned int nr_cpus;
281289

282290
pr_info("\n");
283291
pr_info("Dumping pstree (pid: %d)\n", root_item->pid->real);
@@ -301,6 +309,7 @@ int dump_pstree(struct pstree_item *root_item)
301309
}
302310
}
303311

312+
nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
304313
img = open_image(CR_FD_PSTREE, O_DUMP);
305314
if (!img)
306315
return -1;
@@ -313,6 +322,7 @@ int dump_pstree(struct pstree_item *root_item)
313322
e.pgid = item->pgid;
314323
e.sid = item->sid;
315324
e.n_threads = item->nr_threads;
325+
e.nr_cpus = nr_cpus;
316326

317327
e.threads = xmalloc(sizeof(e.threads[0]) * e.n_threads);
318328
if (!e.threads)
@@ -532,6 +542,7 @@ static int read_one_pstree_item(struct cr_img *img, pid_t *pid_max)
532542
struct pstree_item *pi;
533543
PstreeEntry *e;
534544
int ret, i;
545+
unsigned int nr_cpus;
535546

536547
ret = pb_read_one_eof(img, &e, PB_PSTREE);
537548
if (ret <= 0)
@@ -543,6 +554,14 @@ static int read_one_pstree_item(struct cr_img *img, pid_t *pid_max)
543554
goto err;
544555
BUG_ON(pi->pid->state != TASK_UNDEF);
545556

557+
if (opts.with_cpu_affinity) {
558+
nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
559+
if (e->nr_cpus > nr_cpus) {
560+
pr_err("different number of cpus in cpu affinity restore\n");
561+
goto err;
562+
}
563+
}
564+
546565
/*
547566
* All pids should be added in the tree to be able to find
548567
* free pid-s for helpers. pstree_item for these pid-s will

images/core.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ message thread_sas_entry {
8787
required uint32 ss_flags = 3;
8888
}
8989

90+
message thread_allowedcpus_entry {
91+
required bool has_cpumask = 1 [default=false];
92+
repeated uint64 cpumask = 2 [packed=true];
93+
}
94+
9095
message thread_core_entry {
9196
required uint64 futex_rla = 1;
9297
required uint32 futex_rla_len = 2;
@@ -107,6 +112,7 @@ message thread_core_entry {
107112
optional uint64 blk_sigset_extended = 14;
108113
optional rseq_entry rseq_entry = 15;
109114
required uint32 cg_set = 16;
115+
optional thread_allowedcpus_entry allowed_cpus = 17;
110116
}
111117

112118
message task_rlimits_entry {

images/pstree.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ message pstree_entry {
88
required uint32 pgid = 3;
99
required uint32 sid = 4;
1010
repeated uint32 threads = 5;
11+
optional uint32 nr_cpus = 6;
1112
}

0 commit comments

Comments
 (0)