Skip to content

Commit bfa9428

Browse files
committed
page-cache: use the PAGEMAP_SCAN ioctl when it is available
Signed-off-by: Andrei Vagin <[email protected]>
1 parent 615e45e commit bfa9428

File tree

6 files changed

+173
-72
lines changed

6 files changed

+173
-72
lines changed

criu/include/mem.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "pid.h"
88
#include "proc_parse.h"
99
#include "inventory.pb-c.h"
10+
#include "pagemap-cache.h"
1011

1112
struct parasite_ctl;
1213
struct vm_area_list;
@@ -47,5 +48,6 @@ int open_vmas(struct pstree_item *t);
4748
int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta);
4849
int unmap_guard_pages(struct pstree_item *t);
4950
int prepare_mappings(struct pstree_item *t);
50-
bool should_dump_page(VmaEntry *vmae, u64 pme);
51+
52+
u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty);
5153
#endif /* __CR_MEM_H__ */

criu/include/pagemap-cache.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
#ifndef __CR_PAGEMAP_H__
22
#define __CR_PAGEMAP_H__
33

4+
#include <stdbool.h>
45
#include <sys/types.h>
56
#include "int.h"
67

78
#include "common/list.h"
9+
#include "pagemap_scan.h"
810

911
struct vma_area;
1012

@@ -15,9 +17,15 @@ typedef struct {
1517
unsigned long start; /* start of area */
1618
unsigned long end; /* end of area */
1719
const struct list_head *vma_head; /* list head of VMAs we're serving */
20+
int fd; /* file to read PMs from */
21+
1822
u64 *map; /* local buffer */
1923
size_t map_len; /* length of a buffer */
20-
int fd; /* file to read PMs from */
24+
25+
struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */
26+
size_t regs_len; /* actual length of regs */
27+
size_t regs_max_len; /* maximum length of regs */
28+
size_t regs_idx; /* current index in the regs array */
2129
} pmc_t;
2230

2331
#define PMC_INIT \
@@ -26,7 +34,8 @@ typedef struct {
2634
}
2735

2836
extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size);
29-
extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma);
37+
extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma);
3038
extern void pmc_fini(pmc_t *pmc);
39+
extern int pmc_fill(pmc_t *pmc, u64 start, u64 end);
3140

3241
#endif /* __CR_PAGEMAP_H__ */

criu/include/shmem.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
#include "int.h"
55
#include "common/lock.h"
66
#include "images/vma.pb-c.h"
7+
#include "pagemap-cache.h"
78

89
struct vma_area;
910

1011
extern int collect_shmem(int pid, struct vma_area *vma);
1112
extern int collect_sysv_shmem(unsigned long shmid, unsigned long size);
1213
extern int cr_dump_shmem(void);
13-
extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map);
14+
extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc);
1415
extern int fixup_sysv_shmems(void);
1516
extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size);
1617
extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid);

criu/mem.c

Lines changed: 73 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -99,38 +99,61 @@ static inline bool __page_in_parent(bool dirty)
9999
return opts.track_mem && opts.img_parent && !dirty;
100100
}
101101

102-
bool should_dump_page(VmaEntry *vmae, u64 pme)
102+
static bool should_dump_entire_vma(VmaEntry *vmae)
103103
{
104104
/*
105105
* vDSO area must be always dumped because on restore
106106
* we might need to generate a proxy.
107107
*/
108108
if (vma_entry_is(vmae, VMA_AREA_VDSO))
109109
return true;
110-
/*
111-
* In turn VVAR area is special and referenced from
112-
* vDSO area by IP addressing (at least on x86) thus
113-
* never ever dump its content but always use one provided
114-
* by the kernel on restore, ie runtime VVAR area must
115-
* be remapped into proper place..
116-
*/
117-
if (vma_entry_is(vmae, VMA_AREA_VVAR))
118-
return false;
119-
120-
/*
121-
* Optimisation for private mapping pages, that haven't
122-
* yet being COW-ed
123-
*/
124-
if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
125-
return false;
126110
if (vma_entry_is(vmae, VMA_AREA_AIORING))
127111
return true;
128-
if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme))
129-
return true;
130112

131113
return false;
132114
}
133115

116+
/*
117+
* should_dump_page returns vaddr if an addressed page has to be dumped.
118+
* Otherwise, it returns an address that has to be inspected next.
119+
*/
120+
u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty)
121+
{
122+
if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end))
123+
return -1;
124+
125+
if (pmc->regs) {
126+
while (1) {
127+
if (pmc->regs_idx == pmc->regs_len)
128+
return pmc->end;
129+
if (vaddr < pmc->regs[pmc->regs_idx].end)
130+
break;
131+
pmc->regs_idx++;
132+
}
133+
if (vaddr < pmc->regs[pmc->regs_idx].start)
134+
return pmc->regs[pmc->regs_idx].start;
135+
if (softdirty)
136+
*softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY;
137+
return vaddr;
138+
} else {
139+
u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)];
140+
141+
/*
142+
* Optimisation for private mapping pages, that haven't
143+
* yet being COW-ed
144+
*/
145+
if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
146+
return vaddr + PAGE_SIZE;
147+
if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) {
148+
if (softdirty)
149+
*softdirty = pme & PME_SOFT_DIRTY;
150+
return vaddr;
151+
}
152+
153+
return vaddr + PAGE_SIZE;
154+
}
155+
}
156+
134157
bool page_is_zero(u64 pme)
135158
{
136159
return __page_is_zero(pme);
@@ -164,25 +187,30 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr)
164187
* the memory contents is present in the parent image set.
165188
*/
166189

167-
static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off,
190+
static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr,
168191
bool has_parent)
169192
{
170-
u64 *at = &map[PAGE_PFN(*off)];
171-
unsigned long pfn, nr_to_scan;
193+
unsigned long nr_scanned;
172194
unsigned long pages[3] = {};
195+
unsigned long vaddr;
196+
bool dump_all_pages;
173197
int ret = 0;
174198

175-
nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
199+
dump_all_pages = should_dump_entire_vma(vma->e);
176200

177-
for (pfn = 0; pfn < nr_to_scan; pfn++) {
178-
unsigned long vaddr;
201+
nr_scanned = 0;
202+
for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) {
179203
unsigned int ppb_flags = 0;
204+
bool softdirty = false;
205+
u64 next;
180206
int st;
181207

182-
if (!should_dump_page(vma->e, at[pfn]))
208+
/* If dump_all_pages is true, should_dump_page is called to get pme. */
209+
next = should_dump_page(pmc, vma->e, vaddr, &softdirty);
210+
if (!dump_all_pages && next != vaddr) {
211+
vaddr = next - PAGE_SIZE;
183212
continue;
184-
185-
vaddr = vma->e->start + *off + pfn * PAGE_SIZE;
213+
}
186214

187215
if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr))
188216
ppb_flags |= PPB_LAZY;
@@ -194,7 +222,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct
194222
* page. The latter would be checked in page-xfer.
195223
*/
196224

197-
if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) {
225+
if (has_parent && page_in_parent(softdirty)) {
198226
ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT);
199227
st = 0;
200228
} else {
@@ -214,9 +242,8 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct
214242
pages[st]++;
215243
}
216244

217-
*off += pfn * PAGE_SIZE;
218-
219-
cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
245+
*pvaddr = vaddr;
246+
cnt_add(CNT_PAGES_SCANNED, nr_scanned);
220247
cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
221248
cnt_add(CNT_PAGES_LAZY, pages[1]);
222249
cnt_add(CNT_PAGES_WRITTEN, pages[2]);
@@ -356,12 +383,20 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str
356383
struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl,
357384
pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode)
358385
{
359-
u64 off = 0;
360-
u64 *map;
386+
u64 vaddr;
361387
int ret;
362388

363389
if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED))
364390
return 0;
391+
/*
392+
* In turn VVAR area is special and referenced from
393+
* vDSO area by IP addressing (at least on x86) thus
394+
* never ever dump its content but always use one provided
395+
* by the kernel on restore, ie runtime VVAR area must
396+
* be remapped into proper place..
397+
*/
398+
if (vma_entry_is(vma->e, VMA_AREA_VVAR))
399+
return 0;
365400

366401
/*
367402
* To facilitate any combination of pre-dump modes to run after
@@ -421,15 +456,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str
421456
has_parent = false;
422457
}
423458

424-
map = pmc_get_map(pmc, vma);
425-
if (!map)
459+
if (pmc_get_map(pmc, vma))
426460
return -1;
427461

428462
if (vma_area_is(vma, VMA_ANON_SHARED))
429-
return add_shmem_area(item->pid->real, vma->e, map);
430-
463+
return add_shmem_area(item->pid->real, vma->e, pmc);
464+
vaddr = vma->e->start;
431465
again:
432-
ret = generate_iovs(item, vma, pp, map, &off, has_parent);
466+
ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent);
433467
if (ret == -EAGAIN) {
434468
BUG_ON(!(pp->flags & PP_CHUNK_MODE));
435469

0 commit comments

Comments
 (0)