linux内核源码解析04–用户进程页表创建

banner

进程是独立的资源空间,每个进程都有自己独立的页表;

用户进程创建页表发生在三个时刻:

  • 创建进程 fork 时;
  • 缺页异常时;
  • 进程切换时;

一、创建进程 fork

核心函数

1
2
3
__do_fork()
-->copy_process
-->dup_mm()

1.1 dum_mm 函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static struct mm_struct *dup_mm(struct task_struct *tsk,
struct mm_struct *oldmm)
{
struct mm_struct *mm;
int err;

mm = allocate_mm();
if (!mm)
goto fail_nomem;

memcpy(mm, oldmm, sizeof(*mm));

if (!mm_init(mm, tsk, mm->user_ns)) /// 分配私有的 pgd 页面
goto fail_nomem;

err = dup_mmap(mm, oldmm); /// 拷贝父进程页表
...
}

1.2 分配 pgd 物理页面

1.2.1 pgd_alloc 函数

mm_init()->mm_alloc_pgd()->pgd_alloc()

1
2
3
4
5
6
7
8
9
pgd_t *pgd_alloc(struct mm_struct *mm)
{
gfp_t gfp = GFP_PGTABLE_USER;

if (PGD_SIZE == PAGE_SIZE)
return (pgd_t *)__get_free_page(gfp); /// 从伙伴系统分配物理页
else
return kmem_cache_alloc(pgd_cache, gfp);
}

1.3 拷贝父进程页表

1.3.1 拷贝 vma

依次调用

1
2
3
4
copy_mm()
->dum_mm()
->dum_mmap()
->copy_page_range()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
...
do {next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) { /// 遍历拷贝页表
ret = -ENOMEM;
break;
}
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
...
}

1.3.2 拷贝 pte 页

依次调用

1
2
3
4
5
copy_p4d_range()
->copy_pud_range()
->copy_pmd_range()
->copy_pte_range()
->copy_present_pte()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/*
* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
* is required to copy this pte.
*/
static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc)
{
...
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
if (is_cow_mapping(vm_flags) && pte_write(pte)) { /// 如果是 COW 页,父进程,子进程页面都设置为只读
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
...
/// 把 PTE 内容设置到子进程对应的 dst_pte 中
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}

这样所有页表拷贝完毕,等进程写只读 vma 时,写时拷贝,触发缺页异常,在异常服务里真正分配物理页面;

二、缺页异常

2.1 wp_page_copy

写时复制引起的缺页异常,核心处理函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;

if (unlikely(anon_vma_prepare(vma))) /// 检查 VMA 是否初始化了 RMAP
goto oom;

if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { ///PTE 如果是系统零页,分配一个内容全零的页面
new_page = alloc_zeroed_user_highpage_movable(vma,
vmf->address);
if (!new_page)
goto oom;
} else { /// 分配一个新物理页面,并且把 old_page 内容复制到 new_page 中
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (!new_page)
goto oom;

if (!cow_user_page(new_page, old_page, vmf)) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
* the same address and we will handle the fault
* from the second attempt.
*/
put_page(new_page);
if (old_page)
put_page(old_page);
return 0;
}
}

if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
cgroup_throttle_swaprate(new_page, GFP_KERNEL);

__SetPageUptodate(new_page); /// 设置 PG_uptodate, 表示内容有效

mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);

/*
* Re-check the pte - we dropped the lock
*/ /// 重新读取 PTE,并判定是否修改
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {if (old_page) {if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
mm_counter_file(old_page));
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else {inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma); /// 生成一个新 PTE

/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry, to keep TLBs on different CPUs in
* sync. This code used to set the new PTE then flush TLBs, but
* that left a window where the new PTE could be loaded into
* some TLBs while the old PTE remains in others.
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte); /// 刷新这个页面的 TLB
page_add_new_anon_rmap(new_page, vma, vmf->address, false); ///new_page 添加到 RMAP 系统中
lru_cache_add_inactive_or_unevictable(new_page, vma); ///new_page 添加到 LRU 链表中
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, vmf->address, vmf->pte, entry); /// 新 pte 设置到硬件 PTE 中
update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
* process may come and find the rmap count decremented
* before the pte is switched to the new page, and
* "reuse" the old page writing into it while our pte
* here still points into it and can be read by other
* threads.
*
* The critical issue is to order this
* page_remove_rmap with the ptp_clear_flush above.
* Those stores are ordered by (if nothing else,)
* the barrier present in the atomic_add_negative
* in page_remove_rmap.
*
* Then the TLB flush in ptep_clear_flush ensures that
* no process can access the old page before the
* decremented mapcount is visible. And the old page
* cannot be reused until after the decremented
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
page_remove_rmap(old_page, false);
}

/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {update_mmu_tlb(vma, vmf->address, vmf->pte);
}

if (new_page)
put_page(new_page);

pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {lock_page(old_page); /* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
unlock_page(old_page);
}
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
put_page(new_page);
oom:
if (old_page)
put_page(old_page);
return VM_FAULT_OOM;
}

三、进程切换

用户进程切换时,内存相关的主要做两件事情:

  1. 设置进程的 ASID 到 ttbr1_el1;
  2. 设置 mm->pgd 到 ttbr0_el1 完成地址空间切换;

依次调用

1
2
3
4
5
6
7
context_switch()
->switch_mm_irqs_off()
->switch_mm()
->__switch_mm()
->check_and_switch_context()
->cpu_switch_mm()
->cpu_do_switch_mm()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm)
{unsigned long ttbr1 = read_sysreg(ttbr1_el1);
unsigned long asid = ASID(mm);
unsigned long ttbr0 = phys_to_ttbr(pgd_phys);

/* Skip CNP for the reserved ASID */
if (system_supports_cnp() && asid)
ttbr0 |= TTBR_CNP_BIT;

/* SW PAN needs a copy of the ASID in TTBR0 for entry */
if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN))
ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid);

/* Set ASID in TTBR1 since TCR.A1 is set */
ttbr1 &= ~TTBR_ASID_MASK;
ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid);

write_sysreg(ttbr1, ttbr1_el1); ///ASID 填入 ttbr1_el1
isb();
write_sysreg(ttbr0, ttbr0_el1); /// 新进程页表基地址 pgd,填入 ttbr0_el1
isb();
post_ttbr_update_workaround();}