linux内核源码解析03–启动代码分析之主内核页表创建

banner

Linux 初始化过程,会依次建立如下页表映射:

  1. 恒等映射:页表基地址 idmap_pg_dir;
  2. 粗粒度内核镜像映射:页表基地址 init_pg_dir;
  3. fixmap 映射:页表基地址为 init_pg_dir, 待 paging_init 之后为 swapper_pg_end;
  4. 细粒度内核镜像映射:页表基地址为 swapper_pg_dir;
  5. 线性映射:页表基地址为 swapper_pg_dir;
  6. 用户空间页表映射:页表基地址 task->mm->pgd;

上篇解析 “fixmap 映射” , 这里来解析主内核页表的创建, 包括 “细粒度内核镜像映射“ 和 “线性映射“;

创建完固定映射后,会初始化物理页面分配器, 即初始化伙伴系统;有了物理页面分配器,内核主页表就可以建立动态映射页表:

1
2
3
4
5
6
7
/// 整理 memblock 的内存区域
arm64_memblock_init();

/// 至此,物理内存通过 memblock 模块添加入了系统,但此时只有 dtb,Image 所在的两端物理内存可以访问;// 其他区域的物理内存,还没建立映射,可以通过 memblock_alloc 分配,但不能访问;// 接下来通过 pagint_init 建立不能访问的物理区域的页表;
//
//paging_init 是内存初始化最核心的一步, 将完成细粒度内核镜像映射 (分别映射每个段), 线性映射 (内核可以访问整个物理内存)
paging_init(); /// 建立动态页表

页面分配器这里略去,先来看主内核页表的建立,分两部分:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
void __init paging_init(void)
{
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir)); ///通过固定映射映射,访问swapper_pg_dir

map_kernel(pgdp); ///建立内核的细粒度映射(分别建立内核每个段的动态映射)
/// 映射memblock子系统添加的内存区域
map_mem(pgdp); ///建立物理内存的线性映射(可以访问整个物理内存区域,memblock有效区域)

///解除fixed区域pgd虚拟地址映射
pgd_clear_fixmap();

///将pgd页表的内容切换到swapper_pgd_dir页表
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
init_mm.pgd = swapper_pg_dir; ///切换内核主进程的pgd地址

///释放init_pg_dir页表的物理内存
memblock_free(__pa_symbol(init_pg_dir),
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));

memblock_allow_resize();
}

一、建立内核的细粒度映射

1.1 map_kernel() 函数

将内核的每个段,分别建立页表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/*
* Create fine-grained mappings for the kernel.
*/
static void __init map_kernel(pgd_t *pgdp)
{
static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
vmlinux_initdata, vmlinux_data;

/*
* External debuggers may need to write directly to the text
* mapping to install SW breakpoints. Allow this (only) when
* explicitly requested with rodata=off.
*/
pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;

/*
* If we have a CPU that supports BTI and a kernel built for
* BTI then mark the kernel executable text as guarded pages
* now so we don't have to rewrite the page tables later.
*/
if (arm64_early_this_cpu_has_bti())
text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);

/*
* Only rodata will be remapped with different permissions later on,
* all other segments are allowed to use contiguous mappings.
*/
map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
VM_NO_GUARD);
map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
&vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
&vmlinux_inittext, 0, VM_NO_GUARD);
map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
&vmlinux_initdata, 0, VM_NO_GUARD);
map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);

if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdp, FIXADDR_START)))) {
/*
* The fixmap falls in a separate pgd to the kernel, and doesn't
* live in the carveout for the swapper_pg_dir. We can simply
* re-use the existing dir for the fixmap.
*/
set_pgd(pgd_offset_pgd(pgdp, FIXADDR_START), /// 将 init_pg_dir 的表项同步到 swapper_pg_dir
READ_ONCE(*pgd_offset_k(FIXADDR_START)));
} else if (CONFIG_PGTABLE_LEVELS > 3) {
pgd_t *bm_pgdp;
p4d_t *bm_p4dp;
pud_t *bm_pudp;
/*
* The fixmap shares its top level pgd entry with the kernel
* mapping. This can really only occur when we are running
* with 16k/4 levels, so we can simply reuse the pud level
* entry instead.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
bm_pgdp = pgd_offset_pgd(pgdp, FIXADDR_START);
bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
pud_clear_fixmap();} else {BUG();
}

kasan_copy_shadow(pgdp);
}

1.2 map_kernel_segment 函数

为内核的段建立动态映射

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/// 建立内核段的动态映射
static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
pgprot_t prot, struct vm_struct *vma,
int flags, unsigned long vm_flags)
{phys_addr_t pa_start = __pa_symbol(va_start); /// 获取物理地址
unsigned long size = va_end - va_start;

BUG_ON(!PAGE_ALIGNED(pa_start));
BUG_ON(!PAGE_ALIGNED(size));

__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
early_pgtable_alloc, flags); /// 建立内存段映射,用 early_pgtable_alloc 动态分配

if (!(vm_flags & VM_NO_GUARD)) /// 添加一个页的 guard
size += PAGE_SIZE;

vma->addr = va_start;
vma->phys_addr = pa_start;
vma->size = size;
vma->flags = VM_MAP | vm_flags;
vma->caller = __builtin_return_address(0);

vm_area_add_early(vma); /// 将 VMA 添加到内核的 vma 链表
}

1.3 __create_pgd_mapping 函数

建立页表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/// 依次动态建立各级页表
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(int),
int flags)
{
unsigned long addr, end, next;
pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);

/*
* If the virtual and physical address don't have the same offset
* within a page, we cannot map the region as the caller expects.
*/
if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
return;

phys &= PAGE_MASK;
addr = virt & PAGE_MASK;
end = PAGE_ALIGN(virt + size);

do {next = pgd_addr_end(addr, end);
alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
flags);
phys += next - addr;
} while (pgdp++, addr = next, addr != end);
}

1.4 动态分配页表

页表建立过程很简单,就不过多啰嗦了,这里标记两点:

  1. 由于页面分配器已经初始化完,这里可以动态分配页表;(内核启动到这里之前,都是静态页表,即页表都是固定页面);
  2. 动态分配的页表,拿到的是物理地址,要继续向下一级页表遍历,必须将物理地址转化为虚拟地址, CPU 才能正确访问;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(int),
int flags)
{
unsigned long next;
pud_t *pudp;
p4d_t *p4dp = p4d_offset(pgdp, addr);
p4d_t p4d = READ_ONCE(*p4dp);

if (p4d_none(p4d)) {
p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
phys_addr_t pud_phys;

if (flags & NO_EXEC_MAPPINGS)
p4dval |= P4D_TABLE_PXN;
BUG_ON(!pgtable_alloc);
pud_phys = pgtable_alloc(PUD_SHIFT); /// 动态分配一个 pud,填充 pgd 表项
__p4d_populate(p4dp, pud_phys, p4dval);
p4d = READ_ONCE(*p4dp);
}
BUG_ON(p4d_bad(p4d));

pudp = pud_set_fixmap_offset(p4dp, addr); ///pgd 表项保存的是 pud 的物理地址,要线转换成虚拟地址,CPU 才能访问
do {pud_t old_pud = READ_ONCE(*pudp);

next = pud_addr_end(addr, end);

/*
* For 4K granule only, attempt to put down a 1GB block
*/
if (use_1G_block(addr, next, phys) &&
(flags & NO_BLOCK_MAPPINGS) == 0) {pud_set_huge(pudp, phys, prot);

/*
* After the PUD entry has been populated once, we
* only allow updates to the permission attributes.
*/
BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
READ_ONCE(pud_val(*pudp))));
} else {
alloc_init_cont_pmd(pudp, addr, next, phys, prot,
pgtable_alloc, flags);

BUG_ON(pud_val(old_pud) != 0 &&
pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
}
phys += next - addr;
} while (pudp++, addr = next, addr != end);

pud_clear_fixmap();}

这样,内核镜像的各个段,就全部做了动态映射,后面访问,就不再依赖于固定映射;

但是 pgd 一级页表基地址,还是用的固定地址 swapper_pg_dir, 内核页表建立后,需要将页表基地址更新到 init 进程的 mm_struct 结构体;

现在内核镜像本身可以自由访问了,但物理内存的其他区域,依然无法访问,为方便内核自由访问所有物理内存,Linux 做了一个线性映射,

二、线性映射

将物理内存全部线性映射到虚拟地址段 (仅做一个偏移),后续在内核空间可以直接用偏移地址访问整个物理内存;

2.1 线性映射核心函数 map_mem()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); /// 计算需要线性映射的虚拟地址和物理地址
phys_addr_t kernel_start = __pa_symbol(_stext);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
phys_addr_t start, end;
int flags = NO_EXEC_MAPPINGS;
u64 i;

/*
* Setting hierarchical PXNTable attributes on table entries covering
* the linear region is only possible if it is guaranteed that no table
* entries at any level are being shared between the linear region and
* the vmalloc region. Check whether this is true for the PGD level, in
* which case it is guaranteed to be true for all other levels as well.
*/
BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));

if (rodata_full || crash_mem_map || debug_pagealloc_enabled())
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;

/*
* Take care not to create a writable alias for the
* read-only text and rodata sections of the kernel image.
* So temporarily mark them as NOMAP to skip mappings in
* the following for-loop
*/
memblock_mark_nomap(kernel_start, kernel_end - kernel_start); /// 设备树可以定义 nomap 区,nomap 段将不会被映射

/* map all the memory banks */
for_each_mem_range(i, &start, &end) {if (start >= end)
break;
/*
* The linear map must allow allocation tags reading/writing
* if MTE is present. Otherwise, it has the same attributes as
* PAGE_KERNEL.
*/
__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
flags);
}

/*
* Map the linear alias of the [_stext, __init_begin) interval
* as non-executable now, and remove the write permission in
* mark_linear_text_alias_ro() below (which will be called after
* alternative patching has completed). This makes the contents
* of the region accessible to subsystems such as hibernate,
* but protects it from inadvertent modification or execution.
* Note that contiguous mappings cannot be remapped in this way,
* so we should avoid them here.
*/
__map_memblock(pgdp, kernel_start, kernel_end,
PAGE_KERNEL, NO_CONT_MAPPINGS);
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
}

2.2 __map_memblock

实际建立页表映射过程过程与细粒度大致相似

1
2
3
4
5
static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
phys_addr_t end, pgprot_t prot, int flags)
{__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
prot, early_pgtable_alloc, flags);
}

至此,Linux 内核主页表创建完毕。

三、buddy系统初始化

到目前为止,内核完成了如下工作

memblock已经通过arm64_memblock_init完成了初始化, 至此系统中的内存可以通过memblock分配了

paging_init完成了分页机制的初始化, 至此内核已经布局了一套完整的虚拟内存空间

稀疏内存管理将整个物理地址空间划分为section
对于ARM64,一般支持48bit物理地址(256T),section为1G物理块,可以划分为256K个seciton;

每个在位的section在软件上抽象为一个struct mem_section结构体;

对于每个section又可以分为若干Pageblock,每个pageblock的状态由4bit来描述

3.1 启动过程期间的内存管理–bootmem分配器

在启动过程期间, 尽管内存管理尚未初始化, 但是内核仍然需要分配内存以创建各种数据结构, 早期的内核中负责初始化阶段的内存分配器称为引导内存分配器(boot memory allocator–bootmem分配器), 在耳熟能详的伙伴系统建立前内存都是利用分配器来分配的,伙伴系统框架建立起来后,bootmem会过度到伙伴系统. 显然, 对该内存分配器的需求集中于简单性方面, 而不是性能和通用性, 它仅用于初始化阶段. 因此内核开发者决定实现一个最先适配(first-first)分配器用于在启动阶段管理内存. 这是可能想到的最简单的方式.

**引导内存分配器(boot memory allocator–bootmem分配器)**基于最先适配(first-first)分配器的原理(这儿是很多系统的内存分配所使用的原理), 使用一个位图来管理页, 以位图代替原来的空闲链表结构来表示存储空间, 位图的比特位的数目与系统中物理内存页面数目相同. 若位图中某一位是1, 则标识该页面已经被分配(已用页), 否则表示未被占有(未用页).

在需要分配内存时, 分配器逐位的扫描位图, 直至找到一个能提供足够连续页的位置, 即所谓的最先最佳(first-best)或最先适配位置.该分配机制通过记录上一次分配的页面帧号(PFN)结束时的偏移量来实现分配大小小于一页的空间, 连续的小的空闲空间将被合并存储在一页上.

即使是初始化用的最先适配分配器也必须使用一些数据结构存, 内核为系统中每一个结点都提供了一个struct bootmem_data结构的实例, 用于bootmem的内存管理. 它含有引导内存分配器给结点分配内存时所需的信息. 当然, 这时候内存管理还没有初始化, 因而该结构所需的内存是无法动态分配的, 必须在编译时分配给内核。

3.1 bootmem_init

在paging_init之后, 系统的页帧已经建立起来, 然后通过bootmem_init中, 系统开始完成bootmem的初始化工作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
void __init bootmem_init(void)
{
unsigned long min, max;

//获取最小,最大页帧号
min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());

///如果开启memtest,内核会对没有使用的free memory做memtest,检测出异常的dram
//将这些dram通过reserve_bad_mem保留不用,从而保证系统正常boot
early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

max_pfn = max_low_pfn = max;
min_low_pfn = min;

///一些numa的初始化工作
arch_numa_init();

/*
* must be done after arch_numa_init() which calls numa_init() to
* initialize node_online_map that gets used in hugetlb_cma_reserve()
* while allocating required CMA size across online nodes.
*/
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
arm64_hugetlb_cma_reserve();
#endif

dma_pernuma_cma_reserve();

kvm_hyp_reserve();

/*
* sparse_init() tries to allocate memory from memblock, so must be
* done after the fixed reservations
*/
///sparse内存模型初始化;
sparse_init();

///初始化zone数据结构
zone_sizes_init(min, max);

/*
* Reserve the CMA area after arm64_dma_phys_limit was initialised.
*/
dma_contiguous_reserve(arm64_dma_phys_limit);

/*
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
reserve_crashkernel();

memblock_dump_all();
}

3.2 sparse_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
void __init sparse_init(void)
{
unsigned long pnum_end, pnum_begin, map_count = 1;
int nid_begin;

///根据memblock.memory信息,初始化mem_section二级指针
memblocks_present();

///根据section_mem_map标记位判断,找到第一个存在mem_section的下标
pnum_begin = first_present_section_nr();
///找第一个存在的mem_section的nid,在早期初始化阶段,section_mem_map保存node id
nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));

/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();

///遍历所有存在的mem_section
for_each_present_section_nr(pnum_begin + 1, pnum_end) {
int nid = sparse_early_nid(__nr_to_section(pnum_end));

///统计nid总共占多少个mem_ection
if (nid == nid_begin) {
map_count++;
continue;
}
/* Init node with sections in range [pnum_begin, pnum_end) */
///初始化mem_section[pnum_end,pnum_end)
sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
nid_begin = nid;
pnum_begin = pnum_end;
map_count = 1;
}
/* cover the last node */
sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
vmemmap_populate_print_last();
}