linux源码解析06–常用内存分配函数kmalloc、vmalloc、malloc和mmap实现原理

banner

一、kmalloc 函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
unsigned int index;
#endif
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
index = kmalloc_index(size); /// 查找使用的哪个 slab 缓冲区

if (!index)
return ZERO_SIZE_PTR;

return kmem_cache_alloc_trace( /// 从 slab 分配内存
kmalloc_caches[kmalloc_type(flags)][index],
flags, size);
#endif
}
return __kmalloc(size, flags);
}

kmem_cache_alloc_trace 分配函数

1
2
3
4
5
6
7
8
9
10
11
12
void *
kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
void *ret;

ret = slab_alloc(cachep, flags, size, _RET_IP_); /// 分配 slab 缓存

ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
return ret;
}

可见,kmalloc() 基于 slab 分配器实现,因此分配的内存,物理上都是连续的。

二、vmalloc 函数

1
2
3
4
vmalloc()
->__vmalloc_node_flags()
->__vmalloc_node()
->__vmalloc_node_range()

2.1 核心函数__vmalloc_node_range

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, unsigned int page_shift,
int node)
{const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area); /// 计算 vm_struct 包含多少个页面
unsigned long array_size;
unsigned int nr_small_pages = size >> PAGE_SHIFT;
unsigned int page_order;
struct page **pages;
unsigned int i;

array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;

/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp, node,
area->caller);
} else {pages = kmalloc_node(array_size, nested_gfp, node);
}

if (!pages) {free_vm_area(area);
warn_alloc(gfp_mask, NULL,
"vmalloc size %lu allocation failure: "
"page array size %lu allocation failed",
nr_small_pages * PAGE_SIZE, array_size);
return NULL;
}

area->pages = pages; /// 保存已分配页面的 page 数据结构的指针
area->nr_pages = nr_small_pages;
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);

page_order = vm_area_page_order(area);

/*
* Careful, we allocate and map page_order pages, but tracking is done
* per PAGE_SIZE page so as to keep the vm_struct APIs independent of
* the physical/mapped size.
*/
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page;
int p;

/* Compound pages required for remap_vmalloc_page */
page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); /// 分配物理页面
if (unlikely(!page)) {/* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
warn_alloc(gfp_mask, NULL,
"vmalloc size %lu allocation failure: "
"page order %u allocation failed",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}

for (p = 0; p < (1U << page_order); p++)
area->pages[i + p] = page + p;

if (gfpflags_allow_blocking(gfp_mask))
cond_resched();}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { /// 建立物理页面到 vma 的映射
warn_alloc(gfp_mask, NULL,
"vmalloc size %lu allocation failure: "
"failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
}

return area->addr;

fail:
__vfree(area->addr);
return NULL;
}

可见,vmalloc 是临时在 vmalloc 内存区申请 vma,并且分配物理页面,建立映射;直接分配物理页面,至少一个页 4K,因此 vmalloc 适合用于分配较大内存,并且物理内存不一定连续;

三、malloc 函数

malloc 是 C 库实现的函数,C 库维护了一个缓存,当内存够用时,malloc 直接从 C 库缓存分配,只有当 C 库缓存不够用;
通过系统调用 brk,向内核申请,从堆空间申请一个 vma;
malloc 实现流程图:

https://hexoimg.oss-cn-shanghai.aliyuncs.com/blog/24/11/image_da6f20ca93a7925a9c4b41f86a07bdd9.png

3.1 __do_sys_brk 函数

经过平台相关实现,malloc 最终会调用 SYSCALL_DEFINE1 宏,扩展为__do_sys_brk 函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long retval;
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);

if (down_write_killable(&mm->mmap_sem)) /// 申请写类型读写信号量
return -EINTR;

origbrk = mm->brk; ///brk 记录动态分配区的当前底部

#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
if (current->brk_randomized)
min_brk = mm->start_brk;
else
min_brk = mm->end_data;
#else
min_brk = mm->start_brk;
#endif
if (brk < min_brk)
goto out;

/*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
mm->end_data, mm->start_data))
goto out;

newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk) {
mm->brk = brk;
goto success;
}

/*
* Always allow shrinking brk.
* __do_munmap() may downgrade mmap_sem to read.
*/
if (brk <= mm->brk) { /// 请求释放空间
int ret;

/*
* mm->brk must to be protected by write mmap_sem so update it
* before downgrading mmap_sem. When __do_munmap() fails,
* mm->brk will be restored from origbrk.
*/
mm->brk = brk;
ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
if (ret < 0) {
mm->brk = origbrk;
goto out;
} else if (ret == 1) {downgraded = true;}
goto success;
}

/* Check against existing mmap mappings. */
next = find_vma(mm, oldbrk);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) /// 发现有重叠,不需要寻找
goto out;

/* Ok, looks good - let it rip. */
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) /// 无重叠,新分配一个 vma
goto out;
mm->brk = brk; /// 更新 brk 地址

success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
if (downgraded)
up_read(&mm->mmap_sem);
else
up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate) /// 调用 mlockall() 系统调用,mm_populate 会立刻分配物理内存
mm_populate(oldbrk, newbrk - oldbrk);
return brk;

out:
retval = origbrk;
up_write(&mm->mmap_sem);
return retval;
}

总结下__do_sys_brk() 功能:
(1) 从旧的 brk 边界去查询,是否有可用 vma,若发现有重叠,直接使用;
(2) 若无发现重叠,新分配一个 vma;
(3) 应用程序若调用 mlockall(),会锁住进程所有虚拟地址空间,防止内存被交换出去,且立刻分配物理内存;否则,物理页面会等到使用时,触发缺页异常分配;

3.2 do_brk_flags 函数

函数实现:
(1) 寻找一个可使用的线性地址;
(2) 查找最适合插入红黑树的节点;
(3) 寻到的线性地址是否可以合并现有 vma,所不能,新建一个 vma;
(4) 将新建 vma 插入 mmap 链表和红黑树中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
unsigned long mapped_addr;

/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; /// 默认属性,可读写

mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); /// 返回未使用过的,未映射的线性地址区间的,起始地址
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;

error = mlock_future_check(mm, mm->def_flags, len);
if (error)
return error;

/* Clear old maps, set up prev, rb_link, rb_parent, and uf */
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) /// 寻找适合插入的红黑树节点
return -ENOMEM;

/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;

if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;

if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;

/* Can we just expand an old private anonymous mapping? */ /// 检查是否能合并 addr 到附近的 vma,若不能,只能新建一个 vma
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;

/*
* create a vma struct for an anonymous mapping
*/
vma = vm_area_alloc(mm);
if (!vma) {vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}

vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent); /// 新 vma 添加到 mmap 链表和红黑树
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
return 0;
}

3.3 mm_populate() 函数

依次调用

1
2
3
4
mm_populate()
->__mm_populate()
->populate_vma_page_range()
->__get_user_pages()

当设置 VM_LOCKED 标志时,表示要马上申请物理页面,并与 vma 建立映射;
否则,这里不操作,直到访问该 vma 时,触发缺页异常,再分配物理页面,并建立映射;

3.3.1 __get_user_pages() 函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
struct follow_page_context ctx = {NULL};

if (!nr_pages)
return 0;

start = untagged_addr(start);

VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));

/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
* fault information is unrelated to the reference behaviour of a task
* using the address space
*/
if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA;

do { /// 依次处理每个页面
struct page *page;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;

/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {vma = find_extend_vma(mm, start); /// 检查是否可以扩增 vma
if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
goto out;
ctx.page_mask = 0;
goto next_page;
}

if (!vma) {
ret = -EFAULT;
goto out;
}
ret = check_vma_flags(vma, gup_flags);
if (ret)
goto out;

if (is_vm_hugetlb_page(vma)) { /// 支持巨页
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags, locked);
if (locked && *locked == 0) {
/*
* We've got a VM_FAULT_RETRY
* and we've lost mmap_lock.
* We must stop here.
*/
BUG_ON(gup_flags & FOLL_NOWAIT);
BUG_ON(ret != 0);
goto out;
}
continue;
}
}
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (fatal_signal_pending(current)) { /// 如果当前进程收到 SIGKILL 信号,直接退出
ret = -EINTR;
goto out;
}
cond_resched(); // 判断是否需要调度,内核中常用该函数,优化系统延迟

page = follow_page_mask(vma, start, foll_flags, &ctx); /// 查看 VMA 的虚拟页面是否已经分配物理内存,返回已经映射的页面的 page
if (!page) {ret = faultin_page(vma, start, &foll_flags, locked); /// 若无映射,主动触发虚拟页面到物理页面的映射
switch (ret) {
case 0:
goto retry;
case -EBUSY:
ret = 0;
fallthrough;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
goto out;
case -ENOENT:
goto next_page;
}
BUG();} else if (PTR_ERR(page) == -EEXIST) {
/*
* Proper page table entry exists, but no corresponding
* struct page.
*/
goto next_page;
} else if (IS_ERR(page)) {ret = PTR_ERR(page);
goto out;
}
if (pages) {pages[i] = page;
flush_anon_page(vma, page, start); /// 分配完物理页面,刷新缓存
flush_dcache_page(page);
ctx.page_mask = 0;
}
next_page:
if (vmas) {vmas[i] = vma;
ctx.page_mask = 0;
}
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
out:
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
return i ? i : ret;
}

follow_page_mask 函数返回已经映射的页面的 page,最终会调用 follow_page_pte 函数,其实现如下:
3.1.1.1 follow_page_pte 函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags,
struct dev_pagemap **pgmap)
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
int ret;

/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);

ptep = pte_offset_map_lock(mm, pmd, address, &ptl); /// 获得 pte 和一个锁
pte = *ptep;
if (!pte_present(pte)) { /// 处理页面不在内存中,作以下处理
swp_entry_t entry;
/*
* KSM's break_ksm() relies upon recognizing a ksm page
* even while it is being migrated, so for that case we
* need migration_entry_wait().
*/
if (likely(!(flags & FOLL_MIGRATION)))
goto no_page;
if (pte_none(pte))
goto no_page;
entry = pte_to_swp_entry(pte);
if (!is_migration_entry(entry))
goto no_page;
pte_unmap_unlock(ptep, ptl);
migration_entry_wait(mm, pmd, address); /// 等待页面合并完成再尝试
goto retry;
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {pte_unmap_unlock(ptep, ptl);
return NULL;
}

page = vm_normal_page(vma, address, pte); /// 根据 pte,返回物理页面 page(只返回普通页面,特殊页面不参与内存管理)
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /// 处理设备映射文件
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
* case since they are only valid while holding the pgmap
* reference.
*/
*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
if (*pgmap)
page = pte_page(pte);
else
goto no_page;
} else if (unlikely(!page)) {/// 处理 vm_normal_page() 没返回有效页面情况
if (flags & FOLL_DUMP) {/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
goto out;
}

if (is_zero_pfn(pte_pfn(pte))) { /// 系统零页,不会返回错误
page = pte_page(pte);
} else {ret = follow_pfn_pte(vma, address, ptep, flags);
page = ERR_PTR(ret);
goto out;
}
}

/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {page = ERR_PTR(-ENOMEM);
goto out;
}
/*
* We need to make the page accessible if and only if we are going
* to access its content (the FOLL_PIN case). Please see
* Documentation/core-api/pin_user_pages.rst for details.
*/
if (flags & FOLL_PIN) {ret = arch_make_page_accessible(page);
if (ret) {unpin_user_page(page);
page = ERR_PTR(ret);
goto out;
}
}
if (flags & FOLL_TOUCH) { ///FOLL_TOUCH, 标记页面可访问
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
set_page_dirty(page);
/*
* pte_mkyoung() would be more correct here, but atomic care
* is needed to avoid losing the dirty bit: it is easier to use
* mark_page_accessed().
*/
mark_page_accessed(page);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/* Do not mlock pte-mapped THP */
if (PageTransCompound(page))
goto out;

/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
* which might bounce very badly if there is contention.
*
* If the page is already locked, we don't need to
* handle it now - vmscan will handle it later if and
* when it attempts to reclaim the page.
*/
if (page->mapping && trylock_page(page)) {lru_add_drain(); /* push cached pages to LRU */
/*
* Because we lock page here, and migration is
* blocked by the pte's page reference, and we
* know the page is still mapped, we don't even
* need to check for file-cache page truncation.
*/
mlock_vma_page(page);
unlock_page(page);
}
}
out:
pte_unmap_unlock(ptep, ptl);
return page;
no_page:
pte_unmap_unlock(ptep, ptl);
if (!pte_none(pte))
return NULL;
return no_page_table(vma, flags);
}

总结:

  1. malloc 函数,从 C 库缓存分配内存,其分配或释放内存,未必马上会执行;
  2. malloc 实际分配内存动作,要么主动设置 mlockall(),人为触发缺页异常,分配物理页面;或者在访问内存时触发缺页异常,分配物理页面;
  3. malloc 分配虚拟内存,有三种情况:
    • malloc() 分配内存后,直接读,linux 内核进入缺页异常,调用 do_anonymous_page 函数使用零页映射,此时 PTE 属性只读;
    • malloc() 分配内存后,先读后写,linux 内核第一次触发缺页异常,映射零页;第二次触发异常,触发写时复制;
    • malloc() 分配内存后, 直接写,linux 内核进入匿名页面的缺页异常,调用 alloc_zeroed_user_highpage_movable 分配一个新页面,这个 PTE 是可写的;

4.mmap 函数

mmap 一般用于用户程序分配内存,读写大文件,链接动态库,多进程内存共享等;
实现过程流程图:

https://hexoimg.oss-cn-shanghai.aliyuncs.com/blog/24/11/image_86e517bb28833ed6586e12be6b8c525a.png
mmap 根据文件关联性和映射区域是否共享等属性,其映射分为 4 类
1. 私有匿名映射
fd=-1, 且 flags=MAP_ANONYMOUS|MAP_PRIVATE, 创建的 mmap 映射是私有匿名映射;
用途是在 glibc 分配大内存时,如果需分配内存大于 MMAP_THREASHOLD(128KB),glibc 默认用 mmap 代替 brk 分配内存;

2. 共享匿名映射
fd=-1, 且 flags=MAP_ANONYMOUS|MAP_SHARED;
常用于父子进程的通信,共享一块内存区域;
do_mmap_pgoff()->mmap_region(), 最终调用 shmem_zero_setup 打开 /dev/zero 设备文件;

另外直接打开 /dev/zero 设备文件,然后使用这个句柄创建 mmap,也是最终调用 shmem 模块创建共享匿名映射;

3. 私有文件映射
flags=MAP_PRIVATE;
常用场景是,加载动态共享库;

4. 共享文件映射
flags=MAP_SHARED;有两个应用场景;
(1) 读写文件:
内核的回写机制会将内存数据同步到磁盘;
(2) 进程间通信:
多个独立进程,打开同一个文件,互相都可以观察到,可是实现多进程通信;
核心函数如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;

/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;

/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
nr_pages = count_vma_pages_range(mm, addr, addr + len);

if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}

/* Clear old maps, set up prev, rb_link, rb_parent, and uf */
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}

/*
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags, /// 尝试合并 vma
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;

/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm); /// 分配一个新 vma
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}

vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;

if (file) { /// 文件映射
if (vm_flags & VM_DENYWRITE) {error = deny_write_access(file);
if (error)
goto free_vma;
}
if (vm_flags & VM_SHARED) {error = mapping_map_writable(file->f_mapping);
if (error)
goto allow_write_and_free_vma;
}

/* ->mmap() can change vma->vm_file, but must guarantee that
* vma_link() below can deny write-access if VM_DENYWRITE is set
* and map writably if VM_SHARED is set. This usually means the
* new file must not have been exposed to user-space, yet.
*/
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;

/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);

addr = vma->vm_start;

/* If vm_flags changed after call_mmap(), we should try merge vma again
* as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
if (merge) {/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
* and cause general protection fault ultimately.
*/
fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
/* Update vm_flags to pick up the change. */
vm_flags = vma->vm_flags;
goto unmap_writable;
}
}

vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) { /// 共享映射
error = shmem_zero_setup(vma); /// 共享匿名映射
if (error)
goto free_vma;
} else {vma_set_anonymous(vma); /// 匿名映射
}

/* Allow architectures to sanity-check the vm_flags */
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
goto unmap_and_free_vma;
else
goto free_vma;
}

vma_link(mm, vma, prev, rb_link, rb_parent); ///vma 加入 mm 系统
/* Once vma denies write, undo our temporary denial count */
if (file) {
unmap_writable:
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
}
file = vma->vm_file;
out:
perf_event_mmap(vma);

vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
}

if (file)
uprobe_mmap(vma);

/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;

vma_set_page_prot(vma);

return addr;

unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;

/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}

总结:

以上的 malloc,mmap 函数,若无特别设定, 默认都是指建立虚拟地址空间,但没有建立虚拟地址空间到物理地址空间的映射;
当访问未映射的虚拟空间时,触发缺页异常,linxu 内核会处理缺页异常,缺页异常服务程序中,会分配物理页,并建立虚拟地址到物理页的映射;

补充两个问题:

  1. 当 mmap 重复申请相同地址,为什么不会失败?
    find_vma_links() 函数便利该进程所有的 vma,当检查到当前要映射区域和已有 vma 重叠时,先销毁旧映射区,重新映射,所以第二次申请,不会报错。
  2. mmap 打开多个文件时,比如播放视频时,为什么会卡顿?
    mmap 只是建立 vma,并未实际分配物理页面读取文件内存,当播放器真正读取文件时,会频繁触发缺页异常,再从磁盘读取文件到页面高速缓存中,会导致磁盘读性能较差;

madvise(add,len,MADV_WILLNEED|MADV_SEQUENTIAL) 对文件内容进行预读和顺序读;

但是内核默认的预读功能就可以实现;且 madvise 不适合流媒体,只适合随机读取场景;

能够有效提高流媒体服务 I / O 性能的方法是最大内核默认预读窗口;内核默认是 128K,可以通过“blockdev –setra”命令修改;