/* * Careful, we allocate and map page_order pages, but tracking is done * per PAGE_SIZE page so as to keep the vm_struct APIs independent of * the physical/mapped size. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { structpage *page; int p;
/* Compound pages required for remap_vmalloc_page */ page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); /// 分配物理页面 if (unlikely(!page)) {/* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); warn_alloc(gfp_mask, NULL, "vmalloc size %lu allocation failure: " "page order %u allocation failed", area->nr_pages * PAGE_SIZE, page_order); goto fail; }
for (p = 0; p < (1U << page_order); p++) area->pages[i + p] = page + p;
if (gfpflags_allow_blocking(gfp_mask)) cond_resched();} atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (down_write_killable(&mm->mmap_sem)) /// 申请写类型读写信号量 return -EINTR;
origbrk = mm->brk; ///brk 记录动态分配区的当前底部
#ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (current->brk_randomized) min_brk = mm->start_brk; else min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif if (brk < min_brk) goto out;
/* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, mm->end_data, mm->start_data)) goto out;
/* * Always allow shrinking brk. * __do_munmap() may downgrade mmap_sem to read. */ if (brk <= mm->brk) { /// 请求释放空间 int ret;
/* * mm->brk must to be protected by write mmap_sem so update it * before downgrading mmap_sem. When __do_munmap() fails, * mm->brk will be restored from origbrk. */ mm->brk = brk; ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); if (ret < 0) { mm->brk = origbrk; goto out; } elseif (ret == 1) {downgraded = true;} goto success; }
/* Check against existing mmap mappings. */ next = find_vma(mm, oldbrk); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) /// 发现有重叠,不需要寻找 goto out;
/* Ok, looks good - let it rip. */ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) /// 无重叠,新分配一个 vma goto out; mm->brk = brk; /// 更新 brk 地址
/* * this is really a simplified "do_mmap". it only handles * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ staticintdo_brk_flags(unsignedlong addr, unsignedlong len, unsignedlong flags, struct list_head *uf) { structmm_struct *mm = current->mm; structvm_area_struct *vma, *prev; structrb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; unsignedlong mapped_addr;
/* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; /// 默认属性,可读写
error = mlock_future_check(mm, mm->def_flags, len); if (error) return error;
/* Clear old maps, set up prev, rb_link, rb_parent, and uf */ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) /// 寻找适合插入的红黑树节点 return -ENOMEM;
/* Check against address space limits *after* clearing old maps... */ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM;
if (mm->map_count > sysctl_max_map_count) return -ENOMEM;
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM;
/* Can we just expand an old private anonymous mapping? *//// 检查是否能合并 addr 到附近的 vma,若不能,只能新建一个 vma vma = vma_merge(mm, prev, addr, addr + len, flags, NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out;
/* * create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) {vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; }
/* * If FOLL_FORCE is set then do not force a full fault as the hinting * fault information is unrelated to the reference behaviour of a task * using the address space */ if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) returnERR_PTR(-EINVAL); retry: if (unlikely(pmd_bad(*pmd))) returnno_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl); /// 获得 pte 和一个锁 pte = *ptep; if (!pte_present(pte)) { /// 处理页面不在内存中,作以下处理 swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); /// 等待页面合并完成再尝试 goto retry; } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {pte_unmap_unlock(ptep, ptl); returnNULL; }
page = vm_normal_page(vma, address, pte); /// 根据 pte,返回物理页面 page(只返回普通页面,特殊页面不参与内存管理) if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /// 处理设备映射文件 /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } elseif (unlikely(!page)) {/// 处理 vm_normal_page() 没返回有效页面情况 if (flags & FOLL_DUMP) {/* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; }
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) {page = ERR_PTR(-ENOMEM); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) {ret = arch_make_page_accessible(page); if (ret) {unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { ///FOLL_TOUCH, 标记页面可访问 if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Do not mlock pte-mapped THP */ if (PageTransCompound(page)) goto out;
/* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) {lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) returnNULL; returnno_page_table(vma, flags); }
/* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsignedlong nr_pages;
/* * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ nr_pages = count_vma_pages_range(mm, addr, addr + len);
/* Clear old maps, set up prev, rb_link, rb_parent, and uf */ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) return -ENOMEM; /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; }
/* * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, /// 尝试合并 vma NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out;
/* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); /// 分配一个新 vma if (!vma) { error = -ENOMEM; goto unacct_error; }
if (file) { /// 文件映射 if (vm_flags & VM_DENYWRITE) {error = deny_write_access(file); if (error) goto free_vma; } if (vm_flags & VM_SHARED) {error = mapping_map_writable(file->f_mapping); if (error) goto allow_write_and_free_vma; }
/* ->mmap() can change vma->vm_file, but must guarantee that * vma_link() below can deny write-access if VM_DENYWRITE is set * and map writably if VM_SHARED is set. This usually means the * new file must not have been exposed to user-space, yet. */ vma->vm_file = get_file(file); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma;
/* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM * Bug: If addr is changed, prev, rb_link, rb_parent should * be updated for vma_link() */ WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
/* If vm_flags changed after call_mmap(), we should try merge vma again * as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); if (merge) {/* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file * and cause general protection fault ultimately. */ fput(vma->vm_file); vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ vm_flags = vma->vm_flags; goto unmap_writable; } }
/* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ vma->vm_flags |= VM_SOFTDIRTY;