structpage *__alloc_pages(gfp_tgfp, unsignedintorder, intpreferred_nid, nodemask_t *nodemask) { structpage *page; ///允许在低水位分配内存 unsignedint alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ ///ac, 用于保存伙伴系统分配内存时的参数 structalloc_contextac = { };
/* * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ ///伙伴系统分配最大内存块2^(max_order-1),默认4MB if (unlikely(order >= MAX_ORDER)) { WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); returnNULL; } // gfp_allowed_mask是一个全局掩码,表示允许的gfp mask gfp &= gfp_allowed_mask; /* * Apply scoped allocation constraints. This is mainly about GFP_NOFS * resp. GFP_NOIO which has to be inherited for all allocation requests * from a particular context which has been marked by * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures * movable zones are not used during allocation. */ // 此步骤是为了确保当前内存分配的上下文能够被正确的继承 gfp = current_gfp_context(gfp); alloc_gfp = gfp; // 来准备分配页面的上下文,填充 ac 和 alloc_flags,并根据需要调整 gfp。 if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) returnNULL;
/* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ ///内存碎片化的一个优化,优先从高端zone分配内存 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
/* First allocation attempt */ ///从伙伴系统的空闲链表分配内存 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); if (likely(page)) // 分配成功直接跳转out退出 // 快速分配就到此为止 goto out;
alloc_gfp = gfp; ac.spread_dirty_pages = false;
/* * Restore the original nodemask if it was potentially replaced with * &cpuset_current_mems_allowed to optimize the fast-path attempt. */ ac.nodemask = nodemask;
///迁移类型 ac->migratetype = gfp_migratetype(gfp_mask); //如果启用了 cpusets,则向 alloc_gfp 添加 __GFP_HARDWALL 标志。cpusets 是 Linux 中的一种资源隔离机制,__GFP_HARDWALL 表示硬性限制内存分配在特定的节点上。 if (cpusets_enabled()) { *alloc_gfp |= __GFP_HARDWALL; /* * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. */ if (in_task() && !ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; else *alloc_flags |= ALLOC_CPUSET; } //分别用于获取和释放文件系统回收的上下文。这与内存分配的 gfp_mask 相关,可能是为了确保在分配过程中不干扰文件系统操作。 fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); //如果 gfp_mask 中包含 __GFP_DIRECT_RECLAIM 标志,则调用 might_sleep_if。这表明当前分配可能会导致直接回收内存,因此在这种情况下,函数可能会进入睡眠状态等待内存回收。 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
/* Dirty zone balancing only done in the fast path */ //表示此次申请是否允许将该页块标记为 dirty,该值只会用于快速分配中 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/* * The preferred zone is used for statistics but crucially it is * also used as the starting point for the zonelist iterator. It * may get reset for allocations that ignore memory policies. */ //记录此次分配最受欢迎的 zoneref,该函数最开始会根据 gfp_mask 优先确定可以选择分配的 zone 的high_zoneidx,函数最后会根据该 hign_zonidx 确定zoneref ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask);
retry: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ ///ALLOC_NOFRAGMENT,避免内存碎片化的一个优化 no_fallback = alloc_flags & ALLOC_NOFRAGMENT; ///z,首选zone z = ac->preferred_zoneref; ///从推荐的zone开始遍历zonelist中所有zone for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, ac->nodemask) { structpage *page; unsignedlong mark;
if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * When allocating a page cache page for writing, we * want to get it from a node that is within its dirty * limit, such that no single node holds more than its * proportional share of globally allowed dirty pages. * The dirty limits take into account the node's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * * XXX: For now, allow allocations to potentially * exceed the per-node dirty limit in the slowpath * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed * nodes are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of nodes in the * dirty-throttling and the flusher threads. */ if (ac->spread_dirty_pages) { if (last_pgdat_dirty_limit == zone->zone_pgdat) continue;
if (!node_dirty_ok(zone->zone_pgdat)) { last_pgdat_dirty_limit = zone->zone_pgdat; continue; } }
///NUMA系统中,优先考虑的是内存节点本地性,而不是碎片化, ///本地内存速度远大于远端内存 if (no_fallback && nr_online_nodes > 1 && zone != ac->preferred_zoneref->zone) { int local_nid;
/* * If moving to a remote node, retry but allow * fragmenting fallbacks. Locality is more important * than fragmentation avoidance. */ local_nid = zone_to_nid(ac->preferred_zoneref->zone); ///判定访问远程zone,跳过,重试 if (zone_to_nid(zone) != local_nid) { alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; } }
///当检测到有外碎片化倾向时,临时提高低水位,提前触发kswapd线程回收内存, //然后触发kcompacted做内存规整,这样有助于分配到大内存; // //无法分配到连续大内存,就认为有碎片化倾向,会从其他迁移类型挪用内存 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
///返回true,表示满足最低水位,且满足分配要求 ///条件分支内容,为处理分配失败 if (!zone_watermark_fast(zone, order, mark, ac->highest_zoneidx, alloc_flags, gfp_mask)) { int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ if (static_branch_unlikely(&deferred_pages)) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone;
///尝试回收一部分内存 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, ac->highest_zoneidx, alloc_flags)) goto try_this_zone;
/* * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */ if (unlikely(order && (alloc_flags & ALLOC_HARDER))) reserve_highatomic_pageblock(page, zone, order);
return page; } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (static_branch_unlikely(&deferred_pages)) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif } }
/* * It's possible on a UMA machine to get through all zones that are * fragmented. If avoiding fragmentation, reset and try again. */ if (no_fallback) { ///遍历完所有zone之后,还是没成功分配,有可能发生了外碎片化,重试一次 alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; }
/* * Fast check for order-0 only. If this fails then the reserves * need to be calculated. */ ///针对申请单个page,做快速优化 //lowmem_reserve是每个zone预留的内存,为防止高端zone在内存不足时,过度使用低端zone内存资源 if (!order) { long fast_free;
fast_free = free_pages; fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) returntrue; }
///进一步检查 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) returntrue; /* * Ignore watermark boosting for GFP_ATOMIC order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ ///针对申请单个页面,进一步检查, 忽略临时水位 if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); }
如果 __zone_watermark_ok() 无法通过,那基本上就无法满足水位要求了。但系统会对 order 0 的申请,再做一次尝试,但这个条件是比较苛刻的;
2.4.1 zone_page_state
1 2 3 4 5 6 7 8 9 10
staticinlineunsignedlongzone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; }
每个zone 都有vm_stat 数组,里面存放着 zone 中各种页面的统计数据,包括空闲页面数量 NR_FREE_PAGES、NR_ZONE_INACTIVE_ANON、NR_ZONE_ACTIVE_ANON、NR_ZONE_INACTIVE_FILE、NR_ZONE_ACTIVE_FILE 等。
/* * If the caller does not have rights to ALLOC_HARDER then subtract * the high-atomic reserves. This will over-estimate the size of the * atomic reserve but it avoids a search. */ if (likely(!alloc_harder)) unusable_free += z->nr_reserved_highatomic;
#ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif
bool __zone_watermark_ok(struct zone *z, unsignedint order, unsignedlong mark, int highest_zoneidx, unsignedint alloc_flags, long free_pages) { long min = mark; int o; constbool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
if (alloc_flags & ALLOC_HIGH) min -= min / 2;
if (unlikely(alloc_harder)) { /* * OOM victims can try even harder than normal ALLOC_HARDER * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; else min -= min / 4; }
/* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) returnfalse;
/* If this is an order-0 request then the watermark is fine */ if (!order) returntrue;
/* For a high-order request, check at least one suitable page is free */ ///检查是否有满足oder需求的内存块 for (o = order; o < MAX_ORDER; o++) { structfree_area *area = &z->free_area[o]; int mt;
if (!area->nr_free) continue;
///从MIGRATE_UNMOVABLE到MIGRATE_RECLAIMABLE类型,发现符合order分配需求,就初步判断满足条件 //后续可以从迁移类型中挪用 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!free_area_empty(area, mt)) returntrue; }
staticinline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsignedint order, gfp_t gfp_flags, unsignedint alloc_flags, int migratetype) { unsignedlong flags; structpage *page;
///处理单个页的分配, //每个zone都有个Per-CPU变量per_cpu_pages,该数据结构有一个单页面链表, //当申请分配单个页面时,从这个物理链表直接获取,可以减少对锁的依赖; if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and * we need to skip it when CMA area isn't allowed. */ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags); goto out; } }
/* * We most definitely don't want callers attempting to * allocate greater than order-1 page units with __GFP_NOFAIL. */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
do { page = NULL; /* * order-0 request can reach here when the pcplist is skipped * due to non-CMA allocation context. HIGHATOMIC area is * reserved for high-order atomic allocation, so order-0 * request should skip it. */ if (order > 0 && alloc_flags & ALLOC_HARDER) { ///先从当前order链表中去分配page page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } if (!page) ///如果当前order分配失败,从高order的可迁移,可回收链表中查找是否可以借用 page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order));///检查所有分配的页面是否合格 if (!page) goto failed;
staticstruct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsignedint order, gfp_t gfp_flags, int migratetype, unsignedint alloc_flags) { structper_cpu_pages *pcp; structlist_head *list; structpage *page; unsignedlong flags;
local_lock_irqsave(&pagesets.lock, flags);
/* * On allocation, reduce the number of pages that are batch freed. * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone, 1); } return page; }
pcp:当前 CPU 对应的 per_cpu_pageset,每个 CPU 都有自己的页面池。
list:指向当前 order 和 migratetype 类型的页面链表。
page:分配的页面。
flags:用于保存锁的状态。
rmqueue_pcplist 函数通过从每个 CPU 的页面池中移除一个页面来处理内存分配请求。它在获取页面时会对 CPU 级别的页面池进行加锁,减少批量释放的页面数量,以提高分配效率。该函数在分配成功后更新相关的统计信息,并返回分配到的页面。
3.2 处理order>0的分配流程
rmqueue() 函数处理可能有 4 种处理方式:
当order = 0,进入 pcp 的分配流程;
当order > 0,进入以下分配流程:
如果alloc_flags 设定了ALLOC_HARDER,migrate type 改 MIGRATE_HIGHATOMIC 进入 __rmqueue_smallest() 函数;
do { page = NULL; /* * order-0 request can reach here when the pcplist is skipped * due to non-CMA allocation context. HIGHATOMIC area is * reserved for high-order atomic allocation, so order-0 * request should skip it. */ if (order > 0 && alloc_flags & ALLOC_HARDER) { ///先从当前order链表中去分配page page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } if (!page) ///如果当前order分配失败,从高order的可迁移,可回收链表中查找是否可以借用 page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order));///检查所有分配的页面是否合格
///多余的内存,放回伙伴系统,high>low staticinlinevoidexpand(struct zone *zone, struct page *page, int low, int high, int migratetype) { unsignedlong size = 1 << high;
/* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ ///设置page_guard if (set_page_guard(zone, &page[size], high, migratetype)) continue;