关键词:warn_alloc()、__GFP_XXX、order、CMA等等。

在内存申请的时候经常会遇到类似“ xxx: page allocation failure: order:10...”类型的问题,这是warn_alloc()的输出。

warn_alloc()被如下函数调用:__alloc_pages_slowpath()、__vmalloc_area_node()、__vmalloc_node_range

下面分三部分了解这种问题的来龙去脉:

  • 什么情况会导致warn_alloc()?
  • warn_alloc()都做了哪些事情?
  • 结合实际问题分析问题原因。

1.触发warn_alloc()情况

要了什么情况下会导致warn_alloc(),就需要分析在何种情况下会被调用。

__alloc_pages_slowpath()表示页面申请进入了slowpath,那相对就有fastpath。

__alloc_pages_nodemask()中可知,这个fastpath就是get_page_from_freelist()。__alloc_pages_nodemask()是分配页面的后备选择。

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = * HZ;
unsigned int cpuset_mems_cookie; if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
} if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC; retry_cpuset:
compaction_retries = ;
no_progress_loops = ;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin(); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)------------------------------------------------找不到合适的zone,进入nopage处理。
goto nopage; alloc_flags = gfp_to_alloc_flags(gfp_mask); if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac); page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg; if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
!gfp_pfmemalloc_allowed(gfp_mask)) {-----------------------------------------在定义__GFP_DIRECT_RECLAIM、__GFP_MEMALLOC并且order大于3,也即分配超过8页内存的时候。
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);---------------------------------------------页面较大情况下,走直接页面回收来获取内存。
if (page)
goto got_pg; if (gfp_mask & __GFP_NORETRY) {----------------------------------------------不做重试的情况。 if (compact_result == COMPACT_DEFERRED)----------------------------------compaction不成功,进入nopage处理。
goto nopage; compact_priority = INIT_COMPACT_PRIORITY;
}
} retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);-------------------------------------------------唤醒kswapd内核线程,让其处于工作状态。 if (gfp_pfmemalloc_allowed(gfp_mask))
alloc_flags = ALLOC_NO_WATERMARKS; if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
} /* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);-----------------申请内存分配,成功则返回struct page地址。
if (page)
goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) {-------------------------------------------------------既不能内存规整direct compact,也无法从freelist获取内存的情况,进入nopage流程。 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
} /* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC) { if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
cond_resched();
goto retry;
}
goto nopage;
} /* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage; /* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg; /* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg; /* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)--------------------------------------------------------------强调不允许循环重试情况。
goto nopage; /*
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))-------------------------针对高order情况,并且不允许__GFP_REPEAT的情况,进入nopage流程。
goto nopage; /* Make sure we know about allocations which stall for too long */
if (time_after(jiffies, alloc_start + stall_timeout)) {------------------------------------内存分配持续时间超过stall_timeout,初始为10秒,后面以10秒递增报警。
warn_alloc(gfp_mask,
"page allocation stalls for %ums, order:%u",
jiffies_to_msecs(jiffies-alloc_start), order);
stall_timeout += * HZ;
} if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > , &no_progress_loops))
goto retry; if (did_some_progress > &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry; if (read_mems_allowed_retry(cpuset_mems_cookie))
goto retry_cpuset; /* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);----------------------分配页面,并且判断是否需要启动OOM killer,did_some_progress会导致retry。如果order小于3则不会进入OOM。
if (page)
goto got_pg; /* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = ;
goto retry;
} nopage: if (read_mems_allowed_retry(cpuset_mems_cookie))
goto retry_cpuset;----------------------------------------------------------------------进入retry_cpuset循环处理。 warn_alloc(gfp_mask,
"page allocation failure: order:%u", order);----------------------------------------无法满足分配order大小页面。
got_pg:
return page;
}

下面两个函数都是vmalloc相关,__vmalloc_area_node()在分配失败之后进入fail,调用warn_alloc()输出log。

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *)); area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, , nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
} for (i = ; i < area->nr_pages; i++) {
struct page *page; if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask);
else
page = alloc_pages_node(node, alloc_mask, ); if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
} if (map_vm_area(area, prot, pages))
goto fail;
return area->addr; fail:
warn_alloc(gfp_mask,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
struct vm_struct *area;
void *addr;
unsigned long real_size = size; size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail; addr =__vmalloc_area_node(area, gfp_mask, prot, node);
if (!addr)
return NULL; clear_vm_uninitialized_flag(area); kmemleak_alloc(addr, real_size, , gfp_mask); return addr; fail:
warn_alloc(gfp_mask,
"vmalloc: allocation failure: %lu bytes", real_size);
return NULL;
}

2. warn_alloc()解析

warn_alloc()首先显示相关进程和内存分配gfp_mask信息,然后打印栈信息,

void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
struct va_format vaf;
va_list args; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
debug_guardpage_minorder() > )
return; if (!(gfp_mask & __GFP_NOMEMALLOC))
if (test_thread_flag(TIF_MEMDIE) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES; pr_warn("%s: ", current->comm);------------------------------------显示对应进程名称。 va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_cont("%pV", &vaf);
va_end(args);------------------------------------------------------显示warn_alloc()传入的参数。 pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);----------------显示gfp_mask。 dump_stack();------------------------------------------------------显示栈信息。
if (!should_suppress_show_mem())
show_mem(filter);----------------------------------------------显示内存信息,这里是重点。
}

show_mem()显示详细的内存信息。

void show_mem(unsigned int filter)
{
pg_data_t *pgdat;
unsigned long total = , reserved = , highmem = ; printk("Mem-Info:\n");
show_free_areas(filter); for_each_online_pgdat(pgdat) {
unsigned long flags;
int zoneid; pgdat_resize_lock(pgdat, &flags);
for (zoneid = ; zoneid < MAX_NR_ZONES; zoneid++) {
struct zone *zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue; total += zone->present_pages;
reserved += zone->present_pages - zone->managed_pages; if (is_highmem_idx(zoneid))
highmem += zone->present_pages;
}
pgdat_resize_unlock(pgdat, &flags);
} printk("%lu pages RAM\n", total);-------------------------------整个平台的页面统计信息:所有页面数、reserved、cma等等。
printk("%lu pages HighMem/MovableOnly\n", highmem);
printk("%lu pages reserved\n", reserved);
#ifdef CONFIG_CMA
printk("%lu pages cma reserved\n", totalcma_pages);
#endif
#ifdef CONFIG_QUICKLIST
printk("%lu pages in pagetable cache\n",
quicklist_total_size());
#endif
#ifdef CONFIG_MEMORY_FAILURE
printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
}

show_free_areas()从所有node、不同node、不同zone、同一zone下不同order分别显示空闲页面信息。

void show_free_areas(unsigned int filter)
{
unsigned long free_pcp = ;
int cpu;
struct zone *zone;
pg_data_t *pgdat; for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue; for_each_online_cpu(cpu)
free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
} printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"-----------------显示所有node的统计信息。
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
global_node_page_state(NR_INACTIVE_ANON),
global_node_page_state(NR_ISOLATED_ANON),
global_node_page_state(NR_ACTIVE_FILE),
global_node_page_state(NR_INACTIVE_FILE),
global_node_page_state(NR_ISOLATED_FILE),
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
global_page_state(NR_FREE_PAGES),
free_pcp,
global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) {-------------------------------------------------分别显示不同node的统计信息。
printk("Node %d"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" isolated(anon):%lukB"
" isolated(file):%lukB"
" mapped:%lukB"
" dirty:%lukB"
" writeback:%lukB"
" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
" shmem_thp: %lukB"
" shmem_pmdmapped: %lukB"
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
" unstable:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_ACTIVE_ANON)),
K(node_page_state(pgdat, NR_INACTIVE_ANON)),
K(node_page_state(pgdat, NR_ACTIVE_FILE)),
K(node_page_state(pgdat, NR_INACTIVE_FILE)),
K(node_page_state(pgdat, NR_UNEVICTABLE)),
K(node_page_state(pgdat, NR_ISOLATED_ANON)),
K(node_page_state(pgdat, NR_ISOLATED_FILE)),
K(node_page_state(pgdat, NR_FILE_MAPPED)),
K(node_page_state(pgdat, NR_FILE_DIRTY)),
K(node_page_state(pgdat, NR_WRITEBACK)),
K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
node_page_state(pgdat, NR_PAGES_SCANNED),
!pgdat_reclaimable(pgdat) ? "yes" : "no");
} for_each_populated_zone(zone) {----------------------------------------------分别显示所有zone的统计信息。
int i; if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue; free_pcp = ;
for_each_online_cpu(cpu)
free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; show_node(zone);
printk(KERN_CONT
"%s"
" free:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" writepending:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
" slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
" free_cma:%lukB"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = ; i < MAX_NR_ZONES; i++)
printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
printk(KERN_CONT "\n");
} for_each_populated_zone(zone) {-------------------------------------------显示所有zone下不同order空闲数目统计信息。
unsigned int order;
unsigned long nr[MAX_ORDER], flags, total = ;
unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
show_node(zone);
printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags);
for (order = ; order < MAX_ORDER; order++) {-------------------------遍历当前zone的不同order,不同order区域数目存在nr[]中,total是总的页面数目。
struct free_area *area = &zone->free_area[order];
int type; nr[order] = area->nr_free;
total += nr[order] << order; types[order] = ;
for (type = ; type < MIGRATE_TYPES; type++) {
if (!list_empty(&area->free_list[type]))
types[order] |= << type;--------------------------------记录order区域中页面类型。
}
}
spin_unlock_irqrestore(&zone->lock, flags);
for (order = ; order < MAX_ORDER; order++) {
printk(KERN_CONT "%lu*%lukB ",
nr[order], K(1UL) << order);-------------------------------输出不同order区域数量和区域大小。
if (nr[order])
show_migration_types(types[order]);---------------------------输出页面类型。
}
printk(KERN_CONT "= %lukB\n", K(total));------------------------------显示总大小。
} hugetlb_show_meminfo();---------------------------------------------------显示huge page统计信息。 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));---总的文件缓存页面数量。 show_swap_cache_info();----------------------------------------------------显示swap cache统计信息。
}

不同的页面有不同的属性,在warn_alloc()输出的字母对应了页面的属性。主要有M、U、E、C。

static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
[MIGRATE_UNMOVABLE] = 'U',--------------------------不可移动。
[MIGRATE_MOVABLE] = 'M',----------------------------可移动。
[MIGRATE_RECLAIMABLE] = 'E',------------------------可回收。
[MIGRATE_HIGHATOMIC] = 'H',-------------------------等同于MIGRATE_PCPTYPES。
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',----------------------------CMA区域页面。
#endif
#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = 'I',
#endif
};
char tmp[MIGRATE_TYPES + ];
char *p = tmp;
int i; for (i = ; i < MIGRATE_TYPES; i++) {
if (type & ( << i))
*p++ = types[i];
} *p = '\0';
printk(KERN_CONT "(%s) ", tmp);
}

经过上面的分析,基本上明白每一行的输出的来源。具体每个字段表示的内存含义,还需要结合代码阅读。

3. 实例解析

下面结合实际问题log输出来分析问题,进而解决问题。

表示进程xxxx在分配order为10个连续物理页面时失败,mode表示内存分配的页模式,具体在include/linux/gfp.h中定义。

内存碎片会导致page分配失败,即使还有很多空闲page。当order=0时,表示系统当前已经完全OOM。

[ 2161.623563] xxxx: page allocation failure: order:, mode:0x2084020(GFP_ATOMIC|__GFP_COMP)-----------------warn_alloc(),从这里可以知道是哪个进程页面分配失败,并且有对应的gfp_mask。
[ 2161.632085] CPU: PID: Comm: AiApp Not tainted 4.9. #53---------------------------------------------dump_stack(),栈信息指出了更详细的调用路径。
[ 2161.637947]
Call Trace:
[<802f63f2>] dump_stack+0x1e/0x3c
[<800f6cf4>] warn_alloc+0x100/0x148
[<800f709c>] __alloc_pages_nodemask+0x2bc/0xb5c
[<801120fe>] kmalloc_order+0x26/0x48
[<>] kmalloc_order_trace+0x38/0x98
[<8012c5d8>] __kmalloc+0xf4/0x12c
[<8048ac78>] alloc_ep_req+0x5c/0x98
[<8048f232>] source_sink_recv+0x2a/0xe0
[<8048f35e>] usb_sourcesink_bulk_read+0x76/0x1c8
[<8048f770>] usb_sourcesink_read+0xfc/0x2c8
[<80134d58>] __vfs_read+0x30/0x108
[<80135c14>] vfs_read+0x94/0x128
[<80136d12>] SyS_read+0x52/0xd4
[<8004a246>] csky_systemcall+0x96/0xe0
[ 2161.689204] Mem-Info:--------------------------------------------------------------show_mem()
[ 2161.691518] active_anon: inactive_anon: isolated_anon:0-----------------------所有node统计信息。
[ 2161.691518] active_file: inactive_file: isolated_file:
[ 2161.691518] unevictable: dirty: writeback: unstable:
[ 2161.691518] slab_reclaimable: slab_unreclaimable:
[ 2161.691518] mapped: shmem: pagetables: bounce:
[ 2161.691518] free: free_pcp: free_cma:60234
--------------------------------------------------------------------------------------只有一个node,输出node 0统计信息。
[ 2161.724334] Node active_anon:13072kB inactive_anon:8kB active_file:5084kB inactive_file:357144kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:17128kB dirty:1372kB writeback:0kB shmem:16kB writeback_tmp:0kB unstable:0kB pages_scanned: all_unreclaimable? no
--------------------------------------------------------------------------------------输出Normal zone统计信息。
[ 2161.748626] Normal free:248344kB min:2444kB low:3052kB high:3660kB active_anon:13072kB inactive_anon:8kB active_file:5084kB inactive_file:357144kB unevictable:0kB writepending:1372kB present:1048572kB managed:734568kB mlocked:0kB slab_reclaimable:8076kB slab_unreclaimable:2576kB kernel_stack:608kB pagetables:236kB bounce:0kB free_pcp:796kB local_pcp:796kB free_cma:240936kB
[ 2161.781670] lowmem_reserve[]: 0
---------------------------------------------------------------------------------------输出Normal zone下不同order的空闲情况,包括其中页面属性。
[ 2161.785225] Normal: *4kB (UEC) *8kB (EC) *16kB (UEC) *32kB (UE) *64kB (UE) *128kB (UE) *256kB (EC) *512kB (E) *1024kB (UEC) *2048kB (UEC) *4096kB (C) = 248344kB
total pagecache pages
---------------------------------------------------------------------------------------整个平台页面统计信息。
[ 2161.803526] pages RAM
[ 2161.806410] pages HighMem/MovableOnly
[ 2161.810264] pages reserved
[ 2161.813509] pages cma reserved

从stack信息可以得知,alloc_ep_req()是分配内存的起点。

struct usb_request *alloc_ep_req(struct usb_ep *ep, size_t len)
{
struct usb_request *req; req = usb_ep_alloc_request(ep, GFP_ATOMIC);
if (req) {
req->length = usb_endpoint_dir_out(ep->desc) ?
usb_ep_align(ep, len) : len;
req->buf = kmalloc(req->length, GFP_ATOMIC);
if (!req->buf) {
usb_ep_free_request(ep, req);
req = NULL;
}
}
return req;
}

3.1 GFP_ATOMIC和__GFP_COMP:页面分配标志

从代码可知此时gfp_mask为GFP_ATOMIC,这种情况是不允许__GFP_DIRECT_RECLAIM页面直接回收的。

#define GFP_ATOMIC    (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)----------------------------------------------表示更高优先级。
#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)------------------------------------------表示调用者不可以回收页面或者睡眠,并且是高优先级。典型的应用是中断处理中。
#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */----在内存分配的时候,主动唤醒kswapd线程。
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)----------------------------------------------复合页标志位,表示将两个或多个也看成一个页面。

GFP位掩码定义如下:

#define ___GFP_DMA        0x01u
#define ___GFP_HIGHMEM 0x02u
#define ___GFP_DMA32 0x04u
#define ___GFP_MOVABLE 0x08u
#define ___GFP_RECLAIMABLE 0x10u
#define ___GFP_HIGH 0x20u
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_COLD 0x100u
#define ___GFP_NOWARN 0x200u
#define ___GFP_REPEAT 0x400u
#define ___GFP_NOFAIL 0x800u
#define ___GFP_NORETRY 0x1000u
#define ___GFP_MEMALLOC 0x2000u
#define ___GFP_COMP 0x4000u
#define ___GFP_ZERO 0x8000u
#define ___GFP_NOMEMALLOC 0x10000u
#define ___GFP_HARDWALL 0x20000u
#define ___GFP_THISNODE 0x40000u
#define ___GFP_ATOMIC 0x80000u
#define ___GFP_ACCOUNT 0x100000u
#define ___GFP_NOTRACK 0x200000u
#define ___GFP_DIRECT_RECLAIM 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
#define ___GFP_WRITE 0x1000000u
#define ___GFP_KSWAPD_RECLAIM 0x2000000u

3.2 gfp和migrate转换,进而alloc_flags:为什么不能使用CMA区域?

gfp_mask决定了申请页面的migratetype,然后在CMA存在的情况下根据migratetype决定是否可用CMA区域。

static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);------------------------------__GFP_HIGH到ALLOC_HIGH转换。 if (gfp_mask & __GFP_ATOMIC) { if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER; alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER; #ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)---------------------------将gfp_mask转换到migratetype,判断是否是MIGRATE_MOVABLE。如果是,则可以在CMA去榆中分配。也就是说必须gfp_flags中包含__GFP_MOVABLE才可以在CMA中分配。
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
} #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)------------------------------___GFP_MOVABLE为0x08,___GFP_RECLAIMABLE为0x10。
#define GFP_MOVABLE_SHIFT 3 static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE; /* Group based on mobility */
return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;--------------------------这里面只会与__GFP_RECLAIMABLE|__GFP_MOVABLE,然后右移3bit,就将___GFP_MOVABLE转换到MIGRATE_MOVABLE,将__GFP_RECLAIMABLE转换到MIGRATE_RECLAIMABLE。
}

由于此次申请的gfp_mask可知没有___GFP_MOVABLE,所以alloc_flags不会包括ALLOC_CMA。反之,如果要复用CMA进行内存申请,需要在gfp_mask中包括__GFP_MOVABLE。

从Normal区域空闲页面可以看出,有58个4MB空闲,但是属于CMA区域。所以申请不成功。

3.3 问题的根源

结合warn_alloc()和实例归纳如下:

1. 虽然存在很多空闲内存,但是alloc_ep_req()无法使用

由于alloc_ep_req()申请内存的gfp_mask为GFP_ATOMIC|__GFP_COMP。

由于不具备__GFP_MOVABLE,所以即使存在很多空闲4MB连续页面,也无法使用,因为这些4MB页面都是CMA的。

[ 2161.785225] Normal: *4kB (UEC) *8kB (EC) *16kB (UEC) *32kB (UE) *64kB (UE) *128kB (UE) *256kB (EC) *512kB (E) *1024kB (UEC) *2048kB (UEC) *4096kB (C) = 248344kB-----光4MB CMA就达到了232M,其他只有16MB。

2. 为什么剩下的内存绝大部分是CMA?

从Normal区域空闲页面情况看,绝大部分都是CMA的。但是初始化的时候存在很多其他类型的页面。

通过cat /proc/pagetypeinfo查看前后对比,可以发现Movable类型的页面基本被申请完。

所以这里怀疑是内存泄漏,通过下面脚本跟踪MemFree。

while true; do cat /proc/meminfo | grep MemFree; sleep ; done

发现内存在不停的下降,达到260M左右的时候出现warn_alloc()。

所以问题的根源在内存泄漏。

3. 如何降低内存碎片?

对内存碎片,可以提供页面规整来解决。请参考《Linux内存管理 (16)内存规整

4. 调整/proc/sys/vm/min_free_kbytes

warn_alloc():page allocation failure问题分析的更多相关文章

  1. kernel: swapper: page allocation failure. order:1, mode:0x20

    场景:领导电话通知,我们的主站宕机了,到家后从另外一台机器上ssh一直处于等待状态,开始怀疑机器的负载比较高,后查看监控机器,发现网卡.cpu.nginx连接数.....通通都没有数据了,显然不是负载 ...

  2. Mongodb 故障分享 初始化时"errmsg" : "exception: new file allocation failure" 并且长时间处于STARTUP2

    Hello,大家下午好. 近几天的项目有点赶,所以耽误了更新.现在给大家分享下,在安装mongodb的过程中,遇到的故障一则.其实很小白的问题,当时遇到这个问题的时候比较心慌,浪费了很多时间,跟大家分 ...

  3. Allocation Failure

    up vote 8 down vote accepted "Allocation Failure" is a cause of GC cycle to kick. "Al ...

  4. mongodb new file allocation failure

    话说那天正在向mongodb中写入数据,突然就蹦出了 new file allocation failure ,以为是数据有错误,就检查了一番,可没问题啊,看着像是mongo自己的问题,于是百度了一番 ...

  5. linux page allocation and deallocation

      All of the physical pages in the system are described by the mem_map  data structure which is a li ...

  6. System and method to prioritize large memory page allocation in virtualized systems

    The prioritization of large memory page mapping is a function of the access bits in the L1 page tabl ...

  7. Operating System Memory Management、Page Fault Exception、Cache Replacement Strategy Learning、LRU Algorithm

    目录 . 引言 . 页表 . 结构化内存管理 . 物理内存的管理 . SLAB分配器 . 处理器高速缓存和TLB控制 . 内存管理的概念 . 内存覆盖与内存交换 . 内存连续分配管理方式 . 内存非连 ...

  8. [转]Android WiFi 掉线原因分析

    看到一个比较详细的分析wifi断开的文章.收藏一下. 原文: http://blog.csdn.net/chi_wy/article/details/50963279 原因1 .从Log分析来看,这个 ...

  9. 【转】Android WiFi 经常掉线出现的几个原因分析!

    原因1.从Log分析来看,这个是由于Dhcp request fail 导致最终disconnect . Log 分析如下: 16:53:31.659 958 6525 D NetUtils: dhc ...

随机推荐

  1. leetcode — longest-consecutive-sequence

    import java.util.HashSet; import java.util.Set; /** * Source : https://oj.leetcode.com/problems/long ...

  2. JDK源码分析(8)之 Reference 完全解读

    在阅读本文之前最好对 Reference 框架有一个整体的把握,可以参考我上一篇博客 Reference 框架概览 :本文主要讲了 Reference 的子类实现和应用(SoftReference,W ...

  3. javascript基础修炼(7)——Promise,异步,可靠性

    开发者的javascript造诣取决于对[动态]和[异步]这两个词的理解水平. 一. 别人是开发者,你也是 Promise技术是[javascript异步编程]这个话题中非常重要的,它一度让我感到熟悉 ...

  4. mongodb学习(入门。。。。。)

    db.xs.insert({name:zhangsan})   db:当前数据库  xs:学生集合(没有的话自动创建) show collections   显示当前数据库的集合名字 show dbs ...

  5. Lambda表达式资料整理

    重温委托,匿名方法,Lambda,泛型委托,表达式树   第一:委托 有些教材,博客说到委托都会提到事件,虽然事件是委托的一个实例,但是为了理解起来更简单,今天只谈委托不谈事件.先上一段代码: 下边的 ...

  6. Java 学习笔记 线程控制

    题目一 本质上来说,线程是不可控制的,线程的执行是由CPU资源分配决定的,我们无法干预系统CPU的资源分配,但我们可以增加条件来让线程按照我们的预想顺序来执行. 比如.如果当前的执行的线程不满足我们所 ...

  7. Maven(十五)Maven 聚合

    聚合解决的问题: 解决每个模块之间都要一个一个安装,一键安装各个模块工程 尤其时在配置继承后要先安装子模块在安装父,模块. 配置方式 自己找一个工程作为聚合工程,配置好后在聚合工程上运行Maven i ...

  8. 聊聊我的 Java 自学之路

    最近经常在知乎收到类似『没基础,java 如何自学』.『怎么才能掌握编程』等等问题,再加上发现高中同学也在自学.有感而发,讲讲我的自学之路. 1.1. 大学 高考没正常发挥,考入一所二流的学校,当时分 ...

  9. 高效开发者是如何个性化VS Code插件与配置的?

    2年之前,我放弃了Sublime Text,选择了Visual Studio Code作为代码编辑器. 我每天花在VS Code上的时间长达5~6个小时,因此按照我的需求优化VS Code配置十分必要 ...

  10. mac下 将python2.7改为python3

    1.查看当前电脑python版本 python -V // 显示2.7.x 2.用brew升级python brew update python 3.如果安装成功,去系统目录下回看到两个版本的pyth ...