Linux内核情景分析的alloc_pages
==================== mm/numa.c 43 43 ====================
43 #ifdef CONFIG_DISCONTIGMEM
==================== mm/numa.c 91 128 ====================
91 /*
92 * This can be refined. Currently, tries to do round robin, instead
93 * should do concentratic circle search, starting from current node.
94 */
//分配策略, 所需物理块的大小,2的order次方
95 struct page * alloc_pages(int gfp_mask, unsigned long order)
96 {
97 struct page *ret = 0;
98 pg_data_t *start, *temp;
99 #ifndef CONFIG_NUMA
100 unsigned long flags;
101 static pg_data_t *next = 0;
102 #endif
103
104 if (order >= MAX_ORDER)
105 return NULL;
106 #ifdef CONFIG_NUMA//NUMA结构
107 temp = NODE_DATA(numa_node_id());//可以通过宏操作找到cpu的节数据结构队列
108 #else
109 spin_lock_irqsave(&node_lock, flags);
110 if (!next) next = pgdat_list;
111 temp = next;
112 next = next->node_next;
113 spin_unlock_irqrestore(&node_lock, flags);
114 #endif
/*
函数主要操作2个循环,一个从temp到队列末尾,一个从队头到temp,扫描所有节,直到某节点内存分配成功
*/
115 start = temp;
116 while (temp) {
117 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))//接下来解析此函数
118 return(ret);
119 temp = temp->node_next;
120 }
121 temp = pgdat_list;
122 while (temp != start) {
123 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
124 return(ret);
125 temp = temp->node_next;
126 }
127 return(0);
128 }
85 static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
86 unsigned long order)
87 { //node_zonelist决定分配策略数组
88 return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);
89 }
==================== include/linux/mm.h 343 352 ====================
343 #ifndef CONFIG_DISCONTIGMEM//只有这个无定义,才使用uma的__alloc_pages
344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
83
345 {
346 /*
347 * Gets optimized away by the compiler.
348 */
349 if (order >= MAX_ORDER)
350 return NULL;
351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);
352 }
如果只分配一个页面,而且要等待完成分配,又不适用于管理的目的
把direct_reclaim设置为1,表示可以从相应的管理区的不活跃干净页面缓冲队列中回收
84发现空闲页面短缺,唤醒以下2个进程,试图腾出一些页面出来
[alloc_pages()>__alloc_pages()]
270 /*
271 * This is the 'heart' of the zoned buddy allocator:
272 */
273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
274 {
275 zone_t **zone;
276 int direct_reclaim = 0;
277 unsigned int gfp_mask = zonelist->gfp_mask;//获取具体的分配策略
278 struct page * page;
279
280 /*
281 * Allocations put pressure on the VM subsystem.
282 */
283 memory_pressure++;//表示内存管理所承受的压力,分配++,归还--
284
285 /*
286 * (If anyone calls gfp from interrupts nonatomically then it
287 * will sooner or later tripped up by a schedule().)
288 *
289 * We are falling back to lower-level zones if allocation
290 * in a higher zone fails.
291 */
292
293 /*
294 如果只分配一个页面,而且要等待完成分配,又不适用于管理的目的
把direct_reclaim设置为1,表示可以从相应的管理区的不活跃干净页面缓冲队列中回收
296 */
297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&
298 !(current->flags & PF_MEMALLOC))
299 direct_reclaim = 1;
300
301 /*
302 * If we are about to get low on free pages and we also have
303 * an inactive page shortage, wake up kswapd.
84发现空闲页面短缺,唤醒以下2个进程,试图腾出一些页面出来
304 */
305 if (inactive_shortage() > inactive_target / 2 && free_shortage())
306 wakeup_kswapd(0);
307 /*
308 * If we are about to get low on free pages and cleaning
309 * the inactive_dirty pages would fix the situation,
310 * wake up bdflush.
311 */
312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
313 && nr_inactive_dirty_pages >= freepages.high)
314 wakeup_bdflush(0);
315
//否则有进程(内核线程kreclaimd)在等待队列睡眠,把它唤醒,用于回收一些页面,备用
==================== mm/page_alloc.c 316 340 ====================
[alloc_pages()>__alloc_pages()]
316 try_again:
317 /*
318 * First, see if we have any zones with lots of free memory.
319 *
320 * We allocate free memory first because it doesn't contain
321 * any data ... DUH!
322 */
323 zone = zonelist->zones;//获取管理区指针
324 for (;;) {
325 zone_t *z = *(zone++);//管理区
326 if (!z)
327 break;
328 if (!z->size)
329 BUG();
330//如果管理区的空闲页面大于其最低标准
331 if (z->free_pages >= z->pages_low) {
332 page = rmqueue(z, order);//分配内存,接下来分析此函数
333 if (page)
334 return page;
335 }
//否则有进程(内核线程kreclaimd)在等待队列睡眠,把它唤醒,用于回收一些页面,备用
else if (z->free_pages < z->pages_min &&
336 waitqueue_active(&kreclaimd_wait)) {
85
337 wake_up_interruptible(&kreclaimd_wait);
338 }
339 }
340
[alloc_pages()>__alloc_pages()>rmqueue()]
172 static struct page * rmqueue(zone_t *zone, unsigned long order)
173 {
174 free_area_t * area = zone->free_area + order;//获取其数组对应的元素
175 unsigned long curr_order = order;
176 struct list_head *head, *curr;
177 unsigned long flags;
178 struct page *page;
179
180 spin_lock_irqsave(&zone->lock, flags);//相应管理区加锁
181 do {
182 head = &area->free_list;//头
183 curr = memlist_next(head);//头的下一个节点
184
185 if (curr != head) {//不等于空,说明有物理页块
186 unsigned int index;
187//从非空队列中取出第一个结构page元素
188 page = memlist_entry(curr, struct page, list);
189 if (BAD_RANGE(zone,page))
190 BUG();
191 memlist_del(curr);//删除队列中的元素
192 index = (page - mem_map) - zone->offset;//偏移
193 MARK_USED(index, curr_order, area);//将相应位图设置为1
194 zone->free_pages -= 1 << order;
195//分配成功,把大块剩余的部分分解为小块,链入相应的队列
196 page = expand(zone, page, index, order, curr_order, area);
197 spin_unlock_irqrestore(&zone->lock, flags);
198
199 set_page_count(page, 1);
200 if (BAD_RANGE(zone,page))
201 BUG();
202 DEBUG_ADD_PAGE
203 return page;
204 }
205 curr_order++;
206 area++;
86
207 } while (curr_order < MAX_ORDER);
208 spin_unlock_irqrestore(&zone->lock, flags);
209
210 return NULL;
211 }
[alloc_pages()>__alloc_pages()>rmqueue()>expand()]
/*
low表示所需块大小,high表示实际大小
*/
150 static inline struct page * expand (zone_t *zone, struct page *page,
151 unsigned long index, int low, int high, free_area_t * area)
152 {
153 unsigned long size = 1 << high;
154
155 while (high > low) {
156 if (BAD_RANGE(zone,page))
157 BUG();
158 area--;
159 high--;
160 size >>= 1;//每次减少2的n次方
161 memlist_add_head(&(page)->list, &(area)->free_list);
162 MARK_USED(index, high, area);//标记位图
//处理更低一档的空闲块队列
163 index += size;
164 page += size;
165 }
166 if (BAD_RANGE(zone,page))
167 BUG();
168 return page;
169 }
[alloc_pages()>__alloc_pages()]
341 /*
342 * Try to allocate a page from a zone with a HIGH
343 * amount of free + inactive_clean pages.
344 *
345 * If there is a lot of activity, inactive_target
346 * will be high and we'll have a good chance of
347 * finding a page using the HIGH limit.
348 */
//先用page_high,如果不行再用page_low
349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
350 if (page)
351 return page;
352
353 /*
354 * Then try to allocate a page from a zone with more
355 * than zone->pages_low free + inactive_clean pages.
356 *
357 * When the working set is very large and VM activity
358 * is low, we're most likely to have our allocation
359 * succeed here.
360 */
361 page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
362 if (page)
363 return page;
364
[alloc_pages()>__alloc_pages()>__alloc_pages_limit()]
213 #define PAGES_MIN 0
214 #define PAGES_LOW 1
215 #define PAGES_HIGH 2
88
216
217 /*
218 * This function does the dirty work for __alloc_pages
219 * and is separated out to keep the code size smaller.
220 * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
221 */
222 static struct page * __alloc_pages_limit(zonelist_t *zonelist,
223 unsigned long order, int limit, int direct_reclaim)
224 {
225 zone_t **zone = zonelist->zones;
226
227 for (;;) {
228 zone_t *z = *(zone++);
229 unsigned long water_mark;
230
231 if (!z)
232 break;
233 if (!z->size)
234 BUG();
235
236 /*
237 * We allocate if the number of free + inactive_clean
238 * pages is above the watermark.
239 */
240 switch (limit) {
241 default:
242 case PAGES_MIN://通过分配策略,改变水位
243 water_mark = z->pages_min;
244 break;
245 case PAGES_LOW:
246 water_mark = z->pages_low;
247 break;
248 case PAGES_HIGH:
249 water_mark = z->pages_high;
250 }
251//如果空闲页面+干净回收页面大于最低水位
252 if (z->free_pages + z->inactive_clean_pages > water_mark) {
253 struct page *page = NULL;
254 /* 如果空闲页面小于最低水位+8,那就回收. */
255 if (direct_reclaim && z->free_pages < z->pages_min + 8)
256 page = reclaim_page(z);//把inactive_clean_list队列回收页面
257 /* If that fails, fall back to rmqueue. */
258 if (!page)
259 page = rmqueue(z, order);
260 if (page)
261 return page;
262 }
263 }
264
89
265 /* Found nothing. */
266 return NULL;
267 }
[alloc_pages()>__alloc_pages()]
365 /*
366 * OK, none of the zones on our zonelist has lots
367 * of pages free.
368 *
369 * We wake up kswapd, in the hope that kswapd will
370 * resolve this situation before memory gets tight.
371 *
372 * We also yield the CPU, because that:
373 * - gives kswapd a chance to do something
374 * - slows down allocations, in particular the
375 * allocations from the fast allocator that's
376 * causing the problems ...
377 * - ... which minimises the impact the "bad guys"
378 * have on the rest of the system
379 * - if we don't have __GFP_IO set, kswapd may be
380 * able to free some memory we can't free ourselves
381 */
382 wakeup_kswapd(0);//唤醒内核线程,想办法换出一些页面
383 if (gfp_mask & __GFP_WAIT) {//要求必须获取页面,分配不到时等待,那就让系统再调用一次(目的为了调度kswapd线程)
//以此获取一些页面
384 __set_current_state(TASK_RUNNING);
385 current->policy |= SCHED_YIELD;
386 schedule();
387 }
388
389 /*
390 * After waking up kswapd, we try to allocate a page
391 * from any zone which isn't critical yet.
392 *
393 * Kswapd should, in most situations, bring the situation
394 * back to normal in no time.
395 */
/*
如果不允许等待,那就用pages_min再调用一次__alloc_pages_limit
*/
396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
397 if (page)
398 return page;
399
==================== mm/page_alloc.c 400 477 ====================
[alloc_pages()>__alloc_pages()]
400 /*
401 * Damn, we didn't succeed.
402 *
403 * This can be due to 2 reasons:
404 * - we're doing a higher-order allocation
405 * --> move pages to the free list until we succeed
406 * - we're /really/ tight on memory
407 * --> wait on the kswapd waitqueue until memory is freed
408 */
409 if (!(current->flags & PF_MEMALLOC)) {
410 /*
411 * Are we dealing with a higher order allocation?
412 *
413 * Move pages from the inactive_clean to the free list
414 * in the hope of creating a large, physically contiguous
415 * piece of free memory.
416 */
417 if (order > 0 && (gfp_mask & __GFP_WAIT)) {
418 zone = zonelist->zones;
419 /* First, clean some dirty pages. */
420 current->flags |= PF_MEMALLOC;
421 page_launder(gfp_mask, 1);//把脏页洗干净(页面的定期换出)
422 current->flags &= ~PF_MEMALLOC;
423 for (;;) {
424 zone_t *z = *(zone++);//通过一个for循环把干净页面等待队列的页面回收
425 if (!z)
426 break;
427 if (!z->size)
428 continue;
//是否有干净页面
429 while (z->inactive_clean_pages) {
430 struct page * page;
431 /* Move one page to the free list. */
432 page = reclaim_page(z);//回收干净页面等待队列
433 if (!page)
434 break;
91
435 __free_page(page);//通过__free_page释放页面的同时,把空闲页面拼接成大的页面块
436 /* Try if the allocation succeeds. */
437 page = rmqueue(z, order);//试图再次请求成功
438 if (page)
439 return page;
440 }
441 }
442 }
443 /*
444 * When we arrive here, we are really tight on memory.
445 *
446 * We wake up kswapd and sleep until kswapd wakes us
447 * up again. After that we loop back to the start.
448 *
449 * We have to do this because something else might eat
450 * the memory kswapd frees for us and we need to be
451 * reliable. Note that we don't loop back for higher
452 * order allocations since it is possible that kswapd
453 * simply cannot free a large enough contiguous area
454 * of memory *ever*.
455 */
/*
如果依旧失败,而且必须要求分配到页面,那就等待,进程睡眠
*/
456 if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
457 wakeup_kswapd(1);//唤醒kswaped,要求分配页面进程睡眠,等待kswapd完成一轮运行再唤醒需要页面的进程
458 memory_pressure++;
459 if (!order)//如果要求分配的是1个页面,跳到try_again
460 goto try_again;
461 /*
462 * If __GFP_IO isn't set, we can't wait on kswapd because
463 * kswapd just might need some IO locks /we/ are holding ...
464 *
465 * SUBTLE: The scheduling point above makes sure that
466 * kswapd does get the chance to free memory we can't
467 * free ourselves...
468 */
469 } else if (gfp_mask & __GFP_WAIT) {
470 try_to_free_pages(gfp_mask);//另外一种方案...直接调用此函数获取页面(本来就是kswaped函数调用的)
471 memory_pressure++;
472 if (!order)
473 goto try_again;
474 }
475
476 }
477
[alloc_pages()>__alloc_pages()]
478 /*
479 * Final phase: allocate anything we can!
480 *
481 * Higher order allocations, GFP_ATOMIC allocations and
482 * recursive allocations (PF_MEMALLOC) end up here.
483 *
484 * Only recursive allocations can use the very last pages
485 * in the system, otherwise it would be just too easy to
486 * deadlock the system...
487 */
488 zone = zonelist->zones;
489 for (;;) {
490 zone_t *z = *(zone++);
491 struct page * page = NULL;
492 if (!z)
493 break;
494 if (!z->size)
495 BUG();
496
497 /*
498 * SUBTLE: direct_reclaim is only possible if the task
499 * becomes PF_MEMALLOC while looping above. This will
500 * happen when the OOM killer selects this task for
501 * instant execution...
93
502 */
503 if (direct_reclaim) {
504 page = reclaim_page(z);
505 if (page)
506 return page;
507 }
508
509 /* XXX: is pages_min/4 a good amount to reserve for this? */
510 if (z->free_pages < z->pages_min / 4 &&
511 !(current->flags & PF_MEMALLOC))
512 continue;
513 page = rmqueue(z, order);
514 if (page)
515 return page;
516 }
517
518 /* No luck.. */
519 printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
520 return NULL;
521 }
Linux内核情景分析的alloc_pages的更多相关文章
- linux内核情景分析之execve()
用来描述用户态的cpu寄存器在内核栈中保存情况.可以获取用户空间的信息 struct pt_regs { long ebx; //可执行文件路径的指针(regs.ebx中 long ecx; //命令 ...
- Linux内核情景分析之消息队列
早期的Unix通信只有管道与信号,管道的缺点: 所载送的信息是无格式的字节流,不知道分界线在哪,也没通信规范,另外缺乏控制手段,比如保温优先级,管道机制的大小只有1页,管道很容易写满而读取没有及时,发 ...
- Linux内核情景分析之异常访问,用户堆栈的扩展
情景假设: 在堆内存中申请了一块内存,然后释放掉该内存,然后再去访问这块内存.也就是所说的野指针访问. 当cpu产生页面错误时,会把失败的线性地址放在cr2寄存器.线性地址缺页异常的4种情况 1.如果 ...
- linux内核情景分析之exit与Wait
//第一层系统调用 asmlinkage long sys_exit(int error_code) { do_exit((error_code&0xff)<<8); } 其主体是 ...
- linux内核情景分析之内核中的互斥操作
信号量机制: struct sempahore是其结构,定义如下 struct semaphore { atomic_t count;//资源数目 int sleepers;//等待进程数目 wait ...
- linux内核情景分析之命名管道
管道是一种"无名","无形文件,只可以近亲进程使用,不可以再任意两个进程通信使用,所以只能实现"有名","有形"的文件来实现就可以 ...
- linux内核情景分析之信号实现
信号在进程间通信是异步的,每个进程的task_struct结构有一个sig指针,指向一个signal_struct结构 定义如下 struct signal_struct { atomic_t cou ...
- linux内核情景分析之强制性调度
从系统调用返回到用户空间是否调度,从ret_with_reschedule可看出,是否真正调度,取决于当前进程的pcb中的need_resched是否设置为1,那如何设置为1取决于以下几种情况: 时间 ...
- linux内核情景分析之匿名管道
管道的机制由pipe()创建,由pipe()所建立的管道两端都在同一进程.所以必须在fork的配合下,才可以在具有亲缘关系的进程通信 /* * sys_pipe() is the normal C c ...
随机推荐
- git重新下载项目
file-new-project from version control - git 修改网址为需要的网址
- python向多个邮箱发邮件--注意接收是垃圾邮件
群发邮件注意:三处标红的地方 # -*- coding: UTF-8 -*- import smtplib from email.mime.text import MIMEText from emai ...
- python基础之函数参数、嵌套、返回值、对象、命名空间和作用域
函数的使用原则 函数的使用必须遵循:先定义后使用的原则 函数的定义,与变量的定义是相似的,如果没有事先定义函数而直接引用就相当于在引用一个不存在变量名 定义阶段:只检测语法,不执行代码,当出现语法错误 ...
- 使用 Ajax
Ajax( Asynchronous JavaScript and XML) 在 Ajax 中 Asynchronous 是指异步, 代表 客户端(Client 通常是指浏览器) 可以向服务器(Ser ...
- P2341 [HAOI2006]受欢迎的牛(tarjan+缩点)
P2341 [HAOI2006]受欢迎的牛 题目描述 每头奶牛都梦想成为牛棚里的明星.被所有奶牛喜欢的奶牛就是一头明星奶牛.所有奶 牛都是自恋狂,每头奶牛总是喜欢自己的.奶牛之间的“喜欢”是可以传递的 ...
- 树莓派Raspberry Pi 3安装步骤
一.需要的硬件 1.Raspberry Pi 3(Model B+)树莓派.购买>https://item.jd.com/29225467867.html 2.输出5V/2A的电源 3.SD卡( ...
- php导出数据为CSV文件DEMO
代码示例: private function _download_send_headers($filename) { // disable caching $now = gmdate("D, ...
- winform对图片进行灰度处理
//图片进行灰度处理 //originalImage为原图像 返回灰度图像 private Bitmap GrayImage(Bitmap originalImage) { ImageAttribut ...
- Codeforces Round #326(Div2)
CodeForces 588A 题意:Duff喜欢吃肉,想在接下来的n天,每天都有Ai斤肉吃,但每一天肉的单价Pi不定,肉 可以保存不过期,现已知n天每天肉的斤数Ai,以及单价Pi,为了使每天都 ...
- Android可移动的Button
关键 package com.example.administrator.mystudent.ButtonMove; import android.app.Activity; import andro ...