Linux内存管理（深入理解Linux内核）

Linux的内存管理，实际上是借助80x86的硬件分段和分页电路，将逻辑地址转化为物理地址的。

物理内存中，有一部分是一直(Permanently)映射给内核使用的，这部分主要用于保存内核的代码，以及内核中静态的数据结构体。之所以要一直将这些物理内存映射给内核，是因为这些内容（代码，静态数据结构）是在整个操作系统运行过程中都一直需要不断地引用的，如果是通过动态分配和翻译的方式来维护它们在物理内存中的位置的话，就会耗费太多的CPU时间。

这种方式可以理解为以空间换时间的策略。

物理内存中的其余部分，是动态内存。

动态内存是一项珍贵的资源，不仅被用户态的各个进程所需要，内核本身也是需要的。

Page Frame Management

Memory Area Management

是两种管理物理上连接的内存区域的方式。

Noncontiguous Memory Area Management

是处理物理上不连接的物理内存区域的方式。

Page Descriptor

内核必须维护每一个Page的当前状态，

比如，它必须能够区分某个物理页现在是被谁在使用：

1. 用户态的进程

2. 内核态的代码

3. 内核态的数据结构

同样，它也必须能够区分一个在Dynamic Memory中分配的物理内存页现在处于哪种状态：

1. 释放状态

2. 存储用户态的进程的数据

3. 存储一个软件的Cache

4. 存储动态分配的内核的数据结构

5. 存储设备驱动的缓存数据

6. 存储内核模块的代码

每个页的描述结构体，都存储成一个struct page的实例，这些实例保存在mem_map数组中。

每个struct page大小为32字节，因此大约会耗费1%（32 / 4096）的物理内存来保存这个数组。

内核提供以下几个宏，来获得page结构体的位置：

   1: #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)

2:

   3: /*

   4:  * supports 3 memory models.

   5:  */

   6: #if defined(CONFIG_FLATMEM)

7:

   8: #define __pfn_to_page(pfn)    (mem_map + ((pfn) - ARCH_PFN_OFFSET))

   9: #define __page_to_pfn(page)    ((unsigned long)((page) - mem_map) + \

  10:                  ARCH_PFN_OFFSET)

  11: #elif defined(CONFIG_DISCONTIGMEM)

12:

  13: #define __pfn_to_page(pfn)            \

  14: ({    unsigned long __pfn = (pfn);        \

  15:     unsigned long __nid = arch_pfn_to_nid(__pfn);  \

  16:     NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\

  17: })

18:

  19: #define __page_to_pfn(pg)                        \

  20: ({    struct page *__pg = (pg);                    \

  21:     struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));    \

  22:     (unsigned long)(__pg - __pgdat->node_mem_map) +            \

  23:      __pgdat->node_start_pfn;                    \

  24: })

25:

  26: #elif defined(CONFIG_SPARSEMEM_VMEMMAP)

27:

  28: /* memmap is virtually contiguous.  */

  29: #define __pfn_to_page(pfn)    (vmemmap + (pfn))

  30: #define __page_to_pfn(page)    (unsigned long)((page) - vmemmap)

31:

  32: #elif defined(CONFIG_SPARSEMEM)

  33: /*

  34:  * Note: section's mem_map is encorded to reflect its start_pfn.

  35:  * section[i].section_mem_map == mem_map's address - start_pfn;

  36:  */

  37: #define __page_to_pfn(pg)                    \

  38: ({    struct page *__pg = (pg);                \

  39:     int __sec = page_to_section(__pg);            \

  40:     (unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec)));    \

  41: })

42:

  43: #define __pfn_to_page(pfn)                \

  44: ({    unsigned long __pfn = (pfn);            \

  45:     struct mem_section *__sec = __pfn_to_section(__pfn);    \

  46:     __section_mem_map_addr(__sec) + __pfn;        \

  47: })

  48: #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */

49:

  50: #define page_to_pfn __page_to_pfn

  51: #define pfn_to_page __pfn_to_page

Page结构体

   1: /*

   2:  * Each physical page in the system has a struct page associated with

   3:  * it to keep track of whatever it is we are using the page for at the

   4:  * moment. Note that we have no way to track which tasks are using

   5:  * a page, though if it is a pagecache page, rmap structures can tell us

   6:  * who is mapping it.

   7:  */

   8: struct page {

   9:     unsigned long flags;        /* Atomic flags, some possibly

  10:                      * updated asynchronously */

  11:     atomic_t _count;        /* Usage count, see below. */

  12:     union {

  13:         /*

  14:          * Count of ptes mapped in

  15:          * mms, to show when page is

  16:          * mapped & limit reverse map

  17:          * searches.

  18:          *

  19:          * Used also for tail pages

  20:          * refcounting instead of

  21:          * _count. Tail pages cannot

  22:          * be mapped and keeping the

  23:          * tail page _count zero at

  24:          * all times guarantees

  25:          * get_page_unless_zero() will

  26:          * never succeed on tail

  27:          * pages.

  28:          */

  29:         atomic_t _mapcount;

30:

  31:         struct {        /* SLUB */

  32:             u16 inuse;

  33:             u16 objects;

  34:         };

  35:     };

  36:     union {

  37:         struct {

  38:         unsigned long private;        /* Mapping-private opaque data:

  39:                           * usually used for buffer_heads

  40:                          * if PagePrivate set; used for

  41:                          * swp_entry_t if PageSwapCache;

  42:                          * indicates order in the buddy

  43:                          * system if PG_buddy is set.

  44:                          */

  45:         struct address_space *mapping;    /* If low bit clear, points to

  46:                          * inode address_space, or NULL.

  47:                          * If page mapped as anonymous

  48:                          * memory, low bit is set, and

  49:                          * it points to anon_vma object:

  50:                          * see PAGE_MAPPING_ANON below.

  51:                          */

  52:         };

  53: #if USE_SPLIT_PTLOCKS

  54:         spinlock_t ptl;

  55: #endif

  56:         struct kmem_cache *slab;    /* SLUB: Pointer to slab */

  57:         struct page *first_page;    /* Compound tail pages */

  58:     };

  59:     union {

  60:         pgoff_t index;        /* Our offset within mapping. */

  61:         void *freelist;        /* SLUB: freelist req. slab lock */

  62:     };

  63:     struct list_head lru;        /* Pageout list, eg. active_list

  64:                      * protected by zone->lru_lock !

  65:                      */

  66:     /*

  67:      * On machines where all RAM is mapped into kernel address space,

  68:      * we can simply calculate the virtual address. On machines with

  69:      * highmem some memory is mapped into kernel virtual memory

  70:      * dynamically, so we need a place to store that address.

  71:      * Note that this field could be 16 bits on x86 ... ;)

  72:      *

  73:      * Architectures with slow multiplication can define

  74:      * WANT_PAGE_VIRTUAL in asm/page.h

  75:      */

  76: #if defined(WANT_PAGE_VIRTUAL)

  77:     void *virtual;            /* Kernel virtual address (NULL if

  78:                        not kmapped, ie. highmem) */

  79: #endif /* WANT_PAGE_VIRTUAL */

  80: #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS

  81:     unsigned long debug_flags;    /* Use atomic bitops on this */

  82: #endif

83:

  84: #ifdef CONFIG_KMEMCHECK

  85:     /*

  86:      * kmemcheck wants to track the status of each byte in a page; this

  87:      * is a pointer to such a status block. NULL if not tracked.

  88:      */

  89:     void *shadow;

  90: #endif

  91: };

1. flags, 定义当前页的状态的enum变量，同时也存储了当前页所在的zone号(also encodes the zone number to which the page frame belongs)

其中各个状态的定义如下：

   1: enum pageflags {

   2:     PG_locked,        /* Page is locked. Don't touch. */

   3:     PG_error,

   4:     PG_referenced,

   5:     PG_uptodate,

   6:     PG_dirty,

   7:     PG_lru,

   8:     PG_active,

   9:     PG_slab,

  10:     PG_owner_priv_1,    /* Owner use. If pagecache, fs may use*/

  11:     PG_arch_1,

  12:     PG_reserved,

  13:     PG_private,        /* If pagecache, has fs-private data */

  14:     PG_private_2,        /* If pagecache, has fs aux data */

  15:     PG_writeback,        /* Page is under writeback */

  16: #ifdef CONFIG_PAGEFLAGS_EXTENDED

  17:     PG_head,        /* A head page */

  18:     PG_tail,        /* A tail page */

  19: #else

  20:     PG_compound,        /* A compound page */

  21: #endif

  22:     PG_swapcache,        /* Swap page: swp_entry_t in private */

  23:     PG_mappedtodisk,    /* Has blocks allocated on-disk */

  24:     PG_reclaim,        /* To be reclaimed asap */

  25:     PG_swapbacked,        /* Page is backed by RAM/swap */

  26:     PG_unevictable,        /* Page is "unevictable"  */

  27: #ifdef CONFIG_MMU

  28:     PG_mlocked,        /* Page is vma mlocked */

  29: #endif

  30: #ifdef CONFIG_ARCH_USES_PG_UNCACHED

  31:     PG_uncached,        /* Page has been mapped as uncached */

  32: #endif

  33: #ifdef CONFIG_MEMORY_FAILURE

  34:     PG_hwpoison,        /* hardware poisoned page. Don't touch */

  35: #endif

  36: #ifdef CONFIG_TRANSPARENT_HUGEPAGE

  37:     PG_compound_lock,

  38: #endif

  39:     __NR_PAGEFLAGS,

40:

  41:     /* Filesystems */

  42:     PG_checked = PG_owner_priv_1,

43:

  44:     /* Two page bits are conscripted by FS-Cache to maintain local caching

  45:      * state.  These bits are set on pages belonging to the netfs's inodes

  46:      * when those inodes are being locally cached.

  47:      */

  48:     PG_fscache = PG_private_2,    /* page backed by cache */

49:

  50:     /* XEN */

  51:     PG_pinned = PG_owner_priv_1,

  52:     PG_savepinned = PG_dirty,

53:

  54:     /* SLOB */

  55:     PG_slob_free = PG_private,

56:

  57:     /* SLUB */

  58:     PG_slub_frozen = PG_active,

  59: };

其中，内核定义了方便操作状态的宏：

   1: PageXXX()

2:

   3: SetPageXXX()

4:

   5: ClearPageXXX()

分别用于查询、设置和清除相应的状态位。

2. _count, 引用计数

   1: page_count()

可以用于查询引用计数

The pool of Reserved Page Frames

保留的页分配池

当分配内存页时，可能会发生两种状态：

1. 空闲的内存页足够，分配立即成功；

2. 空闲的内存页不足够，必须进行内存回收(Memory Reclaiming), 而申请内存页的内核控制路径(Kernel Control Path)必须被block直到有足够的空闲内存页出现。

但是，某些Kernel Control Path是不能够被block的，比如：

1. 正在处理中断的Handler;

2. 处在关键区中的代码(Critical Section)

这些Kernel Control Path在申请内存页时，应该使用GFP_ATOMIC标志，该标志表示申请不应该被block，如果没有足够的内存页，就直接失败而返回。

但是内核必须尽量保证GFP_ATOMIC类型的申请能够正确地执行，因此内核保留了一定数量的物理内存页，这些内存页仅供处在低内存状态(Low-On-Memory)条件下的GFP_ATOMIC使用。

通常会分配min_free_kbytes（这么多KB）的内存来作为Pool。

通过公式

   1: reserved_pool_size = floor(sqrt(16 * (ZONE_DMA + ZONE_NORMAL)))

来计算，而且限制在128~65536KB之间。

而且会按照大小比例，在ZONE_DMA及ZONE_NORMAL之间分配各自保留的比例。

Linux内核能够直接映射的线性地址范围为3GB~3GB+896MB。

如果分配这个范围内的物理内存页，那么可以直接返回分配到的页的线性内存地址。但是如果分配的物理内存不是这个范围内的，无法直接返回其对应的内核空间的线性地址，但是可以返回页结构体（struct page）的地址，因为所有物理内存页的页结构体都存放在mem_map中。

这种方式的限制是，如果分配了线性地址空间3GB~3GB+896MB范围之外的内存时，需要重新更新该物理内存页到线性地址的映射，即重新设置页表。

对页表的额外操作，使得分配这些内存就不如分配固定映射的那部分内存来得高效。

高端内存映射参考：http://linux.chinaitlab.com/administer/831348.html