前面已經分析了linux記憶體管理演算法(伙伴管理演算法)的準備工作。 具體的演算法初始化則回到start_kernel()函數接著往下走,下一個函數是mm_init(): 乍看僅僅是幾個函數的調用,實際上這裡的事情遠遠沒這麼簡單。其中page_cgroup_init_flatmem()與cgroup相關, ...
前面已經分析了linux記憶體管理演算法(伙伴管理演算法)的準備工作。
具體的演算法初始化則回到start_kernel()函數接著往下走,下一個函數是mm_init():
【file:/init/main.c】
/*
* Set up kernel memory allocators
*/
static void __init mm_init(void)
{
/*
* page_cgroup requires contiguous pages,
* bigger than MAX_ORDER unless SPARSEMEM.
*/
page_cgroup_init_flatmem();
mem_init();
kmem_cache_init();
percpu_init_late();
pgtable_init();
vmalloc_init();
}
乍看僅僅是幾個函數的調用,實際上這裡的事情遠遠沒這麼簡單。其中page_cgroup_init_flatmem()與cgroup相關,而mem_init()則是管理伙伴管理演算法的初始化,此外kmem_cache_init()是用於內核slub記憶體分配體系的初始化,而vmalloc_init()則是用於vmalloc的初始化。
當前主要分析伙伴管理演算法,則僅對mem_init()做專門的分析,其餘的暫且後面再分析。
伙伴管理演算法的初始化函數入口是mem_init(),其實現:
【file:/arch/x86/mm/init_32.c】
void __init mem_init(void)
{
pci_iommu_alloc();
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
/*
* With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
* be done before free_all_bootmem(). Memblock use free low memory for
* temporary data (see find_range_array()) and for this purpose can use
* pages that was already passed to the buddy allocator, hence marked as
* not accessible in the page tables when compiled with
* CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
* important here.
*/
set_highmem_pages_init();
/* this will put all low memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
" .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
#ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,
#endif
VMALLOC_START, VMALLOC_END,
(VMALLOC_END - VMALLOC_START) >> 20,
(unsigned long)__va(0), (unsigned long)high_memory,
((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
(unsigned long)&__init_begin, (unsigned long)&__init_end,
((unsigned long)&__init_end -
(unsigned long)&__init_begin) >> 10,
(unsigned long)&_etext, (unsigned long)&_edata,
((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
/*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
*/
#define __FIXADDR_TOP (-PAGE_SIZE)
#ifdef CONFIG_HIGHMEM
BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
#define high_memory (-128UL << 20)
BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP
#ifdef CONFIG_RANDOMIZE_BASE
BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
#endif
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
}
其中pci_iommu_alloc()不是伙伴演算法重點相關的函數,不過還是稍微記錄一下:
【file:/arch/x86/kernel/pci-dma.c】
void __init pci_iommu_alloc(void)
{
struct iommu_table_entry *p;
sort_iommu_table(__iommu_table, __iommu_table_end);
check_iommu_entries(__iommu_table, __iommu_table_end);
for (p = __iommu_table; p < __iommu_table_end; p++) {
if (p && p->detect && p->detect() > 0) {
p->flags |= IOMMU_DETECTED;
if (p->early_init)
p->early_init();
if (p->flags & IOMMU_FINISH_IF_DETECTED)
break;
}
}
}
該函數主要是將iommu table先行排序檢查,然後調用各個表項註冊的函數進行初始化。
而接著的set_highmem_pages_init()則是伙伴演算法的開始:
【file:/arch/x86/mm/highmem_32.c】
void __init set_highmem_pages_init(void)
{
struct zone *zone;
int nid;
/*
* Explicitly reset zone->managed_pages because set_highmem_pages_init()
* is invoked before free_all_bootmem()
*/
reset_all_zones_managed_pages();
for_each_zone(zone) {
unsigned long zone_start_pfn, zone_end_pfn;
if (!is_highmem(zone))
continue;
zone_start_pfn = zone->zone_start_pfn;
zone_end_pfn = zone_start_pfn + zone->spanned_pages;
nid = zone_to_nid(zone);
printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
zone->name, nid, zone_start_pfn, zone_end_pfn);
add_highpages_with_active_regions(nid, zone_start_pfn,
zone_end_pfn);
}
}
其中for_each_free_mem_range(i, nid, &start, &end, NULL)用於遍歷查找memblock演算法中空閑的空間區域,然後通過clamp_t()對空間區域進行去除記憶體空洞調整。裡面的for ( ; pfn < e_pfn; pfn++)則用於將空間區域的各頁面通過free_highmem_page()進行釋放處理,其中if (pfn_valid(pfn))用於判斷頁面的有效性,而pfn_to_page(pfn)則是將頁框號轉換為頁面管理結構。
進一步分析free_highmem_page()實現:
【file:/mm/page_alloc.c】
void free_highmem_page(struct page *page)
{
__free_reserved_page(page);
totalram_pages++;
page_zone(page)->managed_pages++;
totalhigh_pages++;
}
其中totalram_pages用於記錄記憶體的總頁面數,page_zone(page)->managed_pages則是記錄管理區的管理頁面數,totalhigh_pages則是記錄高端記憶體的頁面總數;
具體看一下__free_reserved_page():
【file:/include/linux/mm.h】
/* Free the reserved page into the buddy system, so it gets managed. */
static inline void __free_reserved_page(struct page *page)
{
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
}
其中ClearPageReserved定義在/include/linux/page-flags.h中:
#define CLEARPAGEFLAG(uname, lname) \
static inline void ClearPage##uname(struct page *page) \
{ clear_bit(PG_##lname, &page->flags); }
用於清除頁面的flag中的reserved標誌位,表示頁面屬於動態記憶體。
接著的init_page_count()這是設置頁面的_count引用計數,設置為1,用於為__free_page()釋放頁面到記憶體管理演算法中做準備。最後的__free_page(),該函數既是初始化伙伴管理演算法,同時也是伙伴管理演算法釋放頁面的操作函數。暫且擱置分析__free_page()的實現,後面再詳細深入。
接著回到mem_init ()裡面下一個調用free_all_bootmem():
【file:/mm/nobootmem.c】
unsigned long __init free_all_bootmem(void)
{
unsigned long pages;
reset_all_zones_managed_pages();
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
pages = free_low_memory_core_early();
totalram_pages += pages;
return pages;
}
其中reset_all_zones_managed_pages()是用於重置管理區zone結構中的managed_pages成員數據,著重分析一下free_low_memory_core_early()實現:
該函數通過for_each_free_mem_range()遍歷memblock演算法中的空閑記憶體空間,並調用__free_memory_core()來釋放;而後面的get_allocated_memblock_reserved_regions_info()和get_allocated_memblock_memory_regions_info()用於獲取通過申請而得的memblock管理演算法空間,然後釋放,其中如果其演算法管理空間是系統定義的memblock_reserved_init_regions和memblock_memory_init_regions則仍保留不予以釋放。
最後著重分析一下__free_memory_core()的實現:
【file:/mm/nobootmem.c】
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
int order;
while (start < end) {
order = min(MAX_ORDER - 1UL, __ffs(start));
while (start + (1UL << order) > end)
order--;
__free_pages_bootmem(pfn_to_page(start), order);
start += (1UL << order);
}
}
其裡面的__free_pages_bootmem()則:
【file:/mm/nobootmem.c】
void __init __free_pages_bootmem(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
__ClearPageReserved(p);
set_page_count(p, 0);
page_zone(page)->managed_pages += nr_pages;
set_page_refcounted(page);
__free_pages(page, order);
}
由此可以看到,其最終調用的還是__free_pages()將頁面予以釋放。該函數在後面集中進行分析。
至此,伙伴管理演算法初始化完畢。