memblock演算法是linux內核初始化階段的一個記憶體分配器(它取代了原來的bootmem演算法),實現較為簡單。負責page allocator初始化之前的記憶體管理和分配請求。 分析memblock演算法,可以從幾點入手: 1. memblock演算法初始化; 2. memblock演算法管理記憶體的申請 ...
memblock演算法是linux內核初始化階段的一個記憶體分配器(它取代了原來的bootmem演算法),實現較為簡單。負責page allocator初始化之前的記憶體管理和分配請求。
分析memblock演算法,可以從幾點入手:
- memblock演算法初始化;
- memblock演算法管理記憶體的申請和釋放;
memblock演算法前的準備:
前面已經分析了linux系統在初始化的過程中,使用int 15中斷探知了機器的記憶體分佈圖(e820圖),其數據是存儲在boot_params.e820_map裡面,這裡面是沒有經過整理的數據,雜亂無章,畢竟BIOS沒有責任做整理記憶體的事情,所以這部分事情由系統來實現。那麼看一下linux如何實現這部分功能的,這部分功能是在setup_memory_map裡面實現的。
該函數在初始化中的調用位置:
start_kernel()
└->setup_arch()
└->setup_memory_map();
函數實現:
【file:/arch/x86/kernel/e820.c】
void __init setup_memory_map(void)
{
char *who;
who = x86_init.resources.memory_setup();
memcpy(&e820_saved, &e820, sizeof(struct e820map));
printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
可以看到實現很簡單,先是調用了一個鉤子函數,然後將e820保存到e820_saved裡面,再往下就是print函數。很明顯可以看到關鍵點在於這個鉤子函數的實現,在x86_init.c裡面可以找到x86_init該結構體變數的定義:
【file:/arch/x86/kernel/x86_init.c】
struct x86_init_ops x86_init __initdata = {
.resources = {
.probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = default_machine_specific_memory_setup,
},
.mpparse = {
.mpc_record = x86_init_uint_noop,
.setup_ioapic_ids = x86_init_noop,
.mpc_apic_id = default_mpc_apic_id,
.smp_read_mpc_oem = default_smp_read_mpc_oem,
.mpc_oem_bus_info = default_mpc_oem_bus_info,
.find_smp_config = default_find_smp_config,
.get_smp_config = default_get_smp_config,
},
.irqs = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
.trap_init = x86_init_noop,
},
.oem = {
.arch_setup = x86_init_noop,
.banner = default_banner,
},
.paging = {
.pagetable_init = native_pagetable_init,
},
.timers = {
.setup_percpu_clockev = setup_boot_APIC_clock,
.tsc_pre_init = x86_init_noop,
.timer_init = hpet_time_init,
.wallclock_init = x86_init_noop,
},
.iommu = {
.iommu_init = iommu_init_noop,
},
.pci = {
.init = x86_default_pci_init,
.init_irq = x86_default_pci_init_irq,
.fixup_irqs = x86_default_pci_fixup_irqs,
},
};
由此可以看到該鉤子函數掛的是default_machine_specific_memory_setup()。
進一步看一下該函數的實現:
【file:/arch/x86/kernel/e820.c】
char *__init default_machine_specific_memory_setup(void)
{
char *who = "BIOS-e820";
u32 new_nr;
/*
* Try to copy the BIOS-supplied E820-map.
*
* Otherwise fake a memory map; one section from 0k->640k,
* the next section from 1mb->appropriate_mem_k
*/
new_nr = boot_params.e820_entries;
sanitize_e820_map(boot_params.e820_map,
ARRAY_SIZE(boot_params.e820_map),
&new_nr);
boot_params.e820_entries = new_nr;
if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
< 0) {
u64 mem_size;
/* compare results from other methods and take the greater */
if (boot_params.alt_mem_k
< boot_params.screen_info.ext_mem_k) {
mem_size = boot_params.screen_info.ext_mem_k;
who = "BIOS-88";
} else {
mem_size = boot_params.alt_mem_k;
who = "BIOS-e801";
}
e820.nr_map = 0;
e820_add_region(0, LOWMEMSIZE(), E820_RAM);
e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
}
/* In case someone cares... */
return who;
}
在這個函數裡面,可以看到前面探測到的記憶體佈局信息boot_params.e820_map在這裡被使用了。
首先分析一下sanitize_e820_map該函數把e820信息進行瞭如何處理,函數實現:
【file:/arch/x86/kernel/e820.c】
int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
u32 *pnr_map)
{
static struct change_member change_point_list[2*E820_X_MAX] __initdata;
static struct change_member *change_point[2*E820_X_MAX] __initdata;
static struct e820entry *overlap_list[E820_X_MAX] __initdata;
static struct e820entry new_bios[E820_X_MAX] __initdata;
unsigned long current_type, last_type;
unsigned long long last_addr;
int chgidx;
int overlap_entries;
int new_bios_entry;
int old_nr, new_nr, chg_nr;
int i;
/* if there's only one memory region, don't bother */
if (*pnr_map < 2)
return -1;
old_nr = *pnr_map;
BUG_ON(old_nr > max_nr_map);
/* bail out if we find any unreasonable addresses in bios map */
for (i = 0; i < old_nr; i++)
if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
return -1;
/* create pointers for initial change-point information (for sorting) */
for (i = 0; i < 2 * old_nr; i++)
change_point[i] = &change_point_list[i];
/* record all known change-points (starting and ending addresses),
omitting those that are for empty memory regions */
chgidx = 0;
for (i = 0; i < old_nr; i++) {
if (biosmap[i].size != 0) {
change_point[chgidx]->addr = biosmap[i].addr;
change_point[chgidx++]->pbios = &biosmap[i];
change_point[chgidx]->addr = biosmap[i].addr +
biosmap[i].size;
change_point[chgidx++]->pbios = &biosmap[i];
}
}
chg_nr = chgidx;
/* sort change-point list by memory addresses (low -> high) */
sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
/* create a new bios memory map, removing overlaps */
overlap_entries = 0; /* number of entries in the overlap table */
new_bios_entry = 0; /* index for creating new bios map entries */
last_type = 0; /* start with undefined memory type */
last_addr = 0; /* start with 0 as last starting address */
/* loop through change-points, determining affect on the new bios map */
for (chgidx = 0; chgidx < chg_nr; chgidx++) {
/* keep track of all overlapping bios entries */
if (change_point[chgidx]->addr ==
change_point[chgidx]->pbios->addr) {
/*
* add map entry to overlap list (> 1 entry
* implies an overlap)
*/
overlap_list[overlap_entries++] =
change_point[chgidx]->pbios;
} else {
/*
* remove entry from list (order independent,
* so swap with last)
*/
for (i = 0; i < overlap_entries; i++) {
if (overlap_list[i] ==
change_point[chgidx]->pbios)
overlap_list[i] =
overlap_list[overlap_entries-1];
}
overlap_entries--;
}
/*
* if there are overlapping entries, decide which
* "type" to use (larger value takes precedence --
* 1=usable, 2,3,4,4+=unusable)
*/
current_type = 0;
for (i = 0; i < overlap_entries; i++)
if (overlap_list[i]->type > current_type)
current_type = overlap_list[i]->type;
/*
* continue building up new bios map based on this
* information
*/
if (current_type != last_type) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
/*
* move forward only if the new size
* was non-zero
*/
if (new_bios[new_bios_entry].size != 0)
/*
* no more space left for new
* bios entries ?
*/
if (++new_bios_entry >= max_nr_map)
break;
}
if (current_type != 0) {
new_bios[new_bios_entry].addr =
change_point[chgidx]->addr;
new_bios[new_bios_entry].type = current_type;
last_addr = change_point[chgidx]->addr;
}
last_type = current_type;
}
}
/* retain count for new bios entries */
new_nr = new_bios_entry;
/* copy new bios mapping into original location */
memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
*pnr_map = new_nr;
return 0;
}
第一個for迴圈:
/* bail out if we find any unreasonable addresses in bios map */
for (i = 0; i < old_nr; i++)
if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
return -1;
這裡是將e820做一個全面檢測,檢測是否存在不合理的記憶體佈局信息項,存在不合理項則直接退出。
第二個for迴圈:
/* create pointers for initial change-point information (for sorting) */
for (i = 0; i < 2 * old_nr; i++)
change_point[i] = &change_point_list[i];
將change_point和change_point_list關聯起來,實際上change_point_list只是占用一個棧空間而已,真正起到作用的是change_point。
第三個for迴圈和sort函數的調用一起來分析:
/* record all known change-points (starting and ending addresses),
omitting those that are for empty memory regions */
chgidx = 0;
for (i = 0; i < old_nr; i++) {
if (biosmap[i].size != 0) {
change_point[chgidx]->addr = biosmap[i].addr;
change_point[chgidx++]->pbios = &biosmap[i];
change_point[chgidx]->addr = biosmap[i].addr +
biosmap[i].size;
change_point[chgidx++]->pbios = &biosmap[i];
}
}
chg_nr = chgidx;
/* sort change-point list by memory addresses (low -> high) */
sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
這裡是把change_point初始化並與boot_params.e820_map各項的起始地址和結束地址分別關聯起來,繼而通過sort來進行排序。排序的結果就是將各項記憶體佈局信息所標示的記憶體空間起始地址和結束地址由低往高進行排序。如果兩者地址值相等,則以兩者的e820_map項信息所標示的記憶體空間尾做排序依據,哪個空間尾更後,則該項排在等值項後面。
看一下最後的for迴圈:
/* loop through change-points, determining affect on the new bios map */
for (chgidx = 0; chgidx < chg_nr; chgidx++) {
/* keep track of all overlapping bios entries */
if (change_point[chgidx]->addr ==
change_point[chgidx]->pbios->addr) {
/*
* add map entry to overlap list (> 1 entry
* implies an overlap)
*/
overlap_list[overlap_entries++] =
change_point[chgidx]->pbios;
} else {
/*
* remove entry from list (order independent,
* so swap with last)
*/
for (i = 0; i < overlap_entries; i++) {
if (overlap_list[i] ==
change_point[chgidx]->pbios)
overlap_list[i] =
overlap_list[overlap_entries-1];
}
overlap_entries--;
}
/*
* if there are overlapping entries, decide which
* "type" to use (larger value takes precedence --
* 1=usable, 2,3,4,4+=unusable)
*/
current_type = 0;
for (i = 0; i < overlap_entries; i++)
if (overlap_list[i]->type > current_type)
current_type = overlap_list[i]->type;
/*
* continue building up new bios map based on this
* information
*/
if (current_type != last_type) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
/*
* move forward only if the new size
* was non-zero
*/
if (new_bios[new_bios_entry].size != 0)
/*
* no more space left for new
* bios entries ?
*/
if (++new_bios_entry >= max_nr_map)
break;
}
if (current_type != 0) {
new_bios[new_bios_entry].addr =
change_point[chgidx]->addr;
new_bios[new_bios_entry].type = current_type;
last_addr = change_point[chgidx]->addr;
}
last_type = current_type;
}
}
這個迴圈裡面幹了什麼事情呢?其大概作用就是把已經排序完了的change_point做整合,將重疊的記憶體空間根據屬性進行篩選,並將同屬性的相鄰記憶體空間進行合併處理。具體實現流程為:
- 最初第一次迴圈加入change_point項,將根據當前change_point記錄的地址項是否與其關聯的e820_map項的起始地址一致,如果一致(第一次必然是一致的),表示當前項是某記憶體塊的起始地址,將其加入到overlap_list中去,然後該項將會添加到new_bios中並更新last_addr,最後更新當前記憶體塊的記憶體類型到last_type中(這個類型當做優先順序理解,會容易理解代碼的,0優先順序最低);
- 第二次迴圈加入change_point項,就會開始遇到各種狀況了:
A. 假設change_point新加入項是第一項的記憶體尾,那麼overlap_list將會移除該項,new_bios將會更新當前項的記憶體塊大小,只要記憶體塊大小不為0,那麼new_bios_entry自加,new_bios開始新一項的內容記錄;
B. 假設change_point新加入項是新的記憶體塊地址頭,那麼overlap_list將會新增一項,然後current_type將會更新到overlap_list列表中的所有項的類型最大值(最高優先順序),接下來又出現新狀況了:
a) 如果當前新加入change_point的類型值等於前者,繼續開始下一迴圈;
b) 如果當前新加入change_point的類型值大於前者,那麼new_bios將會以該change_point項紀錄的地址作為new_bios前一項的結束地址,然後更新大小到new_bios中,開啟new_bios的新一項的記錄;
c) 如果當前新加入change_point的類型值小於前者,由於current_type將會仍然保持前者的類型值,後續將會跳過開始下一迴圈;
第三次迴圈加入change_point項,基於前面第二次迴圈加入change_point項的情況,如果是情況A,那麼問題很簡單,就是一個新開始而已;如果是情況B,將會出現稍微複雜的情況:
A. 假設該change_point新加入項又是新的記憶體塊頭,將會涉及類型值不同的問題,其實也就是重覆前面類型第二次迴圈的情況B的場景,這裡就不細述
B. 假設該change_point新加入項是前面的記憶體塊尾,那麼可能的狀況是:
a) 如果前面已加入的兩項類型相同,即B.a的情形,當前作為記憶體尾新加入change_point的類型值必然也是相同的,這僅會把與overlap_list中配對的那一項從隊列中刪除,繼續開始加入change_point下一項; b) 如果前面兩項,第一項類型值大於第二項,即B.c的情形,若當前項作為第一項的尾加入,那麼當前的new_bios項將會以此作為結尾;但是若當前項作為第二項的尾加入,那麼將把第二項從overlap_list中刪除; c) 如果前面兩項,第二項類型值大於第一項,即B.b的情形,若當前項作為第一項的尾加入,也僅是將第一項從overlap_list中刪除;若當前項作為第一項的尾加入,那麼new_bios將會就此作為尾完成當前項;
後續的迴圈加入change_point項也僅是前面的場景演變而已,這裡不細述了。那麼從前面的信息來看,這個函數的目的是什麼基本明瞭,也就是將boot_params.e820_map裡面的記憶體佈局根據順序和重疊情況,將連續的同類型的記憶體塊合併,將不同類型的記憶體塊根據高低優先順序進行記憶體塊劃分拆開。
正如下描述的一樣:
【file:/arch/x86/kernel/e820.c】
/*
* Sanitize the BIOS e820 map.
*
* Some e820 responses include overlapping entries. The following
* replaces the original e820 map with a new one, removing overlaps,
* and resolving conflicting memory types in favor of highest
* numbered type.
*
* The input parameter biosmap points to an array of 'struct
* e820entry' which on entry has elements in the range [0, *pnr_map)
* valid, and which has space for up to max_nr_map entries.
* On return, the resulting sanitized e820 map entries will be in
* overwritten in the same location, starting at biosmap.
*
* The integer pointed to by pnr_map must be valid on entry (the
* current number of valid entries located at biosmap) and will
* be updated on return, with the new number of valid entries
* (something no more than max_nr_map.)
*
* The return value from sanitize_e820_map() is zero if it
* successfully 'sanitized' the map entries passed in, and is -1
* if it did nothing, which can happen if either of (1) it was
* only passed one map entry, or (2) any of the input map entries
* were invalid (start + size < start, meaning that the size was
* so big the described memory range wrapped around through zero.)
*
* Visually we're performing the following
* (1,2,3,4 = memory types)...
*
* Sample memory map (w/overlaps):
* ____22__________________
* ______________________4_
* ____1111________________
* _44_____________________
* 11111111________________
* ____________________33__
* ___________44___________
* __________33333_________
* ______________22________
* ___________________2222_
* _________111111111______
* _____________________11_
* _________________4______
*
* Sanitized equivalent (no overlap):
* 1_______________________
* _44_____________________
* ___1____________________
* ____22__________________
* ______11________________
* _________1______________
* __________3_____________
* ___________44___________
* _____________33_________
* _______________2________
* ________________1_______
* _________________4______
* ___________________2____
* ____________________33__
* ______________________4_
*/
為了方便理解,整理出一張草圖:
連續的同類型的合併到一塊裡面,不同類型的各自為政,不同類型重疊部分根據類型優先順序高低拆分,依高優先順序順序保證各類型的記憶體塊的完整性。額外廢話一句:目前玩虛擬機沒有遇到過上面這麼複雜的情況,但是不能否認這段代碼是設計來乾這活的。
從sanitize_e820_map出來後,回到default_machine_specific_memory_setup中,接下來是append_e820_map的函數:
【file:/arch/x86/kernel/e820.c】
static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
{
/* Only one memory region (or negative)? Ignore it */
if (nr_map < 2)
return -1;
return __append_e820_map(biosmap, nr_map);
}
append_e820_map封裝調用__append_e820_map:
【file:/arch/x86/kernel/e820.c】
static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
{
while (nr_map) {
u64 start = biosmap->addr;
u64 size = biosmap->size;
u64 end = start + size;
u32 type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
if (start > end)
return -1;
e820_add_region(start, size, type);
biosmap++;
nr_map--;
}
return 0;
迴圈調用e820_add_region將整理後的boot_params.e820_map做添加操作:
【file:/arch/x86/kernel/e820.c】
void __init e820_add_region(u64 start, u64 size, int type)
{
__e820_add_region(&e820, start, size, type);
}
e820_add_region封裝__e820_add_region:
【file:/arch/x86/kernel/e820.c】
static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
int type)
{
int x = e820x->nr_map;
if (x >= ARRAY_SIZE(e820x->map)) {
printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
(unsigned long long) start,
(unsigned long long) (start + size - 1));
return;
}
e820x->map[x].addr = start;
e820x->map[x].size = size;
e820x->map[x].type = type;
e820x->nr_map++;
}
而__e820_add_region則是將各項信息往e820做添加操作。說白了,就是將boot_params.e820_map轉入到e820圖中。
最後順便看一下e820_print_map函數的實現:
【file:/arch/x86/kernel/e820.c】
void __init e820_print_map(char *who)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
(unsigned long long) e820.map[i].addr,
(unsigned long long)
(e820.map[i].addr + e820.map[i].size - 1));
e820_print_type(e820.map[i].type);
printk(KERN_CONT "\n");
}
}
它是將鉤子函數返回的內容列印出來,列印出來的內容可以在shell上面通過dmesg命令查看得到,例如:
最後總結一下,以上代碼幹了這麼多活,主要就是為了把通過BIOS中斷探測到的記憶體佈局信息boot_params.e820_map做整合處理,完了轉存到變數e820中。