关于 unikernel 多进程框架的研究
如何在 unikernel 中运行多进程应用程序是 unikernel 领域的一个新课题,我将在此记录我对这个问题的研究进展。该问题涉及到 unikernel, qemu等底层软件的基础知识以及调试技巧。
qemu拦截指令
- 使用无效指令
为了能够使 unikernel 能够与 qemu 进行交互,这里我们使用一个无效的指令
ud2
进行标记。
#![cfg_attr(feature = "axstd", no_std)]
#![cfg_attr(feature = "axstd", no_main)]
#[macro_use]
#[cfg(feature = "axstd")]
extern crate axstd as std;
use core::arch::asm;
fn fork() -> Result<u64, &'static str> {
unsafe {
asm!("ud2", options(nostack, preserves_flags));
}
let new_vm_id: u64;
unsafe {
asm!(
"mov {0}, rax",
out(reg) new_vm_id,
options(nostack, preserves_flags)
);
}
Ok(new_vm_id)
}
#[cfg_attr(feature = "axstd", no_mangle)]
fn main() {
println!("About to call fork()");
let vm_id = fork();
println!("fork returned");
}
当我们运行上面的代码之后,会发生报错。
About to call fork()
[ 0.006446 0:2 axruntime::lang_items:5] panicked at modules/axhal/src/arch/x86_64/trap.rs:44:13:
Unhandled exception 6 (#UD, error_code=0x0) @ 0xffffff8000200c00:
TrapFrame {
rax: 0xffffff8000214a00,
rcx: 0x0,
rdx: 0x2,
rbx: 0xffffff8000256ca8,
rbp: 0x1,
rsi: 0xffffff800020d8fd,
rdi: 0xa,
r8: 0x97,
r9: 0x400000,
r10: 0xffffff8000257900,
r11: 0x16,
r12: 0xffffff8000216118,
r13: 0x3,
r14: 0xffffff80002c4010,
r15: 0xffffff8000207f70,
vector: 0x6,
error_code: 0x0,
rip: 0xffffff8000200c00,
cs: 0x10,
rflags: 0x10082,
rsp: 0xffffff8000256ca0,
ss: 0x0,
}
可以看到,当执行到 ud2
指令时, 出现了未处理的异常 #UD(Invalid Opcode),ud2 是 x86 架构下专门用于产生无效指令异常的指令,所以出现这种异常是预期的行为。
- 使用端口通信
qemu pc 初始化
下面这段代码是 qemu 基于 q35 芯片组,负责配置虚拟化环境,包括内存布局、PCI总线初始化、设备模拟等。
/* PC hardware initialisation */
static void pc_q35_init(MachineState *machine)
{
PCMachineState *pcms = PC_MACHINE(machine);
PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
X86MachineState *x86ms = X86_MACHINE(machine);
Object *phb;
PCIDevice *lpc;
DeviceState *lpc_dev;
MemoryRegion *system_memory = get_system_memory();
MemoryRegion *system_io = get_system_io();
MemoryRegion *pci_memory = g_new(MemoryRegion, 1);
GSIState *gsi_state;
ISABus *isa_bus;
int i;
ram_addr_t lowmem;
DriveInfo *hd[MAX_SATA_PORTS];
MachineClass *mc = MACHINE_GET_CLASS(machine);
bool acpi_pcihp;
bool keep_pci_slot_hpc;
uint64_t pci_hole64_size = 0;
assert(pcmc->pci_enabled);
/* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory
* and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping
* also known as MMCFG).
* If it doesn't, we need to split it in chunks below and above 4G.
* In any case, try to make sure that guest addresses aligned at
* 1G boundaries get mapped to host addresses aligned at 1G boundaries.
*/
//默认guest内存布局,将内存分为低于4GB 的部分和高于 4GB 的部分,默认保留 512MB 给IO 设备,256MB 给PCI 设备内存映射。
if (machine->ram_size >= 0xb0000000) {
lowmem = 0x80000000;
} else {
lowmem = 0xb0000000;
}
/* Handle the machine opt max-ram-below-4g. It is basically doing
* min(qemu limit, user limit).
*/
if (!pcms->max_ram_below_4g) {
pcms->max_ram_below_4g = 4 * GiB;
}
if (lowmem > pcms->max_ram_below_4g) {
lowmem = pcms->max_ram_below_4g;
if (machine->ram_size - lowmem > lowmem &&
lowmem & (1 * GiB - 1)) {
warn_report("There is possibly poor performance as the ram size "
" (0x%" PRIx64 ") is more then twice the size of"
" max-ram-below-4g (%"PRIu64") and"
" max-ram-below-4g is not a multiple of 1G.",
(uint64_t)machine->ram_size, pcms->max_ram_below_4g);
}
}
if (machine->ram_size >= lowmem) {
x86ms->above_4g_mem_size = machine->ram_size - lowmem;
x86ms->below_4g_mem_size = lowmem;
} else {
x86ms->above_4g_mem_size = 0;
x86ms->below_4g_mem_size = machine->ram_size;
}
//设置初始化 SGX 相关的模拟环境
pc_machine_init_sgx_epc(pcms);
//初始化 CPU
x86_cpus_init(x86ms, pcmc->default_cpu_version);
// 启用KVM 加速
if (kvm_enabled()) {
kvmclock_create(pcmc->kvmclock_create_always);
}
// PCI 总线和设备初始化
/* create pci host bus */
phb = OBJECT(qdev_new(TYPE_Q35_HOST_DEVICE));
pci_hole64_size = object_property_get_uint(phb,
PCI_HOST_PROP_PCI_HOLE64_SIZE,
&error_abort);
/* allocate ram and load rom/bios */
memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
pc_memory_init(pcms, system_memory, pci_memory, pci_hole64_size);
object_property_add_child(OBJECT(machine), "q35", phb);
object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
OBJECT(machine->ram), NULL);
object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
OBJECT(pci_memory), NULL);
object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
OBJECT(system_memory), NULL);
object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
OBJECT(system_io), NULL);
object_property_set_int(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
x86ms->below_4g_mem_size, NULL);
object_property_set_int(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
x86ms->above_4g_mem_size, NULL);
object_property_set_bool(phb, PCI_HOST_BYPASS_IOMMU,
pcms->default_bus_bypass_iommu, NULL);
object_property_set_bool(phb, PCI_HOST_PROP_SMM_RANGES,
x86_machine_is_smm_enabled(x86ms), NULL);
sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
/* pci */
pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pcie.0"));
/* irq lines */
/* 中断初始化 */
gsi_state = pc_gsi_create(&x86ms->gsi, true);
/* create ISA bus */
lpc = pci_new_multifunction(PCI_DEVFN(ICH9_LPC_DEV, ICH9_LPC_FUNC),
TYPE_ICH9_LPC_DEVICE);
lpc_dev = DEVICE(lpc);
qdev_prop_set_bit(lpc_dev, "smm-enabled",
x86_machine_is_smm_enabled(x86ms));
for (i = 0; i < IOAPIC_NUM_PINS; i++) {
qdev_connect_gpio_out_named(lpc_dev, ICH9_GPIO_GSI, i, x86ms->gsi[i]);
}
pci_realize_and_unref(lpc, pcms->pcibus, &error_fatal);
x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(lpc), "rtc"));
object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
TYPE_HOTPLUG_HANDLER,
(Object **)&x86ms->acpi_dev,
object_property_allow_set_link,
OBJ_PROP_LINK_STRONG);
object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
OBJECT(lpc), &error_abort);
acpi_pcihp = object_property_get_bool(OBJECT(lpc),
ACPI_PM_PROP_ACPI_PCIHP_BRIDGE,
NULL);
keep_pci_slot_hpc = object_property_get_bool(OBJECT(lpc),
"x-keep-pci-slot-hpc",
NULL);
if (!keep_pci_slot_hpc && acpi_pcihp) {
object_register_sugar_prop(TYPE_PCIE_SLOT,
"x-do-not-expose-native-hotplug-cap",
"true", true);
}
isa_bus = ISA_BUS(qdev_get_child_bus(lpc_dev, "isa.0"));
if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
pc_i8259_create(isa_bus, gsi_state->i8259_irq);
}
ioapic_init_gsi(gsi_state, OBJECT(phb));
if (tcg_enabled()) {
x86_register_ferr_irq(x86ms->gsi[13]);
}
/* init basic PC hardware */
pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, !mc->no_floppy,
0xff0104);
if (pcms->sata_enabled) {
PCIDevice *pdev;
AHCIPCIState *ich9;
/* ahci and SATA device, for q35 1 ahci controller is built-in */
pdev = pci_create_simple_multifunction(pcms->pcibus,
PCI_DEVFN(ICH9_SATA1_DEV,
ICH9_SATA1_FUNC),
"ich9-ahci");
ich9 = ICH9_AHCI(pdev);
pcms->idebus[0] = qdev_get_child_bus(DEVICE(pdev), "ide.0");
pcms->idebus[1] = qdev_get_child_bus(DEVICE(pdev), "ide.1");
g_assert(MAX_SATA_PORTS == ich9->ahci.ports);
ide_drive_get(hd, ich9->ahci.ports);
ahci_ide_create_devs(&ich9->ahci, hd);
}
if (machine_usb(machine)) {
/* Should we create 6 UHCI according to ich9 spec? */
ehci_create_ich9_with_companions(pcms->pcibus, 0x1d);
}
if (pcms->smbus_enabled) {
PCIDevice *smb;
/* TODO: Populate SPD eeprom data. */
smb = pci_create_simple_multifunction(pcms->pcibus,
PCI_DEVFN(ICH9_SMB_DEV,
ICH9_SMB_FUNC),
TYPE_ICH9_SMB_DEVICE);
pcms->smbus = I2C_BUS(qdev_get_child_bus(DEVICE(smb), "i2c"));
smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
}
/* the rest devices to which pci devfn is automatically assigned */
pc_vga_init(isa_bus, pcms->pcibus);
pc_nic_init(pcmc, isa_bus, pcms->pcibus);
if (machine->nvdimms_state->is_enabled) {
nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
x86_nvdimm_acpi_dsmio,
x86ms->fw_cfg, OBJECT(pcms));
}
}
register_ioport_write(大善人)
用于注册特定I/O 端口的写回调,用来拦截guest对某个I/O端口的写操作,将操作委托给注册的回调函数处理。
qemu 冷迁移
qemu 热迁移
对于最右边的部分迁移前的和迁移后的要完全一致。首先,存储要一致,然后共享的方式也非常重要,使用的文件系统必须配置为禁止缓存(否则,源主机上的数据可能在迁移过程中尚未写入存储,目标主机已经开始运行并访问存储,导致客户机看到数据损坏)
墙上时钟也必须一致,迁移前后的网络配置也必须一致,host CPU的类型必须一致,否则指令集可能不兼容。机器类型也要一致,包括ROM,BIOS等等。
三个阶段:1. 标记所有的RAM为脏位。2. 持续放送设置脏位的RAM页面。3. 停止运行 guest,传输剩下的脏 RAM,设备状态。
在第二阶段的时候,QEMU 可以通过 KVM 提供的服务找到自从上次请求以来哪些数据被更改了。