3 minute read

如何在 unikernel 中运行多进程应用程序是 unikernel 领域的一个新课题,我将在此记录我对这个问题的研究进展。该问题涉及到 unikernel, qemu等底层软件的基础知识以及调试技巧。

qemu拦截指令

  1. 使用无效指令 为了能够使 unikernel 能够与 qemu 进行交互,这里我们使用一个无效的指令 ud2 进行标记。
#![cfg_attr(feature = "axstd", no_std)]
#![cfg_attr(feature = "axstd", no_main)]

#[macro_use]
#[cfg(feature = "axstd")]
extern crate axstd as std;

use core::arch::asm;

fn fork() -> Result<u64, &'static str> {
    unsafe {
        asm!("ud2", options(nostack, preserves_flags));
    }
    let new_vm_id: u64;
    unsafe {
        asm!(
            "mov {0}, rax",
            out(reg) new_vm_id,
            options(nostack, preserves_flags)
        );
    }
    Ok(new_vm_id)
}

#[cfg_attr(feature = "axstd", no_mangle)]
fn main() {
    println!("About to call fork()");
    let vm_id = fork();
    println!("fork returned");
}

当我们运行上面的代码之后,会发生报错。

About to call fork()
[  0.006446 0:2 axruntime::lang_items:5] panicked at modules/axhal/src/arch/x86_64/trap.rs:44:13:
Unhandled exception 6 (#UD, error_code=0x0) @ 0xffffff8000200c00:
TrapFrame {
    rax: 0xffffff8000214a00,
    rcx: 0x0,
    rdx: 0x2,
    rbx: 0xffffff8000256ca8,
    rbp: 0x1,
    rsi: 0xffffff800020d8fd,
    rdi: 0xa,
    r8: 0x97,
    r9: 0x400000,
    r10: 0xffffff8000257900,
    r11: 0x16,
    r12: 0xffffff8000216118,
    r13: 0x3,
    r14: 0xffffff80002c4010,
    r15: 0xffffff8000207f70,
    vector: 0x6,
    error_code: 0x0,
    rip: 0xffffff8000200c00,
    cs: 0x10,
    rflags: 0x10082,
    rsp: 0xffffff8000256ca0,
    ss: 0x0,
}

可以看到,当执行到 ud2 指令时, 出现了未处理的异常 #UD(Invalid Opcode),ud2 是 x86 架构下专门用于产生无效指令异常的指令,所以出现这种异常是预期的行为。

  1. 使用端口通信

qemu pc 初始化

下面这段代码是 qemu 基于 q35 芯片组,负责配置虚拟化环境,包括内存布局、PCI总线初始化、设备模拟等。

/* PC hardware initialisation */
static void pc_q35_init(MachineState *machine)
{
    PCMachineState *pcms = PC_MACHINE(machine);
    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
    X86MachineState *x86ms = X86_MACHINE(machine);
    Object *phb;
    PCIDevice *lpc;
    DeviceState *lpc_dev;
    MemoryRegion *system_memory = get_system_memory();
    MemoryRegion *system_io = get_system_io();
    MemoryRegion *pci_memory = g_new(MemoryRegion, 1);
    GSIState *gsi_state;
    ISABus *isa_bus;
    int i;
    ram_addr_t lowmem;
    DriveInfo *hd[MAX_SATA_PORTS];
    MachineClass *mc = MACHINE_GET_CLASS(machine);
    bool acpi_pcihp;
    bool keep_pci_slot_hpc;
    uint64_t pci_hole64_size = 0;

    assert(pcmc->pci_enabled);

    /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory
     * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping
     * also known as MMCFG).
     * If it doesn't, we need to split it in chunks below and above 4G.
     * In any case, try to make sure that guest addresses aligned at
     * 1G boundaries get mapped to host addresses aligned at 1G boundaries.
     */
     //默认guest内存布局,将内存分为低于4GB 的部分和高于 4GB 的部分,默认保留 512MB 给IO 设备,256MB 给PCI 设备内存映射。
    if (machine->ram_size >= 0xb0000000) {
        lowmem = 0x80000000;
    } else {
        lowmem = 0xb0000000;
    }

    /* Handle the machine opt max-ram-below-4g.  It is basically doing
     * min(qemu limit, user limit).
     */
    if (!pcms->max_ram_below_4g) {
        pcms->max_ram_below_4g = 4 * GiB;
    }
    if (lowmem > pcms->max_ram_below_4g) {
        lowmem = pcms->max_ram_below_4g;
        if (machine->ram_size - lowmem > lowmem &&
            lowmem & (1 * GiB - 1)) {
            warn_report("There is possibly poor performance as the ram size "
                        " (0x%" PRIx64 ") is more then twice the size of"
                        " max-ram-below-4g (%"PRIu64") and"
                        " max-ram-below-4g is not a multiple of 1G.",
                        (uint64_t)machine->ram_size, pcms->max_ram_below_4g);
        }
    }

    if (machine->ram_size >= lowmem) {
        x86ms->above_4g_mem_size = machine->ram_size - lowmem;
        x86ms->below_4g_mem_size = lowmem;
    } else {
        x86ms->above_4g_mem_size = 0;
        x86ms->below_4g_mem_size = machine->ram_size;
    }
    //设置初始化 SGX 相关的模拟环境
    pc_machine_init_sgx_epc(pcms);
    //初始化 CPU 
    x86_cpus_init(x86ms, pcmc->default_cpu_version);
    // 启用KVM 加速
    if (kvm_enabled()) {
        kvmclock_create(pcmc->kvmclock_create_always);
    }

    // PCI 总线和设备初始化
    /* create pci host bus */
    phb = OBJECT(qdev_new(TYPE_Q35_HOST_DEVICE));

    pci_hole64_size = object_property_get_uint(phb,
                                               PCI_HOST_PROP_PCI_HOLE64_SIZE,
                                               &error_abort);

    /* allocate ram and load rom/bios */
    memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
    pc_memory_init(pcms, system_memory, pci_memory, pci_hole64_size);

    object_property_add_child(OBJECT(machine), "q35", phb);
    object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
                             OBJECT(machine->ram), NULL);
    object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
                             OBJECT(pci_memory), NULL);
    object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
                             OBJECT(system_memory), NULL);
    object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
                             OBJECT(system_io), NULL);
    object_property_set_int(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
                            x86ms->below_4g_mem_size, NULL);
    object_property_set_int(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
                            x86ms->above_4g_mem_size, NULL);
    object_property_set_bool(phb, PCI_HOST_BYPASS_IOMMU,
                             pcms->default_bus_bypass_iommu, NULL);
    object_property_set_bool(phb, PCI_HOST_PROP_SMM_RANGES,
                             x86_machine_is_smm_enabled(x86ms), NULL);
    sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);

    /* pci */
    pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pcie.0"));

    /* irq lines */
    /* 中断初始化 */
    gsi_state = pc_gsi_create(&x86ms->gsi, true);

    /* create ISA bus */
    lpc = pci_new_multifunction(PCI_DEVFN(ICH9_LPC_DEV, ICH9_LPC_FUNC),
                                TYPE_ICH9_LPC_DEVICE);
    lpc_dev = DEVICE(lpc);
    qdev_prop_set_bit(lpc_dev, "smm-enabled",
                      x86_machine_is_smm_enabled(x86ms));
    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
        qdev_connect_gpio_out_named(lpc_dev, ICH9_GPIO_GSI, i, x86ms->gsi[i]);
    }
    pci_realize_and_unref(lpc, pcms->pcibus, &error_fatal);

    x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(lpc), "rtc"));

    object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
                             TYPE_HOTPLUG_HANDLER,
                             (Object **)&x86ms->acpi_dev,
                             object_property_allow_set_link,
                             OBJ_PROP_LINK_STRONG);
    object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
                             OBJECT(lpc), &error_abort);

    acpi_pcihp = object_property_get_bool(OBJECT(lpc),
                                          ACPI_PM_PROP_ACPI_PCIHP_BRIDGE,
                                          NULL);

    keep_pci_slot_hpc = object_property_get_bool(OBJECT(lpc),
                                                 "x-keep-pci-slot-hpc",
                                                 NULL);

    if (!keep_pci_slot_hpc && acpi_pcihp) {
        object_register_sugar_prop(TYPE_PCIE_SLOT,
                                   "x-do-not-expose-native-hotplug-cap",
                                   "true", true);
    }

    isa_bus = ISA_BUS(qdev_get_child_bus(lpc_dev, "isa.0"));

    if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
        pc_i8259_create(isa_bus, gsi_state->i8259_irq);
    }

    ioapic_init_gsi(gsi_state, OBJECT(phb));

    if (tcg_enabled()) {
        x86_register_ferr_irq(x86ms->gsi[13]);
    }

    /* init basic PC hardware */
    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, !mc->no_floppy,
                         0xff0104);

    if (pcms->sata_enabled) {
        PCIDevice *pdev;
        AHCIPCIState *ich9;

        /* ahci and SATA device, for q35 1 ahci controller is built-in */
        pdev = pci_create_simple_multifunction(pcms->pcibus,
                                               PCI_DEVFN(ICH9_SATA1_DEV,
                                                         ICH9_SATA1_FUNC),
                                               "ich9-ahci");
        ich9 = ICH9_AHCI(pdev);
        pcms->idebus[0] = qdev_get_child_bus(DEVICE(pdev), "ide.0");
        pcms->idebus[1] = qdev_get_child_bus(DEVICE(pdev), "ide.1");
        g_assert(MAX_SATA_PORTS == ich9->ahci.ports);
        ide_drive_get(hd, ich9->ahci.ports);
        ahci_ide_create_devs(&ich9->ahci, hd);
    }

    if (machine_usb(machine)) {
        /* Should we create 6 UHCI according to ich9 spec? */
        ehci_create_ich9_with_companions(pcms->pcibus, 0x1d);
    }

    if (pcms->smbus_enabled) {
        PCIDevice *smb;

        /* TODO: Populate SPD eeprom data.  */
        smb = pci_create_simple_multifunction(pcms->pcibus,
                                              PCI_DEVFN(ICH9_SMB_DEV,
                                                        ICH9_SMB_FUNC),
                                              TYPE_ICH9_SMB_DEVICE);
        pcms->smbus = I2C_BUS(qdev_get_child_bus(DEVICE(smb), "i2c"));

        smbus_eeprom_init(pcms->smbus, 8, NULL, 0);
    }

    /* the rest devices to which pci devfn is automatically assigned */
    pc_vga_init(isa_bus, pcms->pcibus);
    pc_nic_init(pcmc, isa_bus, pcms->pcibus);

    if (machine->nvdimms_state->is_enabled) {
        nvdimm_init_acpi_state(machine->nvdimms_state, system_io,
                               x86_nvdimm_acpi_dsmio,
                               x86ms->fw_cfg, OBJECT(pcms));
    }
}

register_ioport_write(大善人)

用于注册特定I/O 端口的写回调,用来拦截guest对某个I/O端口的写操作,将操作委托给注册的回调函数处理。

qemu 冷迁移

qemu 热迁移

对于最右边的部分迁移前的和迁移后的要完全一致。首先,存储要一致,然后共享的方式也非常重要,使用的文件系统必须配置为禁止缓存(否则,源主机上的数据可能在迁移过程中尚未写入存储,目标主机已经开始运行并访问存储,导致客户机看到数据损坏) 墙上时钟也必须一致,迁移前后的网络配置也必须一致,host CPU的类型必须一致,否则指令集可能不兼容。机器类型也要一致,包括ROM,BIOS等等。
三个阶段:1. 标记所有的RAM为脏位。2. 持续放送设置脏位的RAM页面。3. 停止运行 guest,传输剩下的脏 RAM,设备状态。 在第二阶段的时候,QEMU 可以通过 KVM 提供的服务找到自从上次请求以来哪些数据被更改了。

Updated: