From 7ed2b60be27f12f08ba794e5bbf6d58ec39c293d Mon Sep 17 00:00:00 2001 From: n00899518 Date: Mon, 11 Aug 2025 15:33:49 +0800 Subject: [PATCH] feat: add migration driver for 5.4 kernel. --- .../KAEKernelDriver-OLK-5.10/Makefile | 57 +- .../hisilicon/migration/acc_vf_migration.c | 321 ++- .../hisilicon/migration/acc_vf_migration.h | 40 +- .../{hisilicon => include_linux}/vfio.h | 8 +- .../include_uapi_linux/vfio.h | 1444 ++++++++++++++ .../KAEKernelDriver-OLK-5.4/Makefile | 51 +- .../hisilicon/Makefile | 1 + .../hisilicon/migration/Makefile | 2 + .../hisilicon/migration/acc_vf_migration.c | 1719 +++++++++++++++++ .../hisilicon/migration/acc_vf_migration.h | 242 +++ .../include_linux/vfio.h | 298 +++ .../include_uapi_linux/vfio.h | 1444 ++++++++++++++ .../KAEKernelDriver-OLK-6.6/Makefile | 43 +- build.sh | 38 +- 14 files changed, 5421 insertions(+), 287 deletions(-) rename KAEKernelDriver/KAEKernelDriver-OLK-5.10/{hisilicon => include_linux}/vfio.h (98%) create mode 100644 KAEKernelDriver/KAEKernelDriver-OLK-5.10/include_uapi_linux/vfio.h create mode 100644 KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/Makefile create mode 100644 KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.c create mode 100644 KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.h create mode 100644 KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_linux/vfio.h create mode 100644 KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_uapi_linux/vfio.h diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/Makefile b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/Makefile index 16da4c2..e11901d 100644 --- a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/Makefile +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/Makefile @@ -1,10 +1,10 @@ KERNEL_VERSION_BY_BUILDENV :=`rpm -q --qf '%{VERSION}-%{RELEASE}.%{ARCH}\n' kernel-devel | head -n 1` KERNEL_PATH := /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build -KSP := $(shell if test -d /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; then \ - echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; \ - else \ - echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build; \ - fi) +# KSP := $(shell if test -d /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; then \ +# echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; \ +# else \ +# echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build; \ +# fi) obj-m += uacce/ obj-m += hisilicon/ @@ -13,18 +13,25 @@ DIRS := $(shell find . -maxdepth 3 -type d) TARGET = $(foreach dir,$(DIRS),$(wildcard \ $(dir)/*.o) $(dir)/*.ko $(dir)/*.tmp_versions $(dir)/*.depend $(dir)/*.mod.c $(dir)/*.order $(dir)/*.symvers) +CONFIG_FLAGS = CONFIG_CC_STACKPROTECTOR_STRONG=y \ + CONFIG_UACCE=m \ + CONFIG_CRYPTO_QM_UACCE=m \ + CONFIG_CRYPTO_DEV_HISI_SGL=m \ + CONFIG_CRYPTO_DEV_HISI_QM=m \ + CONFIG_CRYPTO_DEV_HISI_ZIP=m \ + CONFIG_CRYPTO_DEV_HISI_HPRE=m \ + CONFIG_CRYPTO_DEV_HISI_SEC2=m \ + CONFIG_CRYPTO_DEV_HISI_TRNG=m + +ifeq ($(ENABLE_MIGRATION), y) +CONFIG_FLAGS += CONFIG_CRYPTO_DEV_HISI_MIGRATION=m +else +CONFIG_FLAGS += CONFIG_CRYPTO_DEV_HISI_MIGRATION=n +endif + default: - $(MAKE) -C $(KERNEL_PATH) M=$(shell pwd) modules \ - CONFIG_CC_STACKPROTECTOR_STRONG=y \ - CONFIG_UACCE=m \ - CONFIG_CRYPTO_QM_UACCE=m \ - CONFIG_CRYPTO_DEV_HISI_SGL=m \ - CONFIG_CRYPTO_DEV_HISI_QM=m \ - CONFIG_CRYPTO_DEV_HISI_ZIP=m \ - CONFIG_CRYPTO_DEV_HISI_HPRE=m \ - CONFIG_CRYPTO_DEV_HISI_SEC2=m \ - CONFIG_CRYPTO_DEV_HISI_TRNG=m \ - CONFIG_CRYPTO_DEV_HISI_MIGRATION=m + $(MAKE) -C $(KERNEL_PATH) M=$(shell pwd) modules $(CONFIG_FLAGS) + #copy: # cp -f $(shell pwd)/include_linux/uacce.h $(KSP)/include/linux # cp -f $(shell pwd)/include_uapi_linux/uacce.h $(KSP)/include/uapi/linux @@ -41,7 +48,9 @@ install: -modprobe hisi_sec2 uacce_mode=1 pf_q_num=256 -modprobe hisi_hpre uacce_mode=1 pf_q_num=256 -modprobe hisi_zip uacce_mode=1 pf_q_num=256 - -modprobe hisi_migration + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe hisi_migration; \ + fi) -echo "options hisi_sec2 uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_sec2.conf -echo "options hisi_hpre uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_hpre.conf -echo "options hisi_zip uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_zip.conf @@ -63,7 +72,9 @@ check: done uninstall: - modprobe -r hisi_migration + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe -r hisi_migration; \ + fi) modprobe -r hisi_zip modprobe -r hisi_hpre modprobe -r hisi_sec2 @@ -72,13 +83,15 @@ uninstall: rm -rf /etc/modprobe.d/hisi_sec2.conf rm -rf /etc/modprobe.d/hisi_hpre.conf rm -rf /etc/modprobe.d/hisi_zip.conf - dracut -f rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/uacce.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_qm.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_sec2.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_hpre.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_zip.ko - rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_migration.ko + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_migration.ko; \ + fi) + depmod -a nosva: $(shell mkdir -p /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra) @@ -92,7 +105,9 @@ nosva: -modprobe hisi_sec2 uacce_mode=2 pf_q_num=256 -modprobe hisi_hpre uacce_mode=2 pf_q_num=256 -modprobe hisi_zip uacce_mode=2 pf_q_num=256 - -modprobe hisi_migration + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe hisi_migration; \ + fi) -echo "options hisi_sec2 uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_sec2.conf -echo "options hisi_hpre uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_hpre.conf -echo "options hisi_zip uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_zip.conf diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.c b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.c index 7547e6e..8a7196a 100644 --- a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.c +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.c @@ -10,7 +10,7 @@ #include #include #include -#include "../vfio.h" +#include #include "acc_vf_migration.h" @@ -18,6 +18,16 @@ static struct dentry *mig_debugfs_root; static int mig_root_ref; +/* return 0 mailbox ready, -ETIMEDOUT hardware timeout */ +static int qm_wait_mb_ready(struct hisi_qm *qm) +{ + u32 val; + + return readl_relaxed_poll_timeout(qm->io_base + QM_MB_CMD_SEND_BASE, + val, !((val >> QM_MB_BUSY_SHIFT) & + 0x1), POLL_PERIOD, POLL_TIMEOUT); +} + /* return 0 VM acc device ready, -ETIMEDOUT hardware timeout */ static int qm_wait_dev_ready(struct hisi_qm *qm) { @@ -27,6 +37,7 @@ static int qm_wait_dev_ready(struct hisi_qm *qm) val, !(val & 0x1), POLL_PERIOD, POLL_TIMEOUT); } + /* 128 bit should be written to hardware at one time to trigger a mailbox */ static void qm_mb_write(struct hisi_qm *qm, const void *src) { @@ -50,147 +61,57 @@ static void qm_mb_write(struct hisi_qm *qm, const void *src) : "memory"); } -/* 128 bit should be read from hardware at one time */ -static void qm_mb_read(struct hisi_qm *qm, void *dst) -{ - const void __iomem *fun_base = qm->io_base + QM_MB_CMD_SEND_BASE; - unsigned long tmp0 = 0, tmp1 = 0; - - if (!IS_ENABLED(CONFIG_ARM64)) { - memcpy_fromio(dst, fun_base, 16); - dma_wmb(); - return; - } - - asm volatile("ldp %0, %1, %3\n" - "stp %0, %1, %2\n" - "dmb oshst\n" - : "=&r" (tmp0), - "=&r" (tmp1), - "+Q" (*((char *)dst)) - : "Q" (*((char __iomem *)fun_base)) - : "memory"); -} - static void qm_mb_pre_init(struct qm_mailbox *mailbox, u8 cmd, - u64 base, u16 queue, bool op) + u16 queue, bool op) { - mailbox->w0 = cpu_to_le16((cmd) | - ((op) ? 0x1 << QM_MB_OP_SHIFT : 0) | - (0x1 << QM_MB_BUSY_SHIFT)); + mailbox->w0 = cpu_to_le16(cmd | + (op ? 0x1 << QM_MB_OP_SHIFT : 0) | + (0x1 << QM_MB_BUSY_SHIFT)); mailbox->queue_num = cpu_to_le16(queue); - mailbox->base_l = cpu_to_le32(lower_32_bits(base)); - mailbox->base_h = cpu_to_le32(upper_32_bits(base)); mailbox->rsvd = 0; } -static int qm_wait_mb_ready(struct hisi_qm *qm) +static int qm_mb_nolock(struct hisi_qm *qm, struct qm_mailbox *mailbox) { - struct qm_mailbox mailbox; - int i = 0; - - while (i++ < QM_MB_WAIT_READY_CNT) { - qm_mb_read(qm, &mailbox); - if (!((le16_to_cpu(mailbox.w0) >> QM_MB_BUSY_SHIFT) & 0x1)) - return 0; + int cnt = 0; - usleep_range(WAIT_PERIOD_US_MIN, WAIT_PERIOD_US_MAX); + if (unlikely(qm_wait_mb_ready(qm))) { + dev_err(&qm->pdev->dev, "QM mailbox is busy to start!\n"); + return -EBUSY; } - return -EBUSY; -} - -static int qm_wait_mb_finish(struct hisi_qm *qm, struct qm_mailbox *mailbox) -{ - int i = 0; - - while (++i) { - qm_mb_read(qm, mailbox); - if (!((le16_to_cpu(mailbox->w0) >> QM_MB_BUSY_SHIFT) & 0x1)) + qm_mb_write(qm, mailbox); + while (true) { + if (!qm_wait_mb_ready(qm)) break; - - if (i == QM_MB_MAX_WAIT_CNT) { + if (++cnt > QM_MB_MAX_WAIT_CNT) { dev_err(&qm->pdev->dev, "QM mailbox operation timeout!\n"); - return -ETIMEDOUT; + return -EBUSY; } - - usleep_range(WAIT_PERIOD_US_MIN, WAIT_PERIOD_US_MAX); - } - - if (le16_to_cpu(mailbox->w0) & QM_MB_STATUS_MASK) { - dev_err(&qm->pdev->dev, "QM mailbox operation failed!\n"); - return -EIO; } - return 0; } -static int qm_mb(struct hisi_qm *qm, struct qm_mailbox *mailbox) +static int qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue, + bool op) { + struct qm_mailbox mailbox; int ret; - mutex_lock(&qm->mailbox_lock); - ret = qm_wait_mb_ready(qm); - if (ret) - goto unlock; + dev_dbg(&qm->pdev->dev, "QM mailbox request to q%u: %u-0x%llx\n", + queue, cmd, (unsigned long long)dma_addr); - qm_mb_write(qm, mailbox); - ret = qm_wait_mb_finish(qm, mailbox); + qm_mb_pre_init(&mailbox, cmd, queue, op); + mailbox.base_l = cpu_to_le32(lower_32_bits(dma_addr)); + mailbox.base_h = cpu_to_le32(upper_32_bits(dma_addr)); -unlock: + mutex_lock(&qm->mailbox_lock); + ret = qm_mb_nolock(qm, &mailbox); mutex_unlock(&qm->mailbox_lock); return ret; } -static int qm_config_set(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, - u16 queue, bool op) -{ - struct qm_mailbox mailbox; - - qm_mb_pre_init(&mailbox, cmd, dma_addr, queue, op); - - return qm_mb(qm, &mailbox); -} - -static int qm_config_get(struct hisi_qm *qm, u64 *base, u8 cmd, u16 queue) -{ - struct qm_mailbox mailbox; - int ret; - - qm_mb_pre_init(&mailbox, cmd, 0, queue, 1); - - ret = qm_mb(qm, &mailbox); - if (ret) - return ret; - - *base = le32_to_cpu(mailbox.base_l) | - ((u64)le32_to_cpu(mailbox.base_h) << 32); - - return 0; -} - -static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd, - u16 index, u8 priority) -{ - void __iomem *io_base = qm->io_base; - u16 randata = 0; - u64 doorbell; - - if (cmd == QM_DOORBELL_CMD_SQ || cmd == QM_DOORBELL_CMD_CQ) - io_base = qm->db_io_base + (u64)qn * qm->db_interval + - QM_DOORBELL_SQ_CQ_BASE_V2; - else - io_base += QM_DOORBELL_EQ_AEQ_BASE_V2; - - doorbell = qn | ((u64)cmd << QM_DB_CMD_SHIFT_V2) | - ((u64)randata << QM_DB_RAND_SHIFT_V2) | - ((u64)index << QM_DB_INDEX_SHIFT_V2) | - ((u64)priority << QM_DB_PRIORITY_SHIFT_V2); - - writeq(doorbell, io_base); -} - /* * Each state Reg is checked 100 times, * with a delay of 100 microseconds after each check @@ -309,10 +230,13 @@ static int qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number) u64 sqc_vft; int ret; - ret = qm_config_get(qm, &sqc_vft, QM_MB_CMD_SQC_VFT_V2, 0); + ret = qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); if (ret) return ret; + sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); *number = (QM_SQC_VFT_NUM_MASK_V2 & (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; @@ -320,6 +244,36 @@ static int qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number) return 0; } +static int qm_get_sqc(struct hisi_qm *qm, u64 *addr) +{ + int ret; + + ret = qm_mb(qm, QM_MB_CMD_SQC_BT, 0, 0, 1); + if (ret) + return ret; + + *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + + return 0; +} + +static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) +{ + int ret; + + ret = qm_mb(qm, QM_MB_CMD_CQC_BT, 0, 0, 1); + if (ret) + return ret; + + *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + + return 0; +} + static int qm_rw_regs_read(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; @@ -427,6 +381,12 @@ static int qm_rw_regs_write(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + ret = qm_write_reg(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1); + if (ret) { + dev_err(dev, "failed to write QM_QUE_ISO_CFG_V!\n"); + return ret; + } + ret = qm_write_reg(qm, QM_PAGE_SIZE, &vf_data->page_size, 1); if (ret) { dev_err(dev, "failed to write QM_PAGE_SIZE!\n"); @@ -456,19 +416,6 @@ static int qm_rw_regs_write(struct hisi_qm *qm, struct acc_vf_data *vf_data) return 0; } -static void vf_qm_xeqc_save(struct hisi_qm *qm, - struct acc_vf_migration *acc_vf_dev) -{ - struct acc_vf_data *vf_data = acc_vf_dev->vf_data; - u16 eq_head, aeq_head; - - eq_head = vf_data->qm_eqc_dw[0] & 0xFFFF; - qm_db(qm, 0, QM_DOORBELL_CMD_EQ, eq_head, 0); - - aeq_head = vf_data->qm_aeqc_dw[0] & 0xFFFF; - qm_db(qm, 0, QM_DOORBELL_CMD_AEQ, aeq_head, 0); -} - /* * the vf QM have unbind from host, insmod in the VM * so, qm just have the addr from pci dev @@ -492,29 +439,26 @@ static int vf_migration_data_store(struct hisi_qm *qm, * every Reg is 32 bit, the dma address is 64 bit * so, the dma address is store in the Reg2 and Reg1 */ - vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->eqe_dma = vf_data->qm_eqc_dw[2]; vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW]; - vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[1]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[2]; vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[1]; /* Through SQC_BT/CQC_BT to get sqc and cqc address */ - ret = qm_config_get(qm, &vf_data->sqc_dma, QM_MB_CMD_SQC_BT, 0); + ret = qm_get_sqc(qm, &vf_data->sqc_dma); if (ret) { dev_err(dev, "failed to read SQC addr!\n"); return -EINVAL; } - ret = qm_config_get(qm, &vf_data->cqc_dma, QM_MB_CMD_CQC_BT, 0); + ret = qm_get_cqc(qm, &vf_data->cqc_dma); if (ret) { dev_err(dev, "failed to read CQC addr!\n"); return -EINVAL; } - /* Save eqc and aeqc interrupt information */ - vf_qm_xeqc_save(qm, acc_vf_dev); - return 0; } @@ -527,6 +471,27 @@ static void qm_dev_cmd_init(struct hisi_qm *qm) writel(0x0, qm->io_base + QM_IFC_INT_MASK); } +static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd, + u16 index, u8 priority) +{ + void __iomem *io_base = qm->io_base; + u16 randata = 0; + u64 doorbell; + + if (cmd == QM_DOORBELL_CMD_SQ || cmd == QM_DOORBELL_CMD_CQ) + io_base = qm->db_io_base + (u64)qn * qm->db_interval + + QM_DOORBELL_SQ_CQ_BASE_V2; + else + io_base += QM_DOORBELL_EQ_AEQ_BASE_V2; + + doorbell = qn | ((u64)cmd << QM_DB_CMD_SHIFT_V2) | + ((u64)randata << QM_DB_RAND_SHIFT_V2) | + ((u64)index << QM_DB_INDEX_SHIFT_V2) | + ((u64)priority << QM_DB_PRIORITY_SHIFT_V2); + + writeq(doorbell, io_base); +} + static void vf_qm_fun_restart(struct hisi_qm *qm, struct acc_vf_migration *acc_vf_dev) { @@ -535,10 +500,9 @@ static void vf_qm_fun_restart(struct hisi_qm *qm, int i; /* - * When the Guest is rebooted or reseted, the SMMU page table - * will be destroyed, and the QP queue cannot be returned - * normally at this time. so if Guest acc driver have removed, - * don't need to restart QP. + * When the system is rebooted, the SMMU page table is destroyed, + * and the QP queue cannot be returned normally at this time. + * if vf_ready == 0x2, don't need to restart QP. */ if (vf_data->vf_state != QM_READY) { dev_err(dev, "failed to restart VF!\n"); @@ -553,7 +517,6 @@ static int vf_match_info_check(struct hisi_qm *qm, struct acc_vf_migration *acc_vf_dev) { struct acc_vf_data *vf_data = acc_vf_dev->vf_data; - struct hisi_qm *pf_qm = acc_vf_dev->pf_qm; struct device *dev = &qm->pdev->dev; u32 que_iso_state; int ret; @@ -577,7 +540,7 @@ static int vf_match_info_check(struct hisi_qm *qm, } /* vf isolation state check */ - ret = qm_read_reg(pf_qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); + ret = qm_read_reg(qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); if (ret) { dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n"); return ret; @@ -610,13 +573,13 @@ static int vf_migration_data_recover(struct hisi_qm *qm, return ret; } - ret = qm_config_set(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); + ret = qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); if (ret) { dev_err(dev, "Set sqc failed!\n"); return ret; } - ret = qm_config_set(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); + ret = qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); if (ret) { dev_err(dev, "Set cqc failed!\n"); return ret; @@ -645,7 +608,7 @@ static int vf_qm_cache_wb(struct hisi_qm *qm) static int vf_qm_func_stop(struct hisi_qm *qm) { - return qm_config_set(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0); + return qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0); } static int pf_qm_get_qp_num(struct hisi_qm *qm, int vf_id, @@ -694,17 +657,19 @@ static int pf_qm_state_pre_save(struct hisi_qm *qm, int vf_id = acc_vf_dev->vf_id; int ret; - /* Vf acc type save */ + /* vf acc type save */ vf_data->acc_type = acc_vf_dev->acc_type; - /* Vf qp num save from PF */ - ret = pf_qm_get_qp_num(qm, vf_id, &vf_data->qp_base, &vf_data->qp_num); - if (ret) { + /* vf qp num save from PF */ + ret = pf_qm_get_qp_num(qm, vf_id, &qm->qp_base, &qm->qp_num); + if (ret || qm->qp_num <= 1) { dev_err(dev, "failed to get vft qp nums!\n"); return -EINVAL; } + vf_data->qp_base = qm->qp_base; + vf_data->qp_num = qm->qp_num; - /* Vf isolation state save from PF */ + /* vf isolation state save from PF */ ret = qm_read_reg(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1); if (ret) { dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n"); @@ -842,7 +807,12 @@ static int acc_vf_set_device_state(struct acc_vf_migration *acc_vf_dev, break; case VFIO_DEVICE_STATE_STOP: + /* restart all VF's QP */ + vf_qm_fun_restart(qm, acc_vf_dev); + + break; case VFIO_DEVICE_STATE_RESUMING: + break; default: ret = -EFAULT; @@ -1242,30 +1212,12 @@ static void acc_vf_release(void *device_data) module_put(THIS_MODULE); } -static void acc_vf_reset(void *device_data) -{ - struct acc_vf_migration *acc_vf_dev = - vfio_pci_vendor_data(device_data); - struct hisi_qm *qm = acc_vf_dev->vf_qm; - struct device *dev = &qm->pdev->dev; - u32 vf_state = QM_NOT_READY; - int ret; - - dev_info(dev, "QEMU prepare to Reset Guest!\n"); - ret = qm_write_reg(qm, QM_VF_STATE, &vf_state, 1); - if (ret) - dev_err(dev, "failed to write QM_VF_STATE\n"); -} - static long acc_vf_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { switch (cmd) { case VFIO_DEVICE_GET_REGION_INFO: return acc_vf_get_region_info(device_data, cmd, arg); - case VFIO_DEVICE_RESET: - acc_vf_reset(device_data); - return vfio_pci_ioctl(device_data, cmd, arg); default: return vfio_pci_ioctl(device_data, cmd, arg); } @@ -1651,19 +1603,6 @@ init_qm_error: return -ENOMEM; } -static int hisi_acc_get_vf_id(struct pci_dev *dev) -{ - struct pci_dev *pf; - - if (!dev->is_virtfn) - return -EINVAL; - - pf = pci_physfn(dev); - return (((dev->bus->number << 8) + dev->devfn) - - ((pf->bus->number << 8) + pf->devfn + pf->sriov->offset)) / - pf->sriov->stride; -} - static void *acc_vf_probe(struct pci_dev *pdev) { struct acc_vf_migration *acc_vf_dev; @@ -1689,7 +1628,7 @@ static void *acc_vf_probe(struct pci_dev *pdev) return ERR_PTR(-EINVAL); } - vf_id = hisi_acc_get_vf_id(vf_dev); + vf_id = PCI_FUNC(vf_dev->devfn); if (vf_id < 0) { dev_info(&pdev->dev, "vf device: %s, vf id: %d\n", pf_qm->dev_name, vf_id); @@ -1706,7 +1645,7 @@ static void *acc_vf_probe(struct pci_dev *pdev) return ERR_PTR(-ENOMEM); } - acc_vf_dev->vf_id = vf_id + 1; + acc_vf_dev->vf_id = vf_id; acc_vf_dev->vf_vendor = pdev->vendor; acc_vf_dev->vf_device = pdev->device; acc_vf_dev->pf_dev = pf_dev; @@ -1736,8 +1675,6 @@ static void acc_vf_remove(void *vendor_data) static struct vfio_pci_vendor_driver_ops sec_vf_mig_ops = { .owner = THIS_MODULE, .name = "hisi_sec2", - .vendor = PCI_VENDOR_ID_HUAWEI, - .device = PCI_DEVICE_ID_HUAWEI_SEC_VF, .probe = acc_vf_probe, .remove = acc_vf_remove, .device_ops = &acc_vf_device_ops_node, @@ -1746,8 +1683,6 @@ static struct vfio_pci_vendor_driver_ops sec_vf_mig_ops = { static struct vfio_pci_vendor_driver_ops hpre_vf_mig_ops = { .owner = THIS_MODULE, .name = "hisi_hpre", - .vendor = PCI_VENDOR_ID_HUAWEI, - .device = PCI_DEVICE_ID_HUAWEI_HPRE_VF, .probe = acc_vf_probe, .remove = acc_vf_remove, .device_ops = &acc_vf_device_ops_node, @@ -1756,8 +1691,6 @@ static struct vfio_pci_vendor_driver_ops hpre_vf_mig_ops = { static struct vfio_pci_vendor_driver_ops zip_vf_mig_ops = { .owner = THIS_MODULE, .name = "hisi_zip", - .vendor = PCI_VENDOR_ID_HUAWEI, - .device = PCI_DEVICE_ID_HUAWEI_ZIP_VF, .probe = acc_vf_probe, .remove = acc_vf_remove, .device_ops = &acc_vf_device_ops_node, @@ -1776,13 +1709,11 @@ static int __init acc_vf_module_init(void) static void __exit acc_vf_module_exit(void) { - vfio_pci_unregister_vendor_driver(&sec_vf_mig_ops); - vfio_pci_unregister_vendor_driver(&hpre_vf_mig_ops); - vfio_pci_unregister_vendor_driver(&zip_vf_mig_ops); + vfio_pci_unregister_vendor_driver(&acc_vf_device_ops_node); }; module_init(acc_vf_module_init); module_exit(acc_vf_module_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Longfang Liu "); -MODULE_DESCRIPTION("HiSilicon Accelerator VF live migration driver"); +MODULE_DESCRIPTION("HiSilicon Accelerator VF live migration driver"); \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.h b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.h index a2368ba..1fdcba0 100644 --- a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.h +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/migration/acc_vf_migration.h @@ -6,7 +6,7 @@ #include #include -#include "../vfio.h" +#include "../../include_linux/vfio.h" #include "../hisi_acc_qm.h" @@ -58,11 +58,9 @@ #define QM_MB_CMD_SEND_BASE 0x300 #define QM_MB_BUSY_SHIFT 13 #define QM_MB_OP_SHIFT 14 -#define QM_MB_WAIT_READY_CNT 10 -#define QM_MB_MAX_WAIT_CNT 3000 -#define WAIT_PERIOD_US_MIN 100 -#define WAIT_PERIOD_US_MAX 200 -#define QM_MB_STATUS_MASK GENMASK(12, 9) +#define QM_MB_CMD_DATA_ADDR_L 0x304 +#define QM_MB_CMD_DATA_ADDR_H 0x308 +#define QM_MB_MAX_WAIT_CNT 6000 /* doorbell */ #define QM_DOORBELL_CMD_SQ 0 @@ -79,8 +77,6 @@ #define QM_REG_ADDR_OFFSET 0x0004 #define QM_XQC_ADDR_OFFSET 32U -#define QM_XQC_ADDR_LOW 0x1 -#define QM_XQC_ADDR_HIGH 0x2 #define QM_VF_AEQ_INT_MASK 0x0004 #define QM_VF_EQ_INT_MASK 0x000c #define QM_IFC_INT_SOURCE_V 0x0020 @@ -219,32 +215,6 @@ struct acc_vf_region_ops { struct vfio_info_cap *caps); }; -/* Single Root I/O Virtualization */ -struct pci_sriov { - int pos; /* Capability position */ - int nres; /* Number of resources */ - u32 cap; /* SR-IOV Capabilities */ - u16 ctrl; /* SR-IOV Control */ - u16 total_VFs; /* Total VFs associated with the PF */ - u16 initial_VFs; /* Initial VFs associated with the PF */ - u16 num_VFs; /* Number of VFs available */ - u16 offset; /* First VF Routing ID offset */ - u16 stride; /* Following VF stride */ - u16 vf_device; /* VF device ID */ - u32 pgsz; /* Page size for BAR alignment */ - u8 link; /* Function Dependency Link */ - u8 max_VF_buses; /* Max buses consumed by VFs */ - u16 driver_max_VFs; /* Max num VFs driver supports */ - struct pci_dev *dev; /* Lowest numbered PF */ - struct pci_dev *self; /* This PF */ - u32 class; /* VF device */ - u8 hdr_type; /* VF header type */ - u16 subsystem_vendor; /* VF subsystem vendor */ - u16 subsystem_device; /* VF subsystem device */ - resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ - bool drivers_autoprobe; /* Auto probing of VFs by driver */ -}; - struct acc_vf_region { u32 type; u32 subtype; @@ -269,4 +239,4 @@ struct acc_vf_irq { const struct acc_vf_irqops *ops; }; -#endif /* ACC_MIG_H */ +#endif /* ACC_MIG_H */ \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/vfio.h b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/include_linux/vfio.h similarity index 98% rename from KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/vfio.h rename to KAEKernelDriver/KAEKernelDriver-OLK-5.10/include_linux/vfio.h index 3489930..0b6cda3 100644 --- a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/hisilicon/vfio.h +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/include_linux/vfio.h @@ -13,7 +13,11 @@ #include #include #include -#include +#include "../include_uapi_linux/vfio.h" + +#ifndef KABI_EXTEND +#define KABI_EXTEND(_new) _new; +#endif struct vfio_device { struct device *dev; @@ -291,4 +295,4 @@ static void __exit device_ops ## _module_exit(void) \ module_init(device_ops ## _module_init); \ module_exit(device_ops ## _module_exit) -#endif /* VFIO_H */ +#endif /* VFIO_H */ \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.10/include_uapi_linux/vfio.h b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/include_uapi_linux/vfio.h new file mode 100644 index 0000000..52658db --- /dev/null +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.10/include_uapi_linux/vfio.h @@ -0,0 +1,1444 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * VFIO API definition + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _UAPIVFIO_H +#define _UAPIVFIO_H + +#include +#include + +#define VFIO_API_VERSION 0 + + +/* Kernel & User level defines for VFIO IOCTLs. */ + +/* Extensions */ + +#define VFIO_TYPE1_IOMMU 1 +#define VFIO_SPAPR_TCE_IOMMU 2 +#define VFIO_TYPE1v2_IOMMU 3 +/* + * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping). This + * capability is subject to change as groups are added or removed. + */ +#define VFIO_DMA_CC_IOMMU 4 + +/* Check if EEH is supported */ +#define VFIO_EEH 5 + +/* Two-stage IOMMU */ +#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ + +#define VFIO_SPAPR_TCE_v2_IOMMU 7 + +/* + * The No-IOMMU IOMMU offers no translation or isolation for devices and + * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU + * code will taint the host kernel and should be used with extreme caution. + */ +#define VFIO_NOIOMMU_IOMMU 8 + +/* + * The vfio_iommu driver may support user clears dirty log manually, which means + * dirty log can be requested to not cleared automatically after dirty log is + * copied to userspace, it's user's duty to clear dirty log. + * + * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and + * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. + */ +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 + +/* + * The IOCTL interface is designed for extensibility by embedding the + * structure length (argsz) and flags into structures passed between + * kernel and userspace. We therefore use the _IO() macro for these + * defines to avoid implicitly embedding a size into the ioctl request. + * As structure fields are added, argsz will increase to match and flag + * bits will be defined to indicate additional fields with valid data. + * It's *always* the caller's responsibility to indicate the size of + * the structure passed by setting argsz appropriately. + */ + +#define VFIO_TYPE (';') +#define VFIO_BASE 100 + +/* + * For extension of INFO ioctls, VFIO makes use of a capability chain + * designed after PCI/e capabilities. A flag bit indicates whether + * this capability chain is supported and a field defined in the fixed + * structure defines the offset of the first capability in the chain. + * This field is only valid when the corresponding bit in the flags + * bitmap is set. This offset field is relative to the start of the + * INFO buffer, as is the next field within each capability header. + * The id within the header is a shared address space per INFO ioctl, + * while the version field is specific to the capability id. The + * contents following the header are specific to the capability id. + */ +struct vfio_info_cap_header { + __u16 id; /* Identifies capability */ + __u16 version; /* Version specific to the capability ID */ + __u32 next; /* Offset of next capability */ +}; + +/* + * Callers of INFO ioctls passing insufficiently sized buffers will see + * the capability chain flag bit set, a zero value for the first capability + * offset (if available within the provided argsz), and argsz will be + * updated to report the necessary buffer size. For compatibility, the + * INFO ioctl will not report error in this case, but the capability chain + * will not be available. + */ + +/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ + +/** + * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0) + * + * Report the version of the VFIO API. This allows us to bump the entire + * API version should we later need to add or change features in incompatible + * ways. + * Return: VFIO_API_VERSION + * Availability: Always + */ +#define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0) + +/** + * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32) + * + * Check whether an extension is supported. + * Return: 0 if not supported, 1 (or some other positive integer) if supported. + * Availability: Always + */ +#define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1) + +/** + * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32) + * + * Set the iommu to the given type. The type must be supported by an + * iommu driver as verified by calling CHECK_EXTENSION using the same + * type. A group must be set to this file descriptor before this + * ioctl is available. The IOMMU interfaces enabled by this call are + * specific to the value set. + * Return: 0 on success, -errno on failure + * Availability: When VFIO group attached + */ +#define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2) + +/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */ + +/** + * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3, + * struct vfio_group_status) + * + * Retrieve information about the group. Fills in provided + * struct vfio_group_info. Caller sets argsz. + * Return: 0 on succes, -errno on failure. + * Availability: Always + */ +struct vfio_group_status { + __u32 argsz; + __u32 flags; +#define VFIO_GROUP_FLAGS_VIABLE (1 << 0) +#define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1) +}; +#define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3) + +/** + * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32) + * + * Set the container for the VFIO group to the open VFIO file + * descriptor provided. Groups may only belong to a single + * container. Containers may, at their discretion, support multiple + * groups. Only when a container is set are all of the interfaces + * of the VFIO file descriptor and the VFIO group file descriptor + * available to the user. + * Return: 0 on success, -errno on failure. + * Availability: Always + */ +#define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4) + +/** + * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5) + * + * Remove the group from the attached container. This is the + * opposite of the SET_CONTAINER call and returns the group to + * an initial state. All device file descriptors must be released + * prior to calling this interface. When removing the last group + * from a container, the IOMMU will be disabled and all state lost, + * effectively also returning the VFIO file descriptor to an initial + * state. + * Return: 0 on success, -errno on failure. + * Availability: When attached to container + */ +#define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5) + +/** + * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char) + * + * Return a new file descriptor for the device object described by + * the provided string. The string should match a device listed in + * the devices subdirectory of the IOMMU group sysfs entry. The + * group containing the device must already be added to this context. + * Return: new file descriptor on success, -errno on failure. + * Availability: When attached to container + */ +#define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6) + +/* --------------- IOCTLs for DEVICE file descriptors --------------- */ + +/** + * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, + * struct vfio_device_info) + * + * Retrieve information about the device. Fills in provided + * struct vfio_device_info. Caller sets argsz. + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_info { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ +#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ +#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ +#define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ +#define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ +#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ + __u32 num_regions; /* Max region index + 1 */ + __u32 num_irqs; /* Max IRQ index + 1 */ + __u32 cap_offset; /* Offset within info struct of first cap */ +}; +#define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) + +/* + * Vendor driver using Mediated device framework should provide device_api + * attribute in supported type attribute groups. Device API string should be one + * of the following corresponding to device flags in vfio_device_info structure. + */ + +#define VFIO_DEVICE_API_PCI_STRING "vfio-pci" +#define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform" +#define VFIO_DEVICE_API_AMBA_STRING "vfio-amba" +#define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" +#define VFIO_DEVICE_API_AP_STRING "vfio-ap" + +/* + * The following capabilities are unique to s390 zPCI devices. Their contents + * are further-defined in vfio_zdev.h + */ +#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1 +#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2 +#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 +#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 + +/** + * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, + * struct vfio_region_info) + * + * Retrieve information about a device region. Caller provides + * struct vfio_region_info with index value set. Caller sets argsz. + * Implementation of region mapping is bus driver specific. This is + * intended to describe MMIO, I/O port, as well as bus specific + * regions (ex. PCI config space). Zero sized regions may be used + * to describe unimplemented regions (ex. unimplemented PCI BARs). + * Return: 0 on success, -errno on failure. + */ +struct vfio_region_info { + __u32 argsz; + __u32 flags; +#define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ +#define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ +#define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ +#define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ + __u32 index; /* Region index */ + __u32 cap_offset; /* Offset within info struct of first cap */ + __u64 size; /* Region size (bytes) */ + __u64 offset; /* Region offset from start of device fd */ +}; +#define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) + +/* + * The sparse mmap capability allows finer granularity of specifying areas + * within a region with mmap support. When specified, the user should only + * mmap the offset ranges specified by the areas array. mmaps outside of the + * areas specified may fail (such as the range covering a PCI MSI-X table) or + * may result in improper device behavior. + * + * The structures below define version 1 of this capability. + */ +#define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 + +struct vfio_region_sparse_mmap_area { + __u64 offset; /* Offset of mmap'able area within region */ + __u64 size; /* Size of mmap'able area */ +}; + +struct vfio_region_info_cap_sparse_mmap { + struct vfio_info_cap_header header; + __u32 nr_areas; + __u32 reserved; + struct vfio_region_sparse_mmap_area areas[]; +}; + +/* + * The device specific type capability allows regions unique to a specific + * device or class of devices to be exposed. This helps solve the problem for + * vfio bus drivers of defining which region indexes correspond to which region + * on the device, without needing to resort to static indexes, as done by + * vfio-pci. For instance, if we were to go back in time, we might remove + * VFIO_PCI_VGA_REGION_INDEX and let vfio-pci simply define that all indexes + * greater than or equal to VFIO_PCI_NUM_REGIONS are device specific and we'd + * make a "VGA" device specific type to describe the VGA access space. This + * means that non-VGA devices wouldn't need to waste this index, and thus the + * address space associated with it due to implementation of device file + * descriptor offsets in vfio-pci. + * + * The current implementation is now part of the user ABI, so we can't use this + * for VGA, but there are other upcoming use cases, such as opregions for Intel + * IGD devices and framebuffers for vGPU devices. We missed VGA, but we'll + * use this for future additions. + * + * The structure below defines version 1 of this capability. + */ +#define VFIO_REGION_INFO_CAP_TYPE 2 + +struct vfio_region_info_cap_type { + struct vfio_info_cap_header header; + __u32 type; /* global per bus driver */ + __u32 subtype; /* type specific */ +}; + +/* + * List of region types, global per bus driver. + * If you introduce a new type, please add it here. + */ + +/* PCI region type containing a PCI vendor part */ +#define VFIO_REGION_TYPE_PCI_VENDOR_TYPE (1 << 31) +#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) +#define VFIO_REGION_TYPE_GFX (1) +#define VFIO_REGION_TYPE_CCW (2) +#define VFIO_REGION_TYPE_MIGRATION (3) + +/* sub-types for VFIO_REGION_TYPE_PCI_* */ + +/* 8086 vendor PCI sub-types */ +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) + +/* 10de vendor PCI sub-types */ +/* + * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space. + */ +#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1) + +/* 1014 vendor PCI sub-types */ +/* + * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU + * to do TLB invalidation on a GPU. + */ +#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) + +/* sub-types for VFIO_REGION_TYPE_GFX */ +#define VFIO_REGION_SUBTYPE_GFX_EDID (1) + +/** + * struct vfio_region_gfx_edid - EDID region layout. + * + * Set display link state and EDID blob. + * + * The EDID blob has monitor information such as brand, name, serial + * number, physical size, supported video modes and more. + * + * This special region allows userspace (typically qemu) set a virtual + * EDID for the virtual monitor, which allows a flexible display + * configuration. + * + * For the edid blob spec look here: + * https://en.wikipedia.org/wiki/Extended_Display_Identification_Data + * + * On linux systems you can find the EDID blob in sysfs: + * /sys/class/drm/${card}/${connector}/edid + * + * You can use the edid-decode ulility (comes with xorg-x11-utils) to + * decode the EDID blob. + * + * @edid_offset: location of the edid blob, relative to the + * start of the region (readonly). + * @edid_max_size: max size of the edid blob (readonly). + * @edid_size: actual edid size (read/write). + * @link_state: display link state (read/write). + * VFIO_DEVICE_GFX_LINK_STATE_UP: Monitor is turned on. + * VFIO_DEVICE_GFX_LINK_STATE_DOWN: Monitor is turned off. + * @max_xres: max display width (0 == no limitation, readonly). + * @max_yres: max display height (0 == no limitation, readonly). + * + * EDID update protocol: + * (1) set link-state to down. + * (2) update edid blob and size. + * (3) set link-state to up. + */ +struct vfio_region_gfx_edid { + __u32 edid_offset; + __u32 edid_max_size; + __u32 edid_size; + __u32 max_xres; + __u32 max_yres; + __u32 link_state; +#define VFIO_DEVICE_GFX_LINK_STATE_UP 1 +#define VFIO_DEVICE_GFX_LINK_STATE_DOWN 2 +}; + +/* sub-types for VFIO_REGION_TYPE_CCW */ +#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) +#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) +#define VFIO_REGION_SUBTYPE_CCW_CRW (3) + +/* sub-types for VFIO_REGION_TYPE_MIGRATION */ +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +/* + * The structure vfio_device_migration_info is placed at the 0th offset of + * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related + * migration information. Field accesses from this structure are only supported + * at their native width and alignment. Otherwise, the result is undefined and + * vendor drivers should return an error. + * + * device_state: (read/write) + * - The user application writes to this field to inform the vendor driver + * about the device state to be transitioned to. + * - The vendor driver should take the necessary actions to change the + * device state. After successful transition to a given state, the + * vendor driver should return success on write(device_state, state) + * system call. If the device state transition fails, the vendor driver + * should return an appropriate -errno for the fault condition. + * - On the user application side, if the device state transition fails, + * that is, if write(device_state, state) returns an error, read + * device_state again to determine the current state of the device from + * the vendor driver. + * - The vendor driver should return previous state of the device unless + * the vendor driver has encountered an internal error, in which case + * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR. + * - The user application must use the device reset ioctl to recover the + * device from VFIO_DEVICE_STATE_ERROR state. If the device is + * indicated to be in a valid device state by reading device_state, the + * user application may attempt to transition the device to any valid + * state reachable from the current state or terminate itself. + * + * device_state consists of 3 bits: + * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear, + * it indicates the _STOP state. When the device state is changed to + * _STOP, driver should stop the device before write() returns. + * - If bit 1 is set, it indicates the _SAVING state, which means that the + * driver should start gathering device state information that will be + * provided to the VFIO user application to save the device's state. + * - If bit 2 is set, it indicates the _RESUMING state, which means that + * the driver should prepare to resume the device. Data provided through + * the migration region should be used to resume the device. + * Bits 3 - 31 are reserved for future use. To preserve them, the user + * application should perform a read-modify-write operation on this + * field when modifying the specified bits. + * + * +------- _RESUMING + * |+------ _SAVING + * ||+----- _RUNNING + * ||| + * 000b => Device Stopped, not saving or resuming + * 001b => Device running, which is the default state + * 010b => Stop the device & save the device state, stop-and-copy state + * 011b => Device running and save the device state, pre-copy state + * 100b => Device stopped and the device state is resuming + * 101b => Invalid state + * 110b => Error state + * 111b => Invalid state + * + * State transitions: + * + * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP + * (100b) (001b) (011b) (010b) (000b) + * 0. Running or default state + * | + * + * 1. Normal Shutdown (optional) + * |------------------------------------->| + * + * 2. Save the state or suspend + * |------------------------->|---------->| + * + * 3. Save the state during live migration + * |----------->|------------>|---------->| + * + * 4. Resuming + * |<---------| + * + * 5. Resumed + * |--------->| + * + * 0. Default state of VFIO device is _RUNNING when the user application starts. + * 1. During normal shutdown of the user application, the user application may + * optionally change the VFIO device state from _RUNNING to _STOP. This + * transition is optional. The vendor driver must support this transition but + * must not require it. + * 2. When the user application saves state or suspends the application, the + * device state transitions from _RUNNING to stop-and-copy and then to _STOP. + * On state transition from _RUNNING to stop-and-copy, driver must stop the + * device, save the device state and send it to the application through the + * migration region. The sequence to be followed for such transition is given + * below. + * 3. In live migration of user application, the state transitions from _RUNNING + * to pre-copy, to stop-and-copy, and to _STOP. + * On state transition from _RUNNING to pre-copy, the driver should start + * gathering the device state while the application is still running and send + * the device state data to application through the migration region. + * On state transition from pre-copy to stop-and-copy, the driver must stop + * the device, save the device state and send it to the user application + * through the migration region. + * Vendor drivers must support the pre-copy state even for implementations + * where no data is provided to the user before the stop-and-copy state. The + * user must not be required to consume all migration data before the device + * transitions to a new state, including the stop-and-copy state. + * The sequence to be followed for above two transitions is given below. + * 4. To start the resuming phase, the device state should be transitioned from + * the _RUNNING to the _RESUMING state. + * In the _RESUMING state, the driver should use the device state data + * received through the migration region to resume the device. + * 5. After providing saved device data to the driver, the application should + * change the state from _RESUMING to _RUNNING. + * + * reserved: + * Reads on this field return zero and writes are ignored. + * + * pending_bytes: (read only) + * The number of pending bytes still to be migrated from the vendor driver. + * + * data_offset: (read only) + * The user application should read data_offset field from the migration + * region. The user application should read the device data from this + * offset within the migration region during the _SAVING state or write + * the device data during the _RESUMING state. See below for details of + * sequence to be followed. + * + * data_size: (read/write) + * The user application should read data_size to get the size in bytes of + * the data copied in the migration region during the _SAVING state and + * write the size in bytes of the data copied in the migration region + * during the _RESUMING state. + * + * The format of the migration region is as follows: + * ------------------------------------------------------------------ + * |vfio_device_migration_info| data section | + * | | /////////////////////////////// | + * ------------------------------------------------------------------ + * ^ ^ + * offset 0-trapped part data_offset + * + * The structure vfio_device_migration_info is always followed by the data + * section in the region, so data_offset will always be nonzero. The offset + * from where the data is copied is decided by the kernel driver. The data + * section can be trapped, mmapped, or partitioned, depending on how the kernel + * driver defines the data section. The data section partition can be defined + * as mapped by the sparse mmap capability. If mmapped, data_offset must be + * page aligned, whereas initial section which contains the + * vfio_device_migration_info structure, might not end at the offset, which is + * page aligned. The user is not required to access through mmap regardless + * of the capabilities of the region mmap. + * The vendor driver should determine whether and how to partition the data + * section. The vendor driver should return data_offset accordingly. + * + * The sequence to be followed while in pre-copy state and stop-and-copy state + * is as follows: + * a. Read pending_bytes, indicating the start of a new iteration to get device + * data. Repeated read on pending_bytes at this stage should have no side + * effects. + * If pending_bytes == 0, the user application should not iterate to get data + * for that device. + * If pending_bytes > 0, perform the following steps. + * b. Read data_offset, indicating that the vendor driver should make data + * available through the data section. The vendor driver should return this + * read operation only after data is available from (region + data_offset) + * to (region + data_offset + data_size). + * c. Read data_size, which is the amount of data in bytes available through + * the migration region. + * Read on data_offset and data_size should return the offset and size of + * the current buffer if the user application reads data_offset and + * data_size more than once here. + * d. Read data_size bytes of data from (region + data_offset) from the + * migration region. + * e. Process the data. + * f. Read pending_bytes, which indicates that the data from the previous + * iteration has been read. If pending_bytes > 0, go to step b. + * + * The user application can transition from the _SAVING|_RUNNING + * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the + * number of pending bytes. The user application should iterate in _SAVING + * (stop-and-copy) until pending_bytes is 0. + * + * The sequence to be followed while _RESUMING device state is as follows: + * While data for this device is available, repeat the following steps: + * a. Read data_offset from where the user application should write data. + * b. Write migration data starting at the migration region + data_offset for + * the length determined by data_size from the migration source. + * c. Write data_size, which indicates to the vendor driver that data is + * written in the migration region. Vendor driver must return this write + * operations on consuming data. Vendor driver should apply the + * user-provided migration region data to the device resume state. + * + * If an error occurs during the above sequences, the vendor driver can return + * an error code for next read() or write() operation, which will terminate the + * loop. The user application should then take the next necessary action, for + * example, failing migration or terminating the user application. + * + * For the user application, data is opaque. The user application should write + * data in the same order as the data is received and the data should be of + * same transaction size at the source. + */ + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + +#define VFIO_DEVICE_STATE_VALID(state) \ + (state & VFIO_DEVICE_STATE_RESUMING ? \ + (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) + +#define VFIO_DEVICE_STATE_IS_ERROR(state) \ + ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING)) + +#define VFIO_DEVICE_STATE_SET_ERROR(state) \ + ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + + __u32 reserved; + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + +/* + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped + * which allows direct access to non-MSIX registers which happened to be within + * the same system page. + * + * Even though the userspace gets direct access to the MSIX data, the existing + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration. + */ +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 + +/* + * Capability with compressed real address (aka SSA - small system address) + * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing + * and by the userspace to associate a NVLink bridge with a GPU. + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT 4 + +struct vfio_region_info_cap_nvlink2_ssatgt { + struct vfio_info_cap_header header; + __u64 tgt; +}; + +/* + * Capability with an NVLink link speed. The value is read by + * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed" + * property in the device tree. The value is fixed in the hardware + * and failing to provide the correct value results in the link + * not working with no indication from the driver why. + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD 5 + +struct vfio_region_info_cap_nvlink2_lnkspd { + struct vfio_info_cap_header header; + __u32 link_speed; + __u32 __pad; +}; + +/** + * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, + * struct vfio_irq_info) + * + * Retrieve information about a device IRQ. Caller provides + * struct vfio_irq_info with index value set. Caller sets argsz. + * Implementation of IRQ mapping is bus driver specific. Indexes + * using multiple IRQs are primarily intended to support MSI-like + * interrupt blocks. Zero count irq blocks may be used to describe + * unimplemented interrupt types. + * + * The EVENTFD flag indicates the interrupt index supports eventfd based + * signaling. + * + * The MASKABLE flags indicates the index supports MASK and UNMASK + * actions described below. + * + * AUTOMASKED indicates that after signaling, the interrupt line is + * automatically masked by VFIO and the user needs to unmask the line + * to receive new interrupts. This is primarily intended to distinguish + * level triggered interrupts. + * + * The NORESIZE flag indicates that the interrupt lines within the index + * are setup as a set and new subindexes cannot be enabled without first + * disabling the entire index. This is used for interrupts like PCI MSI + * and MSI-X where the driver may only use a subset of the available + * indexes, but VFIO needs to enable a specific number of vectors + * upfront. In the case of MSI-X, where the user can enable MSI-X and + * then add and unmask vectors, it's up to userspace to make the decision + * whether to allocate the maximum supported number of vectors or tear + * down setup and incrementally increase the vectors as each is enabled. + */ +struct vfio_irq_info { + __u32 argsz; + __u32 flags; +#define VFIO_IRQ_INFO_EVENTFD (1 << 0) +#define VFIO_IRQ_INFO_MASKABLE (1 << 1) +#define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) +#define VFIO_IRQ_INFO_NORESIZE (1 << 3) + __u32 index; /* IRQ index */ + __u32 count; /* Number of IRQs within this index */ +}; +#define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) + +/** + * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) + * + * Set signaling, masking, and unmasking of interrupts. Caller provides + * struct vfio_irq_set with all fields set. 'start' and 'count' indicate + * the range of subindexes being specified. + * + * The DATA flags specify the type of data provided. If DATA_NONE, the + * operation performs the specified action immediately on the specified + * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]: + * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1. + * + * DATA_BOOL allows sparse support for the same on arrays of interrupts. + * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]): + * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3, + * data = {1,0,1} + * + * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd. + * A value of -1 can be used to either de-assign interrupts if already + * assigned or skip un-assigned interrupts. For example, to set an eventfd + * to be trigger for interrupts [0,0] and [0,2]: + * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3, + * data = {fd1, -1, fd2} + * If index [0,1] is previously set, two count = 1 ioctls calls would be + * required to set [0,0] and [0,2] without changing [0,1]. + * + * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used + * with ACTION_TRIGGER to perform kernel level interrupt loopback testing + * from userspace (ie. simulate hardware triggering). + * + * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER + * enables the interrupt index for the device. Individual subindex interrupts + * can be disabled using the -1 value for DATA_EVENTFD or the index can be + * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0. + * + * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while + * ACTION_TRIGGER specifies kernel->user signaling. + */ +struct vfio_irq_set { + __u32 argsz; + __u32 flags; +#define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ +#define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ +#define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ +#define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ +#define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ +#define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ + __u32 index; + __u32 start; + __u32 count; + __u8 data[]; +}; +#define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10) + +#define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \ + VFIO_IRQ_SET_DATA_BOOL | \ + VFIO_IRQ_SET_DATA_EVENTFD) +#define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \ + VFIO_IRQ_SET_ACTION_UNMASK | \ + VFIO_IRQ_SET_ACTION_TRIGGER) +/** + * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11) + * + * Reset a device. + */ +#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) + +/* + * The VFIO-PCI bus driver makes use of the following fixed region and + * IRQ index mapping. Unimplemented regions return a size of zero. + * Unimplemented IRQ types return a count of zero. + */ + +enum { + VFIO_PCI_BAR0_REGION_INDEX, + VFIO_PCI_BAR1_REGION_INDEX, + VFIO_PCI_BAR2_REGION_INDEX, + VFIO_PCI_BAR3_REGION_INDEX, + VFIO_PCI_BAR4_REGION_INDEX, + VFIO_PCI_BAR5_REGION_INDEX, + VFIO_PCI_ROM_REGION_INDEX, + VFIO_PCI_CONFIG_REGION_INDEX, + /* + * Expose VGA regions defined for PCI base class 03, subclass 00. + * This includes I/O port ranges 0x3b0 to 0x3bb and 0x3c0 to 0x3df + * as well as the MMIO range 0xa0000 to 0xbffff. Each implemented + * range is found at it's identity mapped offset from the region + * offset, for example 0x3b0 is region_info.offset + 0x3b0. Areas + * between described ranges are unimplemented. + */ + VFIO_PCI_VGA_REGION_INDEX, + VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */ + /* device specific cap to define content. */ +}; + +enum { + VFIO_PCI_INTX_IRQ_INDEX, + VFIO_PCI_MSI_IRQ_INDEX, + VFIO_PCI_MSIX_IRQ_INDEX, + VFIO_PCI_ERR_IRQ_INDEX, + VFIO_PCI_REQ_IRQ_INDEX, + VFIO_PCI_NUM_IRQS +}; + +/* + * The vfio-ccw bus driver makes use of the following fixed region and + * IRQ index mapping. Unimplemented regions return a size of zero. + * Unimplemented IRQ types return a count of zero. + */ + +enum { + VFIO_CCW_CONFIG_REGION_INDEX, + VFIO_CCW_NUM_REGIONS +}; + +enum { + VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_CRW_IRQ_INDEX, + VFIO_CCW_NUM_IRQS +}; + +/** + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12, + * struct vfio_pci_hot_reset_info) + * + * Return: 0 on success, -errno on failure: + * -enospc = insufficient buffer, -enodev = unsupported for device. + */ +struct vfio_pci_dependent_device { + __u32 group_id; + __u16 segment; + __u8 bus; + __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ +}; + +struct vfio_pci_hot_reset_info { + __u32 argsz; + __u32 flags; + __u32 count; + struct vfio_pci_dependent_device devices[]; +}; + +#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/** + * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13, + * struct vfio_pci_hot_reset) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_pci_hot_reset { + __u32 argsz; + __u32 flags; + __u32 count; + __s32 group_fds[]; +}; + +#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13) + +/** + * VFIO_DEVICE_QUERY_GFX_PLANE - _IOW(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_device_query_gfx_plane) + * + * Set the drm_plane_type and flags, then retrieve the gfx plane info. + * + * flags supported: + * - VFIO_GFX_PLANE_TYPE_PROBE and VFIO_GFX_PLANE_TYPE_DMABUF are set + * to ask if the mdev supports dma-buf. 0 on support, -EINVAL on no + * support for dma-buf. + * - VFIO_GFX_PLANE_TYPE_PROBE and VFIO_GFX_PLANE_TYPE_REGION are set + * to ask if the mdev supports region. 0 on support, -EINVAL on no + * support for region. + * - VFIO_GFX_PLANE_TYPE_DMABUF or VFIO_GFX_PLANE_TYPE_REGION is set + * with each call to query the plane info. + * - Others are invalid and return -EINVAL. + * + * Note: + * 1. Plane could be disabled by guest. In that case, success will be + * returned with zero-initialized drm_format, size, width and height + * fields. + * 2. x_hot/y_hot is set to 0xFFFFFFFF if no hotspot information available + * + * Return: 0 on success, -errno on other failure. + */ +struct vfio_device_gfx_plane_info { + __u32 argsz; + __u32 flags; +#define VFIO_GFX_PLANE_TYPE_PROBE (1 << 0) +#define VFIO_GFX_PLANE_TYPE_DMABUF (1 << 1) +#define VFIO_GFX_PLANE_TYPE_REGION (1 << 2) + /* in */ + __u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */ + /* out */ + __u32 drm_format; /* drm format of plane */ + __u64 drm_format_mod; /* tiled mode */ + __u32 width; /* width of plane */ + __u32 height; /* height of plane */ + __u32 stride; /* stride of plane */ + __u32 size; /* size of plane in bytes, align on page*/ + __u32 x_pos; /* horizontal position of cursor plane */ + __u32 y_pos; /* vertical position of cursor plane*/ + __u32 x_hot; /* horizontal position of cursor hotspot */ + __u32 y_hot; /* vertical position of cursor hotspot */ + union { + __u32 region_index; /* region index */ + __u32 dmabuf_id; /* dma-buf id */ + }; +}; + +#define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14) + +/** + * VFIO_DEVICE_GET_GFX_DMABUF - _IOW(VFIO_TYPE, VFIO_BASE + 15, __u32) + * + * Return a new dma-buf file descriptor for an exposed guest framebuffer + * described by the provided dmabuf_id. The dmabuf_id is returned from VFIO_ + * DEVICE_QUERY_GFX_PLANE as a token of the exposed guest framebuffer. + */ + +#define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15) + +/** + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 16, + * struct vfio_device_ioeventfd) + * + * Perform a write to the device at the specified device fd offset, with + * the specified data and width when the provided eventfd is triggered. + * vfio bus drivers may not support this for all regions, for all widths, + * or at all. vfio-pci currently only enables support for BAR regions, + * excluding the MSI-X vector table. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_ioeventfd { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */ +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */ +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) + __u64 offset; /* device fd offset of write */ + __u64 data; /* data to be written */ + __s32 fd; /* -1 for de-assignment */ +}; + +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) + +/** + * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_device_feature) + * + * Get, set, or probe feature data of the device. The feature is selected + * using the FEATURE_MASK portion of the flags field. Support for a feature + * can be probed by setting both the FEATURE_MASK and PROBE bits. A probe + * may optionally include the GET and/or SET bits to determine read vs write + * access of the feature respectively. Probing a feature will return success + * if the feature is supported and all of the optionally indicated GET/SET + * methods are supported. The format of the data portion of the structure is + * specific to the given feature. The data portion is not required for + * probing. GET and SET are mutually exclusive, except for use with PROBE. + * + * Return 0 on success, -errno on failure. + */ +struct vfio_device_feature { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_FEATURE_MASK (0xffff) /* 16-bit feature index */ +#define VFIO_DEVICE_FEATURE_GET (1 << 16) /* Get feature into data[] */ +#define VFIO_DEVICE_FEATURE_SET (1 << 17) /* Set feature from data[] */ +#define VFIO_DEVICE_FEATURE_PROBE (1 << 18) /* Probe feature support */ + __u8 data[]; +}; + +#define VFIO_DEVICE_FEATURE _IO(VFIO_TYPE, VFIO_BASE + 17) + +/* + * Provide support for setting a PCI VF Token, which is used as a shared + * secret between PF and VF drivers. This feature may only be set on a + * PCI SR-IOV PF when SR-IOV is enabled on the PF and there are no existing + * open VFs. Data provided when setting this feature is a 16-byte array + * (__u8 b[16]), representing a UUID. + */ +#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0) + +/* -------- API for Type1 VFIO IOMMU -------- */ + +/** + * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info) + * + * Retrieve information about the IOMMU object. Fills in provided + * struct vfio_iommu_info. Caller sets argsz. + * + * XXX Should we do these by CHECK_EXTENSION too? + */ +struct vfio_iommu_type1_info { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ +#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ + __u64 iova_pgsizes; /* Bitmap of supported page sizes */ + __u32 cap_offset; /* Offset within info struct of first cap */ +}; + +/* + * The IOVA capability allows to report the valid IOVA range(s) + * excluding any non-relaxable reserved regions exposed by + * devices attached to the container. Any DMA map attempt + * outside the valid iova range will return error. + * + * The structures below define version 1 of this capability. + */ +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 1 + +struct vfio_iova_range { + __u64 start; + __u64 end; +}; + +struct vfio_iommu_type1_info_cap_iova_range { + struct vfio_info_cap_header header; + __u32 nr_iovas; + __u32 reserved; + struct vfio_iova_range iova_ranges[]; +}; + +/* + * The migration capability allows to report supported features for migration. + * + * The structures below define version 1 of this capability. + * + * The existence of this capability indicates that IOMMU kernel driver supports + * dirty page logging. + * + * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty + * page logging. + * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap + * size in bytes that can be used by user applications when getting the dirty + * bitmap. + */ +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 2 + +struct vfio_iommu_type1_info_cap_migration { + struct vfio_info_cap_header header; + __u32 flags; + __u64 pgsize_bitmap; + __u64 max_dirty_bitmap_size; /* in bytes */ +}; + +/* + * The DMA available capability allows to report the current number of + * simultaneously outstanding DMA mappings that are allowed. + * + * The structure below defines version 1 of this capability. + * + * avail: specifies the current number of outstanding DMA mappings allowed. + */ +#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 + +struct vfio_iommu_type1_info_dma_avail { + struct vfio_info_cap_header header; + __u32 avail; +}; + +#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/** + * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map) + * + * Map process virtual addresses to IO virtual addresses using the + * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. + */ +struct vfio_iommu_type1_dma_map { + __u32 argsz; + __u32 flags; +#define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ +#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ + __u64 vaddr; /* Process virtual address */ + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of mapping (bytes) */ +}; + +#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) + +struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ + __u64 __user *data; /* one bit per page */ +}; + +/** + * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_dma_unmap) + * + * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. + * Caller sets argsz. The actual unmapped size is returned in the size + * field. No guarantee is made to the user that arbitrary unmaps of iova + * or size different from those used in the original mapping call will + * succeed. + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap + * before unmapping IO virtual addresses. When this flag is set, the user must + * provide a struct vfio_bitmap in data[]. User must provide zero-allocated + * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field. + * A bit in the bitmap represents one page, of user provided page size in + * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set + * indicates that the page at that offset from iova is dirty. A Bitmap of the + * pages in the range of unmapped size is returned in the user-provided + * vfio_bitmap.data. + */ +struct vfio_iommu_type1_dma_unmap { + __u32 argsz; + __u32 flags; +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of mapping (bytes) */ + __u8 data[]; +}; + +#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) + +/* + * IOCTLs to enable/disable IOMMU container usage. + * No parameters are supported. + */ +#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) +#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) + +/** + * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_type1_dirty_bitmap) + * IOCTL is used for dirty pages logging. + * Caller should set flag depending on which operation to perform, details as + * below: + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs + * the IOMMU driver to log pages that are dirtied or potentially dirtied by + * the device; designed to be used when a migration is in progress. Dirty pages + * are logged until logging is disabled by user application by calling the IOCTL + * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs + * the IOMMU driver to stop logging dirtied pages. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set + * returns the dirty pages bitmap for IOMMU container for a given IOVA range. + * The user must specify the IOVA range and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports getting a bitmap of the smallest supported pgsize only and can be + * modified in future to get a bitmap of any specified supported pgsize. The + * user must provide a zeroed memory area for the bitmap memory and specify its + * size in bitmap.size. One bit is used to represent one page consecutively + * starting from iova offset. The user should provide page size in bitmap.pgsize + * field. A bit set in the bitmap indicates that the page at that offset from + * iova is dirty. The caller must set argsz to a value including the size of + * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the + * actual bitmap. If dirty pages logging is not enabled, an error will be + * returned. + * + * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as + * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying + * dirty bitmap is not cleared automatically. The user can clear it manually by + * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap + * for IOMMU container for a given IOVA range. The user must specify the IOVA + * range, the bitmap and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports clearing a bitmap of the smallest supported pgsize only and can be + * modified in future to clear a bitmap of any specified supported pgsize. The + * user must provide a memory area for the bitmap memory and specify its size + * in bitmap.size. One bit is used to represent one page consecutively starting + * from iova offset. The user should provide page size in bitmap.pgsize field. + * A bit set in the bitmap indicates that the page at that offset from iova is + * cleared the dirty status, and dirty tracking is re-enabled for that page. The + * caller must set argsz to a value including the size of structure + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If + * dirty pages logging is not enabled, an error will be returned. Note: user + * should clear dirty log before handle corresponding dirty pages. + * + * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be + * specified at a time. + */ +struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) + __u8 data[]; +}; + +struct vfio_iommu_type1_dirty_bitmap_get { + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of iova range */ + struct vfio_bitmap bitmap; +}; + +#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) + +/* + * VFIO_IOMMU_BIND_PROCESS + * + * Allocate a PASID for a process address space, and use it to attach this + * process to all devices in the container. Devices can then tag their DMA + * traffic with the returned @pasid to perform transactions on the associated + * virtual address space. Mapping and unmapping buffers is performed by standard + * functions such as mmap and malloc. + * + * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to + * bind. Otherwise the current task is bound. Given that the caller owns the + * device, setting this flag grants the caller read and write permissions on the + * entire address space of foreign process described by @pid. Therefore, + * permission to perform the bind operation on a foreign process is governed by + * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) + * for more information. + * + * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This + * ID is unique to a process and can be used on all devices in the container. + * + * On fork, the child inherits the device fd and can use the bonds setup by its + * parent. Consequently, the child has R/W access on the address spaces bound by + * its parent. After an execv, the device fd is closed and the child doesn't + * have access to the address space anymore. + * + * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is + * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND, + * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current + * task from the container. + */ +struct vfio_iommu_type1_bind_process { + __u32 flags; +#define VFIO_IOMMU_BIND_PID (1 << 0) + __u32 pasid; + __s32 pid; +}; + +/* + * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes + * vfio_iommu_type1_bind_process in data. + */ +struct vfio_iommu_type1_bind { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_BIND_PROCESS (1 << 0) + __u8 data[]; +}; + +/* + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind) + * + * Manage address spaces of devices in this container. Initially a TYPE1 + * container can only have one address space, managed with + * VFIO_IOMMU_MAP/UNMAP_DMA. + * + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page + * tables, and BIND manages the stage-1 (guest) page tables. Other types of + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls + * non-PASID traffic and BIND controls PASID traffic. But this depends on the + * underlying IOMMU architecture and isn't guaranteed. + * + * Availability of this feature depends on the device, its bus, the underlying + * IOMMU and the CPU architecture. + * + * returns: 0 on success, -errno on failure. + */ +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 22) + +/* + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind) + * + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. + */ +#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23) + +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ + +/* + * The SPAPR TCE DDW info struct provides the information about + * the details of Dynamic DMA window capability. + * + * @pgsizes contains a page size bitmask, 4K/64K/16M are supported. + * @max_dynamic_windows_supported tells the maximum number of windows + * which the platform can create. + * @levels tells the maximum number of levels in multi-level IOMMU tables; + * this allows splitting a table into smaller chunks which reduces + * the amount of physically contiguous memory required for the table. + */ +struct vfio_iommu_spapr_tce_ddw_info { + __u64 pgsizes; /* Bitmap of supported page sizes */ + __u32 max_dynamic_windows_supported; + __u32 levels; +}; + +/* + * The SPAPR TCE info struct provides the information about the PCI bus + * address ranges available for DMA, these values are programmed into + * the hardware so the guest has to know that information. + * + * The DMA 32 bit window start is an absolute PCI bus address. + * The IOVA address passed via map/unmap ioctls are absolute PCI bus + * addresses too so the window works as a filter rather than an offset + * for IOVA addresses. + * + * Flags supported: + * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows + * (DDW) support is present. @ddw is only supported when DDW is present. + */ +struct vfio_iommu_spapr_tce_info { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_SPAPR_INFO_DDW (1 << 0) /* DDW supported */ + __u32 dma32_window_start; /* 32 bit window start (bytes) */ + __u32 dma32_window_size; /* 32 bit window size (bytes) */ + struct vfio_iommu_spapr_tce_ddw_info ddw; +}; + +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/* + * EEH PE operation struct provides ways to: + * - enable/disable EEH functionality; + * - unfreeze IO/DMA for frozen PE; + * - read PE state; + * - reset PE; + * - configure PE; + * - inject EEH error. + */ +struct vfio_eeh_pe_err { + __u32 type; + __u32 func; + __u64 addr; + __u64 mask; +}; + +struct vfio_eeh_pe_op { + __u32 argsz; + __u32 flags; + __u32 op; + union { + struct vfio_eeh_pe_err err; + }; +}; + +#define VFIO_EEH_PE_DISABLE 0 /* Disable EEH functionality */ +#define VFIO_EEH_PE_ENABLE 1 /* Enable EEH functionality */ +#define VFIO_EEH_PE_UNFREEZE_IO 2 /* Enable IO for frozen PE */ +#define VFIO_EEH_PE_UNFREEZE_DMA 3 /* Enable DMA for frozen PE */ +#define VFIO_EEH_PE_GET_STATE 4 /* PE state retrieval */ +#define VFIO_EEH_PE_STATE_NORMAL 0 /* PE in functional state */ +#define VFIO_EEH_PE_STATE_RESET 1 /* PE reset in progress */ +#define VFIO_EEH_PE_STATE_STOPPED 2 /* Stopped DMA and IO */ +#define VFIO_EEH_PE_STATE_STOPPED_DMA 4 /* Stopped DMA only */ +#define VFIO_EEH_PE_STATE_UNAVAIL 5 /* State unavailable */ +#define VFIO_EEH_PE_RESET_DEACTIVATE 5 /* Deassert PE reset */ +#define VFIO_EEH_PE_RESET_HOT 6 /* Assert hot reset */ +#define VFIO_EEH_PE_RESET_FUNDAMENTAL 7 /* Assert fundamental reset */ +#define VFIO_EEH_PE_CONFIGURE 8 /* PE configuration */ +#define VFIO_EEH_PE_INJECT_ERR 9 /* Inject EEH error */ + +#define VFIO_EEH_PE_OP _IO(VFIO_TYPE, VFIO_BASE + 21) + +/** + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory) + * + * Registers user space memory where DMA is allowed. It pins + * user pages and does the locked memory accounting so + * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls + * get faster. + */ +struct vfio_iommu_spapr_register_memory { + __u32 argsz; + __u32 flags; + __u64 vaddr; /* Process virtual address */ + __u64 size; /* Size of mapping (bytes) */ +}; +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) + +/** + * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory) + * + * Unregisters user space memory registered with + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY. + * Uses vfio_iommu_spapr_register_memory for parameters. + */ +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) + +/** + * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create) + * + * Creates an additional TCE table and programs it (sets a new DMA window) + * to every IOMMU group in the container. It receives page shift, window + * size and number of levels in the TCE table being created. + * + * It allocates and returns an offset on a PCI bus of the new DMA window. + */ +struct vfio_iommu_spapr_tce_create { + __u32 argsz; + __u32 flags; + /* in */ + __u32 page_shift; + __u32 __resv1; + __u64 window_size; + __u32 levels; + __u32 __resv2; + /* out */ + __u64 start_addr; +}; +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) + +/** + * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove) + * + * Unprograms a TCE table from all groups in the container and destroys it. + * It receives a PCI bus offset as a window id. + */ +struct vfio_iommu_spapr_tce_remove { + __u32 argsz; + __u32 flags; + /* in */ + __u64 start_addr; +}; +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + +/* ***************************************************************** */ + +#endif /* _UAPIVFIO_H */ diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/Makefile b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/Makefile index 1ad372c..cecd57f 100644 --- a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/Makefile +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/Makefile @@ -1,10 +1,10 @@ KERNEL_VERSION_BY_BUILDENV := `uname -r` KERNEL_PATH := /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build -KSP := $(shell if test -d /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; then \ - echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; \ - else \ - echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build; \ - fi) +# KSP := $(shell if test -d /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; then \ +# echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; \ +# else \ +# echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build; \ +# fi) obj-m += uacce/ obj-m += hisilicon/ @@ -13,17 +13,25 @@ DIRS := $(shell find . -maxdepth 3 -type d) TARGET = $(foreach dir,$(DIRS),$(wildcard \ $(dir)/*.o) $(dir)/*.ko $(dir)/*.tmp_versions $(dir)/*.depend $(dir)/*.mod.c $(dir)/*.order $(dir)/*.symvers) +CONFIG_FLAGS = CONFIG_CC_STACKPROTECTOR_STRONG=y \ + CONFIG_UACCE=m \ + CONFIG_CRYPTO_QM_UACCE=m \ + CONFIG_CRYPTO_DEV_HISI_SGL=m \ + CONFIG_CRYPTO_DEV_HISI_QM=m \ + CONFIG_CRYPTO_DEV_HISI_ZIP=m \ + CONFIG_CRYPTO_DEV_HISI_HPRE=m \ + CONFIG_CRYPTO_DEV_HISI_SEC2=m \ + CONFIG_CRYPTO_DEV_HISI_TRNG=m + +ifeq ($(ENABLE_MIGRATION), y) +CONFIG_FLAGS += CONFIG_CRYPTO_DEV_HISI_MIGRATION=m +else +CONFIG_FLAGS += CONFIG_CRYPTO_DEV_HISI_MIGRATION=n +endif + default: - $(MAKE) -C $(KERNEL_PATH) M=$(shell pwd) modules \ - CONFIG_CC_STACKPROTECTOR_STRONG=y \ - CONFIG_UACCE=m \ - CONFIG_CRYPTO_QM_UACCE=m \ - CONFIG_CRYPTO_DEV_HISI_SGL=m \ - CONFIG_CRYPTO_DEV_HISI_QM=m \ - CONFIG_CRYPTO_DEV_HISI_ZIP=m \ - CONFIG_CRYPTO_DEV_HISI_HPRE=m \ - CONFIG_CRYPTO_DEV_HISI_SEC2=m \ - CONFIG_CRYPTO_DEV_HISI_TRNG=m + $(MAKE) -C $(KERNEL_PATH) M=$(shell pwd) modules $(CONFIG_FLAGS) + #copy: # cp -f $(shell pwd)/include_linux/uacce.h $(KSP)/include/linux # cp -f $(shell pwd)/include_uapi_linux/uacce.h $(KSP)/include/uapi/linux @@ -40,6 +48,9 @@ install: -modprobe hisi_sec2 uacce_mode=1 pf_q_num=256 -modprobe hisi_hpre uacce_mode=1 pf_q_num=256 -modprobe hisi_zip uacce_mode=1 pf_q_num=256 + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe hisi_migration; \ + fi) -echo "options hisi_sec2 uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_sec2.conf -echo "options hisi_hpre uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_hpre.conf -echo "options hisi_zip uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_zip.conf @@ -61,6 +72,9 @@ check: done uninstall: + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe -r hisi_migration; \ + fi) modprobe -r hisi_zip modprobe -r hisi_hpre modprobe -r hisi_sec2 @@ -75,6 +89,9 @@ uninstall: rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_sec2.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_hpre.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_zip.ko + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_migration.ko; \ + fi) nosva: $(shell mkdir -p /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra) @@ -88,7 +105,9 @@ nosva: -modprobe hisi_sec2 uacce_mode=2 pf_q_num=256 -modprobe hisi_hpre uacce_mode=2 pf_q_num=256 -modprobe hisi_zip uacce_mode=2 pf_q_num=256 - + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe hisi_migration; \ + fi) -echo "options hisi_sec2 uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_sec2.conf -echo "options hisi_hpre uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_hpre.conf -echo "options hisi_zip uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_zip.conf diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/Makefile b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/Makefile index a62965b..19e6627 100644 --- a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/Makefile +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_CRYPTO_DEV_HISI_SEC2) += sec2/ obj-$(CONFIG_CRYPTO_DEV_HISI_QM) += hisi_qm.o hisi_qm-objs = qm.o sgl.o debugfs.o obj-$(CONFIG_CRYPTO_DEV_HISI_ZIP) += zip/ +obj-$(CONFIG_CRYPTO_DEV_HISI_MIGRATION) += migration/ \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/Makefile b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/Makefile new file mode 100644 index 0000000..c2c5219 --- /dev/null +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_CRYPTO_DEV_HISI_MIGRATION) += hisi_migration.o +hisi_migration-objs = acc_vf_migration.o \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.c b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.c new file mode 100644 index 0000000..8a7196a --- /dev/null +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.c @@ -0,0 +1,1719 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 HiSilicon Limited. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acc_vf_migration.h" + +#define VDM_OFFSET(x) offsetof(struct vfio_device_migration_info, x) +static struct dentry *mig_debugfs_root; +static int mig_root_ref; + +/* return 0 mailbox ready, -ETIMEDOUT hardware timeout */ +static int qm_wait_mb_ready(struct hisi_qm *qm) +{ + u32 val; + + return readl_relaxed_poll_timeout(qm->io_base + QM_MB_CMD_SEND_BASE, + val, !((val >> QM_MB_BUSY_SHIFT) & + 0x1), POLL_PERIOD, POLL_TIMEOUT); +} + +/* return 0 VM acc device ready, -ETIMEDOUT hardware timeout */ +static int qm_wait_dev_ready(struct hisi_qm *qm) +{ + u32 val; + + return readl_relaxed_poll_timeout(qm->io_base + QM_VF_STATE, + val, !(val & 0x1), POLL_PERIOD, POLL_TIMEOUT); +} + + +/* 128 bit should be written to hardware at one time to trigger a mailbox */ +static void qm_mb_write(struct hisi_qm *qm, const void *src) +{ + void __iomem *fun_base = qm->io_base + QM_MB_CMD_SEND_BASE; + unsigned long tmp0 = 0; + unsigned long tmp1 = 0; + + if (!IS_ENABLED(CONFIG_ARM64)) { + memcpy_toio(fun_base, src, 16); + wmb(); + return; + } + + asm volatile("ldp %0, %1, %3\n" + "stp %0, %1, %2\n" + "dsb sy\n" + : "=&r" (tmp0), + "=&r" (tmp1), + "+Q" (*((char __iomem *)fun_base)) + : "Q" (*((char *)src)) + : "memory"); +} + +static void qm_mb_pre_init(struct qm_mailbox *mailbox, u8 cmd, + u16 queue, bool op) +{ + mailbox->w0 = cpu_to_le16(cmd | + (op ? 0x1 << QM_MB_OP_SHIFT : 0) | + (0x1 << QM_MB_BUSY_SHIFT)); + mailbox->queue_num = cpu_to_le16(queue); + mailbox->rsvd = 0; +} + +static int qm_mb_nolock(struct hisi_qm *qm, struct qm_mailbox *mailbox) +{ + int cnt = 0; + + if (unlikely(qm_wait_mb_ready(qm))) { + dev_err(&qm->pdev->dev, "QM mailbox is busy to start!\n"); + return -EBUSY; + } + + qm_mb_write(qm, mailbox); + while (true) { + if (!qm_wait_mb_ready(qm)) + break; + if (++cnt > QM_MB_MAX_WAIT_CNT) { + dev_err(&qm->pdev->dev, "QM mailbox operation timeout!\n"); + return -EBUSY; + } + } + return 0; +} + +static int qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue, + bool op) +{ + struct qm_mailbox mailbox; + int ret; + + dev_dbg(&qm->pdev->dev, "QM mailbox request to q%u: %u-0x%llx\n", + queue, cmd, (unsigned long long)dma_addr); + + qm_mb_pre_init(&mailbox, cmd, queue, op); + mailbox.base_l = cpu_to_le32(lower_32_bits(dma_addr)); + mailbox.base_h = cpu_to_le32(upper_32_bits(dma_addr)); + + mutex_lock(&qm->mailbox_lock); + ret = qm_mb_nolock(qm, &mailbox); + mutex_unlock(&qm->mailbox_lock); + + return ret; +} + +/* + * Each state Reg is checked 100 times, + * with a delay of 100 microseconds after each check + */ +static u32 acc_check_reg_state(struct hisi_qm *qm, u32 regs) +{ + int check_times = 0; + u32 state; + + state = readl(qm->io_base + regs); + while (state && check_times < ERROR_CHECK_TIMEOUT) { + udelay(CHECK_DELAY_TIME); + state = readl(qm->io_base + regs); + check_times++; + } + + return state; +} + +/* Check the PF's RAS state and Function INT state */ +static int qm_check_int_state(struct acc_vf_migration *acc_vf_dev) +{ + struct hisi_qm *vfqm = acc_vf_dev->vf_qm; + struct hisi_qm *qm = acc_vf_dev->pf_qm; + struct device *dev = &qm->pdev->dev; + u32 state; + + /* Check RAS state */ + state = acc_check_reg_state(qm, QM_ABNORMAL_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM RAS state!\n"); + return -EBUSY; + } + + /* Check Function Communication state between PF and VF */ + state = acc_check_reg_state(vfqm, QM_IFC_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM IFC INT state!\n"); + return -EBUSY; + } + state = acc_check_reg_state(vfqm, QM_IFC_INT_SET_V); + if (state) { + dev_err(dev, "failed to check QM IFC INT SET state!\n"); + return -EBUSY; + } + + /* Check submodule task state */ + switch (acc_vf_dev->acc_type) { + case HISI_SEC: + state = acc_check_reg_state(qm, SEC_CORE_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM SEC Core INT state!\n"); + return -EBUSY; + } + break; + case HISI_HPRE: + state = acc_check_reg_state(qm, HPRE_HAC_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM HPRE HAC INT state!\n"); + return -EBUSY; + } + break; + case HISI_ZIP: + state = acc_check_reg_state(qm, HZIP_CORE_INT_STATUS); + if (state) { + dev_err(dev, "failed to check QM ZIP Core INT state!\n"); + return -EBUSY; + } + break; + default: + dev_err(dev, "failed to detect acc module type!\n"); + return -EINVAL; + } + + return 0; +} + +static int qm_read_reg(struct hisi_qm *qm, u32 reg_addr, + u32 *data, u8 nums) +{ + int i; + + if (nums < 1 || nums > QM_REGS_MAX_LEN) { + dev_err(&qm->pdev->dev, "QM read input parameter is error!\n"); + return -EINVAL; + } + + for (i = 0; i < nums; i++) { + data[i] = readl(qm->io_base + reg_addr); + reg_addr += QM_REG_ADDR_OFFSET; + } + + return 0; +} + +static int qm_write_reg(struct hisi_qm *qm, u32 reg_addr, + u32 *data, u8 nums) +{ + int i; + + if (nums < 1 || nums > QM_REGS_MAX_LEN) { + dev_err(&qm->pdev->dev, "QM write input parameter is error!\n"); + return -EINVAL; + } + + for (i = 0; i < nums; i++) { + writel(data[i], qm->io_base + reg_addr); + reg_addr += QM_REG_ADDR_OFFSET; + } + + return 0; +} + +static int qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number) +{ + u64 sqc_vft; + int ret; + + ret = qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); + if (ret) + return ret; + + sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); + *number = (QM_SQC_VFT_NUM_MASK_V2 & + (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; + + return 0; +} + +static int qm_get_sqc(struct hisi_qm *qm, u64 *addr) +{ + int ret; + + ret = qm_mb(qm, QM_MB_CMD_SQC_BT, 0, 0, 1); + if (ret) + return ret; + + *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + + return 0; +} + +static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) +{ + int ret; + + ret = qm_mb(qm, QM_MB_CMD_CQC_BT, 0, 0, 1); + if (ret) + return ret; + + *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | + ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << + QM_XQC_ADDR_OFFSET); + + return 0; +} + +static int qm_rw_regs_read(struct hisi_qm *qm, struct acc_vf_data *vf_data) +{ + struct device *dev = &qm->pdev->dev; + int ret; + + ret = qm_read_reg(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to read QM_VF_AEQ_INT_MASK!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to read QM_VF_EQ_INT_MASK!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_IFC_INT_SOURCE_V, + &vf_data->ifc_int_source, 1); + if (ret) { + dev_err(dev, "failed to read QM_IFC_INT_SOURCE_V!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1); + if (ret) { + dev_err(dev, "failed to read QM_IFC_INT_MASK!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1); + if (ret) { + dev_err(dev, "failed to read QM_IFC_INT_SET_V!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_PAGE_SIZE, &vf_data->page_size, 1); + if (ret) { + dev_err(dev, "failed to read QM_PAGE_SIZE!\n"); + return ret; + } + + ret = qm_read_reg(qm, QM_VF_STATE, &vf_data->vf_state, 1); + if (ret) { + dev_err(dev, "failed to read QM_VF_STATE!\n"); + return ret; + } + + /* QM_EQC_DW has 7 regs */ + ret = qm_read_reg(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + if (ret) { + dev_err(dev, "failed to read QM_EQC_DW!\n"); + return ret; + } + + /* QM_AEQC_DW has 7 regs */ + ret = qm_read_reg(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + if (ret) { + dev_err(dev, "failed to read QM_AEQC_DW!\n"); + return ret; + } + + return 0; +} + +static int qm_rw_regs_write(struct hisi_qm *qm, struct acc_vf_data *vf_data) +{ + struct device *dev = &qm->pdev->dev; + int ret; + + /* check VF state */ + if (unlikely(qm_wait_mb_ready(qm))) { + dev_err(&qm->pdev->dev, "QM device is not ready to write!\n"); + return -EBUSY; + } + + ret = qm_write_reg(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_AEQ_INT_MASK!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_EQ_INT_MASK!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_IFC_INT_SOURCE_V, + &vf_data->ifc_int_source, 1); + if (ret) { + dev_err(dev, "failed to write QM_IFC_INT_SOURCE_V!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1); + if (ret) { + dev_err(dev, "failed to write QM_IFC_INT_MASK!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1); + if (ret) { + dev_err(dev, "failed to write QM_IFC_INT_SET_V!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1); + if (ret) { + dev_err(dev, "failed to write QM_QUE_ISO_CFG_V!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_PAGE_SIZE, &vf_data->page_size, 1); + if (ret) { + dev_err(dev, "failed to write QM_PAGE_SIZE!\n"); + return ret; + } + + ret = qm_write_reg(qm, QM_VF_STATE, &vf_data->vf_state, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_STATE!\n"); + return ret; + } + + /* QM_EQC_DW has 7 regs */ + ret = qm_write_reg(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + if (ret) { + dev_err(dev, "failed to write QM_EQC_DW!\n"); + return ret; + } + + /* QM_AEQC_DW has 7 regs */ + ret = qm_write_reg(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + if (ret) { + dev_err(dev, "failed to write QM_AEQC_DW!\n"); + return ret; + } + + return 0; +} + +/* + * the vf QM have unbind from host, insmod in the VM + * so, qm just have the addr from pci dev + * others is null. + * so we need read from the SEC hardware REGs. + */ +static int vf_migration_data_store(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + int ret; + + ret = qm_rw_regs_read(qm, vf_data); + if (ret) { + dev_err(dev, "failed to read QM regs!\n"); + return -EINVAL; + } + + /* + * every Reg is 32 bit, the dma address is 64 bit + * so, the dma address is store in the Reg2 and Reg1 + */ + vf_data->eqe_dma = vf_data->qm_eqc_dw[2]; + vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[1]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[2]; + vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[1]; + + /* Through SQC_BT/CQC_BT to get sqc and cqc address */ + ret = qm_get_sqc(qm, &vf_data->sqc_dma); + if (ret) { + dev_err(dev, "failed to read SQC addr!\n"); + return -EINVAL; + } + + ret = qm_get_cqc(qm, &vf_data->cqc_dma); + if (ret) { + dev_err(dev, "failed to read CQC addr!\n"); + return -EINVAL; + } + + return 0; +} + +static void qm_dev_cmd_init(struct hisi_qm *qm) +{ + /* clear VF communication status registers. */ + writel(0x1, qm->io_base + QM_IFC_INT_SOURCE_V); + + /* enable pf and vf communication. */ + writel(0x0, qm->io_base + QM_IFC_INT_MASK); +} + +static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd, + u16 index, u8 priority) +{ + void __iomem *io_base = qm->io_base; + u16 randata = 0; + u64 doorbell; + + if (cmd == QM_DOORBELL_CMD_SQ || cmd == QM_DOORBELL_CMD_CQ) + io_base = qm->db_io_base + (u64)qn * qm->db_interval + + QM_DOORBELL_SQ_CQ_BASE_V2; + else + io_base += QM_DOORBELL_EQ_AEQ_BASE_V2; + + doorbell = qn | ((u64)cmd << QM_DB_CMD_SHIFT_V2) | + ((u64)randata << QM_DB_RAND_SHIFT_V2) | + ((u64)index << QM_DB_INDEX_SHIFT_V2) | + ((u64)priority << QM_DB_PRIORITY_SHIFT_V2); + + writeq(doorbell, io_base); +} + +static void vf_qm_fun_restart(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + int i; + + /* + * When the system is rebooted, the SMMU page table is destroyed, + * and the QP queue cannot be returned normally at this time. + * if vf_ready == 0x2, don't need to restart QP. + */ + if (vf_data->vf_state != QM_READY) { + dev_err(dev, "failed to restart VF!\n"); + return; + } + + for (i = 0; i < qm->qp_num; i++) + qm_db(qm, i, QM_DOORBELL_CMD_SQ, 0, 1); +} + +static int vf_match_info_check(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + u32 que_iso_state; + int ret; + + /* vf acc type check */ + if (vf_data->acc_type != acc_vf_dev->acc_type) { + dev_err(dev, "failed to match VF acc type!\n"); + return -EINVAL; + } + + /* vf qp num check */ + ret = qm_get_vft(qm, &qm->qp_base, &qm->qp_num); + if (ret || qm->qp_num <= 1) { + dev_err(dev, "failed to get vft qp nums!\n"); + return ret; + } + + if (vf_data->qp_num != qm->qp_num) { + dev_err(dev, "failed to match VF qp num!\n"); + return -EINVAL; + } + + /* vf isolation state check */ + ret = qm_read_reg(qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); + if (ret) { + dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n"); + return ret; + } + if (vf_data->que_iso_cfg != que_iso_state) { + dev_err(dev, "failed to match isolation state!\n"); + return -EINVAL; + } + + return 0; +} + +static int vf_migration_data_recover(struct hisi_qm *qm, + struct acc_vf_data *vf_data) +{ + struct device *dev = &qm->pdev->dev; + int ret; + + qm->eqe_dma = vf_data->eqe_dma; + qm->aeqe_dma = vf_data->aeqe_dma; + qm->sqc_dma = vf_data->sqc_dma; + qm->cqc_dma = vf_data->cqc_dma; + + qm->qp_base = vf_data->qp_base; + qm->qp_num = vf_data->qp_num; + + ret = qm_rw_regs_write(qm, vf_data); + if (ret) { + dev_err(dev, "Set VF regs failed!\n"); + return ret; + } + + ret = qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); + if (ret) { + dev_err(dev, "Set sqc failed!\n"); + return ret; + } + + ret = qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); + if (ret) { + dev_err(dev, "Set cqc failed!\n"); + return ret; + } + + /* which ACC module need to reinit? */ + qm_dev_cmd_init(qm); + + return 0; +} + +static int vf_qm_cache_wb(struct hisi_qm *qm) +{ + unsigned int val; + + writel(0x1, qm->io_base + QM_CACHE_WB_START); + if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, + val, val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT)) { + dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail!\n"); + return -EINVAL; + } + + return 0; +} + +static int vf_qm_func_stop(struct hisi_qm *qm) +{ + return qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0); +} + +static int pf_qm_get_qp_num(struct hisi_qm *qm, int vf_id, + u32 *rbase, u32 *rnumber) +{ + unsigned int val; + u64 sqc_vft; + int ret; + + ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, + val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT); + if (ret) + return ret; + + writel(0x1, qm->io_base + QM_VFT_CFG_OP_WR); + /* 0 mean SQC VFT */ + writel(0x0, qm->io_base + QM_VFT_CFG_TYPE); + writel(vf_id, qm->io_base + QM_VFT_CFG); + + writel(0x0, qm->io_base + QM_VFT_CFG_RDY); + writel(0x1, qm->io_base + QM_VFT_CFG_OP_ENABLE); + + ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val, + val & BIT(0), POLL_PERIOD, + POLL_TIMEOUT); + if (ret) + return ret; + + sqc_vft = readl(qm->io_base + QM_VFT_CFG_DATA_L) | + ((u64)readl(qm->io_base + QM_VFT_CFG_DATA_H) << + QM_XQC_ADDR_OFFSET); + *rbase = QM_SQC_VFT_BASE_MASK_V2 & + (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); + *rnumber = (QM_SQC_VFT_NUM_MASK_V2 & + (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; + + return 0; +} + +static int pf_qm_state_pre_save(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct acc_vf_data *vf_data = acc_vf_dev->vf_data; + struct device *dev = &qm->pdev->dev; + int vf_id = acc_vf_dev->vf_id; + int ret; + + /* vf acc type save */ + vf_data->acc_type = acc_vf_dev->acc_type; + + /* vf qp num save from PF */ + ret = pf_qm_get_qp_num(qm, vf_id, &qm->qp_base, &qm->qp_num); + if (ret || qm->qp_num <= 1) { + dev_err(dev, "failed to get vft qp nums!\n"); + return -EINVAL; + } + vf_data->qp_base = qm->qp_base; + vf_data->qp_num = qm->qp_num; + + /* vf isolation state save from PF */ + ret = qm_read_reg(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1); + if (ret) { + dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n"); + return ret; + } + + return 0; +} + +static int vf_qm_state_save(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct device *dev = &acc_vf_dev->vf_dev->dev; + int ret; + + /* + * check VM task driver state + * if vf_ready == 0x1, skip migrate. + */ + if (unlikely(qm_wait_dev_ready(qm))) { + acc_vf_dev->mig_ignore = true; + dev_err(&qm->pdev->dev, "QM device is not ready to read!\n"); + return 0; + } + + /* First stop the ACC vf function */ + ret = vf_qm_func_stop(qm); + if (ret) { + dev_err(dev, "failed to stop QM VF function!\n"); + return ret; + } + + /* Check the VF's RAS and Interrution state */ + ret = qm_check_int_state(acc_vf_dev); + if (ret) { + dev_err(dev, "failed to check QM INT state!\n"); + goto state_error; + } + + /* hisi_qm_cache_wb store cache data to DDR */ + ret = vf_qm_cache_wb(qm); + if (ret) { + dev_err(dev, "failed to writeback QM Cache!\n"); + goto state_error; + } + + ret = vf_migration_data_store(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to get and store migration data!\n"); + goto state_error; + } + + return 0; + +state_error: + vf_qm_fun_restart(qm, acc_vf_dev); + return ret; +} + +static int vf_qm_state_resume(struct hisi_qm *qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct device *dev = &acc_vf_dev->vf_dev->dev; + int ret; + + /* recover data to VF */ + ret = vf_migration_data_recover(qm, acc_vf_dev->vf_data); + if (ret) { + dev_err(dev, "failed to recover the VF!\n"); + return ret; + } + + /* restart all destination VF's QP */ + vf_qm_fun_restart(qm, acc_vf_dev); + + return 0; +} + +static int acc_vf_set_device_state(struct acc_vf_migration *acc_vf_dev, + u32 state) +{ + struct vfio_device_migration_info *mig_ctl = acc_vf_dev->mig_ctl; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *pfqm = acc_vf_dev->pf_qm; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + int ret = 0; + + if (state == mig_ctl->device_state) + return 0; + + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: + if (!mig_ctl->data_size) + break; + + if (mig_ctl->device_state == VFIO_DEVICE_STATE_RESUMING) { + ret = vf_qm_state_resume(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to resume device!\n"); + return -EFAULT; + } + } + + break; + case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: + /* ACC should in the pre cycle to read match information data */ + ret = pf_qm_state_pre_save(pfqm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to pre save device state!\n"); + return -EFAULT; + } + + /* set the pending_byte and match data size */ + mig_ctl->data_size = QM_MATCH_SIZE; + mig_ctl->pending_bytes = mig_ctl->data_size; + + break; + case VFIO_DEVICE_STATE_SAVING: + /* stop the vf function */ + ret = vf_qm_state_save(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to save device state!\n"); + return -EFAULT; + } + + if (acc_vf_dev->mig_ignore) { + mig_ctl->data_size = 0; + mig_ctl->pending_bytes = 0; + break; + } + + /* set the pending_byte and data_size */ + mig_ctl->data_size = sizeof(struct acc_vf_data); + mig_ctl->pending_bytes = mig_ctl->data_size; + + break; + case VFIO_DEVICE_STATE_STOP: + /* restart all VF's QP */ + vf_qm_fun_restart(qm, acc_vf_dev); + + break; + case VFIO_DEVICE_STATE_RESUMING: + + break; + default: + ret = -EFAULT; + } + + if (!ret) { + dev_info(dev, "migration state: %s ----------> %s!\n", + vf_dev_state[mig_ctl->device_state], + vf_dev_state[state]); + mig_ctl->device_state = state; + } + + return ret; +} + +static int acc_vf_data_transfer(struct acc_vf_migration *acc_vf_dev, + char __user *buf, size_t count, u64 pos, bool iswrite) +{ + struct vfio_device_migration_info *mig_ctl = acc_vf_dev->mig_ctl; + void *data_addr = acc_vf_dev->vf_data; + int ret = 0; + + if (!count) { + dev_err(&acc_vf_dev->vf_dev->dev, + "Qemu operation data size error!\n"); + return -EINVAL; + } + + data_addr += pos - mig_ctl->data_offset; + if (iswrite) { + ret = copy_from_user(data_addr, buf, count) ? + -EFAULT : count; + if (ret == count) + mig_ctl->pending_bytes += count; + } else { + ret = copy_to_user(buf, data_addr, count) ? + -EFAULT : count; + if (ret == count) + mig_ctl->pending_bytes -= count; + } + + return ret; +} + +static int acc_vf_region_migration_rw(struct acc_vf_migration *acc_vf_dev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) +{ + struct vfio_device_migration_info *mig_ctl = acc_vf_dev->mig_ctl; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; + u32 device_state; + int ret = 0; + + switch (pos) { + case VDM_OFFSET(device_state): + if (count != sizeof(mig_ctl->device_state)) { + ret = -EINVAL; + break; + } + + if (iswrite) { + if (copy_from_user(&device_state, buf, count)) { + ret = -EFAULT; + break; + } + + ret = acc_vf_set_device_state(acc_vf_dev, + device_state) ? ret : count; + } else { + ret = copy_to_user(buf, &mig_ctl->device_state, + count) ? -EFAULT : count; + } + break; + case VDM_OFFSET(reserved): + ret = -EFAULT; + break; + case VDM_OFFSET(pending_bytes): + if (count != sizeof(mig_ctl->pending_bytes)) { + ret = -EINVAL; + break; + } + + if (iswrite) + ret = -EFAULT; + else + ret = copy_to_user(buf, &mig_ctl->pending_bytes, + count) ? -EFAULT : count; + break; + case VDM_OFFSET(data_offset): + if (count != sizeof(mig_ctl->data_offset)) { + ret = -EINVAL; + break; + } + if (iswrite) + ret = copy_from_user(&mig_ctl->data_offset, buf, count) ? + -EFAULT : count; + else + ret = copy_to_user(buf, &mig_ctl->data_offset, count) ? + -EFAULT : count; + break; + case VDM_OFFSET(data_size): + if (count != sizeof(mig_ctl->data_size)) { + ret = -EINVAL; + break; + } + + if (iswrite) + ret = copy_from_user(&mig_ctl->data_size, buf, count) ? + -EFAULT : count; + else + ret = copy_to_user(buf, &mig_ctl->data_size, count) ? + -EFAULT : count; + break; + default: + ret = -EFAULT; + break; + } + + /* Transfer data section */ + if (pos >= mig_ctl->data_offset && + pos < MIGRATION_REGION_SZ) { + ret = acc_vf_data_transfer(acc_vf_dev, buf, + count, pos, iswrite); + if (ret != count) + return ret; + } + + if (mig_ctl->device_state == VFIO_DEVICE_STATE_RESUMING && + mig_ctl->pending_bytes == QM_MATCH_SIZE && + mig_ctl->data_size == QM_MATCH_SIZE) { + /* check the VF match information */ + ret = vf_match_info_check(qm, acc_vf_dev); + if (ret) { + dev_err(dev, "failed to check match information!\n"); + return -EFAULT; + } + ret = count; + + /* clear the VF match data size */ + mig_ctl->pending_bytes = 0; + mig_ctl->data_size = 0; + } + return ret; +} + +static int acc_vf_region_migration_mmap(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region, + struct vm_area_struct *vma) +{ + return -EFAULT; +} + +static void acc_vf_region_migration_release(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region) +{ + kfree(acc_vf_dev->mig_ctl); + acc_vf_dev->mig_ctl = NULL; +} + +static const struct acc_vf_region_ops acc_vf_region_ops_migration = { + .rw = acc_vf_region_migration_rw, + .release = acc_vf_region_migration_release, + .mmap = acc_vf_region_migration_mmap, +}; + +static int acc_vf_register_region(struct acc_vf_migration *acc_vf_dev, + const struct acc_vf_region_ops *ops, + void *data) +{ + struct acc_vf_region *regions; + + regions = krealloc(acc_vf_dev->regions, + (acc_vf_dev->num_regions + 1) * sizeof(*regions), + GFP_KERNEL); + if (!regions) + return -ENOMEM; + + acc_vf_dev->regions = regions; + regions[acc_vf_dev->num_regions].type = + VFIO_REGION_TYPE_MIGRATION; + regions[acc_vf_dev->num_regions].subtype = + VFIO_REGION_SUBTYPE_MIGRATION; + regions[acc_vf_dev->num_regions].ops = ops; + regions[acc_vf_dev->num_regions].size = + MIGRATION_REGION_SZ; + regions[acc_vf_dev->num_regions].flags = + VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + regions[acc_vf_dev->num_regions].data = data; + acc_vf_dev->num_regions++; + + return 0; +} + +static long acc_vf_get_region_info(void *device_data, + unsigned int cmd, unsigned long arg) +{ + int num_vdev_regions = vfio_pci_num_regions(device_data); + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + struct vfio_region_info_cap_type cap_type; + struct acc_vf_region *regions; + struct vfio_region_info info; + struct vfio_info_cap caps; + unsigned long minsz; + int index, ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (cmd != VFIO_DEVICE_GET_REGION_INFO) + return -EINVAL; + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + goto default_handle; + + index = info.index - VFIO_PCI_NUM_REGIONS - num_vdev_regions; + if (index > acc_vf_dev->num_regions) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check region numbers!\n"); + return -EINVAL; + } + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + regions = acc_vf_dev->regions; + info.size = regions[index].size; + info.flags = regions[index].flags; + caps.buf = NULL; + caps.size = 0; + cap_type.header.id = VFIO_REGION_INFO_CAP_TYPE; + cap_type.header.version = 1; + cap_type.type = regions[index].type; + cap_type.subtype = regions[index].subtype; + + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + + if (regions[index].ops->add_cap) { + ret = regions[index].ops->add_cap(acc_vf_dev, + ®ions[index], &caps); + if (ret) { + kfree(caps.buf); + return ret; + } + } + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info.cap_offset = sizeof(info); + } + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; + +default_handle: + ret = vfio_pci_ioctl(device_data, cmd, arg); + if (ret) + return ret; + + if (info.index == VFIO_PCI_BAR0_REGION_INDEX) { + if (!acc_vf_dev->in_dirty_track) + return ret; + + /* read default handler's data back */ + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + /* update customized region info */ + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + } + + if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + /* + * ACC VF dev BAR2 region(64K) consists of both functional + * register space and migration control register space. + * Report only the first 32K(functional region) to Guest. + */ + info.size = pci_resource_len(acc_vf_dev->vf_dev, info.index) >> 1; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + } + + return ret; +} + +static int acc_vf_open(void *device_data) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + struct vfio_device_migration_info *mig_ctl; + __u64 mig_offset; + void *vf_data; + int ret; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&acc_vf_dev->reflock); + if (!acc_vf_dev->refcnt) { + ret = acc_vf_register_region(acc_vf_dev, + &acc_vf_region_ops_migration, + NULL); + if (ret) + goto region_error; + vfio_pci_set_vendor_regions(device_data, + acc_vf_dev->num_regions); + + /* the data region must follow migration info */ + mig_offset = sizeof(struct vfio_device_migration_info); + mig_ctl = kzalloc(MIGRATION_REGION_SZ, GFP_KERNEL); + if (!mig_ctl) { + ret = -ENOMEM; + goto mig_error; + } + acc_vf_dev->mig_ctl = mig_ctl; + + vf_data = (void *)mig_ctl + mig_offset; + acc_vf_dev->vf_data = vf_data; + + mig_ctl->device_state = VFIO_DEVICE_STATE_RUNNING; + mig_ctl->data_offset = mig_offset; + mig_ctl->data_size = 0; + } + + ret = vfio_pci_open(device_data); + if (ret) + goto open_error; + + acc_vf_dev->refcnt++; + mutex_unlock(&acc_vf_dev->reflock); + + return 0; + +open_error: + if (!acc_vf_dev->refcnt) { + kfree(acc_vf_dev->mig_ctl); + acc_vf_dev->mig_ctl = NULL; + } +mig_error: + vfio_pci_set_vendor_regions(device_data, 0); +region_error: + mutex_unlock(&acc_vf_dev->reflock); + module_put(THIS_MODULE); + return ret; +} + +static void acc_vf_release(void *device_data) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + int i; + + mutex_lock(&acc_vf_dev->reflock); + if (!--acc_vf_dev->refcnt) { + for (i = 0; i < acc_vf_dev->num_regions; i++) { + if (!acc_vf_dev->regions[i].ops) + continue; + acc_vf_dev->regions[i].ops->release(acc_vf_dev, + &acc_vf_dev->regions[i]); + } + kfree(acc_vf_dev->regions); + acc_vf_dev->regions = NULL; + acc_vf_dev->num_regions = 0; + vfio_pci_set_vendor_regions(device_data, 0); + + kfree(acc_vf_dev->mig_ctl); + acc_vf_dev->mig_ctl = NULL; + } + vfio_pci_release(device_data); + mutex_unlock(&acc_vf_dev->reflock); + module_put(THIS_MODULE); +} + +static long acc_vf_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_REGION_INFO: + return acc_vf_get_region_info(device_data, cmd, arg); + default: + return vfio_pci_ioctl(device_data, cmd, arg); + } +} + +static ssize_t acc_vf_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + int num_vdev_regions = vfio_pci_num_regions(device_data); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int num_vendor_region = acc_vf_dev->num_regions; + struct acc_vf_region *region; + + if (index >= VFIO_PCI_NUM_REGIONS + num_vdev_regions + + num_vendor_region) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check read regions index!\n"); + return -EINVAL; + } + + if (index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + return vfio_pci_read(device_data, buf, count, ppos); + + index -= VFIO_PCI_NUM_REGIONS + num_vdev_regions; + + region = &acc_vf_dev->regions[index]; + if (!region->ops->rw) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check regions read ops!\n"); + return -EINVAL; + } + + return region->ops->rw(acc_vf_dev, buf, count, ppos, false); +} + +static ssize_t acc_vf_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct acc_vf_migration *acc_vf_dev = + vfio_pci_vendor_data(device_data); + int num_vdev_regions = vfio_pci_num_regions(device_data); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int num_vendor_region = acc_vf_dev->num_regions; + struct acc_vf_region *region; + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + pr_debug("vfio bar 0 write\n"); + + if (index >= VFIO_PCI_NUM_REGIONS + num_vdev_regions + + num_vendor_region) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check write regions index!\n"); + return -EINVAL; + } + + if (index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + return vfio_pci_write(device_data, buf, count, ppos); + + index -= VFIO_PCI_NUM_REGIONS + num_vdev_regions; + + region = &acc_vf_dev->regions[index]; + + if (!region->ops->rw) { + dev_err(&acc_vf_dev->vf_dev->dev, + "failed to check regions write ops!\n"); + return -EINVAL; + } + + return region->ops->rw(acc_vf_dev, (char __user *)buf, + count, ppos, true); +} + +static int acc_vf_mmap(void *device_data, struct vm_area_struct *vma) +{ + return vfio_pci_mmap(device_data, vma); +} + +static void acc_vf_request(void *device_data, unsigned int count) +{ + vfio_pci_request(device_data, count); +} + +static struct vfio_device_ops acc_vf_device_ops_node = { + .name = "acc_vf", + .open = acc_vf_open, + .release = acc_vf_release, + .ioctl = acc_vf_ioctl, + .read = acc_vf_read, + .write = acc_vf_write, + .mmap = acc_vf_mmap, + .request = acc_vf_request, +}; + +static ssize_t acc_vf_debug_read(struct file *filp, char __user *buffer, + size_t count, loff_t *pos) +{ + char buf[VFIO_DEV_DBG_LEN]; + int len; + + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "echo 0: test vf data store\n" + "echo 1: test vf data writeback\n" + "echo 2: test vf send mailbox\n" + "echo 3: dump vf dev data\n" + "echo 4: dump migration state\n"); + + return simple_read_from_buffer(buffer, count, pos, buf, len); +} + +static ssize_t acc_vf_debug_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *pos) +{ + struct acc_vf_migration *acc_vf_dev = filp->private_data; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + char tbuf[VFIO_DEV_DBG_LEN]; + unsigned long val; + u64 data; + int len, ret; + + if (*pos) + return 0; + + if (count >= VFIO_DEV_DBG_LEN) + return -ENOSPC; + + len = simple_write_to_buffer(tbuf, VFIO_DEV_DBG_LEN - 1, + pos, buffer, count); + if (len < 0) + return len; + tbuf[len] = '\0'; + if (kstrtoul(tbuf, 0, &val)) + return -EFAULT; + + switch (val) { + case STATE_SAVE: + ret = vf_qm_state_save(qm, acc_vf_dev); + if (ret) + return -EINVAL; + break; + case STATE_RESUME: + ret = vf_qm_state_resume(qm, acc_vf_dev); + if (ret) + return -EINVAL; + break; + case MB_TEST: + data = readl(qm->io_base + QM_MB_CMD_SEND_BASE); + dev_info(dev, "debug mailbox addr: 0x%lx, mailbox val: 0x%llx\n", + (uintptr_t)qm->phys_base, data); + break; + case MIG_DATA_DUMP: + dev_info(dev, "dumped vf migration data:\n"); + print_hex_dump(KERN_INFO, "Mig Data:", DUMP_PREFIX_OFFSET, + VFIO_DBG_LOG_LEN, 1, + (unsigned char *)acc_vf_dev->vf_data, + sizeof(struct acc_vf_data), false); + break; + case MIG_DEV_SHOW: + if (!acc_vf_dev->mig_ctl) + dev_info(dev, "migration region have release!\n"); + else + dev_info(dev, + "device state: %u\n" + "data offset: %llu\n" + "data size: %llu\n" + "pending bytes: %llu\n" + "data addr: 0x%lx\n", + acc_vf_dev->mig_ctl->device_state, + acc_vf_dev->mig_ctl->data_offset, + acc_vf_dev->mig_ctl->data_size, + acc_vf_dev->mig_ctl->pending_bytes, + (uintptr_t)acc_vf_dev->vf_data); + break; + default: + return -EINVAL; + } + + return count; +} + +static const struct file_operations acc_vf_debug_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = acc_vf_debug_read, + .write = acc_vf_debug_write, +}; + +static ssize_t acc_vf_state_read(struct file *filp, char __user *buffer, + size_t count, loff_t *pos) +{ + struct acc_vf_migration *acc_vf_dev = filp->private_data; + char buf[VFIO_DEV_DBG_LEN]; + u32 state; + int len; + + if (!acc_vf_dev->mig_ctl) { + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", "Invalid\n"); + } else { + state = acc_vf_dev->mig_ctl->device_state; + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "RUNNING\n"); + break; + case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "SAVING and RUNNING\n"); + break; + case VFIO_DEVICE_STATE_SAVING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "SAVING\n"); + break; + case VFIO_DEVICE_STATE_STOP: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "STOP\n"); + break; + case VFIO_DEVICE_STATE_RESUMING: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "RESUMING\n"); + break; + default: + len = scnprintf(buf, VFIO_DEV_DBG_LEN, "%s\n", + "Error\n"); + } + } + + return simple_read_from_buffer(buffer, count, pos, buf, len); +} + +static const struct file_operations acc_vf_state_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = acc_vf_state_read, +}; + +static void vf_debugfs_init(struct acc_vf_migration *acc_vf_dev) +{ + char name[VFIO_DEV_DBG_LEN]; + int node_id; + + if (!mig_root_ref) + mig_debugfs_root = debugfs_create_dir("vfio_acc", NULL); + mutex_lock(&acc_vf_dev->reflock); + mig_root_ref++; + mutex_unlock(&acc_vf_dev->reflock); + + node_id = dev_to_node(&acc_vf_dev->vf_dev->dev); + if (node_id < 0) + node_id = 0; + + if (acc_vf_dev->acc_type == HISI_SEC) + scnprintf(name, VFIO_DEV_DBG_LEN, "sec_vf%d-%d", + node_id, acc_vf_dev->vf_id); + else if (acc_vf_dev->acc_type == HISI_HPRE) + scnprintf(name, VFIO_DEV_DBG_LEN, "hpre_vf%d-%d", + node_id, acc_vf_dev->vf_id); + else + scnprintf(name, VFIO_DEV_DBG_LEN, "zip_vf%d-%d", + node_id, acc_vf_dev->vf_id); + + acc_vf_dev->debug_root = debugfs_create_dir(name, mig_debugfs_root); + + debugfs_create_file("debug", 0644, acc_vf_dev->debug_root, + acc_vf_dev, &acc_vf_debug_fops); + debugfs_create_file("state", 0444, acc_vf_dev->debug_root, + acc_vf_dev, &acc_vf_state_fops); +} + +static void vf_debugfs_exit(struct acc_vf_migration *acc_vf_dev) +{ + debugfs_remove_recursive(acc_vf_dev->debug_root); + + mutex_lock(&acc_vf_dev->reflock); + mig_root_ref--; + mutex_unlock(&acc_vf_dev->reflock); + + if (!mig_root_ref) + debugfs_remove_recursive(mig_debugfs_root); +} + +static int qm_acc_type_init(struct acc_vf_migration *acc_vf_dev) +{ + struct hisi_qm *qm = acc_vf_dev->vf_qm; + int i; + + acc_vf_dev->acc_type = 0; + for (i = 0; i < ARRAY_SIZE(vf_acc_types); i++) { + if (!strncmp(qm->dev_name, vf_acc_types[i].name, + strlen(vf_acc_types[i].name))) + acc_vf_dev->acc_type = vf_acc_types[i].type; + } + if (!acc_vf_dev->acc_type) { + dev_err(&acc_vf_dev->vf_dev->dev, "failed to check acc type!\n"); + return -EINVAL; + } + + return 0; +} + +static int vf_qm_pci_init(struct pci_dev *pdev, struct hisi_qm *vfqm) +{ + struct device *dev = &pdev->dev; + u32 val; + int ret; + + ret = pci_request_mem_regions(pdev, vfqm->dev_name); + if (ret < 0) { + dev_err(dev, "failed to request mem regions!\n"); + return ret; + } + + vfqm->phys_base = pci_resource_start(pdev, PCI_BAR_2); + vfqm->io_base = devm_ioremap(dev, pci_resource_start(pdev, PCI_BAR_2), + pci_resource_len(pdev, PCI_BAR_2)); + if (!vfqm->io_base) { + ret = -EIO; + goto err_ioremap; + } + + val = readl(vfqm->io_base + QM_QUE_ISO_CFG_V); + val = val & BIT(0); + if (val) { + vfqm->db_phys_base = pci_resource_start(pdev, PCI_BAR_4); + vfqm->db_io_base = devm_ioremap(dev, pci_resource_start(pdev, + PCI_BAR_4), pci_resource_len(pdev, PCI_BAR_4)); + if (!vfqm->db_io_base) { + ret = -EIO; + goto err_db_ioremap; + } + } else { + vfqm->db_phys_base = vfqm->phys_base; + vfqm->db_io_base = vfqm->io_base; + } + + vfqm->pdev = pdev; + mutex_init(&vfqm->mailbox_lock); + + /* + * Allow VF devices to be loaded in VM when + * it loaded in migration driver + */ + pci_release_mem_regions(pdev); + + return 0; + +err_db_ioremap: + devm_iounmap(dev, vfqm->io_base); +err_ioremap: + pci_release_mem_regions(pdev); + return ret; +} + +static int acc_vf_dev_init(struct pci_dev *pdev, struct hisi_qm *pf_qm, + struct acc_vf_migration *acc_vf_dev) +{ + struct hisi_qm *vf_qm; + int ret; + + vf_qm = kzalloc(sizeof(struct hisi_qm), GFP_KERNEL); + if (!vf_qm) + return -ENOMEM; + + /* get vf qm dev name from pf */ + vf_qm->dev_name = pf_qm->dev_name; + vf_qm->fun_type = QM_HW_VF; + acc_vf_dev->vf_qm = vf_qm; + acc_vf_dev->pf_qm = pf_qm; + + ret = vf_qm_pci_init(pdev, vf_qm); + if (ret) + goto init_qm_error; + + ret = qm_acc_type_init(acc_vf_dev); + if (ret) + goto init_qm_error; + + return 0; + +init_qm_error: + kfree(vf_qm); + return -ENOMEM; +} + +static void *acc_vf_probe(struct pci_dev *pdev) +{ + struct acc_vf_migration *acc_vf_dev; + struct pci_dev *pf_dev, *vf_dev; + struct hisi_qm *pf_qm; + int vf_id, ret; + + pf_dev = pdev->physfn; + vf_dev = pdev; + /* + * the VF driver have been remove after unbind + * the PF driver have probe + */ + pf_qm = pci_get_drvdata(pf_dev); + if (!pf_qm) { + dev_err(&pdev->dev, "host qm driver not insmod!\n"); + return ERR_PTR(-EINVAL); + } + if (pf_qm->ver < QM_HW_V3) { + dev_err(&pdev->dev, + "device can't support migration! version: 0x%x\n", + pf_qm->ver); + return ERR_PTR(-EINVAL); + } + + vf_id = PCI_FUNC(vf_dev->devfn); + if (vf_id < 0) { + dev_info(&pdev->dev, "vf device: %s, vf id: %d\n", + pf_qm->dev_name, vf_id); + return ERR_PTR(-EINVAL); + } + + acc_vf_dev = kzalloc(sizeof(*acc_vf_dev), GFP_KERNEL); + if (!acc_vf_dev) + return ERR_PTR(-ENOMEM); + + ret = acc_vf_dev_init(pdev, pf_qm, acc_vf_dev); + if (ret) { + kfree(acc_vf_dev); + return ERR_PTR(-ENOMEM); + } + + acc_vf_dev->vf_id = vf_id; + acc_vf_dev->vf_vendor = pdev->vendor; + acc_vf_dev->vf_device = pdev->device; + acc_vf_dev->pf_dev = pf_dev; + acc_vf_dev->vf_dev = vf_dev; + acc_vf_dev->mig_ignore = false; + mutex_init(&acc_vf_dev->reflock); + + vf_debugfs_init(acc_vf_dev); + + return acc_vf_dev; +} + +static void acc_vf_remove(void *vendor_data) +{ + struct acc_vf_migration *acc_vf_dev = vendor_data; + struct device *dev = &acc_vf_dev->vf_dev->dev; + struct hisi_qm *qm = acc_vf_dev->vf_qm; + + vf_debugfs_exit(acc_vf_dev); + + devm_iounmap(dev, qm->io_base); + + kfree(qm); + kfree(acc_vf_dev); +} + +static struct vfio_pci_vendor_driver_ops sec_vf_mig_ops = { + .owner = THIS_MODULE, + .name = "hisi_sec2", + .probe = acc_vf_probe, + .remove = acc_vf_remove, + .device_ops = &acc_vf_device_ops_node, +}; + +static struct vfio_pci_vendor_driver_ops hpre_vf_mig_ops = { + .owner = THIS_MODULE, + .name = "hisi_hpre", + .probe = acc_vf_probe, + .remove = acc_vf_remove, + .device_ops = &acc_vf_device_ops_node, +}; + +static struct vfio_pci_vendor_driver_ops zip_vf_mig_ops = { + .owner = THIS_MODULE, + .name = "hisi_zip", + .probe = acc_vf_probe, + .remove = acc_vf_remove, + .device_ops = &acc_vf_device_ops_node, +}; + +static int __init acc_vf_module_init(void) +{ + __vfio_pci_register_vendor_driver(&sec_vf_mig_ops); + + __vfio_pci_register_vendor_driver(&hpre_vf_mig_ops); + + __vfio_pci_register_vendor_driver(&zip_vf_mig_ops); + + return 0; +}; + +static void __exit acc_vf_module_exit(void) +{ + vfio_pci_unregister_vendor_driver(&acc_vf_device_ops_node); +}; +module_init(acc_vf_module_init); +module_exit(acc_vf_module_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Longfang Liu "); +MODULE_DESCRIPTION("HiSilicon Accelerator VF live migration driver"); \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.h b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.h new file mode 100644 index 0000000..1fdcba0 --- /dev/null +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/hisilicon/migration/acc_vf_migration.h @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021 HiSilicon Limited. */ + +#ifndef ACC_MIG_H +#define ACC_MIG_H + +#include +#include +#include "../../include_linux/vfio.h" + +#include "../hisi_acc_qm.h" + +#define VFIO_PCI_OFFSET_SHIFT 40 +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((off) >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) + +#define MIGRATION_REGION_SZ (sizeof(struct acc_vf_data) + \ + sizeof(struct vfio_device_migration_info)) +#define VFIO_DEV_DBG_LEN 256 +#define VFIO_DBG_LOG_LEN 16 +#define VFIO_DEVFN_MASK 0xFF + +#define PCI_BAR_2 2 +#define PCI_BAR_4 4 +#define POLL_PERIOD 10 +#define POLL_TIMEOUT 1000 +#define QM_CACHE_WB_START 0x204 +#define QM_CACHE_WB_DONE 0x208 +#define QM_MB_CMD_PAUSE_QM 0xe +#define QM_ABNORMAL_INT_STATUS 0x100008 +#define QM_IFC_INT_STATUS 0x0028 +#define SEC_CORE_INT_STATUS 0x301008 +#define HPRE_HAC_INT_STATUS 0x301800 +#define HZIP_CORE_INT_STATUS 0x3010AC + +#define QM_VFT_CFG_RDY 0x10006c +#define QM_VFT_CFG_OP_WR 0x100058 +#define QM_VFT_CFG_TYPE 0x10005c +#define QM_VFT_CFG 0x100060 +#define QM_VFT_CFG_OP_ENABLE 0x100054 +#define QM_VFT_CFG_DATA_L 0x100064 +#define QM_VFT_CFG_DATA_H 0x100068 + +#define ERROR_CHECK_TIMEOUT 100 +#define CHECK_DELAY_TIME 100 + +#define QM_SQC_VFT_BASE_SHIFT_V2 28 +#define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0) +#define QM_SQC_VFT_NUM_SHIFT_V2 45 +#define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0) + +/* mailbox */ +#define QM_MB_CMD_SQC_BT 0x4 +#define QM_MB_CMD_CQC_BT 0x5 +#define QM_MB_CMD_SQC_VFT_V2 0x6 + +#define QM_MB_CMD_SEND_BASE 0x300 +#define QM_MB_BUSY_SHIFT 13 +#define QM_MB_OP_SHIFT 14 +#define QM_MB_CMD_DATA_ADDR_L 0x304 +#define QM_MB_CMD_DATA_ADDR_H 0x308 +#define QM_MB_MAX_WAIT_CNT 6000 + +/* doorbell */ +#define QM_DOORBELL_CMD_SQ 0 +#define QM_DOORBELL_CMD_CQ 1 +#define QM_DOORBELL_SQ_CQ_BASE_V2 0x1000 +#define QM_DOORBELL_EQ_AEQ_BASE_V2 0x2000 +#define QM_DB_CMD_SHIFT_V2 12 +#define QM_DB_RAND_SHIFT_V2 16 +#define QM_DB_INDEX_SHIFT_V2 32 +#define QM_DB_PRIORITY_SHIFT_V2 48 + +/* RW regs */ +#define QM_REGS_MAX_LEN 7 +#define QM_REG_ADDR_OFFSET 0x0004 + +#define QM_XQC_ADDR_OFFSET 32U +#define QM_VF_AEQ_INT_MASK 0x0004 +#define QM_VF_EQ_INT_MASK 0x000c +#define QM_IFC_INT_SOURCE_V 0x0020 +#define QM_IFC_INT_MASK 0x0024 +#define QM_IFC_INT_SET_V 0x002c +#define QM_QUE_ISO_CFG_V 0x0030 +#define QM_PAGE_SIZE 0x0034 + +#define QM_EQC_DW0 0X8000 +#define QM_AEQC_DW0 0X8020 + +struct qm_mailbox { + __le16 w0; + __le16 queue_num; + __le32 base_l; + __le32 base_h; + __le32 rsvd; +}; + +enum acc_type { + HISI_SEC = 0x1, + HISI_HPRE = 0x2, + HISI_ZIP = 0x3, +}; + +struct vf_acc_type { + const char *name; + u32 type; +}; + +static struct vf_acc_type vf_acc_types[] = { + {"hisi_sec2", HISI_SEC}, + {"hisi_hpre", HISI_HPRE}, + {"hisi_zip", HISI_ZIP}, +}; + +enum mig_debug_cmd { + STATE_SAVE, + STATE_RESUME, + MB_TEST, + MIG_DATA_DUMP, + MIG_DEV_SHOW, +}; + +static const char * const vf_dev_state[] = { + "Stop", + "Running", + "Saving", + "Running & Saving", + "Resuming", +}; + +#define QM_MATCH_SIZE 32L +struct acc_vf_data { + /* QM match information */ + u32 qp_num; + u32 acc_type; + u32 que_iso_cfg; + u32 qp_base; + /* QM reserved 4 match information */ + u32 qm_rsv_state[4]; + + /* QM RW regs */ + u32 aeq_int_mask; + u32 eq_int_mask; + u32 ifc_int_source; + u32 ifc_int_mask; + u32 ifc_int_set; + u32 page_size; + u32 vf_state; + + /* + * QM_VF_MB has 4 regs don't need to migration + * mailbox regs writeback value will cause + * hardware to perform command operations + */ + + /* QM_EQC_DW has 7 regs */ + u32 qm_eqc_dw[7]; + + /* QM_AEQC_DW has 7 regs */ + u32 qm_aeqc_dw[7]; + + /* QM reserved 5 regs */ + u32 qm_rsv_regs[5]; + + /* qm memory init information */ + dma_addr_t eqe_dma; + dma_addr_t aeqe_dma; + dma_addr_t sqc_dma; + dma_addr_t cqc_dma; +}; + +struct acc_vf_remap_irq_ctx { + struct eventfd_ctx *trigger; + struct virqfd *sync; + atomic_t cnt; + wait_queue_head_t waitq; + bool init; +}; + +struct acc_vf_migration { + __u32 vf_vendor; + __u32 vf_device; + __u32 handle; + struct pci_dev *pf_dev; + struct pci_dev *vf_dev; + struct hisi_qm *pf_qm; + struct hisi_qm *vf_qm; + int vf_id; + int refcnt; + u8 acc_type; + bool mig_ignore; + struct mutex reflock; + + struct vfio_device_migration_info *mig_ctl; + struct acc_vf_data *vf_data; + bool in_dirty_track; + struct acc_vf_remap_irq_ctx remap_irq_ctx; + struct acc_vf_region *regions; + int num_regions; + struct dentry *debug_root; +}; + +struct acc_vf_region_ops { + int (*rw)(struct acc_vf_migration *acc_vf_dev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite); + void (*release)(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region); + int (*mmap)(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region, + struct vm_area_struct *vma); + int (*add_cap)(struct acc_vf_migration *acc_vf_dev, + struct acc_vf_region *region, + struct vfio_info_cap *caps); +}; + +struct acc_vf_region { + u32 type; + u32 subtype; + size_t size; + u32 flags; + const struct acc_vf_region_ops *ops; + void *data; +}; + +struct acc_vf_irqops { + int (*set_irqs)(struct acc_vf_migration *acc_vf_dev, + u32 flags, unsigned int index, + unsigned int start, unsigned int count, + void *data); +}; + +struct acc_vf_irq { + u32 type; + u32 subtype; + u32 flags; + u32 count; + const struct acc_vf_irqops *ops; +}; + +#endif /* ACC_MIG_H */ \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_linux/vfio.h b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_linux/vfio.h new file mode 100644 index 0000000..0b6cda3 --- /dev/null +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_linux/vfio.h @@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * VFIO API definition + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + */ +#ifndef VFIO_H +#define VFIO_H + + +#include +#include +#include +#include +#include "../include_uapi_linux/vfio.h" + +#ifndef KABI_EXTEND +#define KABI_EXTEND(_new) _new; +#endif + +struct vfio_device { + struct device *dev; + const struct vfio_device_ops *ops; + struct vfio_group *group; + + /* Members below here are private, not for driver use */ + refcount_t refcount; + struct completion comp; + struct list_head group_next; + void *device_data; +}; + +/** + * struct vfio_device_ops - VFIO bus driver device callbacks + * + * @open: Called when userspace creates new file descriptor for device + * @release: Called when userspace releases file descriptor for device + * @read: Perform read(2) on device file descriptor + * @write: Perform write(2) on device file descriptor + * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* + * operations documented below + * @mmap: Perform mmap(2) on a region of the device file descriptor + * @request: Request for the bus driver to release the device + * @match: Optional device name match callback (return: 0 for no-match, >0 for + * match, -errno for abort (ex. match with insufficient or incorrect + * additional args) + */ +struct vfio_device_ops { + char *name; + int (*open)(void *device_data); + void (*release)(void *device_data); + ssize_t (*read)(void *device_data, char __user *buf, + size_t count, loff_t *ppos); + ssize_t (*write)(void *device_data, const char __user *buf, + size_t count, loff_t *size); + long (*ioctl)(void *device_data, unsigned int cmd, + unsigned long arg); + int (*mmap)(void *device_data, struct vm_area_struct *vma); + void (*request)(void *device_data, unsigned int count); + int (*match)(void *device_data, char *buf); +}; + +extern struct iommu_group *vfio_iommu_group_get(struct device *dev); +extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); + +void vfio_init_group_dev(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops, void *device_data); +int vfio_register_group_dev(struct vfio_device *device); +extern int vfio_add_group_dev(struct device *dev, + const struct vfio_device_ops *ops, + void *device_data); + +extern void *vfio_del_group_dev(struct device *dev); +void vfio_unregister_group_dev(struct vfio_device *device); +extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); +extern void vfio_device_put(struct vfio_device *device); +extern void *vfio_device_data(struct vfio_device *device); + +/** + * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks + */ +struct vfio_iommu_driver_ops { + char *name; + struct module *owner; + void *(*open)(unsigned long arg); + void (*release)(void *iommu_data); + ssize_t (*read)(void *iommu_data, char __user *buf, + size_t count, loff_t *ppos); + ssize_t (*write)(void *iommu_data, const char __user *buf, + size_t count, loff_t *size); + long (*ioctl)(void *iommu_data, unsigned int cmd, + unsigned long arg); + int (*mmap)(void *iommu_data, struct vm_area_struct *vma); + int (*attach_group)(void *iommu_data, + struct iommu_group *group); + void (*detach_group)(void *iommu_data, + struct iommu_group *group); + int (*pin_pages)(void *iommu_data, + struct iommu_group *group, + unsigned long *user_pfn, + int npage, int prot, + unsigned long *phys_pfn); + int (*unpin_pages)(void *iommu_data, + unsigned long *user_pfn, int npage); + int (*register_notifier)(void *iommu_data, + unsigned long *events, + struct notifier_block *nb); + int (*unregister_notifier)(void *iommu_data, + struct notifier_block *nb); + int (*dma_rw)(void *iommu_data, dma_addr_t user_iova, + void *data, size_t count, bool write); + KABI_EXTEND(struct iommu_domain *(*group_iommu_domain)(void *iommu_data, + struct iommu_group *group)) +}; + +extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); + +extern void vfio_unregister_iommu_driver( + const struct vfio_iommu_driver_ops *ops); + +/* + * External user API + */ +extern struct vfio_group *vfio_group_get_external_user(struct file *filep); +extern void vfio_group_put_external_user(struct vfio_group *group); +extern struct vfio_group *vfio_group_get_external_user_from_dev(struct device + *dev); +extern bool vfio_external_group_match_file(struct vfio_group *group, + struct file *filep); +extern int vfio_external_user_iommu_id(struct vfio_group *group); +extern long vfio_external_check_extension(struct vfio_group *group, + unsigned long arg); + +#define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) + +extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, + int npage, int prot, unsigned long *phys_pfn); +extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, + int npage); + +extern int vfio_group_pin_pages(struct vfio_group *group, + unsigned long *user_iova_pfn, int npage, + int prot, unsigned long *phys_pfn); +extern int vfio_group_unpin_pages(struct vfio_group *group, + unsigned long *user_iova_pfn, int npage); + +extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, + void *data, size_t len, bool write); + +extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group); + +/* each type has independent events */ +enum vfio_notify_type { + VFIO_IOMMU_NOTIFY = 0, + VFIO_GROUP_NOTIFY = 1, +}; + +/* events for VFIO_IOMMU_NOTIFY */ +#define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0) + +/* events for VFIO_GROUP_NOTIFY */ +#define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) + +extern int vfio_register_notifier(struct device *dev, + enum vfio_notify_type type, + unsigned long *required_events, + struct notifier_block *nb); +extern int vfio_unregister_notifier(struct device *dev, + enum vfio_notify_type type, + struct notifier_block *nb); + +struct kvm; +extern void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm); + +/* + * Sub-module helpers + */ +struct vfio_info_cap { + struct vfio_info_cap_header *buf; + size_t size; +}; +extern struct vfio_info_cap_header *vfio_info_cap_add( + struct vfio_info_cap *caps, size_t size, u16 id, u16 version); +extern void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset); + +extern int vfio_info_add_capability(struct vfio_info_cap *caps, + struct vfio_info_cap_header *cap, + size_t size); + +extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, + int num_irqs, int max_irq_type, + size_t *data_size); + +struct pci_dev; +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); +extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); +extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, + unsigned int cmd, + unsigned long arg); +#else +static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) +{ +} + +static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) +{ +} + +static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, + unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} +#endif /* CONFIG_VFIO_SPAPR_EEH */ + +/* + * IRQfd - generic + */ +struct virqfd { + void *opaque; + struct eventfd_ctx *eventfd; + int (*handler)(void *, void *); + void (*thread)(void *, void *); + void *data; + struct work_struct inject; + wait_queue_entry_t wait; + poll_table pt; + struct work_struct shutdown; + struct work_struct flush_inject; + struct virqfd **pvirqfd; +}; + +extern int vfio_virqfd_enable(void *opaque, + int (*handler)(void *, void *), + void (*thread)(void *, void *), + void *data, struct virqfd **pvirqfd, int fd); +extern void vfio_virqfd_disable(struct virqfd **pvirqfd); +void vfio_virqfd_flush_thread(struct virqfd **pvirqfd); + +extern int vfio_pci_num_regions(void *device_data); +extern struct pci_dev *vfio_pci_pdev(void *device_data); +extern long vfio_pci_ioctl(void *device_data, + unsigned int cmd, unsigned long arg); +extern ssize_t vfio_pci_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos); +extern ssize_t vfio_pci_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos); +extern int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma); +extern void vfio_pci_request(void *device_data, unsigned int count); +extern int vfio_pci_open(void *device_data); +extern void vfio_pci_release(void *device_data); +extern void *vfio_pci_vendor_data(void *device_data); +extern int vfio_pci_set_vendor_regions(void *device_data, + int num_vendor_regions); + +struct vfio_pci_vendor_driver_ops { + char *name; + struct module *owner; + /* Used to match device */ + unsigned short vendor; + unsigned short device; + void *(*probe)(struct pci_dev *pdev); + void (*remove)(void *vendor_data); + struct vfio_device_ops *device_ops; +}; +int __vfio_pci_register_vendor_driver(struct vfio_pci_vendor_driver_ops *ops); +void vfio_pci_unregister_vendor_driver(struct vfio_pci_vendor_driver_ops *ops); + +#define vfio_pci_register_vendor_driver(__name, __probe, __remove, \ + __device_ops) \ +static struct vfio_pci_vendor_driver_ops __ops ## _node = { \ + .owner = THIS_MODULE, \ + .name = __name, \ + .probe = __probe, \ + .remove = __remove, \ + .device_ops = __device_ops, \ +}; \ +__vfio_pci_register_vendor_driver(&__ops ## _node) + +#define module_vfio_pci_register_vendor_handler(name, probe, remove, \ + device_ops) \ +static int __init device_ops ## _module_init(void) \ +{ \ + vfio_pci_register_vendor_driver(name, probe, remove, \ + device_ops); \ + return 0; \ +}; \ +static void __exit device_ops ## _module_exit(void) \ +{ \ + vfio_pci_unregister_vendor_driver(device_ops); \ +}; \ +module_init(device_ops ## _module_init); \ +module_exit(device_ops ## _module_exit) + +#endif /* VFIO_H */ \ No newline at end of file diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_uapi_linux/vfio.h b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_uapi_linux/vfio.h new file mode 100644 index 0000000..52658db --- /dev/null +++ b/KAEKernelDriver/KAEKernelDriver-OLK-5.4/include_uapi_linux/vfio.h @@ -0,0 +1,1444 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * VFIO API definition + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _UAPIVFIO_H +#define _UAPIVFIO_H + +#include +#include + +#define VFIO_API_VERSION 0 + + +/* Kernel & User level defines for VFIO IOCTLs. */ + +/* Extensions */ + +#define VFIO_TYPE1_IOMMU 1 +#define VFIO_SPAPR_TCE_IOMMU 2 +#define VFIO_TYPE1v2_IOMMU 3 +/* + * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping). This + * capability is subject to change as groups are added or removed. + */ +#define VFIO_DMA_CC_IOMMU 4 + +/* Check if EEH is supported */ +#define VFIO_EEH 5 + +/* Two-stage IOMMU */ +#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ + +#define VFIO_SPAPR_TCE_v2_IOMMU 7 + +/* + * The No-IOMMU IOMMU offers no translation or isolation for devices and + * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU + * code will taint the host kernel and should be used with extreme caution. + */ +#define VFIO_NOIOMMU_IOMMU 8 + +/* + * The vfio_iommu driver may support user clears dirty log manually, which means + * dirty log can be requested to not cleared automatically after dirty log is + * copied to userspace, it's user's duty to clear dirty log. + * + * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and + * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. + */ +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 + +/* + * The IOCTL interface is designed for extensibility by embedding the + * structure length (argsz) and flags into structures passed between + * kernel and userspace. We therefore use the _IO() macro for these + * defines to avoid implicitly embedding a size into the ioctl request. + * As structure fields are added, argsz will increase to match and flag + * bits will be defined to indicate additional fields with valid data. + * It's *always* the caller's responsibility to indicate the size of + * the structure passed by setting argsz appropriately. + */ + +#define VFIO_TYPE (';') +#define VFIO_BASE 100 + +/* + * For extension of INFO ioctls, VFIO makes use of a capability chain + * designed after PCI/e capabilities. A flag bit indicates whether + * this capability chain is supported and a field defined in the fixed + * structure defines the offset of the first capability in the chain. + * This field is only valid when the corresponding bit in the flags + * bitmap is set. This offset field is relative to the start of the + * INFO buffer, as is the next field within each capability header. + * The id within the header is a shared address space per INFO ioctl, + * while the version field is specific to the capability id. The + * contents following the header are specific to the capability id. + */ +struct vfio_info_cap_header { + __u16 id; /* Identifies capability */ + __u16 version; /* Version specific to the capability ID */ + __u32 next; /* Offset of next capability */ +}; + +/* + * Callers of INFO ioctls passing insufficiently sized buffers will see + * the capability chain flag bit set, a zero value for the first capability + * offset (if available within the provided argsz), and argsz will be + * updated to report the necessary buffer size. For compatibility, the + * INFO ioctl will not report error in this case, but the capability chain + * will not be available. + */ + +/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ + +/** + * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0) + * + * Report the version of the VFIO API. This allows us to bump the entire + * API version should we later need to add or change features in incompatible + * ways. + * Return: VFIO_API_VERSION + * Availability: Always + */ +#define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0) + +/** + * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32) + * + * Check whether an extension is supported. + * Return: 0 if not supported, 1 (or some other positive integer) if supported. + * Availability: Always + */ +#define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1) + +/** + * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32) + * + * Set the iommu to the given type. The type must be supported by an + * iommu driver as verified by calling CHECK_EXTENSION using the same + * type. A group must be set to this file descriptor before this + * ioctl is available. The IOMMU interfaces enabled by this call are + * specific to the value set. + * Return: 0 on success, -errno on failure + * Availability: When VFIO group attached + */ +#define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2) + +/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */ + +/** + * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3, + * struct vfio_group_status) + * + * Retrieve information about the group. Fills in provided + * struct vfio_group_info. Caller sets argsz. + * Return: 0 on succes, -errno on failure. + * Availability: Always + */ +struct vfio_group_status { + __u32 argsz; + __u32 flags; +#define VFIO_GROUP_FLAGS_VIABLE (1 << 0) +#define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1) +}; +#define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3) + +/** + * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32) + * + * Set the container for the VFIO group to the open VFIO file + * descriptor provided. Groups may only belong to a single + * container. Containers may, at their discretion, support multiple + * groups. Only when a container is set are all of the interfaces + * of the VFIO file descriptor and the VFIO group file descriptor + * available to the user. + * Return: 0 on success, -errno on failure. + * Availability: Always + */ +#define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4) + +/** + * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5) + * + * Remove the group from the attached container. This is the + * opposite of the SET_CONTAINER call and returns the group to + * an initial state. All device file descriptors must be released + * prior to calling this interface. When removing the last group + * from a container, the IOMMU will be disabled and all state lost, + * effectively also returning the VFIO file descriptor to an initial + * state. + * Return: 0 on success, -errno on failure. + * Availability: When attached to container + */ +#define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5) + +/** + * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char) + * + * Return a new file descriptor for the device object described by + * the provided string. The string should match a device listed in + * the devices subdirectory of the IOMMU group sysfs entry. The + * group containing the device must already be added to this context. + * Return: new file descriptor on success, -errno on failure. + * Availability: When attached to container + */ +#define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6) + +/* --------------- IOCTLs for DEVICE file descriptors --------------- */ + +/** + * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, + * struct vfio_device_info) + * + * Retrieve information about the device. Fills in provided + * struct vfio_device_info. Caller sets argsz. + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_info { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ +#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ +#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ +#define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ +#define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ +#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ + __u32 num_regions; /* Max region index + 1 */ + __u32 num_irqs; /* Max IRQ index + 1 */ + __u32 cap_offset; /* Offset within info struct of first cap */ +}; +#define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) + +/* + * Vendor driver using Mediated device framework should provide device_api + * attribute in supported type attribute groups. Device API string should be one + * of the following corresponding to device flags in vfio_device_info structure. + */ + +#define VFIO_DEVICE_API_PCI_STRING "vfio-pci" +#define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform" +#define VFIO_DEVICE_API_AMBA_STRING "vfio-amba" +#define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" +#define VFIO_DEVICE_API_AP_STRING "vfio-ap" + +/* + * The following capabilities are unique to s390 zPCI devices. Their contents + * are further-defined in vfio_zdev.h + */ +#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1 +#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2 +#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 +#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 + +/** + * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, + * struct vfio_region_info) + * + * Retrieve information about a device region. Caller provides + * struct vfio_region_info with index value set. Caller sets argsz. + * Implementation of region mapping is bus driver specific. This is + * intended to describe MMIO, I/O port, as well as bus specific + * regions (ex. PCI config space). Zero sized regions may be used + * to describe unimplemented regions (ex. unimplemented PCI BARs). + * Return: 0 on success, -errno on failure. + */ +struct vfio_region_info { + __u32 argsz; + __u32 flags; +#define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ +#define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ +#define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ +#define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ + __u32 index; /* Region index */ + __u32 cap_offset; /* Offset within info struct of first cap */ + __u64 size; /* Region size (bytes) */ + __u64 offset; /* Region offset from start of device fd */ +}; +#define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) + +/* + * The sparse mmap capability allows finer granularity of specifying areas + * within a region with mmap support. When specified, the user should only + * mmap the offset ranges specified by the areas array. mmaps outside of the + * areas specified may fail (such as the range covering a PCI MSI-X table) or + * may result in improper device behavior. + * + * The structures below define version 1 of this capability. + */ +#define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 + +struct vfio_region_sparse_mmap_area { + __u64 offset; /* Offset of mmap'able area within region */ + __u64 size; /* Size of mmap'able area */ +}; + +struct vfio_region_info_cap_sparse_mmap { + struct vfio_info_cap_header header; + __u32 nr_areas; + __u32 reserved; + struct vfio_region_sparse_mmap_area areas[]; +}; + +/* + * The device specific type capability allows regions unique to a specific + * device or class of devices to be exposed. This helps solve the problem for + * vfio bus drivers of defining which region indexes correspond to which region + * on the device, without needing to resort to static indexes, as done by + * vfio-pci. For instance, if we were to go back in time, we might remove + * VFIO_PCI_VGA_REGION_INDEX and let vfio-pci simply define that all indexes + * greater than or equal to VFIO_PCI_NUM_REGIONS are device specific and we'd + * make a "VGA" device specific type to describe the VGA access space. This + * means that non-VGA devices wouldn't need to waste this index, and thus the + * address space associated with it due to implementation of device file + * descriptor offsets in vfio-pci. + * + * The current implementation is now part of the user ABI, so we can't use this + * for VGA, but there are other upcoming use cases, such as opregions for Intel + * IGD devices and framebuffers for vGPU devices. We missed VGA, but we'll + * use this for future additions. + * + * The structure below defines version 1 of this capability. + */ +#define VFIO_REGION_INFO_CAP_TYPE 2 + +struct vfio_region_info_cap_type { + struct vfio_info_cap_header header; + __u32 type; /* global per bus driver */ + __u32 subtype; /* type specific */ +}; + +/* + * List of region types, global per bus driver. + * If you introduce a new type, please add it here. + */ + +/* PCI region type containing a PCI vendor part */ +#define VFIO_REGION_TYPE_PCI_VENDOR_TYPE (1 << 31) +#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) +#define VFIO_REGION_TYPE_GFX (1) +#define VFIO_REGION_TYPE_CCW (2) +#define VFIO_REGION_TYPE_MIGRATION (3) + +/* sub-types for VFIO_REGION_TYPE_PCI_* */ + +/* 8086 vendor PCI sub-types */ +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) + +/* 10de vendor PCI sub-types */ +/* + * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space. + */ +#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1) + +/* 1014 vendor PCI sub-types */ +/* + * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU + * to do TLB invalidation on a GPU. + */ +#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) + +/* sub-types for VFIO_REGION_TYPE_GFX */ +#define VFIO_REGION_SUBTYPE_GFX_EDID (1) + +/** + * struct vfio_region_gfx_edid - EDID region layout. + * + * Set display link state and EDID blob. + * + * The EDID blob has monitor information such as brand, name, serial + * number, physical size, supported video modes and more. + * + * This special region allows userspace (typically qemu) set a virtual + * EDID for the virtual monitor, which allows a flexible display + * configuration. + * + * For the edid blob spec look here: + * https://en.wikipedia.org/wiki/Extended_Display_Identification_Data + * + * On linux systems you can find the EDID blob in sysfs: + * /sys/class/drm/${card}/${connector}/edid + * + * You can use the edid-decode ulility (comes with xorg-x11-utils) to + * decode the EDID blob. + * + * @edid_offset: location of the edid blob, relative to the + * start of the region (readonly). + * @edid_max_size: max size of the edid blob (readonly). + * @edid_size: actual edid size (read/write). + * @link_state: display link state (read/write). + * VFIO_DEVICE_GFX_LINK_STATE_UP: Monitor is turned on. + * VFIO_DEVICE_GFX_LINK_STATE_DOWN: Monitor is turned off. + * @max_xres: max display width (0 == no limitation, readonly). + * @max_yres: max display height (0 == no limitation, readonly). + * + * EDID update protocol: + * (1) set link-state to down. + * (2) update edid blob and size. + * (3) set link-state to up. + */ +struct vfio_region_gfx_edid { + __u32 edid_offset; + __u32 edid_max_size; + __u32 edid_size; + __u32 max_xres; + __u32 max_yres; + __u32 link_state; +#define VFIO_DEVICE_GFX_LINK_STATE_UP 1 +#define VFIO_DEVICE_GFX_LINK_STATE_DOWN 2 +}; + +/* sub-types for VFIO_REGION_TYPE_CCW */ +#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) +#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) +#define VFIO_REGION_SUBTYPE_CCW_CRW (3) + +/* sub-types for VFIO_REGION_TYPE_MIGRATION */ +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +/* + * The structure vfio_device_migration_info is placed at the 0th offset of + * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related + * migration information. Field accesses from this structure are only supported + * at their native width and alignment. Otherwise, the result is undefined and + * vendor drivers should return an error. + * + * device_state: (read/write) + * - The user application writes to this field to inform the vendor driver + * about the device state to be transitioned to. + * - The vendor driver should take the necessary actions to change the + * device state. After successful transition to a given state, the + * vendor driver should return success on write(device_state, state) + * system call. If the device state transition fails, the vendor driver + * should return an appropriate -errno for the fault condition. + * - On the user application side, if the device state transition fails, + * that is, if write(device_state, state) returns an error, read + * device_state again to determine the current state of the device from + * the vendor driver. + * - The vendor driver should return previous state of the device unless + * the vendor driver has encountered an internal error, in which case + * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR. + * - The user application must use the device reset ioctl to recover the + * device from VFIO_DEVICE_STATE_ERROR state. If the device is + * indicated to be in a valid device state by reading device_state, the + * user application may attempt to transition the device to any valid + * state reachable from the current state or terminate itself. + * + * device_state consists of 3 bits: + * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear, + * it indicates the _STOP state. When the device state is changed to + * _STOP, driver should stop the device before write() returns. + * - If bit 1 is set, it indicates the _SAVING state, which means that the + * driver should start gathering device state information that will be + * provided to the VFIO user application to save the device's state. + * - If bit 2 is set, it indicates the _RESUMING state, which means that + * the driver should prepare to resume the device. Data provided through + * the migration region should be used to resume the device. + * Bits 3 - 31 are reserved for future use. To preserve them, the user + * application should perform a read-modify-write operation on this + * field when modifying the specified bits. + * + * +------- _RESUMING + * |+------ _SAVING + * ||+----- _RUNNING + * ||| + * 000b => Device Stopped, not saving or resuming + * 001b => Device running, which is the default state + * 010b => Stop the device & save the device state, stop-and-copy state + * 011b => Device running and save the device state, pre-copy state + * 100b => Device stopped and the device state is resuming + * 101b => Invalid state + * 110b => Error state + * 111b => Invalid state + * + * State transitions: + * + * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP + * (100b) (001b) (011b) (010b) (000b) + * 0. Running or default state + * | + * + * 1. Normal Shutdown (optional) + * |------------------------------------->| + * + * 2. Save the state or suspend + * |------------------------->|---------->| + * + * 3. Save the state during live migration + * |----------->|------------>|---------->| + * + * 4. Resuming + * |<---------| + * + * 5. Resumed + * |--------->| + * + * 0. Default state of VFIO device is _RUNNING when the user application starts. + * 1. During normal shutdown of the user application, the user application may + * optionally change the VFIO device state from _RUNNING to _STOP. This + * transition is optional. The vendor driver must support this transition but + * must not require it. + * 2. When the user application saves state or suspends the application, the + * device state transitions from _RUNNING to stop-and-copy and then to _STOP. + * On state transition from _RUNNING to stop-and-copy, driver must stop the + * device, save the device state and send it to the application through the + * migration region. The sequence to be followed for such transition is given + * below. + * 3. In live migration of user application, the state transitions from _RUNNING + * to pre-copy, to stop-and-copy, and to _STOP. + * On state transition from _RUNNING to pre-copy, the driver should start + * gathering the device state while the application is still running and send + * the device state data to application through the migration region. + * On state transition from pre-copy to stop-and-copy, the driver must stop + * the device, save the device state and send it to the user application + * through the migration region. + * Vendor drivers must support the pre-copy state even for implementations + * where no data is provided to the user before the stop-and-copy state. The + * user must not be required to consume all migration data before the device + * transitions to a new state, including the stop-and-copy state. + * The sequence to be followed for above two transitions is given below. + * 4. To start the resuming phase, the device state should be transitioned from + * the _RUNNING to the _RESUMING state. + * In the _RESUMING state, the driver should use the device state data + * received through the migration region to resume the device. + * 5. After providing saved device data to the driver, the application should + * change the state from _RESUMING to _RUNNING. + * + * reserved: + * Reads on this field return zero and writes are ignored. + * + * pending_bytes: (read only) + * The number of pending bytes still to be migrated from the vendor driver. + * + * data_offset: (read only) + * The user application should read data_offset field from the migration + * region. The user application should read the device data from this + * offset within the migration region during the _SAVING state or write + * the device data during the _RESUMING state. See below for details of + * sequence to be followed. + * + * data_size: (read/write) + * The user application should read data_size to get the size in bytes of + * the data copied in the migration region during the _SAVING state and + * write the size in bytes of the data copied in the migration region + * during the _RESUMING state. + * + * The format of the migration region is as follows: + * ------------------------------------------------------------------ + * |vfio_device_migration_info| data section | + * | | /////////////////////////////// | + * ------------------------------------------------------------------ + * ^ ^ + * offset 0-trapped part data_offset + * + * The structure vfio_device_migration_info is always followed by the data + * section in the region, so data_offset will always be nonzero. The offset + * from where the data is copied is decided by the kernel driver. The data + * section can be trapped, mmapped, or partitioned, depending on how the kernel + * driver defines the data section. The data section partition can be defined + * as mapped by the sparse mmap capability. If mmapped, data_offset must be + * page aligned, whereas initial section which contains the + * vfio_device_migration_info structure, might not end at the offset, which is + * page aligned. The user is not required to access through mmap regardless + * of the capabilities of the region mmap. + * The vendor driver should determine whether and how to partition the data + * section. The vendor driver should return data_offset accordingly. + * + * The sequence to be followed while in pre-copy state and stop-and-copy state + * is as follows: + * a. Read pending_bytes, indicating the start of a new iteration to get device + * data. Repeated read on pending_bytes at this stage should have no side + * effects. + * If pending_bytes == 0, the user application should not iterate to get data + * for that device. + * If pending_bytes > 0, perform the following steps. + * b. Read data_offset, indicating that the vendor driver should make data + * available through the data section. The vendor driver should return this + * read operation only after data is available from (region + data_offset) + * to (region + data_offset + data_size). + * c. Read data_size, which is the amount of data in bytes available through + * the migration region. + * Read on data_offset and data_size should return the offset and size of + * the current buffer if the user application reads data_offset and + * data_size more than once here. + * d. Read data_size bytes of data from (region + data_offset) from the + * migration region. + * e. Process the data. + * f. Read pending_bytes, which indicates that the data from the previous + * iteration has been read. If pending_bytes > 0, go to step b. + * + * The user application can transition from the _SAVING|_RUNNING + * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the + * number of pending bytes. The user application should iterate in _SAVING + * (stop-and-copy) until pending_bytes is 0. + * + * The sequence to be followed while _RESUMING device state is as follows: + * While data for this device is available, repeat the following steps: + * a. Read data_offset from where the user application should write data. + * b. Write migration data starting at the migration region + data_offset for + * the length determined by data_size from the migration source. + * c. Write data_size, which indicates to the vendor driver that data is + * written in the migration region. Vendor driver must return this write + * operations on consuming data. Vendor driver should apply the + * user-provided migration region data to the device resume state. + * + * If an error occurs during the above sequences, the vendor driver can return + * an error code for next read() or write() operation, which will terminate the + * loop. The user application should then take the next necessary action, for + * example, failing migration or terminating the user application. + * + * For the user application, data is opaque. The user application should write + * data in the same order as the data is received and the data should be of + * same transaction size at the source. + */ + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + +#define VFIO_DEVICE_STATE_VALID(state) \ + (state & VFIO_DEVICE_STATE_RESUMING ? \ + (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) + +#define VFIO_DEVICE_STATE_IS_ERROR(state) \ + ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING)) + +#define VFIO_DEVICE_STATE_SET_ERROR(state) \ + ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + + __u32 reserved; + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + +/* + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped + * which allows direct access to non-MSIX registers which happened to be within + * the same system page. + * + * Even though the userspace gets direct access to the MSIX data, the existing + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration. + */ +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 + +/* + * Capability with compressed real address (aka SSA - small system address) + * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing + * and by the userspace to associate a NVLink bridge with a GPU. + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT 4 + +struct vfio_region_info_cap_nvlink2_ssatgt { + struct vfio_info_cap_header header; + __u64 tgt; +}; + +/* + * Capability with an NVLink link speed. The value is read by + * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed" + * property in the device tree. The value is fixed in the hardware + * and failing to provide the correct value results in the link + * not working with no indication from the driver why. + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD 5 + +struct vfio_region_info_cap_nvlink2_lnkspd { + struct vfio_info_cap_header header; + __u32 link_speed; + __u32 __pad; +}; + +/** + * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, + * struct vfio_irq_info) + * + * Retrieve information about a device IRQ. Caller provides + * struct vfio_irq_info with index value set. Caller sets argsz. + * Implementation of IRQ mapping is bus driver specific. Indexes + * using multiple IRQs are primarily intended to support MSI-like + * interrupt blocks. Zero count irq blocks may be used to describe + * unimplemented interrupt types. + * + * The EVENTFD flag indicates the interrupt index supports eventfd based + * signaling. + * + * The MASKABLE flags indicates the index supports MASK and UNMASK + * actions described below. + * + * AUTOMASKED indicates that after signaling, the interrupt line is + * automatically masked by VFIO and the user needs to unmask the line + * to receive new interrupts. This is primarily intended to distinguish + * level triggered interrupts. + * + * The NORESIZE flag indicates that the interrupt lines within the index + * are setup as a set and new subindexes cannot be enabled without first + * disabling the entire index. This is used for interrupts like PCI MSI + * and MSI-X where the driver may only use a subset of the available + * indexes, but VFIO needs to enable a specific number of vectors + * upfront. In the case of MSI-X, where the user can enable MSI-X and + * then add and unmask vectors, it's up to userspace to make the decision + * whether to allocate the maximum supported number of vectors or tear + * down setup and incrementally increase the vectors as each is enabled. + */ +struct vfio_irq_info { + __u32 argsz; + __u32 flags; +#define VFIO_IRQ_INFO_EVENTFD (1 << 0) +#define VFIO_IRQ_INFO_MASKABLE (1 << 1) +#define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) +#define VFIO_IRQ_INFO_NORESIZE (1 << 3) + __u32 index; /* IRQ index */ + __u32 count; /* Number of IRQs within this index */ +}; +#define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) + +/** + * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) + * + * Set signaling, masking, and unmasking of interrupts. Caller provides + * struct vfio_irq_set with all fields set. 'start' and 'count' indicate + * the range of subindexes being specified. + * + * The DATA flags specify the type of data provided. If DATA_NONE, the + * operation performs the specified action immediately on the specified + * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]: + * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1. + * + * DATA_BOOL allows sparse support for the same on arrays of interrupts. + * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]): + * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3, + * data = {1,0,1} + * + * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd. + * A value of -1 can be used to either de-assign interrupts if already + * assigned or skip un-assigned interrupts. For example, to set an eventfd + * to be trigger for interrupts [0,0] and [0,2]: + * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3, + * data = {fd1, -1, fd2} + * If index [0,1] is previously set, two count = 1 ioctls calls would be + * required to set [0,0] and [0,2] without changing [0,1]. + * + * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used + * with ACTION_TRIGGER to perform kernel level interrupt loopback testing + * from userspace (ie. simulate hardware triggering). + * + * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER + * enables the interrupt index for the device. Individual subindex interrupts + * can be disabled using the -1 value for DATA_EVENTFD or the index can be + * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0. + * + * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while + * ACTION_TRIGGER specifies kernel->user signaling. + */ +struct vfio_irq_set { + __u32 argsz; + __u32 flags; +#define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ +#define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ +#define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ +#define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ +#define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ +#define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ + __u32 index; + __u32 start; + __u32 count; + __u8 data[]; +}; +#define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10) + +#define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \ + VFIO_IRQ_SET_DATA_BOOL | \ + VFIO_IRQ_SET_DATA_EVENTFD) +#define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \ + VFIO_IRQ_SET_ACTION_UNMASK | \ + VFIO_IRQ_SET_ACTION_TRIGGER) +/** + * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11) + * + * Reset a device. + */ +#define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) + +/* + * The VFIO-PCI bus driver makes use of the following fixed region and + * IRQ index mapping. Unimplemented regions return a size of zero. + * Unimplemented IRQ types return a count of zero. + */ + +enum { + VFIO_PCI_BAR0_REGION_INDEX, + VFIO_PCI_BAR1_REGION_INDEX, + VFIO_PCI_BAR2_REGION_INDEX, + VFIO_PCI_BAR3_REGION_INDEX, + VFIO_PCI_BAR4_REGION_INDEX, + VFIO_PCI_BAR5_REGION_INDEX, + VFIO_PCI_ROM_REGION_INDEX, + VFIO_PCI_CONFIG_REGION_INDEX, + /* + * Expose VGA regions defined for PCI base class 03, subclass 00. + * This includes I/O port ranges 0x3b0 to 0x3bb and 0x3c0 to 0x3df + * as well as the MMIO range 0xa0000 to 0xbffff. Each implemented + * range is found at it's identity mapped offset from the region + * offset, for example 0x3b0 is region_info.offset + 0x3b0. Areas + * between described ranges are unimplemented. + */ + VFIO_PCI_VGA_REGION_INDEX, + VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */ + /* device specific cap to define content. */ +}; + +enum { + VFIO_PCI_INTX_IRQ_INDEX, + VFIO_PCI_MSI_IRQ_INDEX, + VFIO_PCI_MSIX_IRQ_INDEX, + VFIO_PCI_ERR_IRQ_INDEX, + VFIO_PCI_REQ_IRQ_INDEX, + VFIO_PCI_NUM_IRQS +}; + +/* + * The vfio-ccw bus driver makes use of the following fixed region and + * IRQ index mapping. Unimplemented regions return a size of zero. + * Unimplemented IRQ types return a count of zero. + */ + +enum { + VFIO_CCW_CONFIG_REGION_INDEX, + VFIO_CCW_NUM_REGIONS +}; + +enum { + VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_CRW_IRQ_INDEX, + VFIO_CCW_NUM_IRQS +}; + +/** + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12, + * struct vfio_pci_hot_reset_info) + * + * Return: 0 on success, -errno on failure: + * -enospc = insufficient buffer, -enodev = unsupported for device. + */ +struct vfio_pci_dependent_device { + __u32 group_id; + __u16 segment; + __u8 bus; + __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ +}; + +struct vfio_pci_hot_reset_info { + __u32 argsz; + __u32 flags; + __u32 count; + struct vfio_pci_dependent_device devices[]; +}; + +#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/** + * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13, + * struct vfio_pci_hot_reset) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_pci_hot_reset { + __u32 argsz; + __u32 flags; + __u32 count; + __s32 group_fds[]; +}; + +#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13) + +/** + * VFIO_DEVICE_QUERY_GFX_PLANE - _IOW(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_device_query_gfx_plane) + * + * Set the drm_plane_type and flags, then retrieve the gfx plane info. + * + * flags supported: + * - VFIO_GFX_PLANE_TYPE_PROBE and VFIO_GFX_PLANE_TYPE_DMABUF are set + * to ask if the mdev supports dma-buf. 0 on support, -EINVAL on no + * support for dma-buf. + * - VFIO_GFX_PLANE_TYPE_PROBE and VFIO_GFX_PLANE_TYPE_REGION are set + * to ask if the mdev supports region. 0 on support, -EINVAL on no + * support for region. + * - VFIO_GFX_PLANE_TYPE_DMABUF or VFIO_GFX_PLANE_TYPE_REGION is set + * with each call to query the plane info. + * - Others are invalid and return -EINVAL. + * + * Note: + * 1. Plane could be disabled by guest. In that case, success will be + * returned with zero-initialized drm_format, size, width and height + * fields. + * 2. x_hot/y_hot is set to 0xFFFFFFFF if no hotspot information available + * + * Return: 0 on success, -errno on other failure. + */ +struct vfio_device_gfx_plane_info { + __u32 argsz; + __u32 flags; +#define VFIO_GFX_PLANE_TYPE_PROBE (1 << 0) +#define VFIO_GFX_PLANE_TYPE_DMABUF (1 << 1) +#define VFIO_GFX_PLANE_TYPE_REGION (1 << 2) + /* in */ + __u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */ + /* out */ + __u32 drm_format; /* drm format of plane */ + __u64 drm_format_mod; /* tiled mode */ + __u32 width; /* width of plane */ + __u32 height; /* height of plane */ + __u32 stride; /* stride of plane */ + __u32 size; /* size of plane in bytes, align on page*/ + __u32 x_pos; /* horizontal position of cursor plane */ + __u32 y_pos; /* vertical position of cursor plane*/ + __u32 x_hot; /* horizontal position of cursor hotspot */ + __u32 y_hot; /* vertical position of cursor hotspot */ + union { + __u32 region_index; /* region index */ + __u32 dmabuf_id; /* dma-buf id */ + }; +}; + +#define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14) + +/** + * VFIO_DEVICE_GET_GFX_DMABUF - _IOW(VFIO_TYPE, VFIO_BASE + 15, __u32) + * + * Return a new dma-buf file descriptor for an exposed guest framebuffer + * described by the provided dmabuf_id. The dmabuf_id is returned from VFIO_ + * DEVICE_QUERY_GFX_PLANE as a token of the exposed guest framebuffer. + */ + +#define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15) + +/** + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 16, + * struct vfio_device_ioeventfd) + * + * Perform a write to the device at the specified device fd offset, with + * the specified data and width when the provided eventfd is triggered. + * vfio bus drivers may not support this for all regions, for all widths, + * or at all. vfio-pci currently only enables support for BAR regions, + * excluding the MSI-X vector table. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_ioeventfd { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */ +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */ +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) + __u64 offset; /* device fd offset of write */ + __u64 data; /* data to be written */ + __s32 fd; /* -1 for de-assignment */ +}; + +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) + +/** + * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_device_feature) + * + * Get, set, or probe feature data of the device. The feature is selected + * using the FEATURE_MASK portion of the flags field. Support for a feature + * can be probed by setting both the FEATURE_MASK and PROBE bits. A probe + * may optionally include the GET and/or SET bits to determine read vs write + * access of the feature respectively. Probing a feature will return success + * if the feature is supported and all of the optionally indicated GET/SET + * methods are supported. The format of the data portion of the structure is + * specific to the given feature. The data portion is not required for + * probing. GET and SET are mutually exclusive, except for use with PROBE. + * + * Return 0 on success, -errno on failure. + */ +struct vfio_device_feature { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_FEATURE_MASK (0xffff) /* 16-bit feature index */ +#define VFIO_DEVICE_FEATURE_GET (1 << 16) /* Get feature into data[] */ +#define VFIO_DEVICE_FEATURE_SET (1 << 17) /* Set feature from data[] */ +#define VFIO_DEVICE_FEATURE_PROBE (1 << 18) /* Probe feature support */ + __u8 data[]; +}; + +#define VFIO_DEVICE_FEATURE _IO(VFIO_TYPE, VFIO_BASE + 17) + +/* + * Provide support for setting a PCI VF Token, which is used as a shared + * secret between PF and VF drivers. This feature may only be set on a + * PCI SR-IOV PF when SR-IOV is enabled on the PF and there are no existing + * open VFs. Data provided when setting this feature is a 16-byte array + * (__u8 b[16]), representing a UUID. + */ +#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0) + +/* -------- API for Type1 VFIO IOMMU -------- */ + +/** + * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info) + * + * Retrieve information about the IOMMU object. Fills in provided + * struct vfio_iommu_info. Caller sets argsz. + * + * XXX Should we do these by CHECK_EXTENSION too? + */ +struct vfio_iommu_type1_info { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ +#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ + __u64 iova_pgsizes; /* Bitmap of supported page sizes */ + __u32 cap_offset; /* Offset within info struct of first cap */ +}; + +/* + * The IOVA capability allows to report the valid IOVA range(s) + * excluding any non-relaxable reserved regions exposed by + * devices attached to the container. Any DMA map attempt + * outside the valid iova range will return error. + * + * The structures below define version 1 of this capability. + */ +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 1 + +struct vfio_iova_range { + __u64 start; + __u64 end; +}; + +struct vfio_iommu_type1_info_cap_iova_range { + struct vfio_info_cap_header header; + __u32 nr_iovas; + __u32 reserved; + struct vfio_iova_range iova_ranges[]; +}; + +/* + * The migration capability allows to report supported features for migration. + * + * The structures below define version 1 of this capability. + * + * The existence of this capability indicates that IOMMU kernel driver supports + * dirty page logging. + * + * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty + * page logging. + * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap + * size in bytes that can be used by user applications when getting the dirty + * bitmap. + */ +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 2 + +struct vfio_iommu_type1_info_cap_migration { + struct vfio_info_cap_header header; + __u32 flags; + __u64 pgsize_bitmap; + __u64 max_dirty_bitmap_size; /* in bytes */ +}; + +/* + * The DMA available capability allows to report the current number of + * simultaneously outstanding DMA mappings that are allowed. + * + * The structure below defines version 1 of this capability. + * + * avail: specifies the current number of outstanding DMA mappings allowed. + */ +#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 + +struct vfio_iommu_type1_info_dma_avail { + struct vfio_info_cap_header header; + __u32 avail; +}; + +#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/** + * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map) + * + * Map process virtual addresses to IO virtual addresses using the + * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. + */ +struct vfio_iommu_type1_dma_map { + __u32 argsz; + __u32 flags; +#define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ +#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ + __u64 vaddr; /* Process virtual address */ + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of mapping (bytes) */ +}; + +#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) + +struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ + __u64 __user *data; /* one bit per page */ +}; + +/** + * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_dma_unmap) + * + * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. + * Caller sets argsz. The actual unmapped size is returned in the size + * field. No guarantee is made to the user that arbitrary unmaps of iova + * or size different from those used in the original mapping call will + * succeed. + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap + * before unmapping IO virtual addresses. When this flag is set, the user must + * provide a struct vfio_bitmap in data[]. User must provide zero-allocated + * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field. + * A bit in the bitmap represents one page, of user provided page size in + * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set + * indicates that the page at that offset from iova is dirty. A Bitmap of the + * pages in the range of unmapped size is returned in the user-provided + * vfio_bitmap.data. + */ +struct vfio_iommu_type1_dma_unmap { + __u32 argsz; + __u32 flags; +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of mapping (bytes) */ + __u8 data[]; +}; + +#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) + +/* + * IOCTLs to enable/disable IOMMU container usage. + * No parameters are supported. + */ +#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) +#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) + +/** + * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_type1_dirty_bitmap) + * IOCTL is used for dirty pages logging. + * Caller should set flag depending on which operation to perform, details as + * below: + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs + * the IOMMU driver to log pages that are dirtied or potentially dirtied by + * the device; designed to be used when a migration is in progress. Dirty pages + * are logged until logging is disabled by user application by calling the IOCTL + * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs + * the IOMMU driver to stop logging dirtied pages. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set + * returns the dirty pages bitmap for IOMMU container for a given IOVA range. + * The user must specify the IOVA range and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports getting a bitmap of the smallest supported pgsize only and can be + * modified in future to get a bitmap of any specified supported pgsize. The + * user must provide a zeroed memory area for the bitmap memory and specify its + * size in bitmap.size. One bit is used to represent one page consecutively + * starting from iova offset. The user should provide page size in bitmap.pgsize + * field. A bit set in the bitmap indicates that the page at that offset from + * iova is dirty. The caller must set argsz to a value including the size of + * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the + * actual bitmap. If dirty pages logging is not enabled, an error will be + * returned. + * + * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as + * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying + * dirty bitmap is not cleared automatically. The user can clear it manually by + * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap + * for IOMMU container for a given IOVA range. The user must specify the IOVA + * range, the bitmap and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports clearing a bitmap of the smallest supported pgsize only and can be + * modified in future to clear a bitmap of any specified supported pgsize. The + * user must provide a memory area for the bitmap memory and specify its size + * in bitmap.size. One bit is used to represent one page consecutively starting + * from iova offset. The user should provide page size in bitmap.pgsize field. + * A bit set in the bitmap indicates that the page at that offset from iova is + * cleared the dirty status, and dirty tracking is re-enabled for that page. The + * caller must set argsz to a value including the size of structure + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If + * dirty pages logging is not enabled, an error will be returned. Note: user + * should clear dirty log before handle corresponding dirty pages. + * + * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be + * specified at a time. + */ +struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) + __u8 data[]; +}; + +struct vfio_iommu_type1_dirty_bitmap_get { + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of iova range */ + struct vfio_bitmap bitmap; +}; + +#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) + +/* + * VFIO_IOMMU_BIND_PROCESS + * + * Allocate a PASID for a process address space, and use it to attach this + * process to all devices in the container. Devices can then tag their DMA + * traffic with the returned @pasid to perform transactions on the associated + * virtual address space. Mapping and unmapping buffers is performed by standard + * functions such as mmap and malloc. + * + * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to + * bind. Otherwise the current task is bound. Given that the caller owns the + * device, setting this flag grants the caller read and write permissions on the + * entire address space of foreign process described by @pid. Therefore, + * permission to perform the bind operation on a foreign process is governed by + * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) + * for more information. + * + * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This + * ID is unique to a process and can be used on all devices in the container. + * + * On fork, the child inherits the device fd and can use the bonds setup by its + * parent. Consequently, the child has R/W access on the address spaces bound by + * its parent. After an execv, the device fd is closed and the child doesn't + * have access to the address space anymore. + * + * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is + * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND, + * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current + * task from the container. + */ +struct vfio_iommu_type1_bind_process { + __u32 flags; +#define VFIO_IOMMU_BIND_PID (1 << 0) + __u32 pasid; + __s32 pid; +}; + +/* + * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes + * vfio_iommu_type1_bind_process in data. + */ +struct vfio_iommu_type1_bind { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_BIND_PROCESS (1 << 0) + __u8 data[]; +}; + +/* + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind) + * + * Manage address spaces of devices in this container. Initially a TYPE1 + * container can only have one address space, managed with + * VFIO_IOMMU_MAP/UNMAP_DMA. + * + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page + * tables, and BIND manages the stage-1 (guest) page tables. Other types of + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls + * non-PASID traffic and BIND controls PASID traffic. But this depends on the + * underlying IOMMU architecture and isn't guaranteed. + * + * Availability of this feature depends on the device, its bus, the underlying + * IOMMU and the CPU architecture. + * + * returns: 0 on success, -errno on failure. + */ +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 22) + +/* + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind) + * + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. + */ +#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23) + +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ + +/* + * The SPAPR TCE DDW info struct provides the information about + * the details of Dynamic DMA window capability. + * + * @pgsizes contains a page size bitmask, 4K/64K/16M are supported. + * @max_dynamic_windows_supported tells the maximum number of windows + * which the platform can create. + * @levels tells the maximum number of levels in multi-level IOMMU tables; + * this allows splitting a table into smaller chunks which reduces + * the amount of physically contiguous memory required for the table. + */ +struct vfio_iommu_spapr_tce_ddw_info { + __u64 pgsizes; /* Bitmap of supported page sizes */ + __u32 max_dynamic_windows_supported; + __u32 levels; +}; + +/* + * The SPAPR TCE info struct provides the information about the PCI bus + * address ranges available for DMA, these values are programmed into + * the hardware so the guest has to know that information. + * + * The DMA 32 bit window start is an absolute PCI bus address. + * The IOVA address passed via map/unmap ioctls are absolute PCI bus + * addresses too so the window works as a filter rather than an offset + * for IOVA addresses. + * + * Flags supported: + * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows + * (DDW) support is present. @ddw is only supported when DDW is present. + */ +struct vfio_iommu_spapr_tce_info { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_SPAPR_INFO_DDW (1 << 0) /* DDW supported */ + __u32 dma32_window_start; /* 32 bit window start (bytes) */ + __u32 dma32_window_size; /* 32 bit window size (bytes) */ + struct vfio_iommu_spapr_tce_ddw_info ddw; +}; + +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/* + * EEH PE operation struct provides ways to: + * - enable/disable EEH functionality; + * - unfreeze IO/DMA for frozen PE; + * - read PE state; + * - reset PE; + * - configure PE; + * - inject EEH error. + */ +struct vfio_eeh_pe_err { + __u32 type; + __u32 func; + __u64 addr; + __u64 mask; +}; + +struct vfio_eeh_pe_op { + __u32 argsz; + __u32 flags; + __u32 op; + union { + struct vfio_eeh_pe_err err; + }; +}; + +#define VFIO_EEH_PE_DISABLE 0 /* Disable EEH functionality */ +#define VFIO_EEH_PE_ENABLE 1 /* Enable EEH functionality */ +#define VFIO_EEH_PE_UNFREEZE_IO 2 /* Enable IO for frozen PE */ +#define VFIO_EEH_PE_UNFREEZE_DMA 3 /* Enable DMA for frozen PE */ +#define VFIO_EEH_PE_GET_STATE 4 /* PE state retrieval */ +#define VFIO_EEH_PE_STATE_NORMAL 0 /* PE in functional state */ +#define VFIO_EEH_PE_STATE_RESET 1 /* PE reset in progress */ +#define VFIO_EEH_PE_STATE_STOPPED 2 /* Stopped DMA and IO */ +#define VFIO_EEH_PE_STATE_STOPPED_DMA 4 /* Stopped DMA only */ +#define VFIO_EEH_PE_STATE_UNAVAIL 5 /* State unavailable */ +#define VFIO_EEH_PE_RESET_DEACTIVATE 5 /* Deassert PE reset */ +#define VFIO_EEH_PE_RESET_HOT 6 /* Assert hot reset */ +#define VFIO_EEH_PE_RESET_FUNDAMENTAL 7 /* Assert fundamental reset */ +#define VFIO_EEH_PE_CONFIGURE 8 /* PE configuration */ +#define VFIO_EEH_PE_INJECT_ERR 9 /* Inject EEH error */ + +#define VFIO_EEH_PE_OP _IO(VFIO_TYPE, VFIO_BASE + 21) + +/** + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory) + * + * Registers user space memory where DMA is allowed. It pins + * user pages and does the locked memory accounting so + * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls + * get faster. + */ +struct vfio_iommu_spapr_register_memory { + __u32 argsz; + __u32 flags; + __u64 vaddr; /* Process virtual address */ + __u64 size; /* Size of mapping (bytes) */ +}; +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) + +/** + * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory) + * + * Unregisters user space memory registered with + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY. + * Uses vfio_iommu_spapr_register_memory for parameters. + */ +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) + +/** + * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create) + * + * Creates an additional TCE table and programs it (sets a new DMA window) + * to every IOMMU group in the container. It receives page shift, window + * size and number of levels in the TCE table being created. + * + * It allocates and returns an offset on a PCI bus of the new DMA window. + */ +struct vfio_iommu_spapr_tce_create { + __u32 argsz; + __u32 flags; + /* in */ + __u32 page_shift; + __u32 __resv1; + __u64 window_size; + __u32 levels; + __u32 __resv2; + /* out */ + __u64 start_addr; +}; +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) + +/** + * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove) + * + * Unprograms a TCE table from all groups in the container and destroys it. + * It receives a PCI bus offset as a window id. + */ +struct vfio_iommu_spapr_tce_remove { + __u32 argsz; + __u32 flags; + /* in */ + __u64 start_addr; +}; +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + +/* ***************************************************************** */ + +#endif /* _UAPIVFIO_H */ diff --git a/KAEKernelDriver/KAEKernelDriver-OLK-6.6/Makefile b/KAEKernelDriver/KAEKernelDriver-OLK-6.6/Makefile index 9328cee..e5c78b6 100644 --- a/KAEKernelDriver/KAEKernelDriver-OLK-6.6/Makefile +++ b/KAEKernelDriver/KAEKernelDriver-OLK-6.6/Makefile @@ -1,10 +1,10 @@ KERNEL_VERSION_BY_BUILDENV :=`rpm -q --qf '%{VERSION}-%{RELEASE}.%{ARCH}\n' kernel-devel | head -n 1` KERNEL_PATH := /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build -KSP := $(shell if test -d /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; then \ - echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; \ - else \ - echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build; \ - fi) +# KSP := $(shell if test -d /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; then \ +# echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/source; \ +# else \ +# echo /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/build; \ +# fi) obj-m += uacce/ obj-m += hisilicon/ @@ -13,9 +13,7 @@ DIRS := $(shell find . -maxdepth 3 -type d) TARGET = $(foreach dir,$(DIRS),$(wildcard \ $(dir)/*.o) $(dir)/*.ko $(dir)/*.tmp_versions $(dir)/*.depend $(dir)/*.mod.c $(dir)/*.order $(dir)/*.symvers) -default: - $(MAKE) -C $(KERNEL_PATH) M=$(shell pwd) modules \ - CONFIG_CC_STACKPROTECTOR_STRONG=y \ +CONFIG_FLAGS = CONFIG_CC_STACKPROTECTOR_STRONG=y \ CONFIG_UACCE=m \ CONFIG_CRYPTO_QM_UACCE=m \ CONFIG_CRYPTO_DEV_HISI_SGL=m \ @@ -23,8 +21,17 @@ default: CONFIG_CRYPTO_DEV_HISI_ZIP=m \ CONFIG_CRYPTO_DEV_HISI_HPRE=m \ CONFIG_CRYPTO_DEV_HISI_SEC2=m \ - CONFIG_CRYPTO_DEV_HISI_TRNG=m \ - CONFIG_HISI_ACC_VFIO_PCI=m + CONFIG_CRYPTO_DEV_HISI_TRNG=m + +ifeq ($(ENABLE_MIGRATION), y) +CONFIG_FLAGS += CONFIG_HISI_ACC_VFIO_PCI=m +else +CONFIG_FLAGS += CONFIG_HISI_ACC_VFIO_PCI=n +endif + +default: + $(MAKE) -C $(KERNEL_PATH) M=$(shell pwd) modules $(CONFIG_FLAGS) + #copy: # cp -f $(shell pwd)/include_linux/uacce.h $(KSP)/include/linux # cp -f $(shell pwd)/include_uapi_linux/uacce.h $(KSP)/include/uapi/linux @@ -41,7 +48,9 @@ install: -modprobe hisi_sec2 uacce_mode=1 pf_q_num=256 -modprobe hisi_hpre uacce_mode=1 pf_q_num=256 -modprobe hisi_zip uacce_mode=1 pf_q_num=256 - -modprobe hisi_acc_vfio_pci + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe hisi_acc_vfio_pci; \ + fi) -echo "options hisi_sec2 uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_sec2.conf -echo "options hisi_hpre uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_hpre.conf -echo "options hisi_zip uacce_mode=1 pf_q_num=256" > /etc/modprobe.d/hisi_zip.conf @@ -62,7 +71,9 @@ check: done uninstall: - modprobe -r hisi_acc_vfio_pci + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe -r hisi_acc_vfio_pci; \ + fi) modprobe -r hisi_zip modprobe -r hisi_hpre modprobe -r hisi_sec2 @@ -73,7 +84,9 @@ uninstall: rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_sec2.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_hpre.ko rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_zip.ko - rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_acc_vfio_pci.ko + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + rm -rf /lib/modules/$(KERNEL_VERSION_BY_BUILDENV)/extra/hisi_acc_vfio_pci.ko; \ + fi) rm -rf /etc/modprobe.d/hisi_sec2.conf rm -rf /etc/modprobe.d/hisi_hpre.conf rm -rf /etc/modprobe.d/hisi_zip.conf @@ -90,7 +103,9 @@ nosva: -modprobe hisi_sec2 uacce_mode=2 pf_q_num=256 -modprobe hisi_hpre uacce_mode=2 pf_q_num=256 -modprobe hisi_zip uacce_mode=2 pf_q_num=256 - -modprobe hisi_acc_vfio_pci + $(shell if [ "$(ENABLE_MIGRATION)" = "y" ]; then \ + modprobe hisi_acc_vfio_pci; \ + fi) -echo "options hisi_sec2 uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_sec2.conf -echo "options hisi_hpre uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_hpre.conf -echo "options hisi_zip uacce_mode=2 pf_q_num=256" > /etc/modprobe.d/hisi_zip.conf diff --git a/build.sh b/build.sh index 6cc57d2..4f834b8 100644 --- a/build.sh +++ b/build.sh @@ -265,8 +265,14 @@ function build_driver() lsmod | grep -q "^uacce" && modprobe -r uacce cd ${KAE_KERNEL_DIR} - make -j - make nosva #默认使用nosva模式 + + if [ "$1" = "migration" ]; then + make ENABLE_MIGRATION=y -j + make nosva ENABLE_MIGRATION=y + else + make -j + make nosva #默认使用nosva模式 + fi # make install chmod 666 /dev/hisi_* } @@ -276,14 +282,22 @@ function build_driver_sva() cd ${KAE_KERNEL_DIR} make -j # make nosva #默认使用nosva模式 - make install + if [ "$1" = "migration" ]; then + make install ENABLE_MIGRATION=y + else + make install #默认使用nosva模式 + fi chmod 666 /dev/hisi_* } function driver_clean() { cd ${KAE_KERNEL_DIR} - make uninstall + if [ "$1" = "migration" ]; then + make uninstall ENABLE_MIGRATION=y + else + make uninstall #默认使用nosva模式 + fi make clean } @@ -629,6 +643,9 @@ function help() echo "sh build.sh driver -- install KAE driver" echo "sh build.sh driver clean -- uninstall KAE driver" + echo "sh build.sh driver_migration -- install KAE driver, with migration driver" + echo "sh build.sh driver_migration clean -- uninstall KAE driver, with migration driver" + echo "sh build.sh uadk -- install uadk" echo "sh build.sh uadk clean -- uninstall uadk" @@ -721,6 +738,19 @@ main() { else build_driver fi + ;; + "driver_migration") + if [ "$2" = "clean" ]; then + driver_clean migration + elif [ "$2" = "sva" ]; then + build_driver_sva migration + elif [ "$2" = "check" ]; then + driver_check + elif [ "$2" = "delete" ]; then + driver_delete + else + build_driver migration + fi ;; "uadk") if [ "$2" = "clean" ]; then -- Gitee