diff --git a/0005-Initial-redhat-build.patch b/0005-Initial-redhat-build.patch new file mode 100755 index 0000000000000000000000000000000000000000..cde66a17b6958e2551cca5f8775314eeb0a63ec9 --- /dev/null +++ b/0005-Initial-redhat-build.patch @@ -0,0 +1,167 @@ +From 4df157781801c50224373be57fa3c8c3741c0535 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 12 Oct 2018 07:31:11 +0200 +Subject: Initial redhat build + +This patch introduces redhat build structure in redhat subdirectory. In addition, +several issues are fixed in QEMU tree: + + - Change of app name for sasl_server_init in VNC code from qemu to qemu-kvm + - As we use qemu-kvm as name in all places, this is updated to be consistent + - Man page renamed from qemu to qemu-kvm + - man page is installed using make install so we have to fix it in qemu tree + - Use "/share/qemu-kvm" as SHARE_SUFFIX + - We reconfigured our share to qemu-kvm to be consistent with used name + +This rebase includes changes up to qemu-kvm-4.1.0-18.el8 + +Rebase notes (3.1.0): +- added new configure options + +Rebase notes (4.0.0): +- Added dependency to perl-Test-Harness (upstream) +- Added dependency to python3-sphinx (upstream) +- Change location of icons (upstream) +- Remove .desktop file (added upstream) +- Added qemu-trace-stap (added upstream) +- Removed elf2dmp (added upstream) +- Remove .buildinfo +- Added pvh.bin rom (added upstream) +- Added interop documentation files +- Use python module instead of qemu.py (upstream) + +Rebase notes (4.1.0): +- Remove edk2 files generated by build +- Switch to rhel-8.1-candidate build target +- Remove specs documentation +- Switched from libssh2 to libssh +- Add rc0 tarball usage hacks +- Added BuildRequires for wget, rpm-build and python3-sphinx +- Removed new unpacked files +- Update configure line to use new options + +Rebase notes (4.2.0): +- Disable iotest run during make check +- README renamed to README.rst (upstream) +- Removed ui-spice-app.so +- Added relevant changes from "505f7f4 redhat: Adding slirp to the exploded tree" +- Removed qemu-ga.8 install from spec file - installed by make +- Removed spapr-rtas.bin (upstream) +- Require newer SLOF (20191022) + +Merged patches (3.1.0): +- 01f0c9f RHEL8: Add disable configure options to qemu spec file +- Spec file cleanups + +Merged patches (4.0.0): +- aa4297c Add edk2 Requires to qemu-kvm +- d124ff5779 Fixing brew build target +- eb204b5 Introduce the qemu-kvm-tests rpm +- 223cf0c Load kvm module during boot (partial) + +Merged patches (4.1.0): +- ebb6e97 redhat: Fix LOCALVERSION creation +- b0ab0cc redhat: enable tpmdev passthrough (not disabling tests) +- 7cb3c4a Enable libpmem to support nvdimm +- 8943607 qemu-kvm.spec: bump libseccomp >= 2.4.0 +- 27b7c44 rh: set CONFIG_BOCHS_DISPLAY=y for x86 (partial) +- e1fe9fe x86_64-rh-devices: enable TPM emulation (partial) + +Merged patches (4.2.0): +- 69e1fb2 enable virgla +- d4f6115 enable virgl, for real this time ... + +Signed-off-by: Danilo C. L. de Paula +--- + .gitignore | 1 + + Makefile | 3 +- + configure | 1 + + os-posix.c | 2 +- + redhat/Makefile | 82 + + redhat/Makefile.common | 51 + + redhat/README.tests | 39 + + redhat/qemu-kvm.spec.template | 2434 +++++++++++++++++++++++++++++ + redhat/scripts/process-patches.sh | 7 +- + tests/Makefile.include | 2 +- + ui/vnc.c | 2 +- + 11 files changed, 2615 insertions(+), 9 deletions(-) + create mode 100644 redhat/Makefile + create mode 100644 redhat/Makefile.common + create mode 100644 redhat/README.tests + create mode 100644 redhat/qemu-kvm.spec.template + +diff --git a/Makefile b/Makefile +index b437a346d7..086727dbb9 100644 +--- a/Makefile ++++ b/Makefile +@@ -512,6 +512,7 @@ CAP_CFLAGS += -DCAPSTONE_HAS_ARM + CAP_CFLAGS += -DCAPSTONE_HAS_ARM64 + CAP_CFLAGS += -DCAPSTONE_HAS_POWERPC + CAP_CFLAGS += -DCAPSTONE_HAS_X86 ++CAP_CFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS + + .PHONY: capstone/all + capstone/all: .git-submodule-status +@@ -826,7 +827,7 @@ install-doc: $(DOCS) install-sphinxdocs + $(INSTALL_DATA) docs/interop/qemu-qmp-ref.txt "$(DESTDIR)$(qemu_docdir)" + ifdef CONFIG_POSIX + $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1" +- $(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1" ++ $(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1/qemu-kvm.1" + $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man7" + $(INSTALL_DATA) docs/interop/qemu-qmp-ref.7 "$(DESTDIR)$(mandir)/man7" + $(INSTALL_DATA) docs/qemu-block-drivers.7 "$(DESTDIR)$(mandir)/man7" +diff --git a/configure b/configure +index 6099be1d84..16564f8ccc 100755 +--- a/configure ++++ b/configure +@@ -2424,6 +2424,7 @@ if test "$seccomp" != "no" ; then + seccomp="no" + fi + fi ++ + ########################################## + # xen probe + +diff --git a/os-posix.c b/os-posix.c +index 86cffd2c7d..1c9f86768d 100644 +--- a/os-posix.c ++++ b/os-posix.c +@@ -83,7 +83,7 @@ void os_setup_signal_handling(void) + /* Find a likely location for support files using the location of the binary. + For installed binaries this will be "$bindir/../share/qemu". When + running from the build tree this will be "$bindir/../pc-bios". */ +-#define SHARE_SUFFIX "/share/qemu" ++#define SHARE_SUFFIX "/share/qemu-kvm" + #define BUILD_SUFFIX "/pc-bios" + char *os_find_datadir(void) + { +diff --git a/tests/Makefile.include b/tests/Makefile.include +index 8566f5f119..b483790cf3 100644 +--- a/tests/Makefile.include ++++ b/tests/Makefile.include +@@ -1194,7 +1194,7 @@ check-acceptance: check-venv $(TESTS_RESULTS_DIR) + check-qapi-schema: check-tests/qapi-schema/frontend check-tests/qapi-schema/doc-good.texi + check-qtest: $(patsubst %,check-qtest-%, $(QTEST_TARGETS)) + check-block: $(patsubst %,check-%, $(check-block-y)) +-check: check-block check-qapi-schema check-unit check-softfloat check-qtest check-decodetree ++check: check-qapi-schema check-unit check-softfloat check-qtest check-decodetree + check-clean: + rm -rf $(check-unit-y) tests/*.o $(QEMU_IOTESTS_HELPERS-y) + rm -rf $(sort $(foreach target,$(SYSEMU_TARGET_LIST), $(check-qtest-$(target)-y)) $(check-qtest-generic-y)) +diff --git a/ui/vnc.c b/ui/vnc.c +index 87b8045afe..ecf6276f5b 100644 +--- a/ui/vnc.c ++++ b/ui/vnc.c +@@ -3987,7 +3987,7 @@ void vnc_display_open(const char *id, Error **errp) + + #ifdef CONFIG_VNC_SASL + if (sasl) { +- int saslErr = sasl_server_init(NULL, "qemu"); ++ int saslErr = sasl_server_init(NULL, "qemu-kvm"); + + if (saslErr != SASL_OK) { + error_setg(errp, "Failed to initialize SASL auth: %s", +-- +2.21.0 + diff --git a/0006-Enable-disable-devices-for-RHEL.patch b/0006-Enable-disable-devices-for-RHEL.patch new file mode 100755 index 0000000000000000000000000000000000000000..b14bb1b514476538f3192678ca685890d229bc5d --- /dev/null +++ b/0006-Enable-disable-devices-for-RHEL.patch @@ -0,0 +1,994 @@ +From 67511676246cce57becbd2dcf5abccf08d9ef737 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Mon, 11 Jan 2016 11:53:33 +0100 +Subject: Enable/disable devices for RHEL + +This commit adds all changes related to changes in supported devices. + +Signed-off-by: Miroslav Rezanina + +Rebase notes (qemu 3.1.0) +- spapr_rng disabled in default_config +- new hyperv.mak in default configs +- Move changes from x86_64-softmmu.mak to i386-softmmu.mak +- Added CONFIG_VIRTIO_MMIO to aarch64-softmmu.mak +- Removed config_vga_isa.c changes as no longer needed +- Removed new devices + +Rebase notes (4.0.0): +- Added CONFIG_PCI_EXPRESS_GENERIC_BRIDGE for aarch64-softmmu.mak +- Added CONFIG_ARM_VIRT for aarch64-softmmu.mak +- Switch to KConfig (upstream) + - Using device whitelist + without-defualt-devices option + +Rebase notes (4.1.0): +- Added CONFIG_USB_OHCI_PCI for ppc64 +- Added CONFIG_XIVE_KVM for ppc64 +- Added CONFIG_ACPI_PCI for x86_64 +- Added CONFIG_SEMIHOSTING for aarch64 +- Cleanup aarch64 devices +- Do not build a15mpcore.c +- Removed ide-isa.c stub file +- Use CONFIG_USB_EHCI_PCI on x86_64 (new upstream) + +Rebase notes (4.2.0-rc0): +- Use conditional build for isa-superio.c (upstream change) +- Rename PCI_PIIX to PCI_I440FX (upstream change) + +Rebase notes (4.2.0-rc3): +- Disabled ccid-card-emulated (patch 92566) +- Disabled vfio-pci-igd-lpc-bridge (patch 92565) + +Merged patches (qemu 3.1.0): +- d51e082 Re-enable CONFIG_HYPERV_TESTDEV +- 4b889f3 Declare cirrus-vga as deprecated +- b579d32 Do not build bluetooth support +- 3eef52a Disable CONFIG_IPMI and CONFIG_I2C for ppc64 +- 9caf292 Disable CONFIG_CAN_BUS and CONFIG_CAN_SJA1000 + +Merged patches (4.1.0): +- 20a51f6 fdc: Revert downstream disablement of device "floppy" +- f869cc0 fdc: Restrict floppy controllers to RHEL-7 machine types +- 5909721 aarch64: Compile out IOH3420 +- 27b7c44 rh: set CONFIG_BOCHS_DISPLAY=y for x86 (partial) +- 495a27d x86_64-rh-devices: add missing TPM passthrough +- e1fe9fe x86_64-rh-devices: enable TPM emulation (partial) + +Merged patches (4.2.0): +- f7587dd RHEL: disable hostmem-memfd + +Signed-off-by: Danilo C. L. de Paula +--- + Makefile.objs | 4 +- + backends/Makefile.objs | 3 +- + default-configs/aarch64-rh-devices.mak | 20 +++++ + default-configs/aarch64-softmmu.mak | 10 ++- + default-configs/ppc64-rh-devices.mak | 32 ++++++++ + default-configs/ppc64-softmmu.mak | 8 +- + default-configs/rh-virtio.mak | 10 +++ + default-configs/s390x-rh-devices.mak | 15 ++++ + default-configs/s390x-softmmu.mak | 4 +- + default-configs/x86_64-rh-devices.mak | 100 +++++++++++++++++++++++++ + default-configs/x86_64-softmmu.mak | 4 +- + hw/acpi/ich9.c | 4 +- + hw/arm/Makefile.objs | 2 +- + hw/block/fdc.c | 10 +++ + hw/bt/Makefile.objs | 4 +- + hw/cpu/Makefile.objs | 5 +- + hw/display/Makefile.objs | 5 +- + hw/display/cirrus_vga.c | 3 + + hw/ide/piix.c | 5 +- + hw/input/pckbd.c | 2 + + hw/net/e1000.c | 2 + + hw/pci-host/i440fx.c | 4 + + hw/ppc/spapr_cpu_core.c | 2 + + hw/usb/Makefile.objs | 4 +- + hw/vfio/pci-quirks.c | 9 +++ + hw/vfio/pci.c | 5 ++ + qemu-options.hx | 7 +- + redhat/qemu-kvm.spec.template | 5 +- + target/arm/cpu.c | 4 +- + target/i386/cpu.c | 35 +++++++-- + target/ppc/cpu-models.c | 10 +++ + target/s390x/cpu_models.c | 3 + + target/s390x/kvm.c | 8 ++ + util/memfd.c | 2 +- + vl.c | 8 +- + 35 files changed, 317 insertions(+), 41 deletions(-) + create mode 100644 default-configs/aarch64-rh-devices.mak + create mode 100644 default-configs/ppc64-rh-devices.mak + create mode 100644 default-configs/rh-virtio.mak + create mode 100644 default-configs/s390x-rh-devices.mak + create mode 100644 default-configs/x86_64-rh-devices.mak + +diff --git a/Makefile.objs b/Makefile.objs +index 11ba1a36bd..fcf63e1096 100644 +--- a/Makefile.objs ++++ b/Makefile.objs +@@ -65,8 +65,8 @@ common-obj-y += replay/ + + common-obj-y += ui/ + common-obj-m += ui/ +-common-obj-y += bt-host.o bt-vhci.o +-bt-host.o-cflags := $(BLUEZ_CFLAGS) ++#common-obj-y += bt-host.o bt-vhci.o ++#bt-host.o-cflags := $(BLUEZ_CFLAGS) + + common-obj-y += dma-helpers.o + common-obj-y += vl.o +diff --git a/backends/Makefile.objs b/backends/Makefile.objs +index f0691116e8..f328d404bf 100644 +--- a/backends/Makefile.objs ++++ b/backends/Makefile.objs +@@ -16,4 +16,5 @@ endif + + common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o + +-common-obj-$(CONFIG_LINUX) += hostmem-memfd.o ++# RHEL: disable memfd ++# common-obj-$(CONFIG_LINUX) += hostmem-memfd.o +diff --git a/default-configs/aarch64-rh-devices.mak b/default-configs/aarch64-rh-devices.mak +new file mode 100644 +index 0000000000..a1ed641174 +--- /dev/null ++++ b/default-configs/aarch64-rh-devices.mak +@@ -0,0 +1,20 @@ ++include rh-virtio.mak ++ ++CONFIG_ARM_GIC_KVM=y ++CONFIG_ARM_SMMUV3=y ++CONFIG_ARM_V7M=y ++CONFIG_ARM_VIRT=y ++CONFIG_EDID=y ++CONFIG_PCIE_PORT=y ++CONFIG_PCI_DEVICES=y ++CONFIG_PCI_TESTDEV=y ++CONFIG_PFLASH_CFI01=y ++CONFIG_SCSI=y ++CONFIG_SEMIHOSTING=y ++CONFIG_USB=y ++CONFIG_USB_XHCI=y ++CONFIG_VFIO=y ++CONFIG_VFIO_PCI=y ++CONFIG_VIRTIO_MMIO=y ++CONFIG_VIRTIO_PCI=y ++CONFIG_XIO3130=y +diff --git a/default-configs/aarch64-softmmu.mak b/default-configs/aarch64-softmmu.mak +index 958b1e08e4..8f6867d48a 100644 +--- a/default-configs/aarch64-softmmu.mak ++++ b/default-configs/aarch64-softmmu.mak +@@ -1,8 +1,10 @@ + # Default configuration for aarch64-softmmu + + # We support all the 32 bit boards so need all their config +-include arm-softmmu.mak ++#include arm-softmmu.mak + +-CONFIG_XLNX_ZYNQMP_ARM=y +-CONFIG_XLNX_VERSAL=y +-CONFIG_SBSA_REF=y ++#CONFIG_XLNX_ZYNQMP_ARM=y ++#CONFIG_XLNX_VERSAL=y ++#CONFIG_SBSA_REF=y ++ ++include aarch64-rh-devices.mak +diff --git a/default-configs/ppc64-rh-devices.mak b/default-configs/ppc64-rh-devices.mak +new file mode 100644 +index 0000000000..35f2106d06 +--- /dev/null ++++ b/default-configs/ppc64-rh-devices.mak +@@ -0,0 +1,32 @@ ++include rh-virtio.mak ++ ++CONFIG_DIMM=y ++CONFIG_MEM_DEVICE=y ++CONFIG_PCI=y ++CONFIG_PCI_DEVICES=y ++CONFIG_PCI_TESTDEV=y ++CONFIG_PSERIES=y ++CONFIG_SCSI=y ++CONFIG_SPAPR_VSCSI=y ++CONFIG_TEST_DEVICES=y ++CONFIG_USB=y ++CONFIG_USB_OHCI=y ++CONFIG_USB_OHCI_PCI=y ++CONFIG_USB_SMARTCARD=y ++CONFIG_USB_STORAGE_BOT=y ++CONFIG_USB_XHCI=y ++CONFIG_USB_XHCI_NEC=y ++CONFIG_VFIO=y ++CONFIG_VFIO_PCI=y ++CONFIG_VGA=y ++CONFIG_VGA_PCI=y ++CONFIG_VHOST_USER=y ++CONFIG_VIRTIO_PCI=y ++CONFIG_VIRTIO_VGA=y ++CONFIG_WDT_IB6300ESB=y ++CONFIG_XICS=y ++CONFIG_XICS_KVM=y ++CONFIG_XICS_SPAPR=y ++CONFIG_XIVE=y ++CONFIG_XIVE_SPAPR=y ++CONFIG_XIVE_KVM=y +diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak +index cca52665d9..fec354f327 100644 +--- a/default-configs/ppc64-softmmu.mak ++++ b/default-configs/ppc64-softmmu.mak +@@ -1,10 +1,12 @@ + # Default configuration for ppc64-softmmu + + # Include all 32-bit boards +-include ppc-softmmu.mak ++#include ppc-softmmu.mak + + # For PowerNV +-CONFIG_POWERNV=y ++#CONFIG_POWERNV=y + + # For pSeries +-CONFIG_PSERIES=y ++#CONFIG_PSERIES=y ++ ++include ppc64-rh-devices.mak +diff --git a/default-configs/rh-virtio.mak b/default-configs/rh-virtio.mak +new file mode 100644 +index 0000000000..94ede1b5f6 +--- /dev/null ++++ b/default-configs/rh-virtio.mak +@@ -0,0 +1,10 @@ ++CONFIG_VIRTIO=y ++CONFIG_VIRTIO_BALLOON=y ++CONFIG_VIRTIO_BLK=y ++CONFIG_VIRTIO_GPU=y ++CONFIG_VIRTIO_INPUT=y ++CONFIG_VIRTIO_INPUT_HOST=y ++CONFIG_VIRTIO_NET=y ++CONFIG_VIRTIO_RNG=y ++CONFIG_VIRTIO_SCSI=y ++CONFIG_VIRTIO_SERIAL=y +diff --git a/default-configs/s390x-rh-devices.mak b/default-configs/s390x-rh-devices.mak +new file mode 100644 +index 0000000000..c3c73fe752 +--- /dev/null ++++ b/default-configs/s390x-rh-devices.mak +@@ -0,0 +1,15 @@ ++include rh-virtio.mak ++ ++CONFIG_PCI=y ++CONFIG_S390_CCW_VIRTIO=y ++CONFIG_S390_FLIC=y ++CONFIG_S390_FLIC_KVM=y ++CONFIG_SCLPCONSOLE=y ++CONFIG_SCSI=y ++CONFIG_TERMINAL3270=y ++CONFIG_VFIO=y ++CONFIG_VFIO_AP=y ++CONFIG_VFIO_PCI=y ++CONFIG_VHOST_USER=y ++CONFIG_VIRTIO_CCW=y ++CONFIG_WDT_DIAG288=y +diff --git a/default-configs/s390x-softmmu.mak b/default-configs/s390x-softmmu.mak +index f2287a133f..3e2e388e91 100644 +--- a/default-configs/s390x-softmmu.mak ++++ b/default-configs/s390x-softmmu.mak +@@ -10,4 +10,6 @@ + + # Boards: + # +-CONFIG_S390_CCW_VIRTIO=y ++#CONFIG_S390_CCW_VIRTIO=y ++ ++include s390x-rh-devices.mak +diff --git a/default-configs/x86_64-rh-devices.mak b/default-configs/x86_64-rh-devices.mak +new file mode 100644 +index 0000000000..d59b6d9bb5 +--- /dev/null ++++ b/default-configs/x86_64-rh-devices.mak +@@ -0,0 +1,100 @@ ++include rh-virtio.mak ++ ++CONFIG_AC97=y ++CONFIG_ACPI=y ++CONFIG_ACPI_PCI=y ++CONFIG_ACPI_CPU_HOTPLUG=y ++CONFIG_ACPI_MEMORY_HOTPLUG=y ++CONFIG_ACPI_NVDIMM=y ++CONFIG_ACPI_SMBUS=y ++CONFIG_ACPI_VMGENID=y ++CONFIG_ACPI_X86=y ++CONFIG_ACPI_X86_ICH=y ++CONFIG_AHCI=y ++CONFIG_APIC=y ++CONFIG_APM=y ++CONFIG_BOCHS_DISPLAY=y ++CONFIG_DIMM=y ++CONFIG_E1000E_PCI_EXPRESS=y ++CONFIG_E1000_PCI=y ++CONFIG_EDU=y ++CONFIG_FDC=y ++CONFIG_FW_CFG_DMA=y ++CONFIG_HDA=y ++CONFIG_HYPERV=y ++CONFIG_HYPERV_TESTDEV=y ++CONFIG_I2C=y ++CONFIG_I440FX=y ++CONFIG_I8254=y ++CONFIG_I8257=y ++CONFIG_I8259=y ++CONFIG_I82801B11=y ++CONFIG_IDE_CORE=y ++CONFIG_IDE_PCI=y ++CONFIG_IDE_PIIX=y ++CONFIG_IDE_QDEV=y ++CONFIG_IOAPIC=y ++CONFIG_IOH3420=y ++CONFIG_ISA_BUS=y ++CONFIG_ISA_DEBUG=y ++CONFIG_ISA_TESTDEV=y ++CONFIG_LPC_ICH9=y ++CONFIG_MC146818RTC=y ++CONFIG_MEM_DEVICE=y ++CONFIG_NVDIMM=y ++CONFIG_OPENGL=y ++CONFIG_PAM=y ++CONFIG_PC=y ++CONFIG_PCI=y ++CONFIG_PCIE_PORT=y ++CONFIG_PCI_DEVICES=y ++CONFIG_PCI_EXPRESS=y ++CONFIG_PCI_EXPRESS_Q35=y ++CONFIG_PCI_I440FX=y ++CONFIG_PCI_TESTDEV=y ++CONFIG_PCKBD=y ++CONFIG_PCSPK=y ++CONFIG_PC_ACPI=y ++CONFIG_PC_PCI=y ++CONFIG_PFLASH_CFI01=y ++CONFIG_PVPANIC=y ++CONFIG_PXB=y ++CONFIG_Q35=y ++CONFIG_QXL=y ++CONFIG_RTL8139_PCI=y ++CONFIG_SCSI=y ++CONFIG_SERIAL=y ++CONFIG_SERIAL_ISA=y ++CONFIG_SERIAL_PCI=y ++CONFIG_SEV=y ++CONFIG_SGA=y ++CONFIG_SMBIOS=y ++CONFIG_SMBUS_EEPROM=y ++CONFIG_SPICE=y ++CONFIG_TEST_DEVICES=y ++CONFIG_USB=y ++CONFIG_USB_EHCI=y ++CONFIG_USB_EHCI_PCI=y ++CONFIG_USB_SMARTCARD=y ++CONFIG_USB_STORAGE_BOT=y ++CONFIG_USB_UHCI=y ++CONFIG_USB_XHCI=y ++CONFIG_USB_XHCI_NEC=y ++CONFIG_VFIO=y ++CONFIG_VFIO_PCI=y ++CONFIG_VGA=y ++CONFIG_VGA_CIRRUS=y ++CONFIG_VGA_PCI=y ++CONFIG_VHOST_USER=y ++CONFIG_VIRTIO_PCI=y ++CONFIG_VIRTIO_VGA=y ++CONFIG_VMMOUSE=y ++CONFIG_VMPORT=y ++CONFIG_VTD=y ++CONFIG_WDT_IB6300ESB=y ++CONFIG_WDT_IB700=y ++CONFIG_XIO3130=y ++CONFIG_TPM_CRB=y ++CONFIG_TPM_TIS=y ++CONFIG_TPM_EMULATOR=y ++CONFIG_TPM_PASSTHROUGH=y +diff --git a/default-configs/x86_64-softmmu.mak b/default-configs/x86_64-softmmu.mak +index 64b2ee2960..b5de7e5279 100644 +--- a/default-configs/x86_64-softmmu.mak ++++ b/default-configs/x86_64-softmmu.mak +@@ -1,3 +1,5 @@ + # Default configuration for x86_64-softmmu + +-include i386-softmmu.mak ++#include i386-softmmu.mak ++ ++include x86_64-rh-devices.mak +diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c +index 2034dd749e..ab203ad448 100644 +--- a/hw/acpi/ich9.c ++++ b/hw/acpi/ich9.c +@@ -449,8 +449,8 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) + static const uint32_t gpe0_len = ICH9_PMIO_GPE0_LEN; + pm->acpi_memory_hotplug.is_enabled = true; + pm->cpu_hotplug_legacy = true; +- pm->disable_s3 = 0; +- pm->disable_s4 = 0; ++ pm->disable_s3 = 1; ++ pm->disable_s4 = 1; + pm->s4_val = 2; + + object_property_add_uint32_ptr(obj, ACPI_PM_PROP_PM_IO_BASE, +diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs +index fe749f65fd..2aa1a9efdd 100644 +--- a/hw/arm/Makefile.objs ++++ b/hw/arm/Makefile.objs +@@ -27,7 +27,7 @@ obj-$(CONFIG_VEXPRESS) += vexpress.o + obj-$(CONFIG_ZYNQ) += xilinx_zynq.o + obj-$(CONFIG_SABRELITE) += sabrelite.o + +-obj-$(CONFIG_ARM_V7M) += armv7m.o ++#obj-$(CONFIG_ARM_V7M) += armv7m.o + obj-$(CONFIG_EXYNOS4) += exynos4210.o + obj-$(CONFIG_PXA2XX) += pxa2xx.o pxa2xx_gpio.o pxa2xx_pic.o + obj-$(CONFIG_DIGIC) += digic.o +diff --git a/hw/block/fdc.c b/hw/block/fdc.c +index ac5d31e8c1..e925bac002 100644 +--- a/hw/block/fdc.c ++++ b/hw/block/fdc.c +@@ -46,6 +46,8 @@ + #include "qemu/module.h" + #include "trace.h" + ++#include "hw/boards.h" ++ + /********************************************************/ + /* debug Floppy devices */ + +@@ -2638,6 +2640,14 @@ static void fdctrl_realize_common(DeviceState *dev, FDCtrl *fdctrl, + int i, j; + static int command_tables_inited = 0; + ++ /* Restricted for Red Hat Enterprise Linux: */ ++ MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); ++ if (!strstr(mc->name, "-rhel7.")) { ++ error_setg(errp, "Device %s is not supported with machine type %s", ++ object_get_typename(OBJECT(dev)), mc->name); ++ return; ++ } ++ + if (fdctrl->fallback == FLOPPY_DRIVE_TYPE_AUTO) { + error_setg(errp, "Cannot choose a fallback FDrive type of 'auto'"); + } +diff --git a/hw/bt/Makefile.objs b/hw/bt/Makefile.objs +index 867a7d2e8a..e678e9ee3c 100644 +--- a/hw/bt/Makefile.objs ++++ b/hw/bt/Makefile.objs +@@ -1,3 +1,3 @@ +-common-obj-y += core.o l2cap.o sdp.o hci.o hid.o +-common-obj-y += hci-csr.o ++#common-obj-y += core.o l2cap.o sdp.o hci.o hid.o ++#common-obj-y += hci-csr.o + +diff --git a/hw/cpu/Makefile.objs b/hw/cpu/Makefile.objs +index 8db9e8a7b3..1601ea93c7 100644 +--- a/hw/cpu/Makefile.objs ++++ b/hw/cpu/Makefile.objs +@@ -1,5 +1,6 @@ + obj-$(CONFIG_ARM11MPCORE) += arm11mpcore.o + obj-$(CONFIG_REALVIEW) += realview_mpcore.o + obj-$(CONFIG_A9MPCORE) += a9mpcore.o +-obj-$(CONFIG_A15MPCORE) += a15mpcore.o +-common-obj-y += core.o cluster.o ++#obj-$(CONFIG_A15MPCORE) += a15mpcore.o ++common-obj-y += core.o ++# cluster.o +diff --git a/hw/display/Makefile.objs b/hw/display/Makefile.objs +index f2182e3bef..3d0cda1b52 100644 +--- a/hw/display/Makefile.objs ++++ b/hw/display/Makefile.objs +@@ -1,8 +1,9 @@ + common-obj-$(CONFIG_DDC) += i2c-ddc.o + common-obj-$(CONFIG_EDID) += edid-generate.o edid-region.o + +-common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o +-common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o ++# Disabled for Red Hat Enterprise Linux ++#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o ++#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o + + common-obj-$(CONFIG_ADS7846) += ads7846.o + common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o +diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c +index cd283e53b4..93afa26fda 100644 +--- a/hw/display/cirrus_vga.c ++++ b/hw/display/cirrus_vga.c +@@ -2975,6 +2975,9 @@ static void pci_cirrus_vga_realize(PCIDevice *dev, Error **errp) + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); + int16_t device_id = pc->device_id; + ++ warn_report("'cirrus-vga' is deprecated, " ++ "please use a different VGA card instead"); ++ + /* follow real hardware, cirrus card emulated has 4 MB video memory. + Also accept 8 MB/16 MB for backward compatibility. */ + if (s->vga.vram_size_mb != 4 && s->vga.vram_size_mb != 8 && +diff --git a/hw/ide/piix.c b/hw/ide/piix.c +index db313dd3b1..e14858ca64 100644 +--- a/hw/ide/piix.c ++++ b/hw/ide/piix.c +@@ -251,7 +251,8 @@ static void piix3_ide_class_init(ObjectClass *klass, void *data) + k->device_id = PCI_DEVICE_ID_INTEL_82371SB_1; + k->class_id = PCI_CLASS_STORAGE_IDE; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); +- dc->hotpluggable = false; ++ /* Disabled for Red Hat Enterprise Linux: */ ++ dc->user_creatable = false; + } + + static const TypeInfo piix3_ide_info = { +@@ -279,6 +280,8 @@ static void piix4_ide_class_init(ObjectClass *klass, void *data) + k->class_id = PCI_CLASS_STORAGE_IDE; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + dc->hotpluggable = false; ++ /* Disabled for Red Hat Enterprise Linux: */ ++ dc->user_creatable = false; + } + + static const TypeInfo piix4_ide_info = { +diff --git a/hw/input/pckbd.c b/hw/input/pckbd.c +index f0acfd86f7..390eb6579c 100644 +--- a/hw/input/pckbd.c ++++ b/hw/input/pckbd.c +@@ -571,6 +571,8 @@ static void i8042_class_initfn(ObjectClass *klass, void *data) + dc->realize = i8042_realizefn; + dc->vmsd = &vmstate_kbd_isa; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); ++ /* Disabled for Red Hat Enterprise Linux: */ ++ dc->user_creatable = false; + } + + static const TypeInfo i8042_info = { +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index a73f8d404e..fc73fdd6fa 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -1795,6 +1795,7 @@ static const E1000Info e1000_devices[] = { + .revision = 0x03, + .phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT, + }, ++#if 0 /* Disabled for Red Hat Enterprise Linux 7 */ + { + .name = "e1000-82544gc", + .device_id = E1000_DEV_ID_82544GC_COPPER, +@@ -1807,6 +1808,7 @@ static const E1000Info e1000_devices[] = { + .revision = 0x03, + .phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT, + }, ++#endif + }; + + static void e1000_register_types(void) +diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c +index f27131102d..17f10efae2 100644 +--- a/hw/pci-host/i440fx.c ++++ b/hw/pci-host/i440fx.c +@@ -386,6 +386,7 @@ static const TypeInfo i440fx_info = { + }, + }; + ++#if 0 /* Disabled in Red Hat Enterprise Linux */ + /* IGD Passthrough Host Bridge. */ + typedef struct { + uint8_t offset; +@@ -469,6 +470,7 @@ static const TypeInfo igd_passthrough_i440fx_info = { + .instance_size = sizeof(PCII440FXState), + .class_init = igd_passthrough_i440fx_class_init, + }; ++#endif + + static const char *i440fx_pcihost_root_bus_path(PCIHostState *host_bridge, + PCIBus *rootbus) +@@ -514,7 +516,9 @@ static const TypeInfo i440fx_pcihost_info = { + static void i440fx_register_types(void) + { + type_register_static(&i440fx_info); ++#if 0 /* Disabled in Red Hat Enterprise Linux */ + type_register_static(&igd_passthrough_i440fx_info); ++#endif + type_register_static(&i440fx_pcihost_info); + } + +diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c +index 8339c4c0f8..301cd7b4e4 100644 +--- a/hw/ppc/spapr_cpu_core.c ++++ b/hw/ppc/spapr_cpu_core.c +@@ -403,10 +403,12 @@ static const TypeInfo spapr_cpu_core_type_infos[] = { + .instance_size = sizeof(SpaprCpuCore), + .class_size = sizeof(SpaprCpuCoreClass), + }, ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + DEFINE_SPAPR_CPU_CORE_TYPE("970_v2.2"), + DEFINE_SPAPR_CPU_CORE_TYPE("970mp_v1.0"), + DEFINE_SPAPR_CPU_CORE_TYPE("970mp_v1.1"), + DEFINE_SPAPR_CPU_CORE_TYPE("power5+_v2.1"), ++#endif + DEFINE_SPAPR_CPU_CORE_TYPE("power7_v2.3"), + DEFINE_SPAPR_CPU_CORE_TYPE("power7+_v2.1"), + DEFINE_SPAPR_CPU_CORE_TYPE("power8_v2.0"), +diff --git a/hw/usb/Makefile.objs b/hw/usb/Makefile.objs +index 303ac084a0..700a91886e 100644 +--- a/hw/usb/Makefile.objs ++++ b/hw/usb/Makefile.objs +@@ -30,7 +30,9 @@ common-obj-$(CONFIG_USB_BLUETOOTH) += dev-bluetooth.o + ifeq ($(CONFIG_USB_SMARTCARD),y) + common-obj-y += dev-smartcard-reader.o + common-obj-$(CONFIG_SMARTCARD) += smartcard.mo +-smartcard.mo-objs := ccid-card-passthru.o ccid-card-emulated.o ++# Disabled for Red Hat Enterprise Linux: ++# smartcard.mo-objs := ccid-card-passthru.o ccid-card-emulated.o ++smartcard.mo-objs := ccid-card-passthru.o + smartcard.mo-cflags := $(SMARTCARD_CFLAGS) + smartcard.mo-libs := $(SMARTCARD_LIBS) + endif +diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c +index 136f3a9ad6..4505ffe48a 100644 +--- a/hw/vfio/pci-quirks.c ++++ b/hw/vfio/pci-quirks.c +@@ -1166,6 +1166,7 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) + trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + /* + * Intel IGD support + * +@@ -1239,6 +1240,7 @@ static int igd_gen(VFIOPCIDevice *vdev) + + return 8; /* Assume newer is compatible */ + } ++#endif + + typedef struct VFIOIGDQuirk { + struct VFIOPCIDevice *vdev; +@@ -1311,6 +1313,7 @@ typedef struct { + uint8_t len; + } IGDHostInfo; + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static const IGDHostInfo igd_host_bridge_infos[] = { + {PCI_REVISION_ID, 2}, + {PCI_SUBSYSTEM_VENDOR_ID, 2}, +@@ -1559,9 +1562,11 @@ static const MemoryRegionOps vfio_igd_index_quirk = { + .write = vfio_igd_quirk_index_write, + .endianness = DEVICE_LITTLE_ENDIAN, + }; ++#endif + + static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) + { ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + struct vfio_region_info *rom = NULL, *opregion = NULL, + *host = NULL, *lpc = NULL; + VFIOQuirk *quirk; +@@ -1572,6 +1577,7 @@ static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) + uint32_t gmch; + uint16_t cmd_orig, cmd; + Error *err = NULL; ++#endif + + /* + * This must be an Intel VGA device at address 00:02.0 for us to even +@@ -1585,6 +1591,8 @@ static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) + return; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ ++ + /* + * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we + * can stuff host values into, so if there's already one there and it's not +@@ -1809,6 +1817,7 @@ out: + g_free(opregion); + g_free(host); + g_free(lpc); ++#endif + } + + /* +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 2d40b396f2..c8534d3035 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3220,6 +3220,7 @@ static const TypeInfo vfio_pci_dev_info = { + }, + }; + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static Property vfio_pci_dev_nohotplug_properties[] = { + DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), + DEFINE_PROP_END_OF_LIST(), +@@ -3239,11 +3240,15 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { + .instance_size = sizeof(VFIOPCIDevice), + .class_init = vfio_pci_nohotplug_dev_class_init, + }; ++#endif + + static void register_vfio_pci_dev_type(void) + { + type_register_static(&vfio_pci_dev_info); ++ ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + type_register_static(&vfio_pci_nohotplug_dev_info); ++#endif + } + + type_init(register_vfio_pci_dev_type) +diff --git a/qemu-options.hx b/qemu-options.hx +index 65c9473b73..fc17aca631 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -2111,11 +2111,6 @@ ETEXI + + DEF("no-hpet", 0, QEMU_OPTION_no_hpet, + "-no-hpet disable HPET\n", QEMU_ARCH_I386) +-STEXI +-@item -no-hpet +-@findex -no-hpet +-Disable HPET support. +-ETEXI + + DEF("acpitable", HAS_ARG, QEMU_OPTION_acpitable, + "-acpitable [sig=str][,rev=n][,oem_id=str][,oem_table_id=str][,oem_rev=n][,asl_compiler_id=str][,asl_compiler_rev=n][,{data|file}=file1[:file2]...]\n" +@@ -3125,6 +3120,7 @@ STEXI + ETEXI + DEFHEADING() + ++#if 0 + DEFHEADING(Bluetooth(R) options:) + STEXI + @table @option +@@ -3203,6 +3199,7 @@ STEXI + @end table + ETEXI + DEFHEADING() ++#endif + + #ifdef CONFIG_TPM + DEFHEADING(TPM device options:) +diff --git a/target/arm/cpu.c b/target/arm/cpu.c +index 7a4ac9339b..3788fc3c4a 100644 +--- a/target/arm/cpu.c ++++ b/target/arm/cpu.c +@@ -2744,7 +2744,9 @@ static void arm_cpu_register_types(void) + type_register_static(&idau_interface_type_info); + + while (info->name) { +- cpu_register(info); ++ /* RHEL specific: Filter out unsupported cpu models */ ++ if (!strcmp(info->name, "cortex-a15")) ++ cpu_register(info); + info++; + } + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 69f518a21a..1b7880ae3a 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1835,14 +1835,14 @@ static X86CPUDefinition builtin_x86_defs[] = { + .family = 6, + .model = 6, + .stepping = 3, +- .features[FEAT_1_EDX] = +- PPRO_FEATURES | +- CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA | +- CPUID_PSE36, +- .features[FEAT_1_ECX] = +- CPUID_EXT_SSE3 | CPUID_EXT_CX16, +- .features[FEAT_8000_0001_EDX] = +- CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX, ++ .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | ++ CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | ++ CPUID_MCA | CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | ++ CPUID_CX8 | CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | ++ CPUID_PSE | CPUID_DE | CPUID_FP87, ++ .features[FEAT_1_ECX] = CPUID_EXT_CX16 | CPUID_EXT_SSE3, ++ .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_NX | ++ CPUID_EXT2_SYSCALL, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM, + .xlevel = 0x8000000A, +@@ -2128,6 +2128,25 @@ static X86CPUDefinition builtin_x86_defs[] = { + .xlevel = 0x80000008, + .model_id = "Intel(R) Atom(TM) CPU N270 @ 1.60GHz", + }, ++ { ++ .name = "cpu64-rhel6", ++ .level = 4, ++ .vendor = CPUID_VENDOR_AMD, ++ .family = 6, ++ .model = 13, ++ .stepping = 3, ++ .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | ++ CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | ++ CPUID_MCA | CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | ++ CPUID_CX8 | CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | ++ CPUID_PSE | CPUID_DE | CPUID_FP87, ++ .features[FEAT_1_ECX] = CPUID_EXT_CX16 | CPUID_EXT_SSE3, ++ .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_NX | CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | ++ CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, ++ .xlevel = 0x8000000A, ++ .model_id = "QEMU Virtual CPU version (cpu64-rhel6)", ++ }, + { + .name = "Conroe", + .level = 10, +diff --git a/target/ppc/cpu-models.c b/target/ppc/cpu-models.c +index 086548e9b9..1bbf378c18 100644 +--- a/target/ppc/cpu-models.c ++++ b/target/ppc/cpu-models.c +@@ -66,6 +66,7 @@ + #define POWERPC_DEF(_name, _pvr, _type, _desc) \ + POWERPC_DEF_SVR(_name, _desc, _pvr, POWERPC_SVR_NONE, _type) + ++#if 0 /* Embedded and 32-bit CPUs disabled for Red Hat Enterprise Linux */ + /* Embedded PowerPC */ + /* PowerPC 401 family */ + POWERPC_DEF("401", CPU_POWERPC_401, 401, +@@ -740,8 +741,10 @@ + "PowerPC 7447A v1.2 (G4)") + POWERPC_DEF("7457a_v1.2", CPU_POWERPC_74x7A_v12, 7455, + "PowerPC 7457A v1.2 (G4)") ++#endif + /* 64 bits PowerPC */ + #if defined(TARGET_PPC64) ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + POWERPC_DEF("970_v2.2", CPU_POWERPC_970_v22, 970, + "PowerPC 970 v2.2") + POWERPC_DEF("970fx_v1.0", CPU_POWERPC_970FX_v10, 970, +@@ -760,6 +763,7 @@ + "PowerPC 970MP v1.1") + POWERPC_DEF("power5+_v2.1", CPU_POWERPC_POWER5P_v21, POWER5P, + "POWER5+ v2.1") ++#endif + POWERPC_DEF("power7_v2.3", CPU_POWERPC_POWER7_v23, POWER7, + "POWER7 v2.3") + POWERPC_DEF("power7+_v2.1", CPU_POWERPC_POWER7P_v21, POWER7, +@@ -780,6 +784,7 @@ + /* PowerPC CPU aliases */ + + PowerPCCPUAlias ppc_cpu_aliases[] = { ++#if 0 /* Embedded and 32-bit CPUs disabled for Red Hat Enterprise Linux */ + { "403", "403gc" }, + { "405", "405d4" }, + { "405cr", "405crc" }, +@@ -938,12 +943,15 @@ PowerPCCPUAlias ppc_cpu_aliases[] = { + { "7447a", "7447a_v1.2" }, + { "7457a", "7457a_v1.2" }, + { "apollo7pm", "7457a_v1.0" }, ++#endif + #if defined(TARGET_PPC64) ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + { "970", "970_v2.2" }, + { "970fx", "970fx_v3.1" }, + { "970mp", "970mp_v1.1" }, + { "power5+", "power5+_v2.1" }, + { "power5gs", "power5+_v2.1" }, ++#endif + { "power7", "power7_v2.3" }, + { "power7+", "power7+_v2.1" }, + { "power8e", "power8e_v2.1" }, +@@ -952,6 +960,7 @@ PowerPCCPUAlias ppc_cpu_aliases[] = { + { "power9", "power9_v2.0" }, + #endif + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + /* Generic PowerPCs */ + #if defined(TARGET_PPC64) + { "ppc64", "970fx_v3.1" }, +@@ -959,5 +968,6 @@ PowerPCCPUAlias ppc_cpu_aliases[] = { + { "ppc32", "604" }, + { "ppc", "604" }, + { "default", "604" }, ++#endif + { NULL, NULL } + }; +diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c +index 7e92fb2e15..be718220d7 100644 +--- a/target/s390x/cpu_models.c ++++ b/target/s390x/cpu_models.c +@@ -404,6 +404,9 @@ static void check_unavailable_features(const S390CPUModel *max_model, + (max_model->def->gen == model->def->gen && + max_model->def->ec_ga < model->def->ec_ga)) { + list_add_feat("type", unavailable); ++ } else if (model->def->gen < 11 && kvm_enabled()) { ++ /* Older CPU models are not supported on Red Hat Enterprise Linux */ ++ list_add_feat("type", unavailable); + } + + /* detect missing features if any to properly report them */ +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 0c9d14b4b1..a02d569537 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2387,6 +2387,14 @@ void kvm_s390_apply_cpu_model(const S390CPUModel *model, Error **errp) + error_setg(errp, "KVM doesn't support CPU models"); + return; + } ++ ++ /* Older CPU models are not supported on Red Hat Enterprise Linux */ ++ if (model->def->gen < 11) { ++ error_setg(errp, "KVM: Unsupported CPU type specified: %s", ++ MACHINE(qdev_get_machine())->cpu_type); ++ return; ++ } ++ + prop.cpuid = s390_cpuid_from_cpu_model(model); + prop.ibc = s390_ibc_from_cpu_model(model); + /* configure cpu features indicated via STFL(e) */ +diff --git a/util/memfd.c b/util/memfd.c +index 4a3c07e0be..3303ec9da4 100644 +--- a/util/memfd.c ++++ b/util/memfd.c +@@ -193,7 +193,7 @@ bool qemu_memfd_alloc_check(void) + */ + bool qemu_memfd_check(unsigned int flags) + { +-#ifdef CONFIG_LINUX ++#if 0 /* RHEL: memfd support disabled */ + int mfd = memfd_create("test", flags | MFD_CLOEXEC); + + if (mfd >= 0) { +diff --git a/vl.c b/vl.c +index 6a65a64bfd..668a34577e 100644 +--- a/vl.c ++++ b/vl.c +@@ -166,7 +166,7 @@ Chardev *parallel_hds[MAX_PARALLEL_PORTS]; + int win2k_install_hack = 0; + int singlestep = 0; + int acpi_enabled = 1; +-int no_hpet = 0; ++int no_hpet = 1; /* Always disabled for Red Hat Enterprise Linux */ + int fd_bootchk = 1; + static int no_reboot; + int no_shutdown = 0; +@@ -914,6 +914,7 @@ static void configure_rtc(QemuOpts *opts) + } + } + ++#if 0 // Disabled for Red Hat Enterprise Linux + /***********************************************************/ + /* Bluetooth support */ + static int nb_hcis; +@@ -1035,6 +1036,7 @@ static int bt_parse(const char *opt) + error_report("bad bluetooth parameter '%s'", opt); + return 1; + } ++#endif + + static int parse_name(void *opaque, QemuOpts *opts, Error **errp) + { +@@ -3128,6 +3130,7 @@ int main(int argc, char **argv, char **envp) + } + break; + #endif ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + case QEMU_OPTION_bt: + warn_report("The bluetooth subsystem is deprecated and will " + "be removed soon. If the bluetooth subsystem is " +@@ -3135,6 +3138,7 @@ int main(int argc, char **argv, char **envp) + "qemu-devel@nongnu.org with your usecase."); + add_device_config(DEV_BT, optarg); + break; ++#endif + case QEMU_OPTION_audio_help: + audio_legacy_help(); + exit (0); +@@ -4282,9 +4286,11 @@ int main(int argc, char **argv, char **envp) + + tpm_init(); + ++#if 0 // Disabled for Red Hat Enterprise Linux + /* init the bluetooth world */ + if (foreach_device_config(DEV_BT, bt_parse)) + exit(1); ++#endif + + if (!xen_enabled()) { + /* On 32-bit hosts, QEMU is limited by virtual address space */ +-- +2.21.0 + diff --git a/0007-Machine-type-related-general-changes.patch b/0007-Machine-type-related-general-changes.patch new file mode 100755 index 0000000000000000000000000000000000000000..4ae3966f497773ed96a3f718800618874cbc2628 --- /dev/null +++ b/0007-Machine-type-related-general-changes.patch @@ -0,0 +1,675 @@ +From 113078b23a4747b07eb363719d7cbc0af403dd2a Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 11 Jan 2019 09:54:45 +0100 +Subject: Machine type related general changes + +This patch is first part of original "Add RHEL machine types" patch we +split to allow easier review. It contains changes not related to any +architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- Remove e1000 device duplication changes to reflect upstream solution +- Rewrite machine compat properties to upstream solution + +Rebase changes (4.1.0): +- Removed optional flag for machine compat properties (upstream) +- Remove c3e002cb chunk from hw/net/e1000.c +- Reorder compat structures +- Use one format for compat scructures +- Added compat for virtio-balloon-pci.any_layout for rhel71 + +Merged patches (4.0.0): +- d4c0957 compat: Generic HW_COMPAT_RHEL7_6 +- cbac773 virtio: Make disable-legacy/disable-modern compat properties optional + +Merged patches (4.1.0): +- 479ad30 redhat: fix cut'n'paste garbage in hw_compat comments +- f19738e compat: Generic hw_compat_rhel_8_0 + +Merged patches (4.2.0): +- 9f2bfaa machine types: Update hw_compat_rhel_8_0 from hw_compat_4_0 +- ca4a5e8 virtio: Make disable-legacy/disable-modern compat properties optional +- compat: Generic hw_compat_rhel_8_1 (patch 93040/92956) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/ich9.c | 16 ++++ + hw/acpi/piix4.c | 5 +- + hw/char/serial.c | 16 ++++ + hw/core/machine.c | 170 ++++++++++++++++++++++++++++++++++++++++ + hw/display/vga-isa.c | 2 +- + hw/net/e1000e.c | 21 +++++ + hw/net/rtl8139.c | 4 +- + hw/rtc/mc146818rtc.c | 6 ++ + hw/smbios/smbios.c | 1 + + hw/timer/i8254_common.c | 2 +- + hw/usb/hcd-uhci.c | 4 +- + hw/usb/hcd-xhci.c | 20 +++++ + hw/usb/hcd-xhci.h | 2 + + include/hw/acpi/ich9.h | 3 + + include/hw/boards.h | 24 ++++++ + include/hw/usb.h | 4 + + migration/migration.c | 2 + + migration/migration.h | 5 ++ + 18 files changed, 301 insertions(+), 6 deletions(-) + +diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c +index ab203ad448..7ec26884e8 100644 +--- a/hw/acpi/ich9.c ++++ b/hw/acpi/ich9.c +@@ -444,6 +444,18 @@ static void ich9_pm_set_enable_tco(Object *obj, bool value, Error **errp) + s->pm.enable_tco = value; + } + ++static bool ich9_pm_get_force_rev1_fadt(Object *obj, Error **errp) ++{ ++ ICH9LPCState *s = ICH9_LPC_DEVICE(obj); ++ return s->pm.force_rev1_fadt; ++} ++ ++static void ich9_pm_set_force_rev1_fadt(Object *obj, bool value, Error **errp) ++{ ++ ICH9LPCState *s = ICH9_LPC_DEVICE(obj); ++ s->pm.force_rev1_fadt = value; ++} ++ + void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) + { + static const uint32_t gpe0_len = ICH9_PMIO_GPE0_LEN; +@@ -468,6 +480,10 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) + ich9_pm_get_cpu_hotplug_legacy, + ich9_pm_set_cpu_hotplug_legacy, + NULL); ++ object_property_add_bool(obj, "__com.redhat_force-rev1-fadt", ++ ich9_pm_get_force_rev1_fadt, ++ ich9_pm_set_force_rev1_fadt, ++ NULL); + object_property_add(obj, ACPI_PM_PROP_S3_DISABLED, "uint8", + ich9_pm_get_disable_s3, + ich9_pm_set_disable_s3, +diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c +index 93aec2dd2c..3a26193cbe 100644 +--- a/hw/acpi/piix4.c ++++ b/hw/acpi/piix4.c +@@ -274,6 +274,7 @@ static const VMStateDescription vmstate_acpi = { + .name = "piix4_pm", + .version_id = 3, + .minimum_version_id = 3, ++ .minimum_version_id = 2, + .post_load = vmstate_acpi_post_load, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, PIIX4PMState), +@@ -627,8 +628,8 @@ static void piix4_send_gpe(AcpiDeviceIf *adev, AcpiEventStatusBits ev) + + static Property piix4_pm_properties[] = { + DEFINE_PROP_UINT32("smb_io_base", PIIX4PMState, smb_io_base, 0), +- DEFINE_PROP_UINT8(ACPI_PM_PROP_S3_DISABLED, PIIX4PMState, disable_s3, 0), +- DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 0), ++ DEFINE_PROP_UINT8(ACPI_PM_PROP_S3_DISABLED, PIIX4PMState, disable_s3, 1), ++ DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_DISABLED, PIIX4PMState, disable_s4, 1), + DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_VAL, PIIX4PMState, s4_val, 2), + DEFINE_PROP_BOOL("acpi-pci-hotplug-with-bridge-support", PIIX4PMState, + use_acpi_pci_hotplug, true), +diff --git a/hw/char/serial.c b/hw/char/serial.c +index b4aa250950..0012f0e44d 100644 +--- a/hw/char/serial.c ++++ b/hw/char/serial.c +@@ -34,6 +34,7 @@ + #include "sysemu/runstate.h" + #include "qemu/error-report.h" + #include "trace.h" ++#include "migration/migration.h" + + //#define DEBUG_SERIAL + +@@ -703,6 +704,9 @@ static int serial_post_load(void *opaque, int version_id) + static bool serial_thr_ipending_needed(void *opaque) + { + SerialState *s = opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } + + if (s->ier & UART_IER_THRI) { + bool expected_value = ((s->iir & UART_IIR_ID) == UART_IIR_THRI); +@@ -784,6 +788,10 @@ static const VMStateDescription vmstate_serial_xmit_fifo = { + static bool serial_fifo_timeout_timer_needed(void *opaque) + { + SerialState *s = (SerialState *)opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return timer_pending(s->fifo_timeout_timer); + } + +@@ -801,6 +809,10 @@ static const VMStateDescription vmstate_serial_fifo_timeout_timer = { + static bool serial_timeout_ipending_needed(void *opaque) + { + SerialState *s = (SerialState *)opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return s->timeout_ipending != 0; + } + +@@ -818,6 +830,10 @@ static const VMStateDescription vmstate_serial_timeout_ipending = { + static bool serial_poll_needed(void *opaque) + { + SerialState *s = (SerialState *)opaque; ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return s->poll_msl >= 0; + } + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index 1689ad3bf8..e0e0eec8bf 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -27,6 +27,176 @@ + #include "hw/pci/pci.h" + #include "hw/mem/nvdimm.h" + ++/* ++ * The same as hw_compat_4_1 ++ */ ++GlobalProperty hw_compat_rhel_8_1[] = { ++ /* hw_compat_rhel_8_1 from hw_compat_4_1 */ ++ { "virtio-pci", "x-pcie-flr-init", "off" }, ++}; ++const size_t hw_compat_rhel_8_1_len = G_N_ELEMENTS(hw_compat_rhel_8_1); ++ ++/* The same as hw_compat_3_1 ++ * format of array has been changed by: ++ * 6c36bddf5340 ("machine: Use shorter format for GlobalProperty arrays") ++ */ ++GlobalProperty hw_compat_rhel_8_0[] = { ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "pcie-root-port", "x-speed", "2_5" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "pcie-root-port", "x-width", "1" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "memory-backend-file", "x-use-canonical-path-for-ramblock-id", "true" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "memory-backend-memfd", "x-use-canonical-path-for-ramblock-id", "true" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "tpm-crb", "ppi", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "tpm-tis", "ppi", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "usb-kbd", "serial", "42" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "usb-mouse", "serial", "42" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "usb-tablet", "serial", "42" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "virtio-blk-device", "discard", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 */ ++ { "virtio-blk-device", "write-zeroes", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "VGA", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "secondary-vga", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "bochs-display", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "virtio-vga", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "virtio-gpu-pci", "edid", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_4_0 */ ++ { "virtio-device", "use-started", "false" }, ++ /* hw_compat_rhel_8_0 from hw_compat_3_1 - that was added in 4.1 */ ++ { "pcie-root-port-base", "disable-acs", "true" }, ++}; ++const size_t hw_compat_rhel_8_0_len = G_N_ELEMENTS(hw_compat_rhel_8_0); ++ ++/* The same as hw_compat_3_0 + hw_compat_2_12 ++ * except that ++ * there's nothing in 3_0 ++ * migration.decompress-error-check=off was in 7.5 from bz 1584139 ++ */ ++GlobalProperty hw_compat_rhel_7_6[] = { ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "hda-audio", "use-timer", "false" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "cirrus-vga", "global-vmstate", "true" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "VGA", "global-vmstate", "true" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "vmware-svga", "global-vmstate", "true" }, ++ /* hw_compat_rhel_7_6 from hw_compat_2_12 */ ++ { "qxl-vga", "global-vmstate", "true" }, ++}; ++const size_t hw_compat_rhel_7_6_len = G_N_ELEMENTS(hw_compat_rhel_7_6); ++ ++/* The same as hw_compat_2_11 + hw_compat_2_10 */ ++GlobalProperty hw_compat_rhel_7_5[] = { ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 */ ++ { "hpet", "hpet-offset-saved", "false" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 */ ++ { "virtio-blk-pci", "vectors", "2" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 */ ++ { "vhost-user-blk-pci", "vectors", "2" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_11 ++ bz 1608778 modified for our naming */ ++ { "e1000-82540em", "migrate_tso_props", "off" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_10 */ ++ { "virtio-mouse-device", "wheel-axis", "false" }, ++ /* hw_compat_rhel_7_5 from hw_compat_2_10 */ ++ { "virtio-tablet-device", "wheel-axis", "false" }, ++ { "cirrus-vga", "vgamem_mb", "16" }, ++ { "migration", "decompress-error-check", "off" }, ++}; ++const size_t hw_compat_rhel_7_5_len = G_N_ELEMENTS(hw_compat_rhel_7_5); ++ ++/* Mostly like hw_compat_2_9 except ++ * x-mtu-bypass-backend, x-migrate-msix has already been ++ * backported to RHEL7.4. shpc was already on in 7.4. ++ */ ++GlobalProperty hw_compat_rhel_7_4[] = { ++ { "intel-iommu", "pt", "off" }, ++}; ++ ++const size_t hw_compat_rhel_7_4_len = G_N_ELEMENTS(hw_compat_rhel_7_4); ++/* Mostly like HW_COMPAT_2_6 + HW_COMPAT_2_7 + HW_COMPAT_2_8 except ++ * disable-modern, disable-legacy, page-per-vq have already been ++ * backported to RHEL7.3 ++ */ ++GlobalProperty hw_compat_rhel_7_3[] = { ++ { "virtio-mmio", "format_transport_address", "off" }, ++ { "virtio-serial-device", "emergency-write", "off" }, ++ { "ioapic", "version", "0x11" }, ++ { "intel-iommu", "x-buggy-eim", "true" }, ++ { "virtio-pci", "x-ignore-backend-features", "on" }, ++ { "fw_cfg_mem", "x-file-slots", stringify(0x10) }, ++ { "fw_cfg_io", "x-file-slots", stringify(0x10) }, ++ { "pflash_cfi01", "old-multiple-chip-handling", "on" }, ++ { TYPE_PCI_DEVICE, "x-pcie-extcap-init", "off" }, ++ { "virtio-pci", "x-pcie-deverr-init", "off" }, ++ { "virtio-pci", "x-pcie-lnkctl-init", "off" }, ++ { "virtio-pci", "x-pcie-pm-init", "off" }, ++ { "virtio-net-device", "x-mtu-bypass-backend", "off" }, ++ { "e1000e", "__redhat_e1000e_7_3_intr_state", "on" }, ++}; ++const size_t hw_compat_rhel_7_3_len = G_N_ELEMENTS(hw_compat_rhel_7_3); ++ ++/* Mostly like hw_compat_2_4 + 2_3 but: ++ * we don't need "any_layout" as it has been backported to 7.2 ++ */ ++GlobalProperty hw_compat_rhel_7_2[] = { ++ { "virtio-blk-device", "scsi", "true" }, ++ { "e1000-82540em", "extra_mac_registers", "off" }, ++ { "virtio-pci", "x-disable-pcie", "on" }, ++ { "virtio-pci", "migrate-extra", "off" }, ++ { "fw_cfg_mem", "dma_enabled", "off" }, ++ { "fw_cfg_io", "dma_enabled", "off" }, ++ { "isa-fdc", "fallback", "144" }, ++ /* Optional because not all virtio-pci devices support legacy mode */ ++ { "virtio-pci", "disable-modern", "on", .optional = true }, ++ { "virtio-pci", "disable-legacy", "off", .optional = true }, ++ { TYPE_PCI_DEVICE, "x-pcie-lnksta-dllla", "off" }, ++ { "virtio-pci", "page-per-vq", "on" }, ++ /* hw_compat_rhel_7_2 - introduced with 2.10.0 */ ++ { "migration", "send-section-footer", "off" }, ++ /* hw_compat_rhel_7_2 - introduced with 2.10.0 */ ++ { "migration", "store-global-state", "off", ++ }, ++}; ++const size_t hw_compat_rhel_7_2_len = G_N_ELEMENTS(hw_compat_rhel_7_2); ++ ++/* Mostly like hw_compat_2_1 but: ++ * we don't need virtio-scsi-pci since 7.0 already had that on ++ * ++ * RH: Note, qemu-extended-regs should have been enabled in the 7.1 ++ * machine type, but was accidentally turned off in 7.2 onwards. ++ */ ++GlobalProperty hw_compat_rhel_7_1[] = { ++ { "intel-hda-generic", "old_msi_addr", "on" }, ++ { "VGA", "qemu-extended-regs", "off" }, ++ { "secondary-vga", "qemu-extended-regs", "off" }, ++ { "usb-mouse", "usb_version", stringify(1) }, ++ { "usb-kbd", "usb_version", stringify(1) }, ++ { "virtio-pci", "virtio-pci-bus-master-bug-migration", "on" }, ++ { "virtio-blk-pci", "any_layout", "off" }, ++ { "virtio-balloon-pci", "any_layout", "off" }, ++ { "virtio-serial-pci", "any_layout", "off" }, ++ { "virtio-9p-pci", "any_layout", "off" }, ++ { "virtio-rng-pci", "any_layout", "off" }, ++ /* HW_COMPAT_RHEL7_1 - introduced with 2.10.0 */ ++ { "migration", "send-configuration", "off" }, ++}; ++const size_t hw_compat_rhel_7_1_len = G_N_ELEMENTS(hw_compat_rhel_7_1); ++ + GlobalProperty hw_compat_4_1[] = { + { "virtio-pci", "x-pcie-flr-init", "off" }, + }; +diff --git a/hw/display/vga-isa.c b/hw/display/vga-isa.c +index 873e5e9706..d1a2efe47e 100644 +--- a/hw/display/vga-isa.c ++++ b/hw/display/vga-isa.c +@@ -82,7 +82,7 @@ static void vga_isa_realizefn(DeviceState *dev, Error **errp) + } + + static Property vga_isa_properties[] = { +- DEFINE_PROP_UINT32("vgamem_mb", ISAVGAState, state.vram_size_mb, 8), ++ DEFINE_PROP_UINT32("vgamem_mb", ISAVGAState, state.vram_size_mb, 16), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c +index b69fd7d8ad..d8be50a1ce 100644 +--- a/hw/net/e1000e.c ++++ b/hw/net/e1000e.c +@@ -79,6 +79,11 @@ typedef struct E1000EState { + + E1000ECore core; + ++ /* 7.3 had the intr_state field that was in the original e1000e code ++ * but that was removed prior to 2.7's release ++ */ ++ bool redhat_7_3_intr_state_enable; ++ uint32_t redhat_7_3_intr_state; + } E1000EState; + + #define E1000E_MMIO_IDX 0 +@@ -94,6 +99,10 @@ typedef struct E1000EState { + #define E1000E_MSIX_TABLE (0x0000) + #define E1000E_MSIX_PBA (0x2000) + ++/* Values as in RHEL 7.3 build and original upstream */ ++#define RH_E1000E_USE_MSI BIT(0) ++#define RH_E1000E_USE_MSIX BIT(1) ++ + static uint64_t + e1000e_mmio_read(void *opaque, hwaddr addr, unsigned size) + { +@@ -305,6 +314,8 @@ e1000e_init_msix(E1000EState *s) + } else { + if (!e1000e_use_msix_vectors(s, E1000E_MSIX_VEC_NUM)) { + msix_uninit(d, &s->msix, &s->msix); ++ } else { ++ s->redhat_7_3_intr_state |= RH_E1000E_USE_MSIX; + } + } + } +@@ -476,6 +487,8 @@ static void e1000e_pci_realize(PCIDevice *pci_dev, Error **errp) + ret = msi_init(PCI_DEVICE(s), 0xD0, 1, true, false, NULL); + if (ret) { + trace_e1000e_msi_init_fail(ret); ++ } else { ++ s->redhat_7_3_intr_state |= RH_E1000E_USE_MSI; + } + + if (e1000e_add_pm_capability(pci_dev, e1000e_pmrb_offset, +@@ -599,6 +612,11 @@ static const VMStateDescription e1000e_vmstate_intr_timer = { + VMSTATE_STRUCT_ARRAY(_f, _s, _num, 0, \ + e1000e_vmstate_intr_timer, E1000IntrDelayTimer) + ++static bool rhel_7_3_check(void *opaque, int version_id) ++{ ++ return ((E1000EState *)opaque)->redhat_7_3_intr_state_enable; ++} ++ + static const VMStateDescription e1000e_vmstate = { + .name = "e1000e", + .version_id = 1, +@@ -610,6 +628,7 @@ static const VMStateDescription e1000e_vmstate = { + VMSTATE_MSIX(parent_obj, E1000EState), + + VMSTATE_UINT32(ioaddr, E1000EState), ++ VMSTATE_UINT32_TEST(redhat_7_3_intr_state, E1000EState, rhel_7_3_check), + VMSTATE_UINT32(core.rxbuf_min_shift, E1000EState), + VMSTATE_UINT8(core.rx_desc_len, E1000EState), + VMSTATE_UINT32_ARRAY(core.rxbuf_sizes, E1000EState, +@@ -658,6 +677,8 @@ static PropertyInfo e1000e_prop_disable_vnet, + + static Property e1000e_properties[] = { + DEFINE_NIC_PROPERTIES(E1000EState, conf), ++ DEFINE_PROP_BOOL("__redhat_e1000e_7_3_intr_state", E1000EState, ++ redhat_7_3_intr_state_enable, false), + DEFINE_PROP_SIGNED("disable_vnet_hdr", E1000EState, disable_vnet, false, + e1000e_prop_disable_vnet, bool), + DEFINE_PROP_SIGNED("subsys_ven", E1000EState, subsys_ven, +diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c +index 88a97d756d..21d80e96cf 100644 +--- a/hw/net/rtl8139.c ++++ b/hw/net/rtl8139.c +@@ -3177,7 +3177,7 @@ static int rtl8139_pre_save(void *opaque) + + static const VMStateDescription vmstate_rtl8139 = { + .name = "rtl8139", +- .version_id = 5, ++ .version_id = 4, + .minimum_version_id = 3, + .post_load = rtl8139_post_load, + .pre_save = rtl8139_pre_save, +@@ -3258,7 +3258,9 @@ static const VMStateDescription vmstate_rtl8139 = { + VMSTATE_UINT32(tally_counters.TxMCol, RTL8139State), + VMSTATE_UINT64(tally_counters.RxOkPhy, RTL8139State), + VMSTATE_UINT64(tally_counters.RxOkBrd, RTL8139State), ++#if 0 /* Disabled for Red Hat Enterprise Linux bz 1420195 */ + VMSTATE_UINT32_V(tally_counters.RxOkMul, RTL8139State, 5), ++#endif + VMSTATE_UINT16(tally_counters.TxAbt, RTL8139State), + VMSTATE_UINT16(tally_counters.TxUndrn, RTL8139State), + +diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c +index 74ae74bc5c..73820517df 100644 +--- a/hw/rtc/mc146818rtc.c ++++ b/hw/rtc/mc146818rtc.c +@@ -42,6 +42,7 @@ + #include "qapi/visitor.h" + #include "exec/address-spaces.h" + #include "hw/rtc/mc146818rtc_regs.h" ++#include "migration/migration.h" + + #ifdef TARGET_I386 + #include "qapi/qapi-commands-misc-target.h" +@@ -820,6 +821,11 @@ static int rtc_post_load(void *opaque, int version_id) + static bool rtc_irq_reinject_on_ack_count_needed(void *opaque) + { + RTCState *s = (RTCState *)opaque; ++ ++ if (migrate_pre_2_2) { ++ return false; ++ } ++ + return s->irq_reinject_on_ack_count != 0; + } + +diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c +index 11d476c4a2..e6e9355384 100644 +--- a/hw/smbios/smbios.c ++++ b/hw/smbios/smbios.c +@@ -777,6 +777,7 @@ void smbios_set_defaults(const char *manufacturer, const char *product, + SMBIOS_SET_DEFAULT(type1.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type1.product, product); + SMBIOS_SET_DEFAULT(type1.version, version); ++ SMBIOS_SET_DEFAULT(type1.family, "Red Hat Enterprise Linux"); + SMBIOS_SET_DEFAULT(type2.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type2.product, product); + SMBIOS_SET_DEFAULT(type2.version, version); +diff --git a/hw/timer/i8254_common.c b/hw/timer/i8254_common.c +index 050875b497..32935da46c 100644 +--- a/hw/timer/i8254_common.c ++++ b/hw/timer/i8254_common.c +@@ -231,7 +231,7 @@ static const VMStateDescription vmstate_pit_common = { + .pre_save = pit_dispatch_pre_save, + .post_load = pit_dispatch_post_load, + .fields = (VMStateField[]) { +- VMSTATE_UINT32_V(channels[0].irq_disabled, PITCommonState, 3), ++ VMSTATE_UINT32(channels[0].irq_disabled, PITCommonState), /* qemu-kvm's v2 had 'flags' here */ + VMSTATE_STRUCT_ARRAY(channels, PITCommonState, 3, 2, + vmstate_pit_channel, PITChannelState), + VMSTATE_INT64(channels[0].next_transition_time, +diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c +index 23507ad3b5..9fd87a7ad9 100644 +--- a/hw/usb/hcd-uhci.c ++++ b/hw/usb/hcd-uhci.c +@@ -1219,12 +1219,14 @@ static void usb_uhci_common_realize(PCIDevice *dev, Error **errp) + UHCIState *s = UHCI(dev); + uint8_t *pci_conf = s->dev.config; + int i; ++ int irq_pin; + + pci_conf[PCI_CLASS_PROG] = 0x00; + /* TODO: reset value should be 0. */ + pci_conf[USB_SBRN] = USB_RELEASE_1; // release number + +- pci_config_set_interrupt_pin(pci_conf, u->info.irq_pin + 1); ++ irq_pin = u->info.irq_pin; ++ pci_config_set_interrupt_pin(pci_conf, irq_pin + 1); + + if (s->masterbus) { + USBPort *ports[NB_PORTS]; +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index 80988bb305..8fed2eedd6 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -3590,9 +3590,27 @@ static const VMStateDescription vmstate_xhci_slot = { + } + }; + ++static int xhci_event_pre_save(void *opaque) ++{ ++ XHCIEvent *s = opaque; ++ ++ s->cve_2014_5263_a = ((uint8_t *)&s->type)[0]; ++ s->cve_2014_5263_b = ((uint8_t *)&s->type)[1]; ++ ++ return 0; ++} ++ ++bool migrate_cve_2014_5263_xhci_fields; ++ ++static bool xhci_event_cve_2014_5263(void *opaque, int version_id) ++{ ++ return migrate_cve_2014_5263_xhci_fields; ++} ++ + static const VMStateDescription vmstate_xhci_event = { + .name = "xhci-event", + .version_id = 1, ++ .pre_save = xhci_event_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT32(type, XHCIEvent), + VMSTATE_UINT32(ccode, XHCIEvent), +@@ -3601,6 +3619,8 @@ static const VMStateDescription vmstate_xhci_event = { + VMSTATE_UINT32(flags, XHCIEvent), + VMSTATE_UINT8(slotid, XHCIEvent), + VMSTATE_UINT8(epid, XHCIEvent), ++ VMSTATE_UINT8_TEST(cve_2014_5263_a, XHCIEvent, xhci_event_cve_2014_5263), ++ VMSTATE_UINT8_TEST(cve_2014_5263_b, XHCIEvent, xhci_event_cve_2014_5263), + VMSTATE_END_OF_LIST() + } + }; +diff --git a/hw/usb/hcd-xhci.h b/hw/usb/hcd-xhci.h +index 2fad4df2a7..f554b671e3 100644 +--- a/hw/usb/hcd-xhci.h ++++ b/hw/usb/hcd-xhci.h +@@ -157,6 +157,8 @@ typedef struct XHCIEvent { + uint32_t flags; + uint8_t slotid; + uint8_t epid; ++ uint8_t cve_2014_5263_a; ++ uint8_t cve_2014_5263_b; + } XHCIEvent; + + typedef struct XHCIInterrupter { +diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h +index 41568d1837..1a23ccc412 100644 +--- a/include/hw/acpi/ich9.h ++++ b/include/hw/acpi/ich9.h +@@ -61,6 +61,9 @@ typedef struct ICH9LPCPMRegs { + uint8_t smm_enabled; + bool enable_tco; + TCOIORegs tco_regs; ++ ++ /* RH addition, see bz 1489800 */ ++ bool force_rev1_fadt; + } ICH9LPCPMRegs; + + #define ACPI_PM_PROP_TCO_ENABLED "enable_tco" +diff --git a/include/hw/boards.h b/include/hw/boards.h +index de45087f34..6f85a0e032 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -377,4 +377,28 @@ extern const size_t hw_compat_2_2_len; + extern GlobalProperty hw_compat_2_1[]; + extern const size_t hw_compat_2_1_len; + ++extern GlobalProperty hw_compat_rhel_8_1[]; ++extern const size_t hw_compat_rhel_8_1_len; ++ ++extern GlobalProperty hw_compat_rhel_8_0[]; ++extern const size_t hw_compat_rhel_8_0_len; ++ ++extern GlobalProperty hw_compat_rhel_7_6[]; ++extern const size_t hw_compat_rhel_7_6_len; ++ ++extern GlobalProperty hw_compat_rhel_7_5[]; ++extern const size_t hw_compat_rhel_7_5_len; ++ ++extern GlobalProperty hw_compat_rhel_7_4[]; ++extern const size_t hw_compat_rhel_7_4_len; ++ ++extern GlobalProperty hw_compat_rhel_7_3[]; ++extern const size_t hw_compat_rhel_7_3_len; ++ ++extern GlobalProperty hw_compat_rhel_7_2[]; ++extern const size_t hw_compat_rhel_7_2_len; ++ ++extern GlobalProperty hw_compat_rhel_7_1[]; ++extern const size_t hw_compat_rhel_7_1_len; ++ + #endif +diff --git a/include/hw/usb.h b/include/hw/usb.h +index c24d968a19..b353438ea0 100644 +--- a/include/hw/usb.h ++++ b/include/hw/usb.h +@@ -605,4 +605,8 @@ int usb_get_quirks(uint16_t vendor_id, uint16_t product_id, + uint8_t interface_class, uint8_t interface_subclass, + uint8_t interface_protocol); + ++ ++/* hcd-xhci.c -- rhel7.0.0 machine type compatibility */ ++extern bool migrate_cve_2014_5263_xhci_fields; ++ + #endif +diff --git a/migration/migration.c b/migration/migration.c +index 354ad072fa..30c53c623b 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -121,6 +121,8 @@ enum mig_rp_message_type { + MIG_RP_MSG_MAX + }; + ++bool migrate_pre_2_2; ++ + /* When we add fault tolerance, we could have several + migrations at once. For now we don't need to add + dynamic creation of migration */ +diff --git a/migration/migration.h b/migration/migration.h +index 79b3dda146..0b1b0d4df5 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -335,6 +335,11 @@ void init_dirty_bitmap_incoming_migration(void); + void migrate_add_address(SocketAddress *address); + + int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque); ++/* ++ * Disables a load of subsections that were added in 2.2/rh7.2 for backwards ++ * migration compatibility. ++ */ ++extern bool migrate_pre_2_2; + + #define qemu_ram_foreach_block \ + #warning "Use foreach_not_ignored_block in migration code" +-- +2.21.0 + diff --git a/0008-Add-aarch64-machine-types.patch b/0008-Add-aarch64-machine-types.patch new file mode 100755 index 0000000000000000000000000000000000000000..5397c8bda3d98a3f07ec60605629042b54299424 --- /dev/null +++ b/0008-Add-aarch64-machine-types.patch @@ -0,0 +1,276 @@ +From 49164264d9928f73961acbbe4d56d8dfa23d8099 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 12:53:31 +0200 +Subject: Add aarch64 machine types + +Adding changes to add RHEL machine types for aarch64 architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- Use upstream compat handling + +Rebase changes (4.1.0-rc0): +- Removed a15memmap (upstream) +- Use virt_flash_create in rhel800_virt_instance_init + +Rebase changes (4.2.0-rc0): +- Set numa_mem_supported + +Rebase notes (4.2.0-rc3): +- aarch64: Add virt-rhel8.2.0 machine type for ARM (patch 92246) +- aarch64: virt: Allow more than 1TB of RAM (patch 92249) +- aarch64: virt: Allow PCDIMM instantiation (patch 92247) +- aarch64: virt: Enhance the comment related to gic-version (patch 92248) + +Merged patches (4.0.0): +- 7bfdb4c aarch64: Add virt-rhel8.0.0 machine type for ARM +- 3433e69 aarch64: Set virt-rhel8.0.0 max_cpus to 512 +- 4d20863 aarch64: Use 256MB ECAM region by default + +Merged patches (4.1.0): +- c3e39ef aarch64: Add virt-rhel8.1.0 machine type for ARM +- 59a46d1 aarch64: Allow ARM VIRT iommu option in RHEL8.1 machine + +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/virt.c | 161 +++++++++++++++++++++++++++++++++++++++++- + include/hw/arm/virt.h | 11 +++ + 2 files changed, 171 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index d4bedc2607..e10839100e 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -72,6 +72,7 @@ + #include "hw/mem/nvdimm.h" + #include "hw/acpi/generic_event_device.h" + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ + static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ + void *data) \ +@@ -98,7 +99,49 @@ + DEFINE_VIRT_MACHINE_LATEST(major, minor, true) + #define DEFINE_VIRT_MACHINE(major, minor) \ + DEFINE_VIRT_MACHINE_LATEST(major, minor, false) +- ++#endif /* disabled for RHEL */ ++ ++#define DEFINE_RHEL_MACHINE_LATEST(m, n, s, latest) \ ++ static void rhel##m##n##s##_virt_class_init(ObjectClass *oc, \ ++ void *data) \ ++ { \ ++ MachineClass *mc = MACHINE_CLASS(oc); \ ++ rhel##m##n##s##_virt_options(mc); \ ++ mc->desc = "RHEL " # m "." # n "." # s " ARM Virtual Machine"; \ ++ if (latest) { \ ++ mc->alias = "virt"; \ ++ mc->is_default = 1; \ ++ } \ ++ } \ ++ static const TypeInfo rhel##m##n##s##_machvirt_info = { \ ++ .name = MACHINE_TYPE_NAME("virt-rhel" # m "." # n "." # s), \ ++ .parent = TYPE_RHEL_MACHINE, \ ++ .instance_init = rhel##m##n##s##_virt_instance_init, \ ++ .class_init = rhel##m##n##s##_virt_class_init, \ ++ }; \ ++ static void rhel##m##n##s##_machvirt_init(void) \ ++ { \ ++ type_register_static(&rhel##m##n##s##_machvirt_info); \ ++ } \ ++ type_init(rhel##m##n##s##_machvirt_init); ++ ++#define DEFINE_RHEL_MACHINE_AS_LATEST(major, minor, subminor) \ ++ DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, true) ++#define DEFINE_RHEL_MACHINE(major, minor, subminor) \ ++ DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, false) ++ ++/* This variable is for changes to properties that are RHEL specific, ++ * different to the current upstream and to be applied to the latest ++ * machine type. ++ */ ++GlobalProperty arm_rhel_compat[] = { ++ { ++ .driver = "virtio-net-pci", ++ .property = "romfile", ++ .value = "", ++ }, ++}; ++const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat); + + /* Number of external interrupt lines to configure the GIC with */ + #define NUM_IRQS 256 +@@ -1763,6 +1806,7 @@ static void machvirt_init(MachineState *machine) + qemu_add_machine_init_done_notifier(&vms->machine_done); + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static bool virt_get_secure(Object *obj, Error **errp) + { + VirtMachineState *vms = VIRT_MACHINE(obj); +@@ -1791,6 +1835,7 @@ static void virt_set_virt(Object *obj, bool value, Error **errp) + vms->virt = value; + } + ++#endif /* disabled for RHEL */ + static bool virt_get_highmem(Object *obj, Error **errp) + { + VirtMachineState *vms = VIRT_MACHINE(obj); +@@ -2022,6 +2067,7 @@ static int virt_kvm_type(MachineState *ms, const char *type_str) + return requested_pa_size > 40 ? requested_pa_size : 0; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void virt_machine_class_init(ObjectClass *oc, void *data) + { + MachineClass *mc = MACHINE_CLASS(oc); +@@ -2258,3 +2304,116 @@ static void virt_machine_2_6_options(MachineClass *mc) + vmc->no_pmu = true; + } + DEFINE_VIRT_MACHINE(2, 6) ++#endif /* disabled for RHEL */ ++ ++static void rhel_machine_class_init(ObjectClass *oc, void *data) ++{ ++ MachineClass *mc = MACHINE_CLASS(oc); ++ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); ++ ++ mc->family = "virt-rhel-Z"; ++ mc->init = machvirt_init; ++ /* Start with max_cpus set to 512, which is the maximum supported by KVM. ++ * The value may be reduced later when we have more information about the ++ * configuration of the particular instance. ++ */ ++ mc->max_cpus = 512; ++ mc->block_default_type = IF_VIRTIO; ++ mc->no_cdrom = 1; ++ mc->pci_allow_0_address = true; ++ /* We know we will never create a pre-ARMv7 CPU which needs 1K pages */ ++ mc->minimum_page_bits = 12; ++ mc->possible_cpu_arch_ids = virt_possible_cpu_arch_ids; ++ mc->cpu_index_to_instance_props = virt_cpu_index_to_props; ++ mc->default_cpu_type = ARM_CPU_TYPE_NAME("cortex-a57"); ++ mc->get_default_cpu_node_id = virt_get_default_cpu_node_id; ++ mc->kvm_type = virt_kvm_type; ++ assert(!mc->get_hotplug_handler); ++ mc->get_hotplug_handler = virt_machine_get_hotplug_handler; ++ hc->pre_plug = virt_machine_device_pre_plug_cb; ++ hc->plug = virt_machine_device_plug_cb; ++ hc->unplug_request = virt_machine_device_unplug_request_cb; ++ mc->numa_mem_supported = true; ++ mc->auto_enable_numa_with_memhp = true; ++} ++ ++static const TypeInfo rhel_machine_info = { ++ .name = TYPE_RHEL_MACHINE, ++ .parent = TYPE_MACHINE, ++ .abstract = true, ++ .instance_size = sizeof(VirtMachineState), ++ .class_size = sizeof(VirtMachineClass), ++ .class_init = rhel_machine_class_init, ++ .interfaces = (InterfaceInfo[]) { ++ { TYPE_HOTPLUG_HANDLER }, ++ { } ++ }, ++}; ++ ++static void rhel_machine_init(void) ++{ ++ type_register_static(&rhel_machine_info); ++} ++type_init(rhel_machine_init); ++ ++static void rhel820_virt_instance_init(Object *obj) ++{ ++ VirtMachineState *vms = VIRT_MACHINE(obj); ++ VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); ++ ++ /* EL3 is disabled by default and non-configurable for RHEL */ ++ vms->secure = false; ++ /* EL2 is disabled by default and non-configurable for RHEL */ ++ vms->virt = false; ++ /* High memory is enabled by default for RHEL */ ++ vms->highmem = true; ++ object_property_add_bool(obj, "highmem", virt_get_highmem, ++ virt_set_highmem, NULL); ++ object_property_set_description(obj, "highmem", ++ "Set on/off to enable/disable using " ++ "physical address space above 32 bits", ++ NULL); ++ /* ++ * Default GIC type is still v2, but became configurable for RHEL. We ++ * keep v2 instead of max as TCG CI test cases require an MSI controller ++ * and there is no userspace ITS MSI emulation available. ++ */ ++ vms->gic_version = 2; ++ object_property_add_str(obj, "gic-version", virt_get_gic_version, ++ virt_set_gic_version, NULL); ++ object_property_set_description(obj, "gic-version", ++ "Set GIC version. " ++ "Valid values are 2, 3 and host", NULL); ++ ++ vms->highmem_ecam = !vmc->no_highmem_ecam; ++ ++ if (vmc->no_its) { ++ vms->its = false; ++ } else { ++ /* Default allows ITS instantiation */ ++ vms->its = true; ++ object_property_add_bool(obj, "its", virt_get_its, ++ virt_set_its, NULL); ++ object_property_set_description(obj, "its", ++ "Set on/off to enable/disable " ++ "ITS instantiation", ++ NULL); ++ } ++ ++ /* Default disallows iommu instantiation */ ++ vms->iommu = VIRT_IOMMU_NONE; ++ object_property_add_str(obj, "iommu", virt_get_iommu, virt_set_iommu, NULL); ++ object_property_set_description(obj, "iommu", ++ "Set the IOMMU type. " ++ "Valid values are none and smmuv3", ++ NULL); ++ ++ vms->irqmap=a15irqmap; ++ virt_flash_create(vms); ++} ++ ++static void rhel820_virt_options(MachineClass *mc) ++{ ++ compat_props_add(mc->compat_props, arm_rhel_compat, arm_rhel_compat_len); ++} ++DEFINE_RHEL_MACHINE_AS_LATEST(8, 2, 0) +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index 0b41083e9d..53fdf16563 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -142,6 +142,7 @@ typedef struct { + + #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) + ++#if 0 /* disabled for Red Hat Enterprise Linux */ + #define TYPE_VIRT_MACHINE MACHINE_TYPE_NAME("virt") + #define VIRT_MACHINE(obj) \ + OBJECT_CHECK(VirtMachineState, (obj), TYPE_VIRT_MACHINE) +@@ -150,6 +151,16 @@ typedef struct { + #define VIRT_MACHINE_CLASS(klass) \ + OBJECT_CLASS_CHECK(VirtMachineClass, klass, TYPE_VIRT_MACHINE) + ++#else ++#define TYPE_RHEL_MACHINE MACHINE_TYPE_NAME("virt-rhel") ++#define VIRT_MACHINE(obj) \ ++ OBJECT_CHECK(VirtMachineState, (obj), TYPE_RHEL_MACHINE) ++#define VIRT_MACHINE_GET_CLASS(obj) \ ++ OBJECT_GET_CLASS(VirtMachineClass, obj, TYPE_RHEL_MACHINE) ++#define VIRT_MACHINE_CLASS(klass) \ ++ OBJECT_CLASS_CHECK(VirtMachineClass, klass, TYPE_RHEL_MACHINE) ++#endif ++ + void virt_acpi_setup(VirtMachineState *vms); + + /* Return the number of used redistributor regions */ +-- +2.21.0 + diff --git a/0009-Add-ppc64-machine-types.patch b/0009-Add-ppc64-machine-types.patch new file mode 100755 index 0000000000000000000000000000000000000000..a3f1a54e96c0f5847002e1cf153d9327550c6fb9 --- /dev/null +++ b/0009-Add-ppc64-machine-types.patch @@ -0,0 +1,463 @@ +From 136eae41007e2e5b0d693cc656f3ec36cbabf16f Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:27:13 +0200 +Subject: Add ppc64 machine types + +Adding changes to add RHEL machine types for ppc64 architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- remove instance options and use upstream solution +- Use upstream compat handling +- Replace SPAPR_PCI_2_7_MMIO_WIN_SIZE with value (changed upstream) +- re-add handling of instance_options (removed upstream) +- Use p8 as default for rhel machine types (p9 default upstream) +- sPAPRMachineClass renamed to SpaprMachineClass (upstream) + +Rebase changes (4.1.0): +- Update format for compat structures + +Merged patches (4.0.0): +- 467d59a redhat: define pseries-rhel8.0.0 machine type + +Merged patches (4.1.0): +- f21757edc target/ppc/spapr: Enable mitigations by default for pseries-4.0 machine type +- 2511c63 redhat: sync pseries-rhel7.6.0 with rhel-av-8.0.1 +- 89f01da redhat: define pseries-rhel8.1.0 machine type + +Merged patches (4.2.0): +- bcba728 redhat: update pseries-rhel8.1.0 machine type +- redhat: update pseries-rhel-7.6.0 machine type (patch 93039) +- redhat: define pseries-rhel8.2.0 machine type (patch 93041) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 278 ++++++++++++++++++++++++++++++++++++++++ + hw/ppc/spapr_cpu_core.c | 13 ++ + include/hw/ppc/spapr.h | 1 + + target/ppc/compat.c | 13 +- + target/ppc/cpu.h | 1 + + 5 files changed, 305 insertions(+), 1 deletion(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index e076f6023c..8749c72066 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -4447,6 +4447,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) + smc->linux_pci_probe = true; + smc->smp_threads_vsmt = true; + smc->nr_xirqs = SPAPR_NR_XIRQS; ++ smc->has_power9_support = true; + } + + static const TypeInfo spapr_machine_info = { +@@ -4491,6 +4492,7 @@ static const TypeInfo spapr_machine_info = { + } \ + type_init(spapr_machine_register_##suffix) + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + /* + * pseries-4.2 + */ +@@ -4520,6 +4522,7 @@ static void spapr_machine_4_1_class_options(MachineClass *mc) + } + + DEFINE_SPAPR_MACHINE(4_1, "4.1", false); ++#endif + + /* + * pseries-4.0 +@@ -4536,6 +4539,7 @@ static void phb_placement_4_0(SpaprMachineState *spapr, uint32_t index, + *nv2atsd = 0; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void spapr_machine_4_0_class_options(MachineClass *mc) + { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); +@@ -4695,6 +4699,7 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", false); + /* + * pseries-2.7 + */ ++#endif + + static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index, + uint64_t *buid, hwaddr *pio, +@@ -4749,6 +4754,7 @@ static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index, + *nv2atsd = 0; + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void spapr_machine_2_7_class_options(MachineClass *mc) + { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); +@@ -4863,6 +4869,278 @@ static void spapr_machine_2_1_class_options(MachineClass *mc) + compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len); + } + DEFINE_SPAPR_MACHINE(2_1, "2.1", false); ++#endif ++ ++/* ++ * pseries-rhel8.2.0 ++ */ ++ ++static void spapr_machine_rhel820_class_options(MachineClass *mc) ++{ ++ /* Defaults for the latest behaviour inherited from the base class */ ++} ++ ++DEFINE_SPAPR_MACHINE(rhel820, "rhel8.2.0", true); ++ ++/* ++ * pseries-rhel8.1.0 ++ * like pseries-4.1 ++ */ ++ ++static void spapr_machine_rhel810_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ static GlobalProperty compat[] = { ++ /* Only allow 4kiB and 64kiB IOMMU pagesizes */ ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" }, ++ }; ++ ++ spapr_machine_rhel820_class_options(mc); ++ ++ /* from pseries-4.1 */ ++ smc->linux_pci_probe = false; ++ smc->smp_threads_vsmt = false; ++ compat_props_add(mc->compat_props, hw_compat_rhel_8_1, ++ hw_compat_rhel_8_1_len); ++ compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); ++ ++} ++ ++DEFINE_SPAPR_MACHINE(rhel810, "rhel8.1.0", false); ++ ++/* ++ * pseries-rhel8.0.0 ++ * like pseries-3.1 and pseries-4.0 ++ * except SPAPR_CAP_CFPC, SPAPR_CAP_SBBC and SPAPR_CAP_IBS ++ * that have been backported to pseries-rhel8.0.0 ++ */ ++ ++static void spapr_machine_rhel800_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel810_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_8_0, ++ hw_compat_rhel_8_0_len); ++ ++ /* pseries-4.0 */ ++ smc->phb_placement = phb_placement_4_0; ++ smc->irq = &spapr_irq_xics; ++ smc->pre_4_1_migration = true; ++ ++ /* pseries-3.1 */ ++ mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0"); ++ smc->update_dt_enabled = false; ++ smc->dr_phb_enabled = false; ++ smc->broken_host_serial_model = true; ++ smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel800, "rhel8.0.0", false); ++ ++/* ++ * pseries-rhel7.6.0 ++ * like spapr_compat_2_12 and spapr_compat_3_0 ++ * spapr_compat_0 is empty ++ */ ++GlobalProperty spapr_compat_rhel7_6[] = { ++ { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" }, ++ { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" }, ++}; ++const size_t spapr_compat_rhel7_6_len = G_N_ELEMENTS(spapr_compat_rhel7_6); ++ ++ ++static void spapr_machine_rhel760_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel800_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_6, spapr_compat_rhel7_6_len); ++ ++ /* from spapr_machine_3_0_class_options() */ ++ smc->legacy_irq_allocation = true; ++ smc->nr_xirqs = 0x400; ++ smc->irq = &spapr_irq_xics_legacy; ++ ++ /* from spapr_machine_2_12_class_options() */ ++ /* We depend on kvm_enabled() to choose a default value for the ++ * hpt-max-page-size capability. Of course we can't do it here ++ * because this is too early and the HW accelerator isn't initialzed ++ * yet. Postpone this to machine init (see default_caps_with_cpu()). ++ */ ++ smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0; ++ ++ /* SPAPR_CAP_WORKAROUND enabled in pseries-rhel800 by ++ * f21757edc554 ++ * "Enable mitigations by default for pseries-4.0 machine type") ++ */ ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel760, "rhel7.6.0", false); ++ ++/* ++ * pseries-rhel7.6.0-sxxm ++ * ++ * pseries-rhel7.6.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel760sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel760_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel760sxxm, "rhel7.6.0-sxxm", false); ++ ++static void spapr_machine_rhel750_class_options(MachineClass *mc) ++{ ++ spapr_machine_rhel760_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ ++} ++ ++DEFINE_SPAPR_MACHINE(rhel750, "rhel7.5.0", false); ++ ++/* ++ * pseries-rhel7.5.0-sxxm ++ * ++ * pseries-rhel7.5.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel750sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel750_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel750sxxm, "rhel7.5.0-sxxm", false); ++ ++/* ++ * pseries-rhel7.4.0 ++ * like spapr_compat_2_9 ++ */ ++GlobalProperty spapr_compat_rhel7_4[] = { ++ { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" }, ++}; ++const size_t spapr_compat_rhel7_4_len = G_N_ELEMENTS(spapr_compat_rhel7_4); ++ ++static void spapr_machine_rhel740_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel750_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_4, hw_compat_rhel_7_4_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_4, spapr_compat_rhel7_4_len); ++ mc->numa_auto_assign_ram = numa_legacy_auto_assign_ram; ++ smc->has_power9_support = false; ++ smc->pre_2_10_has_unused_icps = true; ++ smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED; ++ smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel740, "rhel7.4.0", false); ++ ++/* ++ * pseries-rhel7.4.0-sxxm ++ * ++ * pseries-rhel7.4.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel740sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel740_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel740sxxm, "rhel7.4.0-sxxm", false); ++ ++/* ++ * pseries-rhel7.3.0 ++ * like spapr_compat_2_6/_2_7/_2_8 but "ddw" has been backported to RHEL7_3 ++ */ ++GlobalProperty spapr_compat_rhel7_3[] = { ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0" }, ++ { TYPE_POWERPC_CPU, "pre-2.8-migration", "on" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" }, ++}; ++const size_t spapr_compat_rhel7_3_len = G_N_ELEMENTS(spapr_compat_rhel7_3); ++ ++static void spapr_machine_rhel730_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel740_class_options(mc); ++ mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3"); ++ mc->default_machine_opts = "modern-hotplug-events=off"; ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_3, hw_compat_rhel_7_3_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_3, spapr_compat_rhel7_3_len); ++ ++ smc->phb_placement = phb_placement_2_7; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel730, "rhel7.3.0", false); ++ ++/* ++ * pseries-rhel7.3.0-sxxm ++ * ++ * pseries-rhel7.3.0 with speculative execution exploit mitigations enabled by default ++ */ ++ ++static void spapr_machine_rhel730sxxm_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel730_class_options(mc); ++ smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; ++ smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD; ++} ++ ++DEFINE_SPAPR_MACHINE(rhel730sxxm, "rhel7.3.0-sxxm", false); ++ ++/* ++ * pseries-rhel7.2.0 ++ */ ++/* Should be like spapr_compat_2_5 + 2_4 + 2_3, but "dynamic-reconfiguration" ++ * has been backported to RHEL7_2 so we don't need it here. ++ */ ++ ++GlobalProperty spapr_compat_rhel7_2[] = { ++ { "spapr-vlan", "use-rx-buffer-pools", "off" }, ++ { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" }, ++}; ++const size_t spapr_compat_rhel7_2_len = G_N_ELEMENTS(spapr_compat_rhel7_2); ++ ++static void spapr_machine_rhel720_class_options(MachineClass *mc) ++{ ++ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); ++ ++ spapr_machine_rhel730_class_options(mc); ++ smc->use_ohci_by_default = true; ++ mc->has_hotpluggable_cpus = NULL; ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_2, hw_compat_rhel_7_2_len); ++ compat_props_add(mc->compat_props, spapr_compat_rhel7_2, spapr_compat_rhel7_2_len); ++} ++ ++DEFINE_SPAPR_MACHINE(rhel720, "rhel7.2.0", false); + + static void spapr_machine_register_types(void) + { +diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c +index 301cd7b4e4..ba5a8fb82b 100644 +--- a/hw/ppc/spapr_cpu_core.c ++++ b/hw/ppc/spapr_cpu_core.c +@@ -24,6 +24,7 @@ + #include "sysemu/reset.h" + #include "sysemu/hw_accel.h" + #include "qemu/error-report.h" ++#include "cpu-models.h" + + static void spapr_reset_vcpu(PowerPCCPU *cpu) + { +@@ -242,6 +243,7 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, + CPUPPCState *env = &cpu->env; + CPUState *cs = CPU(cpu); + Error *local_err = NULL; ++ SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + + object_property_set_bool(OBJECT(cpu), true, "realized", &local_err); + if (local_err) { +@@ -254,6 +256,17 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, + cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr)); + kvmppc_set_papr(cpu); + ++ if (!smc->has_power9_support && ++ (((spapr->max_compat_pvr && ++ ppc_compat_cmp(spapr->max_compat_pvr, ++ CPU_POWERPC_LOGICAL_3_00) >= 0)) || ++ (!spapr->max_compat_pvr && ++ ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, 0)))) { ++ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, ++ "POWER9 CPU is not supported by this machine class"); ++ return; ++ } ++ + if (spapr_irq_cpu_intc_create(spapr, cpu, &local_err) < 0) { + goto error_intc_create; + } +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index d5ab5ea7b2..aa89cc4a95 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -125,6 +125,7 @@ struct SpaprMachineClass { + bool linux_pci_probe; + bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ + ++ bool has_power9_support; + void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, + uint64_t *buid, hwaddr *pio, + hwaddr *mmio32, hwaddr *mmio64, +diff --git a/target/ppc/compat.c b/target/ppc/compat.c +index 7de4bf3122..3e2e35342d 100644 +--- a/target/ppc/compat.c ++++ b/target/ppc/compat.c +@@ -105,8 +105,19 @@ static const CompatInfo *compat_by_pvr(uint32_t pvr) + return NULL; + } + ++long ppc_compat_cmp(uint32_t pvr1, uint32_t pvr2) ++{ ++ const CompatInfo *compat1 = compat_by_pvr(pvr1); ++ const CompatInfo *compat2 = compat_by_pvr(pvr2); ++ ++ g_assert(compat1); ++ g_assert(compat2); ++ ++ return compat1 - compat2; ++} ++ + static bool pcc_compat(PowerPCCPUClass *pcc, uint32_t compat_pvr, +- uint32_t min_compat_pvr, uint32_t max_compat_pvr) ++ uint32_t min_compat_pvr, uint32_t max_compat_pvr) + { + const CompatInfo *compat = compat_by_pvr(compat_pvr); + const CompatInfo *min = compat_by_pvr(min_compat_pvr); +diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h +index e3e82327b7..5c53801cfd 100644 +--- a/target/ppc/cpu.h ++++ b/target/ppc/cpu.h +@@ -1367,6 +1367,7 @@ static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch) + + /* Compatibility modes */ + #if defined(TARGET_PPC64) ++long ppc_compat_cmp(uint32_t pvr1, uint32_t pvr2); + bool ppc_check_compat(PowerPCCPU *cpu, uint32_t compat_pvr, + uint32_t min_compat_pvr, uint32_t max_compat_pvr); + bool ppc_type_check_compat(const char *cputype, uint32_t compat_pvr, +-- +2.21.0 + diff --git a/0010-Add-s390x-machine-types.patch b/0010-Add-s390x-machine-types.patch new file mode 100755 index 0000000000000000000000000000000000000000..d0f666924463fc27bf4a969ae6db744582ba404c --- /dev/null +++ b/0010-Add-s390x-machine-types.patch @@ -0,0 +1,126 @@ +From 0842700b3a01891c316e9169fa651f26714cafa5 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:47:32 +0200 +Subject: Add s390x machine types + +Adding changes to add RHEL machine types for s390x architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (weekly-4.1.0): +- Use upstream compat handling + +Merged patches (3.1.0): +- 29df663 s390x/cpumodel: default enable bpb and ppa15 for z196 and later + +Merged patches (4.1.0): +- 6c200d665b hw/s390x/s390-virtio-ccw: Add machine types for RHEL8.0.0 + +Merged patches (4.2.0): +- fb192e5 redhat: s390x: Rename s390-ccw-virtio-rhel8.0.0 to s390-ccw-virtio-rhel8.1.0 +- a9b22e8 redhat: s390x: Add proper compatibility options for the -rhel7.6.0 machine +- hw/s390x: Add the s390-ccw-virtio-rhel8.2.0 machine types (patch 92954) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 70 +++++++++++++++++++++++++++++++++++++- + 1 file changed, 69 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index d3edeef0ad..c2c83d2fce 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -615,7 +615,7 @@ bool css_migration_enabled(void) + { \ + MachineClass *mc = MACHINE_CLASS(oc); \ + ccw_machine_##suffix##_class_options(mc); \ +- mc->desc = "VirtIO-ccw based S390 machine v" verstr; \ ++ mc->desc = "VirtIO-ccw based S390 machine " verstr; \ + if (latest) { \ + mc->alias = "s390-ccw-virtio"; \ + mc->is_default = 1; \ +@@ -639,6 +639,7 @@ bool css_migration_enabled(void) + } \ + type_init(ccw_machine_register_##suffix) + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void ccw_machine_4_2_instance_options(MachineState *machine) + { + } +@@ -866,6 +867,73 @@ static void ccw_machine_2_4_class_options(MachineClass *mc) + compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); + } + DEFINE_CCW_MACHINE(2_4, "2.4", false); ++#endif ++ ++static void ccw_machine_rhel820_instance_options(MachineState *machine) ++{ ++} ++ ++static void ccw_machine_rhel820_class_options(MachineClass *mc) ++{ ++} ++DEFINE_CCW_MACHINE(rhel820, "rhel8.2.0", true); ++ ++static void ccw_machine_rhel760_instance_options(MachineState *machine) ++{ ++ static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V3_1 }; ++ ++ ccw_machine_rhel820_instance_options(machine); ++ ++ s390_set_qemu_cpu_model(0x2827, 12, 2, qemu_cpu_feat); ++ ++ /* The multiple-epoch facility was not available with rhel7.6.0 on z14GA1 */ ++ s390_cpudef_featoff(14, 1, S390_FEAT_MULTIPLE_EPOCH); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_QSIE); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_QTOUE); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_STOE); ++ s390_cpudef_featoff(14, 1, S390_FEAT_PTFF_STOUE); ++} ++ ++static void ccw_machine_rhel760_class_options(MachineClass *mc) ++{ ++ ccw_machine_rhel820_class_options(mc); ++ /* We never published the s390x version of RHEL-AV 8.0 and 8.1, so add this here */ ++ compat_props_add(mc->compat_props, hw_compat_rhel_8_0, hw_compat_rhel_8_0_len); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++} ++DEFINE_CCW_MACHINE(rhel760, "rhel7.6.0", false); ++ ++static void ccw_machine_rhel750_instance_options(MachineState *machine) ++{ ++ static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V2_11 }; ++ ccw_machine_rhel760_instance_options(machine); ++ ++ /* before 2.12 we emulated the very first z900, and RHEL 7.5 is ++ based on 2.10 */ ++ s390_set_qemu_cpu_model(0x2064, 7, 1, qemu_cpu_feat); ++ ++ /* bpb and ppa15 were only in the full model in RHEL 7.5 */ ++ s390_cpudef_featoff_greater(11, 1, S390_FEAT_PPA15); ++ s390_cpudef_featoff_greater(11, 1, S390_FEAT_BPB); ++} ++ ++GlobalProperty ccw_compat_rhel_7_5[] = { ++ { ++ .driver = TYPE_SCLP_EVENT_FACILITY, ++ .property = "allow_all_mask_sizes", ++ .value = "off", ++ }, ++}; ++const size_t ccw_compat_rhel_7_5_len = G_N_ELEMENTS(ccw_compat_rhel_7_5); ++ ++static void ccw_machine_rhel750_class_options(MachineClass *mc) ++{ ++ ccw_machine_rhel760_class_options(mc); ++ compat_props_add(mc->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ compat_props_add(mc->compat_props, ccw_compat_rhel_7_5, ccw_compat_rhel_7_5_len); ++ S390_MACHINE_CLASS(mc)->hpage_1m_allowed = false; ++} ++DEFINE_CCW_MACHINE(rhel750, "rhel7.5.0", false); + + static void ccw_machine_register_types(void) + { +-- +2.21.0 + diff --git a/0011-Add-x86_64-machine-types.patch b/0011-Add-x86_64-machine-types.patch new file mode 100755 index 0000000000000000000000000000000000000000..72a5159dee6aa23be2eae57f11c7a09b63a924b9 --- /dev/null +++ b/0011-Add-x86_64-machine-types.patch @@ -0,0 +1,897 @@ +From 2ebaeca6e26950f401a8169d1324be2bafd11741 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:10:31 +0200 +Subject: Add x86_64 machine types + +Adding changes to add RHEL machine types for x86_64 architecture. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (qemu-4.0.0): +- Use upstream compat handling + +Rebase notes (3.1.0): +- Removed xsave changes + +Rebase notes (4.1.0): +- Updated format for compat structures + +Rebase notes (4.2.0-rc2): +- Use X86MachineClass for save_tsc_khz (upstream change) + +Merged patches (4.1.0): +- f4dc802 pc: 7.5 compat entries +- 456ed3e pc: PC_RHEL7_6_COMPAT +- 04119ee pc: Add compat for pc-i440fx-rhel7.6.0 machine type +- b3b3687 pc: Add pc-q35-8.0.0 machine type +- 8d46fc6 pc: Add x-migrate-smi-count=off to PC_RHEL7_6_COMPAT +- 1de7949 kvm: clear out KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT for older machine types +- 18cf0d7 target/i386: Disable MPX support on named CPU models (partialy) +- 2660667 rhel: Set host-phys-bits-limit=48 on rhel machine-types + +Merged patches (4.2.0): +- 7d5c2ef pc: Don't make die-id mandatory unless necessary +- e42808c x86 machine types: pc_rhel_8_0_compat +- 9de83a8 x86 machine types: q35: Fixup units_per_default_bus +- 6df1559 x86 machine types: Fixup dynamic sysbus entries +- 0784125 x86 machine types: add pc-q35-rhel8.1.0 +- machines/x86: Add rhel 8.2 machine type (patch 92959) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/i386/acpi-build.c | 3 + + hw/i386/pc.c | 263 ++++++++++++++++++++++++++++++++++++++++++- + hw/i386/pc_piix.c | 210 +++++++++++++++++++++++++++++++++- + hw/i386/pc_q35.c | 156 ++++++++++++++++++++++++- + include/hw/boards.h | 2 + + include/hw/i386/pc.h | 33 ++++++ + target/i386/cpu.c | 9 +- + target/i386/kvm.c | 4 + + 8 files changed, 673 insertions(+), 7 deletions(-) + +diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c +index 12ff55fcfb..64001893ab 100644 +--- a/hw/i386/acpi-build.c ++++ b/hw/i386/acpi-build.c +@@ -204,6 +204,9 @@ static void acpi_get_pm_info(MachineState *machine, AcpiPmInfo *pm) + pm->fadt.reset_reg = r; + pm->fadt.reset_val = 0xf; + pm->fadt.flags |= 1 << ACPI_FADT_F_RESET_REG_SUP; ++ if (object_property_get_bool(lpc, ++ "__com.redhat_force-rev1-fadt", NULL)) ++ pm->fadt.rev = 1; + pm->cpu_hp_io_base = ICH9_CPU_HOTPLUG_IO_BASE; + } + +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index ac08e63604..61e70e4811 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -344,6 +344,261 @@ GlobalProperty pc_compat_1_4[] = { + }; + const size_t pc_compat_1_4_len = G_N_ELEMENTS(pc_compat_1_4); + ++/* This macro is for changes to properties that are RHEL specific, ++ * different to the current upstream and to be applied to the latest ++ * machine type. ++ */ ++GlobalProperty pc_rhel_compat[] = { ++ { TYPE_X86_CPU, "host-phys-bits", "on" }, ++ { TYPE_X86_CPU, "host-phys-bits-limit", "48" }, ++ /* bz 1508330 */ ++ { "vfio-pci", "x-no-geforce-quirks", "on" }, ++}; ++const size_t pc_rhel_compat_len = G_N_ELEMENTS(pc_rhel_compat); ++ ++/* pc_rhel_8_1_compat is empty since pc_4_1_compat is */ ++GlobalProperty pc_rhel_8_1_compat[] = { }; ++const size_t pc_rhel_8_1_compat_len = G_N_ELEMENTS(pc_rhel_8_1_compat); ++ ++GlobalProperty pc_rhel_8_0_compat[] = { ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "intel-iommu", "dma-drain", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G4" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G4" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G5" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Opteron_G5" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC-IBPB" "-" TYPE_X86_CPU, "npt", "off" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "EPYC-IBPB" "-" TYPE_X86_CPU, "nrip-save", "off" }, ++ /** The mpx=on entries from pc_compat_3_1 are in pc_rhel_7_6_compat **/ ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { "Cascadelake-Server" "-" TYPE_X86_CPU, "stepping", "5" }, ++ /* pc_rhel_8_0_compat from pc_compat_3_1 */ ++ { TYPE_X86_CPU, "x-intel-pt-auto-level", "off" }, ++}; ++const size_t pc_rhel_8_0_compat_len = G_N_ELEMENTS(pc_rhel_8_0_compat); ++ ++/* Similar to PC_COMPAT_3_0 + PC_COMPAT_2_12, but: ++ * all of the 2_12 stuff was already in 7.6 from bz 1481253 ++ * x-migrate-smi-count comes from PC_COMPAT_2_11 but ++ * is really tied to kernel version so keep it off on 7.x ++ * machine types irrespective of host. ++ */ ++GlobalProperty pc_rhel_7_6_compat[] = { ++ /* pc_rhel_7_6_compat from pc_compat_3_0 */ ++ { TYPE_X86_CPU, "x-hv-synic-kvm-only", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_3_0 */ ++ { "Skylake-Server" "-" TYPE_X86_CPU, "pku", "off" }, ++ /* pc_rhel_7_6_compat from pc_compat_3_0 */ ++ { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "pku", "off" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { TYPE_X86_CPU, "x-migrate-smi-count", "off" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Client" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Client-IBRS" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Skylake-Server-IBRS" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Cascadelake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Icelake-Client" "-" TYPE_X86_CPU, "mpx", "on" }, ++ /* pc_rhel_7_6_compat from pc_compat_2_11 */ ++ { "Icelake-Server" "-" TYPE_X86_CPU, "mpx", "on" }, ++}; ++const size_t pc_rhel_7_6_compat_len = G_N_ELEMENTS(pc_rhel_7_6_compat); ++ ++/* Similar to PC_COMPAT_2_11 + PC_COMPAT_2_10, but: ++ * - x-hv-max-vps was backported to 7.5 ++ * - x-pci-hole64-fix was backported to 7.5 ++ */ ++GlobalProperty pc_rhel_7_5_compat[] = { ++ /* pc_rhel_7_5_compat from pc_compat_2_11 */ ++ { "Skylake-Server" "-" TYPE_X86_CPU, "clflushopt", "off" }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { TYPE_X86_CPU, "legacy-cache", "on" }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { TYPE_X86_CPU, "topoext", "off" }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { "EPYC-" TYPE_X86_CPU, "xlevel", stringify(0x8000000a) }, ++ /* pc_rhel_7_5_compat from pc_compat_2_12 */ ++ { "EPYC-IBPB-" TYPE_X86_CPU, "xlevel", stringify(0x8000000a) }, ++}; ++const size_t pc_rhel_7_5_compat_len = G_N_ELEMENTS(pc_rhel_7_5_compat); ++ ++GlobalProperty pc_rhel_7_4_compat[] = { ++ /* pc_rhel_7_4_compat from pc_compat_2_9 */ ++ { "mch", "extended-tseg-mbytes", stringify(0) }, ++ /* bz 1489800 */ ++ { "ICH9-LPC", "__com.redhat_force-rev1-fadt", "on" }, ++ /* pc_rhel_7_4_compat from pc_compat_2_10 */ ++ { "i440FX-pcihost", "x-pci-hole64-fix", "off" }, ++ /* pc_rhel_7_4_compat from pc_compat_2_10 */ ++ { "q35-pcihost", "x-pci-hole64-fix", "off" }, ++ /* pc_rhel_7_4_compat from pc_compat_2_10 */ ++ { TYPE_X86_CPU, "x-hv-max-vps", "0x40" }, ++}; ++const size_t pc_rhel_7_4_compat_len = G_N_ELEMENTS(pc_rhel_7_4_compat); ++ ++GlobalProperty pc_rhel_7_3_compat[] = { ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { "kvmclock", "x-mach-use-reliable-get-clock", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { TYPE_X86_CPU, "l3-cache", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { TYPE_X86_CPU, "full-cpuid-auto-level", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "family", "15" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "model", "6" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "Opteron_G3" "-" TYPE_X86_CPU, "stepping", "1" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_7 */ ++ { "isa-pcspk", "migrate", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_6 */ ++ { TYPE_X86_CPU, "cpuid-0xb", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { "ICH9-LPC", "x-smi-broadcast", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { TYPE_X86_CPU, "vmware-cpuid-freq", "off" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_8 */ ++ { "Haswell-" TYPE_X86_CPU, "stepping", "1" }, ++ /* pc_rhel_7_3_compat from pc_compat_2_3 added in 2.9*/ ++ { TYPE_X86_CPU, "kvm-no-smi-migration", "on" }, ++}; ++const size_t pc_rhel_7_3_compat_len = G_N_ELEMENTS(pc_rhel_7_3_compat); ++ ++GlobalProperty pc_rhel_7_2_compat[] = { ++ { "phenom" "-" TYPE_X86_CPU, "rdtscp", "off"}, ++ { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" }, ++ { "qemu64" "-" TYPE_X86_CPU, "abm", "on" }, ++ { "Haswell-" TYPE_X86_CPU, "abm", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" }, ++ { "Haswell-noTSX-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" }, ++ { "Broadwell-noTSX-IBRS" "-" TYPE_X86_CPU, "abm", "off" }, ++ { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" }, ++ { TYPE_X86_CPU, "check", "off" }, ++ { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" }, ++ { TYPE_X86_CPU, "arat", "off" }, ++ { "usb-redir", "streams", "off" }, ++ { TYPE_X86_CPU, "fill-mtrr-mask", "off" }, ++ { "apic-common", "legacy-instance-id", "on" }, ++}; ++const size_t pc_rhel_7_2_compat_len = G_N_ELEMENTS(pc_rhel_7_2_compat); ++ ++GlobalProperty pc_rhel_7_1_compat[] = { ++ { "kvm64" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "kvm32" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Conroe" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Penryn" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Nehalem" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Nehalem-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Westmere" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Westmere-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "SandyBridge" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "SandyBridge-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Haswell" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Broadwell" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G1" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G2" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G3" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G4" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Opteron_G5" "-" TYPE_X86_CPU, "vme", "off" }, ++ { "Haswell" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Haswell" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "Haswell-IBRS" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "Broadwell" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "f16c", "off" }, ++ { "Broadwell" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "Broadwell-IBRS" "-" TYPE_X86_CPU, "rdrand", "off" }, ++ { "coreduo" "-" TYPE_X86_CPU, "vmx", "on" }, ++ { "core2duo" "-" TYPE_X86_CPU, "vmx", "on" }, ++ { "qemu64" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "kvm64" "-" TYPE_X86_CPU, "min-level", stringify(5) }, ++ { "pentium3" "-" TYPE_X86_CPU, "min-level", stringify(2) }, ++ { "n270" "-" TYPE_X86_CPU, "min-level", stringify(5) }, ++ { "Conroe" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "Penryn" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "Nehalem" "-" TYPE_X86_CPU, "min-level", stringify(4) }, ++ { "n270" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Penryn" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Conroe" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Nehalem" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Westmere" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "SandyBridge" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "IvyBridge" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Haswell" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Haswell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Broadwell" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++ { "Broadwell-noTSX" "-" TYPE_X86_CPU, "min-xlevel", stringify(0x8000000a) }, ++}; ++const size_t pc_rhel_7_1_compat_len = G_N_ELEMENTS(pc_rhel_7_1_compat); ++ ++/* ++ * The PC_RHEL_*_COMPAT serve the same purpose for RHEL-7 machine ++ * types as the PC_COMPAT_* do for upstream types. ++ * PC_RHEL_7_*_COMPAT apply both to i440fx and q35 types. ++ */ ++ ++/* ++ * RHEL-7 is based on QEMU 1.5.3, so this needs the PC_COMPAT_* ++ * between our base and 1.5, less stuff backported to RHEL-7.0 ++ * (usb-device.msos-desc), less stuff for devices we changed ++ * (qemu64-x86_64-cpu) or don't support (hpet, pci-serial-2x, ++ * pci-serial-4x) in 7.0. ++ */ ++GlobalProperty pc_rhel_7_0_compat[] = { ++ { "virtio-scsi-pci", "any_layout", "off" }, ++ { "PIIX4_PM", "memory-hotplug-support", "off" }, ++ { "apic", "version", stringify(0x11) }, ++ { "nec-usb-xhci", "superspeed-ports-first", "off" }, ++ { "nec-usb-xhci", "force-pcie-endcap", "on" }, ++ { "pci-serial", "prog_if", stringify(0) }, ++ { "virtio-net-pci", "guest_announce", "off" }, ++ { "ICH9-LPC", "memory-hotplug-support", "off" }, ++ { "xio3130-downstream", COMPAT_PROP_PCP, "off" }, ++ { "ioh3420", COMPAT_PROP_PCP, "off" }, ++ { "PIIX4_PM", "acpi-pci-hotplug-with-bridge-support", "off" }, ++ { "e1000", "mitigation", "off" }, ++ { "virtio-net-pci", "ctrl_guest_offloads", "off" }, ++ { "Conroe" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Penryn" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Nehalem" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Nehalem-IBRS" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Westmere" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Westmere-IBRS" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G1" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G2" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G3" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G4" "-" TYPE_X86_CPU, "x2apic", "on" }, ++ { "Opteron_G5" "-" TYPE_X86_CPU, "x2apic", "on" }, ++}; ++const size_t pc_rhel_7_0_compat_len = G_N_ELEMENTS(pc_rhel_7_0_compat); ++ + void gsi_handler(void *opaque, int n, int level) + { + GSIState *s = opaque; +@@ -1225,7 +1480,8 @@ void pc_memory_init(PCMachineState *pcms, + option_rom_mr = g_malloc(sizeof(*option_rom_mr)); + memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, + &error_fatal); +- if (pcmc->pci_enabled) { ++ /* RH difference: See bz 1489800, explicitly make ROM ro */ ++ if (pcmc->pc_rom_ro) { + memory_region_set_readonly(option_rom_mr, true); + } + memory_region_add_subregion_overlap(rom_memory, +@@ -2198,6 +2454,8 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) + pcmc->linuxboot_dma_enabled = true; + pcmc->pvh_enabled = true; + assert(!mc->get_hotplug_handler); ++ pcmc->pc_rom_ro = true; ++ mc->async_pf_vmexit_disable = false; + mc->get_hotplug_handler = pc_get_hotplug_handler; + mc->hotplug_allowed = pc_hotplug_allowed; + mc->cpu_index_to_instance_props = x86_cpu_index_to_props; +@@ -2209,7 +2467,8 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) + mc->hot_add_cpu = pc_hot_add_cpu; + mc->smp_parse = pc_smp_parse; + mc->block_default_type = IF_IDE; +- mc->max_cpus = 255; ++ /* 240: max CPU count for RHEL */ ++ mc->max_cpus = 240; + mc->reset = pc_machine_reset; + mc->wakeup = pc_machine_wakeup; + hc->pre_plug = pc_machine_device_pre_plug_cb; +diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c +index 1bd70d1abb..bd7fdb99bb 100644 +--- a/hw/i386/pc_piix.c ++++ b/hw/i386/pc_piix.c +@@ -53,6 +53,7 @@ + #include "cpu.h" + #include "qapi/error.h" + #include "qemu/error-report.h" ++#include "migration/migration.h" + #ifdef CONFIG_XEN + #include + #include "hw/xen/xen_pt.h" +@@ -173,8 +174,8 @@ static void pc_init1(MachineState *machine, + if (pcmc->smbios_defaults) { + MachineClass *mc = MACHINE_GET_CLASS(machine); + /* These values are guest ABI, do not change */ +- smbios_set_defaults("QEMU", "Standard PC (i440FX + PIIX, 1996)", +- mc->name, pcmc->smbios_legacy_mode, ++ smbios_set_defaults("Red Hat", "KVM", ++ mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, + SMBIOS_ENTRY_POINT_21); + } +@@ -307,6 +308,7 @@ else { + * hw_compat_*, pc_compat_*, or * pc_*_machine_options(). + */ + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void pc_compat_2_3_fn(MachineState *machine) + { + PCMachineState *pcms = PC_MACHINE(machine); +@@ -1026,3 +1028,207 @@ static void xenfv_machine_options(MachineClass *m) + DEFINE_PC_MACHINE(xenfv, "xenfv", pc_xen_hvm_init, + xenfv_machine_options); + #endif ++#endif /* Disabled for Red Hat Enterprise Linux */ ++ ++/* Red Hat Enterprise Linux machine types */ ++ ++/* Options for the latest rhel7 machine type */ ++static void pc_machine_rhel7_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ m->family = "pc_piix_Y"; ++ m->default_machine_opts = "firmware=bios-256k.bin"; ++ pcmc->default_nic_model = "e1000"; ++ m->default_display = "std"; ++ m->no_parallel = 1; ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); ++ compat_props_add(m->compat_props, pc_rhel_compat, pc_rhel_compat_len); ++ m->alias = "pc"; ++ m->is_default = 1; ++} ++ ++static void pc_init_rhel760(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel760_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_machine_rhel7_options(m); ++ m->desc = "RHEL 7.6.0 PC (i440FX + PIIX, 1996)"; ++ m->async_pf_vmexit_disable = true; ++ m->smbus_no_migration_support = true; ++ pcmc->pvh_enabled = false; ++ pcmc->default_cpu_version = CPU_VERSION_LEGACY; ++ compat_props_add(m->compat_props, hw_compat_rhel_8_1, hw_compat_rhel_8_1_len); ++ compat_props_add(m->compat_props, pc_rhel_8_1_compat, pc_rhel_8_1_compat_len); ++ compat_props_add(m->compat_props, hw_compat_rhel_8_0, hw_compat_rhel_8_0_len); ++ compat_props_add(m->compat_props, pc_rhel_8_0_compat, pc_rhel_8_0_compat_len); ++ compat_props_add(m->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++ compat_props_add(m->compat_props, pc_rhel_7_6_compat, pc_rhel_7_6_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel760, "pc-i440fx-rhel7.6.0", pc_init_rhel760, ++ pc_machine_rhel760_options); ++ ++static void pc_init_rhel750(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel750_options(MachineClass *m) ++{ ++ pc_machine_rhel760_options(m); ++ m->alias = NULL; ++ m->is_default = 0; ++ m->desc = "RHEL 7.5.0 PC (i440FX + PIIX, 1996)"; ++ m->auto_enable_numa_with_memhp = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ compat_props_add(m->compat_props, pc_rhel_7_5_compat, pc_rhel_7_5_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel750, "pc-i440fx-rhel7.5.0", pc_init_rhel750, ++ pc_machine_rhel750_options); ++ ++static void pc_init_rhel740(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel740_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_machine_rhel750_options(m); ++ m->desc = "RHEL 7.4.0 PC (i440FX + PIIX, 1996)"; ++ m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; ++ pcmc->pc_rom_ro = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_4, hw_compat_rhel_7_4_len); ++ compat_props_add(m->compat_props, pc_rhel_7_4_compat, pc_rhel_7_4_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel740, "pc-i440fx-rhel7.4.0", pc_init_rhel740, ++ pc_machine_rhel740_options); ++ ++static void pc_init_rhel730(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel730_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_machine_rhel740_options(m); ++ m->desc = "RHEL 7.3.0 PC (i440FX + PIIX, 1996)"; ++ pcmc->linuxboot_dma_enabled = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_3, hw_compat_rhel_7_3_len); ++ compat_props_add(m->compat_props, pc_rhel_7_3_compat, pc_rhel_7_3_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel730, "pc-i440fx-rhel7.3.0", pc_init_rhel730, ++ pc_machine_rhel730_options); ++ ++ ++static void pc_init_rhel720(MachineState *machine) ++{ ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel720_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ X86MachineClass *x86mc = X86_MACHINE_CLASS(m); ++ pc_machine_rhel730_options(m); ++ m->desc = "RHEL 7.2.0 PC (i440FX + PIIX, 1996)"; ++ /* From pc_i440fx_2_5_machine_options */ ++ x86mc->save_tsc_khz = false; ++ m->legacy_fw_cfg_order = 1; ++ /* Note: broken_reserved_end was already in 7.2 */ ++ /* From pc_i440fx_2_6_machine_options */ ++ pcmc->legacy_cpu_hotplug = true; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_2, hw_compat_rhel_7_2_len); ++ compat_props_add(m->compat_props, pc_rhel_7_2_compat, pc_rhel_7_2_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel720, "pc-i440fx-rhel7.2.0", pc_init_rhel720, ++ pc_machine_rhel720_options); ++ ++static void pc_compat_rhel710(MachineState *machine) ++{ ++ PCMachineState *pcms = PC_MACHINE(machine); ++ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); ++ ++ /* From pc_compat_2_2 */ ++ pcmc->rsdp_in_ram = false; ++ machine->suppress_vmdesc = true; ++ ++ /* From pc_compat_2_1 */ ++ pcmc->smbios_uuid_encoded = false; ++ x86_cpu_change_kvm_default("svm", NULL); ++ pcmc->enforce_aligned_dimm = false; ++ ++ /* Disable all the extra subsections that were added in 2.2 */ ++ migrate_pre_2_2 = true; ++ ++ /* From pc_i440fx_2_4_machine_options */ ++ pcmc->broken_reserved_end = true; ++} ++ ++static void pc_init_rhel710(MachineState *machine) ++{ ++ pc_compat_rhel710(machine); ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel710_options(MachineClass *m) ++{ ++ pc_machine_rhel720_options(m); ++ m->family = "pc_piix_Y"; ++ m->desc = "RHEL 7.1.0 PC (i440FX + PIIX, 1996)"; ++ m->default_display = "cirrus"; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_1, hw_compat_rhel_7_1_len); ++ compat_props_add(m->compat_props, pc_rhel_7_1_compat, pc_rhel_7_1_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel710, "pc-i440fx-rhel7.1.0", pc_init_rhel710, ++ pc_machine_rhel710_options); ++ ++static void pc_compat_rhel700(MachineState *machine) ++{ ++ PCMachineState *pcms = PC_MACHINE(machine); ++ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); ++ ++ pc_compat_rhel710(machine); ++ ++ /* Upstream enables it for everyone, we're a little more selective */ ++ x86_cpu_change_kvm_default("x2apic", NULL); ++ x86_cpu_change_kvm_default("svm", NULL); ++ pcmc->legacy_acpi_table_size = 6418; /* see pc_compat_2_0() */ ++ pcmc->smbios_legacy_mode = true; ++ pcmc->has_reserved_memory = false; ++ migrate_cve_2014_5263_xhci_fields = true; ++} ++ ++static void pc_init_rhel700(MachineState *machine) ++{ ++ pc_compat_rhel700(machine); ++ pc_init1(machine, TYPE_I440FX_PCI_HOST_BRIDGE, \ ++ TYPE_I440FX_PCI_DEVICE); ++} ++ ++static void pc_machine_rhel700_options(MachineClass *m) ++{ ++ pc_machine_rhel710_options(m); ++ m->family = "pc_piix_Y"; ++ m->desc = "RHEL 7.0.0 PC (i440FX + PIIX, 1996)"; ++ compat_props_add(m->compat_props, pc_rhel_7_0_compat, pc_rhel_7_0_compat_len); ++} ++ ++DEFINE_PC_MACHINE(rhel700, "pc-i440fx-rhel7.0.0", pc_init_rhel700, ++ pc_machine_rhel700_options); +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 385e5cffb1..7531d8ed76 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -197,8 +197,8 @@ static void pc_q35_init(MachineState *machine) + + if (pcmc->smbios_defaults) { + /* These values are guest ABI, do not change */ +- smbios_set_defaults("QEMU", "Standard PC (Q35 + ICH9, 2009)", +- mc->name, pcmc->smbios_legacy_mode, ++ smbios_set_defaults("Red Hat", "KVM", ++ mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, + SMBIOS_ENTRY_POINT_21); + } +@@ -330,6 +330,7 @@ static void pc_q35_init(MachineState *machine) + DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn) + + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void pc_q35_machine_options(MachineClass *m) + { + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); +@@ -533,3 +534,154 @@ static void pc_q35_2_4_machine_options(MachineClass *m) + + DEFINE_Q35_MACHINE(v2_4, "pc-q35-2.4", NULL, + pc_q35_2_4_machine_options); ++#endif /* Disabled for Red Hat Enterprise Linux */ ++ ++/* Red Hat Enterprise Linux machine types */ ++ ++/* Options for the latest rhel q35 machine type */ ++static void pc_q35_machine_rhel_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pcmc->default_nic_model = "e1000e"; ++ m->family = "pc_q35_Z"; ++ m->units_per_default_bus = 1; ++ m->default_machine_opts = "firmware=bios-256k.bin"; ++ m->default_display = "std"; ++ m->no_floppy = 1; ++ m->no_parallel = 1; ++ pcmc->default_cpu_version = 1; ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); ++ machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); ++ m->alias = "q35"; ++ m->max_cpus = 384; ++ compat_props_add(m->compat_props, pc_rhel_compat, pc_rhel_compat_len); ++} ++ ++static void pc_q35_init_rhel820(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel820_options(MachineClass *m) ++{ ++ pc_q35_machine_rhel_options(m); ++ m->desc = "RHEL-8.2.0 PC (Q35 + ICH9, 2009)"; ++} ++ ++DEFINE_PC_MACHINE(q35_rhel820, "pc-q35-rhel8.2.0", pc_q35_init_rhel820, ++ pc_q35_machine_rhel820_options); ++ ++static void pc_q35_init_rhel810(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel810_options(MachineClass *m) ++{ ++ pc_q35_machine_rhel820_options(m); ++ m->desc = "RHEL-8.1.0 PC (Q35 + ICH9, 2009)"; ++ m->alias = NULL; ++ compat_props_add(m->compat_props, hw_compat_rhel_8_1, hw_compat_rhel_8_1_len); ++ compat_props_add(m->compat_props, pc_rhel_8_1_compat, pc_rhel_8_1_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel810, "pc-q35-rhel8.1.0", pc_q35_init_rhel810, ++ pc_q35_machine_rhel810_options); ++ ++static void pc_q35_init_rhel800(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel800_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel810_options(m); ++ m->desc = "RHEL-8.0.0 PC (Q35 + ICH9, 2009)"; ++ m->smbus_no_migration_support = true; ++ m->alias = NULL; ++ pcmc->pvh_enabled = false; ++ pcmc->default_cpu_version = CPU_VERSION_LEGACY; ++ compat_props_add(m->compat_props, hw_compat_rhel_8_0, hw_compat_rhel_8_0_len); ++ compat_props_add(m->compat_props, pc_rhel_8_0_compat, pc_rhel_8_0_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel800, "pc-q35-rhel8.0.0", pc_q35_init_rhel800, ++ pc_q35_machine_rhel800_options); ++ ++static void pc_q35_init_rhel760(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel760_options(MachineClass *m) ++{ ++ pc_q35_machine_rhel800_options(m); ++ m->alias = NULL; ++ m->desc = "RHEL-7.6.0 PC (Q35 + ICH9, 2009)"; ++ m->async_pf_vmexit_disable = true; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); ++ compat_props_add(m->compat_props, pc_rhel_7_6_compat, pc_rhel_7_6_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel760, "pc-q35-rhel7.6.0", pc_q35_init_rhel760, ++ pc_q35_machine_rhel760_options); ++ ++static void pc_q35_init_rhel750(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel750_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel760_options(m); ++ m->alias = NULL; ++ m->desc = "RHEL-7.5.0 PC (Q35 + ICH9, 2009)"; ++ m->auto_enable_numa_with_memhp = false; ++ pcmc->default_nic_model = "e1000"; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_5, hw_compat_rhel_7_5_len); ++ compat_props_add(m->compat_props, pc_rhel_7_5_compat, pc_rhel_7_5_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel750, "pc-q35-rhel7.5.0", pc_q35_init_rhel750, ++ pc_q35_machine_rhel750_options); ++ ++static void pc_q35_init_rhel740(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel740_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel750_options(m); ++ m->desc = "RHEL-7.4.0 PC (Q35 + ICH9, 2009)"; ++ m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; ++ pcmc->pc_rom_ro = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_4, hw_compat_rhel_7_4_len); ++ compat_props_add(m->compat_props, pc_rhel_7_4_compat, pc_rhel_7_4_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel740, "pc-q35-rhel7.4.0", pc_q35_init_rhel740, ++ pc_q35_machine_rhel740_options); ++ ++static void pc_q35_init_rhel730(MachineState *machine) ++{ ++ pc_q35_init(machine); ++} ++ ++static void pc_q35_machine_rhel730_options(MachineClass *m) ++{ ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); ++ pc_q35_machine_rhel740_options(m); ++ m->desc = "RHEL-7.3.0 PC (Q35 + ICH9, 2009)"; ++ m->max_cpus = 255; ++ pcmc->linuxboot_dma_enabled = false; ++ compat_props_add(m->compat_props, hw_compat_rhel_7_3, hw_compat_rhel_7_3_len); ++ compat_props_add(m->compat_props, pc_rhel_7_3_compat, pc_rhel_7_3_compat_len); ++} ++ ++DEFINE_PC_MACHINE(q35_rhel730, "pc-q35-rhel7.3.0", pc_q35_init_rhel730, ++ pc_q35_machine_rhel730_options); +diff --git a/include/hw/boards.h b/include/hw/boards.h +index 6f85a0e032..2920bdef5b 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -222,6 +222,8 @@ struct MachineClass { + const char **valid_cpu_types; + strList *allowed_dynamic_sysbus_devices; + bool auto_enable_numa_with_memhp; ++ /* RHEL only */ ++ bool async_pf_vmexit_disable; + void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size); + bool ignore_boot_device_suffixes; +diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h +index 1f86eba3f9..2e362c8faa 100644 +--- a/include/hw/i386/pc.h ++++ b/include/hw/i386/pc.h +@@ -124,6 +124,9 @@ typedef struct PCMachineClass { + + /* use PVH to load kernels that support this feature */ + bool pvh_enabled; ++ ++ /* RH only, see bz 1489800 */ ++ bool pc_rom_ro; + } PCMachineClass; + + #define TYPE_PC_MACHINE "generic-pc-machine" +@@ -300,6 +303,36 @@ extern const size_t pc_compat_1_5_len; + extern GlobalProperty pc_compat_1_4[]; + extern const size_t pc_compat_1_4_len; + ++extern GlobalProperty pc_rhel_compat[]; ++extern const size_t pc_rhel_compat_len; ++ ++extern GlobalProperty pc_rhel_8_1_compat[]; ++extern const size_t pc_rhel_8_1_compat_len; ++ ++extern GlobalProperty pc_rhel_8_0_compat[]; ++extern const size_t pc_rhel_8_0_compat_len; ++ ++extern GlobalProperty pc_rhel_7_6_compat[]; ++extern const size_t pc_rhel_7_6_compat_len; ++ ++extern GlobalProperty pc_rhel_7_5_compat[]; ++extern const size_t pc_rhel_7_5_compat_len; ++ ++extern GlobalProperty pc_rhel_7_4_compat[]; ++extern const size_t pc_rhel_7_4_compat_len; ++ ++extern GlobalProperty pc_rhel_7_3_compat[]; ++extern const size_t pc_rhel_7_3_compat_len; ++ ++extern GlobalProperty pc_rhel_7_2_compat[]; ++extern const size_t pc_rhel_7_2_compat_len; ++ ++extern GlobalProperty pc_rhel_7_1_compat[]; ++extern const size_t pc_rhel_7_1_compat_len; ++ ++extern GlobalProperty pc_rhel_7_0_compat[]; ++extern const size_t pc_rhel_7_0_compat_len; ++ + /* Helper for setting model-id for CPU models that changed model-id + * depending on QEMU versions up to QEMU 2.4. + */ +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 1b7880ae3a..790db778ab 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1829,11 +1829,17 @@ static CPUCaches epyc_cache_info = { + + static X86CPUDefinition builtin_x86_defs[] = { + { ++ /* qemu64 is the default CPU model for all *-rhel7.* machine-types. ++ * The default on RHEL-6 was cpu64-rhel6. ++ * libvirt assumes that qemu64 is the default for _all_ machine-types, ++ * so we should try to keep qemu64 and cpu64-rhel6 as similar as ++ * possible. ++ */ + .name = "qemu64", + .level = 0xd, + .vendor = CPUID_VENDOR_AMD, + .family = 6, +- .model = 6, ++ .model = 13, + .stepping = 3, + .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | + CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | +@@ -3932,6 +3938,7 @@ static PropValue kvm_default_props[] = { + { "acpi", "off" }, + { "monitor", "off" }, + { "svm", "off" }, ++ { "kvm-pv-unhalt", "on" }, + { NULL, NULL }, + }; + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 1d10046a6c..86d9a1f364 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -3079,6 +3079,7 @@ static int kvm_get_msrs(X86CPU *cpu) + struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries; + int ret, i; + uint64_t mtrr_top_bits; ++ MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); + + kvm_msr_buf_reset(cpu); + +@@ -3388,6 +3389,9 @@ static int kvm_get_msrs(X86CPU *cpu) + break; + case MSR_KVM_ASYNC_PF_EN: + env->async_pf_en_msr = msrs[i].data; ++ if (mc->async_pf_vmexit_disable) { ++ env->async_pf_en_msr &= ~(1ULL << 2); ++ } + break; + case MSR_KVM_PV_EOI_EN: + env->pv_eoi_en_msr = msrs[i].data; +-- +2.21.0 + diff --git a/0012-Enable-make-check.patch b/0012-Enable-make-check.patch new file mode 100755 index 0000000000000000000000000000000000000000..09f7b4e35dbb46c14030beecb41263dcaa5e9de9 --- /dev/null +++ b/0012-Enable-make-check.patch @@ -0,0 +1,307 @@ +From 154215041df085271a780a2989f4f481226e3e34 Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 19 Oct 2018 13:48:41 +0200 +Subject: Enable make check + +Fixing tests after device disabling and machine types changes and enabling +make check run during build. + +Signed-off-by: Miroslav Rezanina + +Rebase changes (4.0.0): +- Remove testing for pseries-2.7 in endianess test +- Disable device-plug-test on s390x as it use disabled device +- Do not run cpu-plug-tests on 7.3 and older machine types + +Rebase changes (4.1.0-rc0): +- removed iotests 068 + +Rebase changes (4.1.0-rc1): +- remove all 205 tests (unstable) + +Rebase changes (4.2.0-rc0): +- partially disable hd-geo-test (requires lsi53c895a) + +Merged patches (4.0.0): +- f7ffd13 Remove 7 qcow2 and luks iotests that are taking > 25 sec to run during the fast train build proce + +Merged patches (4.1.0-rc0): +- 41288ff redhat: Remove raw iotest 205 + +Signed-off-by: Danilo C. L. de Paula +--- + redhat/qemu-kvm.spec.template | 2 +- + tests/Makefile.include | 10 +++++----- + tests/boot-serial-test.c | 6 +++++- + tests/cpu-plug-test.c | 4 ++-- + tests/e1000-test.c | 2 ++ + tests/hd-geo-test.c | 4 ++++ + tests/prom-env-test.c | 4 ++++ + tests/qemu-iotests/051 | 12 ++++++------ + tests/qemu-iotests/group | 4 ++-- + tests/test-x86-cpuid-compat.c | 2 ++ + tests/usb-hcd-xhci-test.c | 4 ++++ + 11 files changed, 37 insertions(+), 17 deletions(-) + +diff --git a/tests/Makefile.include b/tests/Makefile.include +index b483790cf3..53bdbdfee0 100644 +--- a/tests/Makefile.include ++++ b/tests/Makefile.include +@@ -172,7 +172,7 @@ check-qtest-i386-y += tests/ide-test$(EXESUF) + check-qtest-i386-y += tests/ahci-test$(EXESUF) + check-qtest-i386-y += tests/hd-geo-test$(EXESUF) + check-qtest-i386-y += tests/boot-order-test$(EXESUF) +-check-qtest-i386-y += tests/bios-tables-test$(EXESUF) ++#check-qtest-i386-y += tests/bios-tables-test$(EXESUF) + check-qtest-i386-$(CONFIG_SGA) += tests/boot-serial-test$(EXESUF) + check-qtest-i386-$(CONFIG_SLIRP) += tests/pxe-test$(EXESUF) + check-qtest-i386-y += tests/rtc-test$(EXESUF) +@@ -230,7 +230,7 @@ check-qtest-mips64el-$(CONFIG_VGA) += tests/display-vga-test$(EXESUF) + check-qtest-moxie-y += tests/boot-serial-test$(EXESUF) + + check-qtest-ppc-$(CONFIG_ISA_TESTDEV) = tests/endianness-test$(EXESUF) +-check-qtest-ppc-y += tests/boot-order-test$(EXESUF) ++#check-qtest-ppc-y += tests/boot-order-test$(EXESUF) + check-qtest-ppc-y += tests/prom-env-test$(EXESUF) + check-qtest-ppc-y += tests/drive_del-test$(EXESUF) + check-qtest-ppc-y += tests/boot-serial-test$(EXESUF) +@@ -244,8 +244,8 @@ check-qtest-ppc64-$(CONFIG_PSERIES) += tests/rtas-test$(EXESUF) + check-qtest-ppc64-$(CONFIG_SLIRP) += tests/pxe-test$(EXESUF) + check-qtest-ppc64-$(CONFIG_USB_UHCI) += tests/usb-hcd-uhci-test$(EXESUF) + check-qtest-ppc64-$(CONFIG_USB_XHCI_NEC) += tests/usb-hcd-xhci-test$(EXESUF) +-check-qtest-ppc64-$(CONFIG_SLIRP) += tests/test-netfilter$(EXESUF) +-check-qtest-ppc64-$(CONFIG_POSIX) += tests/test-filter-mirror$(EXESUF) ++#check-qtest-ppc64-$(CONFIG_SLIRP) += tests/test-netfilter$(EXESUF) ++#check-qtest-ppc64-$(CONFIG_POSIX) += tests/test-filter-mirror$(EXESUF) + check-qtest-ppc64-$(CONFIG_RTL8139_PCI) += tests/test-filter-redirector$(EXESUF) + check-qtest-ppc64-$(CONFIG_VGA) += tests/display-vga-test$(EXESUF) + check-qtest-ppc64-y += tests/numa-test$(EXESUF) +@@ -291,7 +291,7 @@ check-qtest-s390x-$(CONFIG_SLIRP) += tests/test-netfilter$(EXESUF) + check-qtest-s390x-$(CONFIG_POSIX) += tests/test-filter-mirror$(EXESUF) + check-qtest-s390x-$(CONFIG_POSIX) += tests/test-filter-redirector$(EXESUF) + check-qtest-s390x-y += tests/drive_del-test$(EXESUF) +-check-qtest-s390x-y += tests/device-plug-test$(EXESUF) ++#check-qtest-s390x-y += tests/device-plug-test$(EXESUF) + check-qtest-s390x-y += tests/virtio-ccw-test$(EXESUF) + check-qtest-s390x-y += tests/cpu-plug-test$(EXESUF) + check-qtest-s390x-y += tests/migration-test$(EXESUF) +diff --git a/tests/boot-serial-test.c b/tests/boot-serial-test.c +index d3a54a0ba5..33ce72b89c 100644 +--- a/tests/boot-serial-test.c ++++ b/tests/boot-serial-test.c +@@ -108,19 +108,23 @@ static testdef_t tests[] = { + { "ppc", "g3beige", "", "PowerPC,750" }, + { "ppc", "mac99", "", "PowerPC,G4" }, + { "ppc", "sam460ex", "-m 256", "DRAM: 256 MiB" }, ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + { "ppc64", "ppce500", "", "U-Boot" }, + { "ppc64", "40p", "-m 192", "Memory: 192M" }, + { "ppc64", "mac99", "", "PowerPC,970FX" }, ++#endif + { "ppc64", "pseries", + "-machine cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken", + "Open Firmware" }, ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + { "ppc64", "powernv8", "", "OPAL" }, + { "ppc64", "powernv9", "", "OPAL" }, + { "ppc64", "sam460ex", "-device e1000", "8086 100e" }, ++#endif + { "i386", "isapc", "-cpu qemu32 -device sga", "SGABIOS" }, + { "i386", "pc", "-device sga", "SGABIOS" }, + { "i386", "q35", "-device sga", "SGABIOS" }, +- { "x86_64", "isapc", "-cpu qemu32 -device sga", "SGABIOS" }, ++ { "x86_64", "pc", "-cpu qemu32 -device sga", "SGABIOS" }, + { "x86_64", "q35", "-device sga", "SGABIOS" }, + { "sparc", "LX", "", "TMS390S10" }, + { "sparc", "SS-4", "", "MB86904" }, +diff --git a/tests/cpu-plug-test.c b/tests/cpu-plug-test.c +index 30e514bbfb..a04beae1c6 100644 +--- a/tests/cpu-plug-test.c ++++ b/tests/cpu-plug-test.c +@@ -185,8 +185,8 @@ static void add_pseries_test_case(const char *mname) + char *path; + PlugTestData *data; + +- if (!g_str_has_prefix(mname, "pseries-") || +- (g_str_has_prefix(mname, "pseries-2.") && atoi(&mname[10]) < 7)) { ++ if (!g_str_has_prefix(mname, "pseries-rhel") || ++ (g_str_has_prefix(mname, "pseries-rhel7.") && atoi(&mname[14]) < 4)) { + return; + } + data = g_new(PlugTestData, 1); +diff --git a/tests/e1000-test.c b/tests/e1000-test.c +index c387984ef6..c89112d6f8 100644 +--- a/tests/e1000-test.c ++++ b/tests/e1000-test.c +@@ -22,9 +22,11 @@ struct QE1000 { + + static const char *models[] = { + "e1000", ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + "e1000-82540em", + "e1000-82544gc", + "e1000-82545em", ++#endif + }; + + static void *e1000_get_driver(void *obj, const char *interface) +diff --git a/tests/hd-geo-test.c b/tests/hd-geo-test.c +index 7e86c5416c..cc068bad87 100644 +--- a/tests/hd-geo-test.c ++++ b/tests/hd-geo-test.c +@@ -732,6 +732,7 @@ static void test_override_ide(void) + test_override(args, expected); + } + ++#if 0 /* Require lsi53c895a - not supported on RHEL */ + static void test_override_scsi(void) + { + TestArgs *args = create_args(); +@@ -776,6 +777,7 @@ static void test_override_scsi_2_controllers(void) + add_scsi_disk(args, 3, 1, 0, 1, 2, 0, 1, 0); + test_override(args, expected); + } ++#endif + + static void test_override_virtio_blk(void) + { +@@ -951,9 +953,11 @@ int main(int argc, char **argv) + qtest_add_func("hd-geo/ide/device/user/chst", test_ide_device_user_chst); + if (have_qemu_img()) { + qtest_add_func("hd-geo/override/ide", test_override_ide); ++#if 0 /* Require lsi53c895a - not supported on RHEL */ + qtest_add_func("hd-geo/override/scsi", test_override_scsi); + qtest_add_func("hd-geo/override/scsi_2_controllers", + test_override_scsi_2_controllers); ++#endif + qtest_add_func("hd-geo/override/virtio_blk", test_override_virtio_blk); + qtest_add_func("hd-geo/override/zero_chs", test_override_zero_chs); + qtest_add_func("hd-geo/override/scsi_hot_unplug", +diff --git a/tests/prom-env-test.c b/tests/prom-env-test.c +index 61bc1d1e7b..028d45c7d7 100644 +--- a/tests/prom-env-test.c ++++ b/tests/prom-env-test.c +@@ -88,10 +88,14 @@ int main(int argc, char *argv[]) + if (!strcmp(arch, "ppc")) { + add_tests(ppc_machines); + } else if (!strcmp(arch, "ppc64")) { ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + add_tests(ppc_machines); + if (g_test_slow()) { ++#endif + qtest_add_data_func("prom-env/pseries", "pseries", test_machine); ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + } ++#endif + } else if (!strcmp(arch, "sparc")) { + add_tests(sparc_machines); + } else if (!strcmp(arch, "sparc64")) { +diff --git a/tests/qemu-iotests/051 b/tests/qemu-iotests/051 +index 53bcdbc911..b387e0c233 100755 +--- a/tests/qemu-iotests/051 ++++ b/tests/qemu-iotests/051 +@@ -181,11 +181,11 @@ run_qemu -drive if=virtio + case "$QEMU_DEFAULT_MACHINE" in + pc) + run_qemu -drive if=none,id=disk -device ide-cd,drive=disk +- run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-cd,drive=disk ++# run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-cd,drive=disk + run_qemu -drive if=none,id=disk -device ide-drive,drive=disk + run_qemu -drive if=none,id=disk -device ide-hd,drive=disk +- run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-disk,drive=disk +- run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-hd,drive=disk ++# run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-disk,drive=disk ++# run_qemu -drive if=none,id=disk -device lsi53c895a -device scsi-hd,drive=disk + ;; + *) + ;; +@@ -234,11 +234,11 @@ run_qemu -drive file="$TEST_IMG",if=virtio,readonly=on + case "$QEMU_DEFAULT_MACHINE" in + pc) + run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device ide-cd,drive=disk +- run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-cd,drive=disk ++# run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-cd,drive=disk + run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device ide-drive,drive=disk + run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device ide-hd,drive=disk +- run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-disk,drive=disk +- run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-hd,drive=disk ++# run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-disk,drive=disk ++# run_qemu -drive file="$TEST_IMG",if=none,id=disk,readonly=on -device lsi53c895a -device scsi-hd,drive=disk + ;; + *) + ;; +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 6b10a6a762..06cc734b26 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -92,7 +92,7 @@ + 068 rw quick + 069 rw auto quick + 070 rw quick +-071 rw auto quick ++# 071 rw auto quick -- requires whitelisted blkverify + 072 rw auto quick + 073 rw auto quick + 074 rw auto quick +@@ -120,7 +120,7 @@ + 096 rw quick + 097 rw auto backing + 098 rw auto backing quick +-099 rw auto quick ++# 099 rw auto quick -- requires whitelisted blkverify + # 100 was removed, do not reuse + 101 rw quick + 102 rw quick +diff --git a/tests/test-x86-cpuid-compat.c b/tests/test-x86-cpuid-compat.c +index 772287bdb4..e7c075ed98 100644 +--- a/tests/test-x86-cpuid-compat.c ++++ b/tests/test-x86-cpuid-compat.c +@@ -300,6 +300,7 @@ int main(int argc, char **argv) + "-cpu 486,xlevel2=0xC0000002,+xstore", + "xlevel2", 0xC0000002); + ++#if 0 /* Disabled in Red Hat Enterprise Linux */ + /* Check compatibility of old machine-types that didn't + * auto-increase level/xlevel/xlevel2: */ + +@@ -350,6 +351,7 @@ int main(int argc, char **argv) + add_cpuid_test("x86/cpuid/xlevel-compat/pc-i440fx-2.4/npt-on", + "-machine pc-i440fx-2.4 -cpu SandyBridge,+npt", + "xlevel", 0x80000008); ++#endif + + /* Test feature parsing */ + add_feature_test("x86/cpuid/features/plus", +diff --git a/tests/usb-hcd-xhci-test.c b/tests/usb-hcd-xhci-test.c +index 10ef9d2a91..3855873050 100644 +--- a/tests/usb-hcd-xhci-test.c ++++ b/tests/usb-hcd-xhci-test.c +@@ -21,6 +21,7 @@ static void test_xhci_hotplug(void) + usb_test_hotplug(global_qtest, "xhci", "1", NULL); + } + ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + static void test_usb_uas_hotplug(void) + { + QTestState *qts = global_qtest; +@@ -36,6 +37,7 @@ static void test_usb_uas_hotplug(void) + qtest_qmp_device_del(qts, "scsihd"); + qtest_qmp_device_del(qts, "uas"); + } ++#endif + + static void test_usb_ccid_hotplug(void) + { +@@ -56,7 +58,9 @@ int main(int argc, char **argv) + + qtest_add_func("/xhci/pci/init", test_xhci_init); + qtest_add_func("/xhci/pci/hotplug", test_xhci_hotplug); ++#if 0 /* Disabled for Red Hat Enterprise Linux */ + qtest_add_func("/xhci/pci/hotplug/usb-uas", test_usb_uas_hotplug); ++#endif + qtest_add_func("/xhci/pci/hotplug/usb-ccid", test_usb_ccid_hotplug); + + qtest_start("-device nec-usb-xhci,id=xhci" +-- +2.21.0 + diff --git a/0013-vfio-cap-number-of-devices-that-can-be-assigned.patch b/0013-vfio-cap-number-of-devices-that-can-be-assigned.patch new file mode 100755 index 0000000000000000000000000000000000000000..db776c4e4dafa98a978b099a1159efc0c118c6bd --- /dev/null +++ b/0013-vfio-cap-number-of-devices-that-can-be-assigned.patch @@ -0,0 +1,114 @@ +From de433da59448eaad4ac1b902d07d57b57f922aff Mon Sep 17 00:00:00 2001 +From: Bandan Das +Date: Tue, 3 Dec 2013 20:05:13 +0100 +Subject: vfio: cap number of devices that can be assigned + +RH-Author: Bandan Das +Message-id: <1386101113-31560-3-git-send-email-bsd@redhat.com> +Patchwork-id: 55984 +O-Subject: [PATCH RHEL7 qemu-kvm v2 2/2] vfio: cap number of devices that can be assigned +Bugzilla: 678368 +RH-Acked-by: Alex Williamson +RH-Acked-by: Marcelo Tosatti +RH-Acked-by: Michael S. Tsirkin + +Go through all groups to get count of total number of devices +active to enforce limit + +Reasoning from Alex for the limit(32) - Assuming 3 slots per +device, with 125 slots (number of memory slots for RHEL 7), +we can support almost 40 devices and still have few slots left +for other uses. Stepping down a bit, the number 32 arbitrarily +matches the number of slots on a PCI bus and is also a nice power +of two. + +Signed-off-by: Bandan Das + +Rebase notes (2.8.0): +- removed return value for vfio_realize (commit 1a22aca) + +Merged patches (2.9.0): +- 17eb774 vfio: Use error_setg when reporting max assigned device overshoot + + Merged patches (4.1.0-rc3): +- 2b89558 vfio: increase the cap on number of assigned devices to 64 + +(cherry picked from commit 9fa3c9fc6dfcde76d80db1aa601b2d577f72ceec) +(cherry picked from commit 3cb35556dc7d994f203d732fe952f95fcdb03c0a) +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/pci.c | 29 ++++++++++++++++++++++++++++- + hw/vfio/pci.h | 1 + + 2 files changed, 29 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index c8534d3035..309535f306 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -47,6 +47,9 @@ + + #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" + ++/* RHEL only: Set once for the first assigned dev */ ++static uint16_t device_limit; ++ + static void vfio_disable_interrupts(VFIOPCIDevice *vdev); + static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); + +@@ -2722,9 +2725,30 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + ssize_t len; + struct stat st; + int groupid; +- int i, ret; ++ int ret, i = 0; + bool is_mdev; + ++ if (device_limit && device_limit != vdev->assigned_device_limit) { ++ error_setg(errp, "Assigned device limit has been redefined. " ++ "Old:%d, New:%d", ++ device_limit, vdev->assigned_device_limit); ++ return; ++ } else { ++ device_limit = vdev->assigned_device_limit; ++ } ++ ++ QLIST_FOREACH(group, &vfio_group_list, next) { ++ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { ++ i++; ++ } ++ } ++ ++ if (i >= vdev->assigned_device_limit) { ++ error_setg(errp, "Maximum supported vfio devices (%d) " ++ "already attached", vdev->assigned_device_limit); ++ return; ++ } ++ + if (!vdev->vbasedev.sysfsdev) { + if (!(~vdev->host.domain || ~vdev->host.bus || + ~vdev->host.slot || ~vdev->host.function)) { +@@ -3167,6 +3191,9 @@ static Property vfio_pci_dev_properties[] = { + DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), + DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, + no_geforce_quirks, false), ++ /* RHEL only */ ++ DEFINE_PROP_UINT16("x-assigned-device-limit", VFIOPCIDevice, ++ assigned_device_limit, 64), + DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd, + false), + DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd, +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 35626cd63e..0cd4803aee 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -135,6 +135,7 @@ typedef struct VFIOPCIDevice { + EventNotifier err_notifier; + EventNotifier req_notifier; + int (*resetfn)(struct VFIOPCIDevice *); ++ uint16_t assigned_device_limit; + uint32_t vendor_id; + uint32_t device_id; + uint32_t sub_vendor_id; +-- +2.21.0 + diff --git a/0014-Add-support-statement-to-help-output.patch b/0014-Add-support-statement-to-help-output.patch new file mode 100755 index 0000000000000000000000000000000000000000..cb77bfe69c89efc84bd8c4f38f7456817ee852d0 --- /dev/null +++ b/0014-Add-support-statement-to-help-output.patch @@ -0,0 +1,58 @@ +From 2754dd8da8975757753fd491985d5e7b36966106 Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Wed, 4 Dec 2013 18:53:17 +0100 +Subject: Add support statement to -help output + +RH-Author: Eduardo Habkost +Message-id: <1386183197-27761-1-git-send-email-ehabkost@redhat.com> +Patchwork-id: 55994 +O-Subject: [qemu-kvm RHEL7 PATCH] Add support statement to -help output +Bugzilla: 972773 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: knoel@redhat.com +RH-Acked-by: Paolo Bonzini + +Add support statement to -help output, reporting direct qemu-kvm usage +as unsupported by Red Hat, and advising users to use libvirt instead. + +Signed-off-by: Eduardo Habkost +(cherry picked from commit 2a07700936e39856cc9f149c6a6517f0715536a6) +(cherry picked from commit 5dd2f4706e2fef945771949e59a8fcc1b5452de9) +Signed-off-by: Danilo C. L. de Paula +--- + vl.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/vl.c b/vl.c +index 668a34577e..9f3e7e7733 100644 +--- a/vl.c ++++ b/vl.c +@@ -1822,9 +1822,17 @@ static void version(void) + QEMU_COPYRIGHT "\n"); + } + ++static void print_rh_warning(void) ++{ ++ printf("\nWARNING: Direct use of qemu-kvm from the command line is not supported by Red Hat.\n" ++ "WARNING: Use libvirt as the stable management interface.\n" ++ "WARNING: Some command line options listed here may not be available in future releases.\n\n"); ++} ++ + static void help(int exitcode) + { + version(); ++ print_rh_warning(); + printf("usage: %s [options] [disk_image]\n\n" + "'disk_image' is a raw hard disk image for IDE hard disk 0\n\n", + error_get_progname()); +@@ -1841,6 +1849,7 @@ static void help(int exitcode) + "\n" + QEMU_HELP_BOTTOM "\n"); + ++ print_rh_warning(); + exit(exitcode); + } + +-- +2.21.0 + diff --git a/0015-globally-limit-the-maximum-number-of-CPUs.patch b/0015-globally-limit-the-maximum-number-of-CPUs.patch new file mode 100755 index 0000000000000000000000000000000000000000..cec862d547d6b2d91a9eaa28a7d64293e616ce4f --- /dev/null +++ b/0015-globally-limit-the-maximum-number-of-CPUs.patch @@ -0,0 +1,152 @@ +From c9c3cf721b0e9e359418f64c2a5121c3f8b5d27a Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Tue, 21 Jan 2014 10:46:52 +0100 +Subject: globally limit the maximum number of CPUs + +We now globally limit the number of VCPUs. +Especially, there is no way one can specify more than +max_cpus VCPUs for a VM. + +This allows us the restore the ppc max_cpus limitation to the upstream +default and minimize the ppc hack in kvm-all.c. + +Signed-off-by: David Hildenbrand +Signed-off-by: Miroslav Rezanina +Signed-off-by: Danilo Cesar Lemes de Paula + +Rebase notes (2.11.0): +- Removed CONFIG_RHV reference +- Update commit log + +Merged patches (2.11.0): +- 92fef14623 redhat: remove manual max_cpus limitations for ppc +- bb722e9eff redhat: globally limit the maximum number of CPUs +- fdeef3c1c7 RHEL: Set vcpus hard limit to 240 for Power +- 0584216921 Match POWER max cpus to x86 + +Signed-off-by: Andrew Jones +(cherry picked from commit a4ceb63bdc5cbac19f5f633ec761b9de0dedb55e) +(cherry picked from commit a1f26d85171b4d554225150053700e93ba6eba10) + +redhat: globally limit the maximum number of CPUs + +RH-Author: David Hildenbrand +Message-id: <20180109103253.24517-2-david@redhat.com> +Patchwork-id: 78531 +O-Subject: [RHEL-7.5 qemu-kvm-ma PATCH v2 1/2] redhat: globally limit the maximum number of CPUs +Bugzilla: 1527449 +RH-Acked-by: David Gibson +RH-Acked-by: Thomas Huth +RH-Acked-by: Cornelia Huck + +Upstream-status: n/a + +For RHEL, we support 240, for RHV up to 384 VCPUs. Let's limit this +globally instead of fixing up all machines. This way, we can easily +change (increase) the product specific levels later. + +Signed-off-by: David Hildenbrand +Signed-off-by: Miroslav Rezanina + +redhat: remove manual max_cpus limitations for ppc + +RH-Author: David Hildenbrand +Message-id: <20180109103253.24517-3-david@redhat.com> +Patchwork-id: 78532 +O-Subject: [RHEL-7.5 qemu-kvm-ma PATCH v2 2/2] redhat: remove manual max_cpus limitations for ppc +Bugzilla: 1527449 +RH-Acked-by: David Gibson +RH-Acked-by: Thomas Huth +RH-Acked-by: Cornelia Huck + +Upstream-status: n/a + +RH-Author: Andrew Jones +Message-id: <1390301212-15344-1-git-send-email-drjones@redhat.com> +Patchwork-id: 56862 +O-Subject: [RHEL7.0 qemu-kvm PATCH v6] use recommended max vcpu count +Bugzilla: 998708 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Marcelo Tosatti + +The recommended vcpu max limit (KVM_CAP_NR_VCPUS) should be used instead +of the actual max vcpu limit (KVM_CAP_MAX_VCPUS) to give an error. + +This commit matches the limit to current KVM_CAP_NR_VCPUS value. + +Signed-off-by: Danilo C. L. de Paula +--- + accel/kvm/kvm-all.c | 12 ++++++++++++ + vl.c | 18 ++++++++++++++++++ + 2 files changed, 30 insertions(+) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index ca00daa2f5..dc3ed7f04e 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -1943,6 +1943,18 @@ static int kvm_init(MachineState *ms) + soft_vcpus_limit = kvm_recommended_vcpus(s); + hard_vcpus_limit = kvm_max_vcpus(s); + ++#ifdef HOST_PPC64 ++ /* ++ * On POWER, the kernel advertises a soft limit based on the ++ * number of CPU threads on the host. We want to allow exceeding ++ * this for testing purposes, so we don't want to set hard limit ++ * to soft limit as on x86. ++ */ ++#else ++ /* RHEL doesn't support nr_vcpus > soft_vcpus_limit */ ++ hard_vcpus_limit = soft_vcpus_limit; ++#endif ++ + while (nc->name) { + if (nc->num > soft_vcpus_limit) { + warn_report("Number of %s cpus requested (%d) exceeds " +diff --git a/vl.c b/vl.c +index 9f3e7e7733..1550aa2aaa 100644 +--- a/vl.c ++++ b/vl.c +@@ -134,6 +134,8 @@ int main(int argc, char **argv) + + #define MAX_VIRTIO_CONSOLES 1 + ++#define RHEL_MAX_CPUS 384 ++ + static const char *data_dir[16]; + static int data_dir_idx; + const char *bios_name = NULL; +@@ -1339,6 +1341,20 @@ static MachineClass *find_default_machine(GSList *machines) + return NULL; + } + ++/* Maximum number of CPUs limited for Red Hat Enterprise Linux */ ++static void limit_max_cpus_in_machines(void) ++{ ++ GSList *el, *machines = object_class_get_list(TYPE_MACHINE, false); ++ ++ for (el = machines; el; el = el->next) { ++ MachineClass *mc = el->data; ++ ++ if (mc->max_cpus > RHEL_MAX_CPUS) { ++ mc->max_cpus = RHEL_MAX_CPUS; ++ } ++ } ++} ++ + static int machine_help_func(QemuOpts *opts, MachineState *machine) + { + ObjectProperty *prop; +@@ -3857,6 +3873,8 @@ int main(int argc, char **argv, char **envp) + "mutually exclusive"); + exit(EXIT_FAILURE); + } ++ /* Maximum number of CPUs limited for Red Hat Enterprise Linux */ ++ limit_max_cpus_in_machines(); + + configure_rtc(qemu_find_opts_singleton("rtc")); + +-- +2.21.0 + diff --git a/0016-Add-support-for-simpletrace.patch b/0016-Add-support-for-simpletrace.patch new file mode 100755 index 0000000000000000000000000000000000000000..962485548f314548e0348a708197606088703db8 --- /dev/null +++ b/0016-Add-support-for-simpletrace.patch @@ -0,0 +1,121 @@ +From 26128b3ede339e292a3c50a84e3248af46ecd0ec Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Thu, 8 Oct 2015 09:50:17 +0200 +Subject: Add support for simpletrace + +As simpletrace is upstream, we just need to properly handle it during rpmbuild. + +Signed-off-by: Miroslav Rezanina + +Rebase notes (3.1.0): +- Fixed python 2 to python3 switch + +Rebase notes (2.9.0): +- Added group argument for tracetool.py (upstream) + +Rebase notes (2.8.0): +- Changed tracetool.py parameters + +Merged patches (2.3.0): +- db959d6 redhat/qemu-kvm.spec.template: Install qemu-kvm-simpletrace.stp +- 5292fc3 trace: add SystemTap init scripts for simpletrace bridge +- eda9e5e simpletrace: install simpletrace.py +- 85c4c8f trace: add systemtap-initscript README file to RPM + +Signed-off-by: Danilo C. L. de Paula +--- + .gitignore | 2 ++ + Makefile | 4 +++ + README.systemtap | 43 +++++++++++++++++++++++++ + redhat/qemu-kvm.spec.template | 26 ++++++++++++++- + scripts/systemtap/conf.d/qemu_kvm.conf | 4 +++ + scripts/systemtap/script.d/qemu_kvm.stp | 1 + + 6 files changed, 79 insertions(+), 1 deletion(-) + create mode 100644 README.systemtap + create mode 100644 scripts/systemtap/conf.d/qemu_kvm.conf + create mode 100644 scripts/systemtap/script.d/qemu_kvm.stp + +diff --git a/Makefile b/Makefile +index 086727dbb9..4254950f7f 100644 +--- a/Makefile ++++ b/Makefile +@@ -939,6 +939,10 @@ endif + $(INSTALL_DATA) $(SRC_PATH)/pc-bios/keymaps/$$x "$(DESTDIR)$(qemu_datadir)/keymaps"; \ + done + $(INSTALL_DATA) $(BUILD_DIR)/trace-events-all "$(DESTDIR)$(qemu_datadir)/trace-events-all" ++ $(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)/systemtap/script.d" ++ $(INSTALL_DATA) $(SRC_PATH)/scripts/systemtap/script.d/qemu_kvm.stp "$(DESTDIR)$(qemu_datadir)/systemtap/script.d/" ++ $(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)/systemtap/conf.d" ++ $(INSTALL_DATA) $(SRC_PATH)/scripts/systemtap/conf.d/qemu_kvm.conf "$(DESTDIR)$(qemu_datadir)/systemtap/conf.d/" + + .PHONY: ctags + ctags: +diff --git a/README.systemtap b/README.systemtap +new file mode 100644 +index 0000000000..ad913fc990 +--- /dev/null ++++ b/README.systemtap +@@ -0,0 +1,43 @@ ++QEMU tracing using systemtap-initscript ++--------------------------------------- ++ ++You can capture QEMU trace data all the time using systemtap-initscript. This ++uses SystemTap's flight recorder mode to trace all running guests to a ++fixed-size buffer on the host. Old trace entries are overwritten by new ++entries when the buffer size wraps. ++ ++1. Install the systemtap-initscript package: ++ # yum install systemtap-initscript ++ ++2. Install the systemtap scripts and the conf file: ++ # cp /usr/share/qemu-kvm/systemtap/script.d/qemu_kvm.stp /etc/systemtap/script.d/ ++ # cp /usr/share/qemu-kvm/systemtap/conf.d/qemu_kvm.conf /etc/systemtap/conf.d/ ++ ++The set of trace events to enable is given in qemu_kvm.stp. This SystemTap ++script can be customized to add or remove trace events provided in ++/usr/share/systemtap/tapset/qemu-kvm-simpletrace.stp. ++ ++SystemTap customizations can be made to qemu_kvm.conf to control the flight ++recorder buffer size and whether to store traces in memory only or disk too. ++See stap(1) for option documentation. ++ ++3. Start the systemtap service. ++ # service systemtap start qemu_kvm ++ ++4. Make the service start at boot time. ++ # chkconfig systemtap on ++ ++5. Confirm that the service works. ++ # service systemtap status qemu_kvm ++ qemu_kvm is running... ++ ++When you want to inspect the trace buffer, perform the following steps: ++ ++1. Dump the trace buffer. ++ # staprun -A qemu_kvm >/tmp/trace.log ++ ++2. Start the systemtap service because the preceding step stops the service. ++ # service systemtap start qemu_kvm ++ ++3. Translate the trace record to readable format. ++ # /usr/share/qemu-kvm/simpletrace.py --no-header /usr/share/qemu-kvm/trace-events /tmp/trace.log +diff --git a/scripts/systemtap/conf.d/qemu_kvm.conf b/scripts/systemtap/conf.d/qemu_kvm.conf +new file mode 100644 +index 0000000000..372d8160a4 +--- /dev/null ++++ b/scripts/systemtap/conf.d/qemu_kvm.conf +@@ -0,0 +1,4 @@ ++# Force load uprobes (see BZ#1118352) ++stap -e 'probe process("/usr/libexec/qemu-kvm").function("main") { printf("") }' -c true ++ ++qemu_kvm_OPT="-s4" # per-CPU buffer size, in megabytes +diff --git a/scripts/systemtap/script.d/qemu_kvm.stp b/scripts/systemtap/script.d/qemu_kvm.stp +new file mode 100644 +index 0000000000..c04abf9449 +--- /dev/null ++++ b/scripts/systemtap/script.d/qemu_kvm.stp +@@ -0,0 +1 @@ ++probe qemu.kvm.simpletrace.handle_qmp_command,qemu.kvm.simpletrace.monitor_protocol_*,qemu.kvm.simpletrace.migrate_set_state {} +-- +2.21.0 + diff --git a/0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch b/0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch new file mode 100755 index 0000000000000000000000000000000000000000..ef834452678679145af334cc45aedd08f88a0060 --- /dev/null +++ b/0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch @@ -0,0 +1,118 @@ +From 97ed62562b883c384346bfef3e1c7e379f03ccab Mon Sep 17 00:00:00 2001 +From: Miroslav Rezanina +Date: Fri, 30 Nov 2018 09:11:03 +0100 +Subject: Use qemu-kvm in documentation instead of qemu-system- + +Patchwork-id: 62380 +O-Subject: [RHEV-7.1 qemu-kvm-rhev PATCHv4] Use qemu-kvm in documentation instead of qemu-system-i386 +Bugzilla: 1140620 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Markus Armbruster +RH-Acked-by: Stefan Hajnoczi + +From: Miroslav Rezanina + +We change the name and location of qemu-kvm binaries. Update documentation +to reflect this change. Only architectures available in RHEL are updated. + +Signed-off-by: Miroslav Rezanina +Signed-off-by: Danilo C. L. de Paula +--- + docs/qemu-block-drivers.texi | 2 +- + docs/qemu-cpu-models.texi | 2 +- + qemu-doc.texi | 6 +++--- + qemu-options.hx | 16 ++++++++-------- + 4 files changed, 13 insertions(+), 13 deletions(-) + +diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi +index 2c7ea49c32..5d0afb3dee 100644 +--- a/docs/qemu-block-drivers.texi ++++ b/docs/qemu-block-drivers.texi +@@ -2,7 +2,7 @@ + QEMU block driver reference manual + @c man end + +-@set qemu_system qemu-system-x86_64 ++@set qemu_system qemu-kvm + + @c man begin DESCRIPTION + +diff --git a/docs/qemu-cpu-models.texi b/docs/qemu-cpu-models.texi +index f88a1def0d..c82cf8fab7 100644 +--- a/docs/qemu-cpu-models.texi ++++ b/docs/qemu-cpu-models.texi +@@ -2,7 +2,7 @@ + QEMU / KVM CPU model configuration + @c man end + +-@set qemu_system_x86 qemu-system-x86_64 ++@set qemu_system_x86 qemu-kvm + + @c man begin DESCRIPTION + +diff --git a/qemu-doc.texi b/qemu-doc.texi +index 3ddf5c0a68..d460f8d2c0 100644 +--- a/qemu-doc.texi ++++ b/qemu-doc.texi +@@ -11,8 +11,8 @@ + @paragraphindent 0 + @c %**end of header + +-@set qemu_system qemu-system-x86_64 +-@set qemu_system_x86 qemu-system-x86_64 ++@set qemu_system qemu-kvm ++@set qemu_system_x86 qemu-kvm + + @ifinfo + @direntry +@@ -1827,7 +1827,7 @@ Set the initial VGA graphic mode. The default is 800x600x32. + Set OpenBIOS variables in NVRAM, for example: + + @example +-qemu-system-ppc -prom-env 'auto-boot?=false' \ ++qemu-kvm -prom-env 'auto-boot?=false' \ + -prom-env 'boot-device=hd:2,\yaboot' \ + -prom-env 'boot-args=conf=hd:2,\yaboot.conf' + @end example +diff --git a/qemu-options.hx b/qemu-options.hx +index fc17aca631..df1d27b6f2 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -2737,11 +2737,11 @@ be created for multiqueue vhost-user. + + Example: + @example +-qemu -m 512 -object memory-backend-file,id=mem,size=512M,mem-path=/hugetlbfs,share=on \ +- -numa node,memdev=mem \ +- -chardev socket,id=chr0,path=/path/to/socket \ +- -netdev type=vhost-user,id=net0,chardev=chr0 \ +- -device virtio-net-pci,netdev=net0 ++qemu-kvm -m 512 -object memory-backend-file,id=mem,size=512M,mem-path=/hugetlbfs,share=on \ ++ -numa node,memdev=mem \ ++ -chardev socket,id=chr0,path=/path/to/socket \ ++ -netdev type=vhost-user,id=net0,chardev=chr0 \ ++ -device virtio-net-pci,netdev=net0 + @end example + + @item -netdev hubport,id=@var{id},hubid=@var{hubid}[,netdev=@var{nd}] +@@ -3631,14 +3631,14 @@ ETEXI + + DEF("realtime", HAS_ARG, QEMU_OPTION_realtime, + "-realtime [mlock=on|off]\n" +- " run qemu with realtime features\n" ++ " run qemu-kvm with realtime features\n" + " mlock=on|off controls mlock support (default: on)\n", + QEMU_ARCH_ALL) + STEXI + @item -realtime mlock=on|off + @findex -realtime +-Run qemu with realtime features. +-mlocking qemu and guest memory can be enabled via @option{mlock=on} ++Run qemu-kvm with realtime features. ++mlocking qemu-kvm and guest memory can be enabled via @option{mlock=on} + (enabled by default). + ETEXI + +-- +2.21.0 + diff --git a/0018-usb-xhci-Fix-PCI-capability-order.patch b/0018-usb-xhci-Fix-PCI-capability-order.patch new file mode 100755 index 0000000000000000000000000000000000000000..bc6146dc3e3ced289f10db9a918b641fa4834d61 --- /dev/null +++ b/0018-usb-xhci-Fix-PCI-capability-order.patch @@ -0,0 +1,96 @@ +From b13a7d3527c5c91e7a50236de30a2244b8453911 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 5 May 2017 19:06:14 +0200 +Subject: usb-xhci: Fix PCI capability order + +RH-Author: Dr. David Alan Gilbert +Message-id: <20170505190614.15987-2-dgilbert@redhat.com> +Patchwork-id: 75038 +O-Subject: [RHEL-7.4 qemu-kvm-rhev PATCH 1/1] usb-xhci: Fix PCI capability order +Bugzilla: 1447874 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Gerd Hoffmann +RH-Acked-by: Juan Quintela + +From: "Dr. David Alan Gilbert" + +Upstream commit 1108b2f8a9 in 2.7.0 changed the order +of the PCI capability chain in the XHCI pci device in the case +where the device has the PCIe endpoint capability (i.e. only +older machine types, pc-i440fx-2.0 upstream, pc-i440fx-rhel7.0.0 +apparently for us). + +Changing the order breaks migration compatibility; fixing this +upstream would mean breaking the same case going from 2.7.0->current +that currently works 2.7.0->2.9.0 - so upstream it's a choice +of two breakages. + +Since we never released 2.7.0/2.8.0 we can fix this downstream. + +This reverts the order so that we create the capabilities in the +order: + PCIe + MSI + MSI-X + +The symptom is: +qemu-kvm: get_pci_config_device: Bad config data: i=0x71 read: a0 device: 0 cmask: ff wmask: 0 w1cmask:0 +qemu-kvm: Failed to load PCIDevice:config +qemu-kvm: Failed to load xhci:parent_obj +qemu-kvm: error while loading state for instance 0x0 of device '0000:00:0d.0/xhci' +qemu-kvm: load of migration failed: Invalid argument + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Miroslav Rezanina + +-- +Rebase notes (2.9.0): +- Change in assert condition (upstream) + +(cherry picked from commit aad727a5ecde1ad4935eb8427604d4df5a1f1f35) +(cherry picked from commit 2dd7402227e77d748a7375233ac9e7feab244bda) + +Conflicts: + hw/usb/hcd-xhci.c + +(cherry picked from commit a42f86dc906cc7d2c16d02bf125ed76847b469cb) +(cherry picked from commit 992ab2e4f6e15d3e51bc716763aa8d6f45c6d29d) +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-xhci.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index 8fed2eedd6..d2b9744030 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -3403,6 +3403,12 @@ static void usb_xhci_realize(struct PCIDevice *dev, Error **errp) + xhci->max_pstreams_mask = 0; + } + ++ if (pci_bus_is_express(pci_get_bus(dev)) || ++ xhci_get_flag(xhci, XHCI_FLAG_FORCE_PCIE_ENDCAP)) { ++ ret = pcie_endpoint_cap_init(dev, 0xa0); ++ assert(ret > 0); ++ } ++ + if (xhci->msi != ON_OFF_AUTO_OFF) { + ret = msi_init(dev, 0x70, xhci->numintrs, true, false, &err); + /* Any error other than -ENOTSUP(board's MSI support is broken) +@@ -3451,12 +3457,6 @@ static void usb_xhci_realize(struct PCIDevice *dev, Error **errp) + PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64, + &xhci->mem); + +- if (pci_bus_is_express(pci_get_bus(dev)) || +- xhci_get_flag(xhci, XHCI_FLAG_FORCE_PCIE_ENDCAP)) { +- ret = pcie_endpoint_cap_init(dev, 0xa0); +- assert(ret > 0); +- } +- + if (xhci->msix != ON_OFF_AUTO_OFF) { + /* TODO check for errors, and should fail when msix=on */ + msix_init(dev, xhci->numintrs, +-- +2.21.0 + diff --git a/0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch b/0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch new file mode 100755 index 0000000000000000000000000000000000000000..e167b2e9224eecc57ffde913f356ea5a5b61026a --- /dev/null +++ b/0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch @@ -0,0 +1,69 @@ +From 3fab8f5e8a9e190c1ed6916ac13c7c4d65e874b7 Mon Sep 17 00:00:00 2001 +From: Fam Zheng +Date: Wed, 14 Jun 2017 15:37:01 +0200 +Subject: virtio-scsi: Reject scsi-cd if data plane enabled [RHEL only] + +RH-Author: Fam Zheng +Message-id: <20170614153701.14757-1-famz@redhat.com> +Patchwork-id: 75613 +O-Subject: [RHV-7.4 qemu-kvm-rhev PATCH v3] virtio-scsi: Reject scsi-cd if data plane enabled [RHEL only] +Bugzilla: 1378816 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +We need a fix for RHEL 7.4 and 7.3.z, but unfortunately upstream isn't +ready. If it were, the changes will be too invasive. To have an idea: + +https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg05400.html + +is an incomplete attempt to fix part of the issue, and the remaining +work unfortunately involve even more complex changes. + +As a band-aid, this partially reverts the effect of ef8875b +(virtio-scsi: Remove op blocker for dataplane, since v2.7). We cannot +simply revert that commit as a whole because we already shipped it in +qemu-kvm-rhev 7.3, since when, block jobs has been possible. We should +only block what has been broken. Also, faithfully reverting the above +commit means adding back the removed op blocker, but that is not enough, +because it still crashes when inserting media into an initially empty +scsi-cd. + +All in all, scsi-cd on virtio-scsi-dataplane has basically been unusable +unless the scsi-cd never enters an empty state, so, disable it +altogether. Otherwise it would be much more difficult to avoid +crashing. + +Signed-off-by: Fam Zheng +Signed-off-by: Miroslav Rezanina +(cherry picked from commit b0caf00bbc35c7d89e02999bdce86e1f867728e8) +(cherry picked from commit c9c4f117d8b507c2f86035c282d537c0a327364f) +(cherry picked from commit 5d586bb2543337f0ff172c6ce942dba3acbcedff) +Signed-off-by: Danilo C. L. de Paula +--- + hw/scsi/virtio-scsi.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c +index e8b2b64d09..54108c0056 100644 +--- a/hw/scsi/virtio-scsi.c ++++ b/hw/scsi/virtio-scsi.c +@@ -808,6 +808,15 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev, + SCSIDevice *sd = SCSI_DEVICE(dev); + int ret; + ++ /* XXX: Remove this check once block backend is capable of handling ++ * AioContext change upon eject/insert. ++ * s->ctx is NULL if ioeventfd is off, s->ctx is qemu_get_aio_context() if ++ * data plane is not used, both cases are safe for scsi-cd. */ ++ if (s->ctx && s->ctx != qemu_get_aio_context() && ++ object_dynamic_cast(OBJECT(dev), "scsi-cd")) { ++ error_setg(errp, "scsi-cd is not supported by data plane"); ++ return; ++ } + if (s->ctx && !s->dataplane_fenced) { + if (blk_op_is_blocked(sd->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) { + return; +-- +2.21.0 + diff --git a/0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch b/0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch new file mode 100755 index 0000000000000000000000000000000000000000..b3350da3adfee04a58c788182873e38fd715b51a --- /dev/null +++ b/0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch @@ -0,0 +1,60 @@ +From 148e9e80a3a430615b552075082fad22d007d851 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Wed, 6 Feb 2019 03:58:56 +0000 +Subject: BZ1653590: Require at least 64kiB pages for downstream guests & hosts + +RH-Author: David Gibson +Message-id: <20190206035856.19058-1-dgibson@redhat.com> +Patchwork-id: 84246 +O-Subject: [RHELAV-8.0/rhel qemu-kvm PATCH] BZ1653590: Require at least 64kiB pages for downstream guests & hosts +Bugzilla: 1653590 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Serhii Popovych +RH-Acked-by: Thomas Huth + +Most current POWER guests require 64kiB page support, so that's the default +for the cap-hpt-max-pagesize option in qemu which limits available guest +page sizes. We warn if the value is set smaller than that, but don't +outright fail upstream, because we need to allow for the possibility of +guest (and/or host) kernels configured for 4kiB page sizes. + +Downstream, however, we simply don't support 4kiB pagesize configured +kernels in guest or host, so we can have qemu simply error out in this +situation. + +Testing: Attempted to start a guest with cap-hpt-max-page-size=4k and verified + it failed immediately with a qemu error + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_caps.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c +index 481dfd2a27..805f38533e 100644 +--- a/hw/ppc/spapr_caps.c ++++ b/hw/ppc/spapr_caps.c +@@ -351,12 +351,19 @@ void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, + static void cap_hpt_maxpagesize_apply(SpaprMachineState *spapr, + uint8_t val, Error **errp) + { ++#if 0 /* disabled for RHEL */ + if (val < 12) { + error_setg(errp, "Require at least 4kiB hpt-max-page-size"); + return; + } else if (val < 16) { + warn_report("Many guests require at least 64kiB hpt-max-page-size"); + } ++#else /* Only page sizes >=64kiB supported for RHEL */ ++ if (val < 16) { ++ error_setg(errp, "Require at least 64kiB hpt-max-page-size"); ++ return; ++ } ++#endif + + spapr_check_pagesize(spapr, qemu_minrampagesize(), errp); + } +-- +2.21.0 + diff --git a/0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch b/0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch new file mode 100755 index 0000000000000000000000000000000000000000..a2a800b08fbfacc8f72866aa7ae6514effd11e79 --- /dev/null +++ b/0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch @@ -0,0 +1,61 @@ +From ab9ebc29bb9bb142e73a160750a451d40bfe9746 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Mon, 16 Sep 2019 17:07:00 +0100 +Subject: Using ip_deq after m_free might read pointers from an allocation + reuse. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Philippe Mathieu-Daudé +Message-id: <20190916170700.647-2-philmd@redhat.com> +Patchwork-id: 90470 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm PATCH 1/1] Using ip_deq after m_free might read pointers from an allocation reuse. +Bugzilla: 1749737 +RH-Acked-by: Danilo de Paula +RH-Acked-by: John Snow + +From: Samuel Thibault + +This would be difficult to exploit, but that is still related with +CVE-2019-14378 which generates fragmented IP packets that would trigger this +issue and at least produce a DoS. + +Signed-off-by: Samuel Thibault +(cherry picked from libslirp commit c59279437eda91841b9d26079c70b8a540d41204) +Signed-off-by: Philippe Mathieu-Daudé + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ip_input.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/slirp/src/ip_input.c b/slirp/src/ip_input.c +index 8c75d91495..df1c846ade 100644 +--- a/slirp/src/ip_input.c ++++ b/slirp/src/ip_input.c +@@ -292,6 +292,7 @@ static struct ip *ip_reass(Slirp *slirp, struct ip *ip, struct ipq *fp) + */ + while (q != (struct ipasfrag *)&fp->frag_link && + ip->ip_off + ip->ip_len > q->ipf_off) { ++ struct ipasfrag *prev; + i = (ip->ip_off + ip->ip_len) - q->ipf_off; + if (i < q->ipf_len) { + q->ipf_len -= i; +@@ -299,9 +300,11 @@ static struct ip *ip_reass(Slirp *slirp, struct ip *ip, struct ipq *fp) + m_adj(dtom(slirp, q), i); + break; + } ++ prev = q; + q = q->ipf_next; +- m_free(dtom(slirp, q->ipf_prev)); +- ip_deq(q->ipf_prev); ++ ip_deq(prev); ++ m_free(dtom(slirp, prev)); ++ + } + + insert: +-- +2.21.0 + diff --git a/81-kvm-rhel.rules b/81-kvm-rhel.rules new file mode 100755 index 0000000000000000000000000000000000000000..787cad622f798e3ba06886fe89f055bb8c457aca --- /dev/null +++ b/81-kvm-rhel.rules @@ -0,0 +1 @@ +DEVPATH=="*/kvm", ACTION=="change", RUN+="/lib/udev/udev-kvm-check $env{COUNT} $env{EVENT}" diff --git a/85-kvm.preset b/85-kvm.preset new file mode 100755 index 0000000000000000000000000000000000000000..8024052ecd938c6f5965db8d6847492e17cac730 --- /dev/null +++ b/85-kvm.preset @@ -0,0 +1,5 @@ +# Enable kvm-setup by default. This can have odd side effects on +# PowerNV systems that aren't intended as KVM hosts, but at present we +# only support RHEL on PowerNV for the purpose of being a RHEV host. + +enable kvm-setup.service diff --git a/95-kvm-memlock.conf b/95-kvm-memlock.conf new file mode 100755 index 0000000000000000000000000000000000000000..fc59dbe05ef88f9fef375ce9e723585c8e03a339 --- /dev/null +++ b/95-kvm-memlock.conf @@ -0,0 +1,10 @@ +# The KVM HV implementation on Power can require a significant amount +# of unswappable memory (about half of which also needs to be host +# physically contiguous) to hold the guest's Hash Page Table (HPT) - +# roughly 1/64th of the guest's RAM size, minimum 16MiB. +# +# These limits allow unprivileged users to start smallish VMs, such as +# those used by libguestfs. +# +* hard memlock 65536 +* soft memlock 65536 diff --git a/99-qemu-guest-agent.rules b/99-qemu-guest-agent.rules new file mode 100755 index 0000000000000000000000000000000000000000..8a290abbd3e372c417b7cc3ccef37afb31b8935a --- /dev/null +++ b/99-qemu-guest-agent.rules @@ -0,0 +1,2 @@ +SUBSYSTEM=="virtio-ports", ATTR{name}=="org.qemu.guest_agent.0", \ + TAG+="systemd" ENV{SYSTEMD_WANTS}="qemu-guest-agent.service" diff --git a/README.md b/README.md deleted file mode 100644 index 7342728d557c602f51c6d278bba9f3dd9faaf356..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,11 +0,0 @@ -Anolis OS -======================================= -# 代码仓库说明 -## 分支说明 ->进行代码开发工作时,请注意选择当前版本对应的分支 -* aX分支为对应大版本的主分支,如a8分支对应当前最新版本 -* aX.Y分支为对应小版本的维护分支,如a8.2分支对应8.2版本 -## 开发流程 -1. 首先fork目标分支到自己的namespace -2. 在自己的fork分支上做出修改 -3. 向对应的仓库中提交merge request,源分支为fork分支 diff --git a/README.tests b/README.tests new file mode 100755 index 0000000000000000000000000000000000000000..9932773c9e6ee8a4803e9c2d6116aabf31275e75 --- /dev/null +++ b/README.tests @@ -0,0 +1,39 @@ +qemu-kvm-tests README +===================== + +The qemu-kvm-tests rpm contains tests that can be used to verify the +functionality of the installed qemu-kvm package + +When installed, the files from this rpm will be arranged in the following +directory structure + +tests-src/ +├── README +├── scripts +│   ├── qemu.py +│   └── qmp +└── tests + ├── acceptance + ├── Makefile.include + └── qemu-iotests + +The tests/ directory within the tests-src/ directory is setup to remain a copy +of a subset of the tests/ directory from the QEMU source tree + +The avocado_qemu tests and qemu-iotests, along with files required for the +execution of the avocado_qemu tests (scripts/qemu.py and scripts/qmp/) will be +installed in a new location - /usr/lib64/qemu-kvm/tests-src/ + +avocado_qemu tests: +The avocado_qemu tests can be executed by running the following avocado command: +avocado run -p qemu_bin=/usr/libexec/qemu-kvm /usr/lib64/qemu-kvm/tests/acceptance/ +Avocado needs to be installed separately using either pip or from source as +Avocado is not being packaged for RHEL-8. + +qemu-iotests: +symlinks to corresponding binaries need to be created for QEMU_PROG, +QEMU_IO_PROG, QEMU_IMG_PROG, and QEMU_NBD_PROG before the iotests can be +executed. + +The primary purpose of this package is to make these tests available to be +executed as gating tests for the virt module in the RHEL-8 OSCI environment. diff --git a/bridge.conf b/bridge.conf new file mode 100755 index 0000000000000000000000000000000000000000..a573665d37fbc5246f9200e3a2de8be729d3e177 --- /dev/null +++ b/bridge.conf @@ -0,0 +1 @@ +allow virbr0 diff --git a/ksm.service b/ksm.service new file mode 100755 index 0000000000000000000000000000000000000000..35c6f1de80bcbe2b695ce1b092991a2c6ba56520 --- /dev/null +++ b/ksm.service @@ -0,0 +1,13 @@ +[Unit] +Description=Kernel Samepage Merging +ConditionPathExists=/sys/kernel/mm/ksm + +[Service] +Type=oneshot +RemainAfterExit=yes +EnvironmentFile=-/etc/sysconfig/ksm +ExecStart=/usr/libexec/ksmctl start +ExecStop=/usr/libexec/ksmctl stop + +[Install] +WantedBy=multi-user.target diff --git a/ksm.sysconfig b/ksm.sysconfig new file mode 100755 index 0000000000000000000000000000000000000000..d99656d70c6179df593db57581cda595e8ec52ea --- /dev/null +++ b/ksm.sysconfig @@ -0,0 +1,4 @@ +# The maximum number of unswappable kernel pages +# which may be allocated by ksm (0 for unlimited) +# If unset, defaults to half of total memory +# KSM_MAX_KERNEL_PAGES= diff --git a/ksmctl.c b/ksmctl.c new file mode 100755 index 0000000000000000000000000000000000000000..af3959102d7173f460c09e802410998d45513a41 --- /dev/null +++ b/ksmctl.c @@ -0,0 +1,77 @@ +/* Start/stop KSM, for systemd. + * Copyright (C) 2009, 2011 Red Hat, Inc. + * Written by Paolo Bonzini . + * Based on the original sysvinit script by Dan Kenigsberg + * This file is distributed under the GNU General Public License, version 2 + * or later. */ + +#include +#include +#include +#include +#include +#include + +#define KSM_MAX_KERNEL_PAGES_FILE "/sys/kernel/mm/ksm/max_kernel_pages" +#define KSM_RUN_FILE "/sys/kernel/mm/ksm/run" + +char *program_name; + +int usage(void) +{ + fprintf(stderr, "Usage: %s {start|stop}\n", program_name); + return 1; +} + +int write_value(uint64_t value, char *filename) +{ + FILE *fp; + if (!(fp = fopen(filename, "w")) || + fprintf(fp, "%llu\n", (unsigned long long) value) == EOF || + fflush(fp) == EOF || + fclose(fp) == EOF) + return 1; + + return 0; +} + +uint64_t ksm_max_kernel_pages() +{ + char *var = getenv("KSM_MAX_KERNEL_PAGES"); + char *endptr; + uint64_t value; + if (var && *var) { + value = strtoll(var, &endptr, 0); + if (value < LLONG_MAX && !*endptr) + return value; + } + /* Unless KSM_MAX_KERNEL_PAGES is set, let KSM munch up to half of + * total memory. */ + return sysconf(_SC_PHYS_PAGES) / 2; +} + +int start(void) +{ + if (access(KSM_MAX_KERNEL_PAGES_FILE, R_OK) >= 0) + write_value(ksm_max_kernel_pages(), KSM_MAX_KERNEL_PAGES_FILE); + return write_value(1, KSM_RUN_FILE); +} + +int stop(void) +{ + return write_value(0, KSM_RUN_FILE); +} + +int main(int argc, char **argv) +{ + program_name = argv[0]; + if (argc < 2) { + return usage(); + } else if (!strcmp(argv[1], "start")) { + return start(); + } else if (!strcmp(argv[1], "stop")) { + return stop(); + } else { + return usage(); + } +} diff --git a/ksmtuned b/ksmtuned new file mode 100755 index 0000000000000000000000000000000000000000..7bc57432a382d231712d18ad7675fac891612499 --- /dev/null +++ b/ksmtuned @@ -0,0 +1,139 @@ +#!/bin/bash +# +# Copyright 2009 Red Hat, Inc. and/or its affiliates. +# Released under the GPL +# +# Author: Dan Kenigsberg +# +# ksmtuned - a simple script that controls whether (and with what vigor) ksm +# should search for duplicated pages. +# +# starts ksm when memory commited to qemu processes exceeds a threshold, and +# make ksm work harder and harder untill memory load falls below that +# threshold. +# +# send SIGUSR1 to this process right after a new qemu process is started, or +# following its death, to retune ksm accordingly +# +# needs testing and ironing. contact danken@redhat.com if something breaks. + +if [ -f /etc/ksmtuned.conf ]; then + . /etc/ksmtuned.conf +fi + +debug() { + if [ -n "$DEBUG" ]; then + s="`/bin/date`: $*" + [ -n "$LOGFILE" ] && echo "$s" >> "$LOGFILE" || echo "$s" + fi +} + + +KSM_MONITOR_INTERVAL=${KSM_MONITOR_INTERVAL:-60} +KSM_NPAGES_BOOST=${KSM_NPAGES_BOOST:-300} +KSM_NPAGES_DECAY=${KSM_NPAGES_DECAY:--50} + +KSM_NPAGES_MIN=${KSM_NPAGES_MIN:-64} +KSM_NPAGES_MAX=${KSM_NPAGES_MAX:-1250} +# millisecond sleep between ksm scans for 16Gb server. Smaller servers sleep +# more, bigger sleep less. +KSM_SLEEP_MSEC=${KSM_SLEEP_MSEC:-10} + +KSM_THRES_COEF=${KSM_THRES_COEF:-20} +KSM_THRES_CONST=${KSM_THRES_CONST:-2048} + +total=`awk '/^MemTotal:/ {print $2}' /proc/meminfo` +debug total $total + +npages=0 +sleep=$[KSM_SLEEP_MSEC * 16 * 1024 * 1024 / total] +[ $sleep -le 10 ] && sleep=10 +debug sleep $sleep +thres=$[total * KSM_THRES_COEF / 100] +if [ $KSM_THRES_CONST -gt $thres ]; then + thres=$KSM_THRES_CONST +fi +debug thres $thres + +KSMCTL () { + case x$1 in + xstop) + echo 0 > /sys/kernel/mm/ksm/run + ;; + xstart) + echo $2 > /sys/kernel/mm/ksm/pages_to_scan + echo $3 > /sys/kernel/mm/ksm/sleep_millisecs + echo 1 > /sys/kernel/mm/ksm/run + ;; + esac +} + +committed_memory () { + # calculate how much memory is committed to running qemu processes + local pidlist + pidlist=$(pgrep -d ' ' -- '^qemu(-(kvm|system-.+)|:.{1,11})$') + if [ -n "$pidlist" ]; then + ps -p "$pidlist" -o rsz= + fi | awk '{ sum += $1 }; END { print 0+sum }' +} + +free_memory () { + awk '/^(MemFree|Buffers|Cached):/ {free += $2}; END {print free}' \ + /proc/meminfo +} + +increase_npages() { + local delta + delta=${1:-0} + npages=$[npages + delta] + if [ $npages -lt $KSM_NPAGES_MIN ]; then + npages=$KSM_NPAGES_MIN + elif [ $npages -gt $KSM_NPAGES_MAX ]; then + npages=$KSM_NPAGES_MAX + fi + echo $npages +} + + +adjust () { + local free committed + free=`free_memory` + committed=`committed_memory` + debug committed $committed free $free + if [ $[committed + thres] -lt $total -a $free -gt $thres ]; then + KSMCTL stop + debug "$[committed + thres] < $total and free > $thres, stop ksm" + return 1 + fi + debug "$[committed + thres] > $total, start ksm" + if [ $free -lt $thres ]; then + npages=`increase_npages $KSM_NPAGES_BOOST` + debug "$free < $thres, boost" + else + npages=`increase_npages $KSM_NPAGES_DECAY` + debug "$free > $thres, decay" + fi + KSMCTL start $npages $sleep + debug "KSMCTL start $npages $sleep" + return 0 +} + +function nothing () { + : +} + +loop () { + trap nothing SIGUSR1 + while true + do + sleep $KSM_MONITOR_INTERVAL & + wait $! + adjust + done +} + +PIDFILE=${PIDFILE-/var/run/ksmtune.pid} +if touch "$PIDFILE"; then + loop & + echo $! > "$PIDFILE" +fi diff --git a/ksmtuned.conf b/ksmtuned.conf new file mode 100755 index 0000000000000000000000000000000000000000..fc4518cf9799b04c312e5e8a928f4424146816ea --- /dev/null +++ b/ksmtuned.conf @@ -0,0 +1,21 @@ +# Configuration file for ksmtuned. + +# How long ksmtuned should sleep between tuning adjustments +# KSM_MONITOR_INTERVAL=60 + +# Millisecond sleep between ksm scans for 16Gb server. +# Smaller servers sleep more, bigger sleep less. +# KSM_SLEEP_MSEC=10 + +# KSM_NPAGES_BOOST=300 +# KSM_NPAGES_DECAY=-50 +# KSM_NPAGES_MIN=64 +# KSM_NPAGES_MAX=1250 + +# KSM_THRES_COEF=20 +# KSM_THRES_CONST=2048 + +# uncomment the following if you want ksmtuned debug info + +# LOGFILE=/var/log/ksmtuned +# DEBUG=1 diff --git a/ksmtuned.service b/ksmtuned.service new file mode 100755 index 0000000000000000000000000000000000000000..39febcca7d4f6da2acad2a4e3a3f8fc5de4b3907 --- /dev/null +++ b/ksmtuned.service @@ -0,0 +1,12 @@ +[Unit] +Description=Kernel Samepage Merging (KSM) Tuning Daemon +After=ksm.service +Requires=ksm.service + +[Service] +ExecStart=/usr/sbin/ksmtuned +ExecReload=/bin/kill -USR1 $MAINPID +Type=forking + +[Install] +WantedBy=multi-user.target diff --git a/kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch b/kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch new file mode 100755 index 0000000000000000000000000000000000000000..7310f1775061c6ad6709e68e1fad3f50974d2529 --- /dev/null +++ b/kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch @@ -0,0 +1,41 @@ +From ff8529dcbf86b3a086d64dd630cf6a687603c571 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:55 +0100 +Subject: [PATCH 12/12] ACPI: add expected files for HMAT tests (acpihmat) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-12-plai@redhat.com> +Patchwork-id: 96742 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 11/11] ACPI: add expected files for HMAT tests (acpihmat) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: "Michael S. Tsirkin" + +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 48892c6c8def6624a0ed57e2bd6c2a0a9878b973) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + tests/bios-tables-test-allowed-diff.h | 8 -------- + 1 file changed, 8 deletions(-) + +diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h +index 3c9e0c9..dfb8523 100644 +--- a/tests/bios-tables-test-allowed-diff.h ++++ b/tests/bios-tables-test-allowed-diff.h +@@ -1,9 +1 @@ + /* List of comma-separated changed AML files to ignore */ +-"tests/data/acpi/pc/APIC.acpihmat", +-"tests/data/acpi/pc/SRAT.acpihmat", +-"tests/data/acpi/pc/HMAT.acpihmat", +-"tests/data/acpi/pc/DSDT.acpihmat", +-"tests/data/acpi/q35/APIC.acpihmat", +-"tests/data/acpi/q35/SRAT.acpihmat", +-"tests/data/acpi/q35/HMAT.acpihmat", +-"tests/data/acpi/q35/DSDT.acpihmat", +-- +1.8.3.1 + diff --git a/kvm-Add-mtod_check.patch b/kvm-Add-mtod_check.patch new file mode 100755 index 0000000000000000000000000000000000000000..0b2e7101cd41279e5fb53e799daf7e67692cbf5f --- /dev/null +++ b/kvm-Add-mtod_check.patch @@ -0,0 +1,68 @@ +From 52bf635da30c75d0fdb0a3e7e7b9a2483ca033fc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:59 -0400 +Subject: [PATCH 05/14] Add mtod_check() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-2-marcandre.lureau@redhat.com> +Patchwork-id: 101819 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/8] Add mtod_check() +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Recent security issues demonstrate the lack of safety care when casting +a mbuf to a particular structure type. At least, it should check that +the buffer is large enough. The following patches will make use of this +function. + +Signed-off-by: Marc-André Lureau + +(cherry picked from commit 93e645e72a056ec0b2c16e0299fc5c6b94e4ca17) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/mbuf.c | 11 +++++++++++ + slirp/src/mbuf.h | 1 + + 2 files changed, 12 insertions(+) + +diff --git a/slirp/src/mbuf.c b/slirp/src/mbuf.c +index 4fd62282a9..6d0653ed3d 100644 +--- a/slirp/src/mbuf.c ++++ b/slirp/src/mbuf.c +@@ -222,3 +222,14 @@ struct mbuf *dtom(Slirp *slirp, void *dat) + + return (struct mbuf *)0; + } ++ ++void *mtod_check(struct mbuf *m, size_t len) ++{ ++ if (m->m_len >= len) { ++ return m->m_data; ++ } ++ ++ DEBUG_ERROR("mtod failed"); ++ ++ return NULL; ++} +diff --git a/slirp/src/mbuf.h b/slirp/src/mbuf.h +index 546e7852c5..2015e3232f 100644 +--- a/slirp/src/mbuf.h ++++ b/slirp/src/mbuf.h +@@ -118,6 +118,7 @@ void m_inc(struct mbuf *, int); + void m_adj(struct mbuf *, int); + int m_copy(struct mbuf *, struct mbuf *, int, int); + struct mbuf *dtom(Slirp *, void *); ++void *mtod_check(struct mbuf *, size_t len); + + static inline void ifs_init(struct mbuf *ifm) + { +-- +2.27.0 + diff --git a/kvm-Compress-lines-for-immediate-return.patch b/kvm-Compress-lines-for-immediate-return.patch new file mode 100755 index 0000000000000000000000000000000000000000..aed51494798f3bf0f9d890a7a55a47fc45e92030 --- /dev/null +++ b/kvm-Compress-lines-for-immediate-return.patch @@ -0,0 +1,242 @@ +From 5cf6dd33456c4e7e2a8849f458ce234fb5bb290c Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 25 Jun 2021 17:41:03 -0400 +Subject: [PATCH 3/4] Compress lines for immediate return +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Kevin Wolf +Message-id: <20210625174104.44313-2-kwolf@redhat.com> +Patchwork-id: 101777 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] Compress lines for immediate return +Bugzilla: 1970912 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Simran Singhal + +Compress two lines into a single line if immediate return statement is found. + +It also remove variables progress, val, data, ret and sock +as they are no longer needed. + +Remove space between function "mixer_load" and '(' to fix the +checkpatch.pl error:- +ERROR: space prohibited between function name and open parenthesis '(' + +Done using following coccinelle script: +@@ +local idexpression ret; +expression e; +@@ + +-ret = ++return + e; +-return ret; + +Signed-off-by: Simran Singhal +Reviewed-by: Stefan Hajnoczi +Message-Id: <20200401165314.GA3213@simran-Inspiron-5558> +[lv: in handle_aiocb_write_zeroes_unmap() move "int ret" inside the #ifdef] +Signed-off-by: Laurent Vivier +(cherry picked from commit b3ac2b94cdc939a90d5a22338ae507689e2cfab0) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 8 +++----- + block/nfs.c | 3 +-- + block/nvme.c | 4 +--- + block/vhdx.c | 3 +-- + hw/audio/ac97.c | 4 +--- + hw/audio/adlib.c | 5 +---- + hw/display/cirrus_vga.c | 4 +--- + migration/ram.c | 4 +--- + ui/gtk.c | 3 +-- + util/qemu-sockets.c | 5 +---- + 10 files changed, 12 insertions(+), 31 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 371572f1b0..837edcf027 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1626,13 +1626,12 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque) + { + RawPosixAIOData *aiocb = opaque; + BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque; +- int ret; + + /* First try to write zeros and unmap at the same time */ + + #ifdef CONFIG_FALLOCATE_PUNCH_HOLE +- ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, +- aiocb->aio_offset, aiocb->aio_nbytes); ++ int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, ++ aiocb->aio_offset, aiocb->aio_nbytes); + if (ret != -ENOTSUP) { + return ret; + } +@@ -1640,8 +1639,7 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque) + + /* If we couldn't manage to unmap while guaranteed that the area reads as + * all-zero afterwards, just write zeroes without unmapping */ +- ret = handle_aiocb_write_zeroes(aiocb); +- return ret; ++ return handle_aiocb_write_zeroes(aiocb); + } + + #ifndef HAVE_COPY_FILE_RANGE +diff --git a/block/nfs.c b/block/nfs.c +index 2393fbfe6b..18c0a73694 100644 +--- a/block/nfs.c ++++ b/block/nfs.c +@@ -623,8 +623,7 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, + } + + bs->total_sectors = ret; +- ret = 0; +- return ret; ++ return 0; + } + + static QemuOptsList nfs_create_opts = { +diff --git a/block/nvme.c b/block/nvme.c +index 7b7c0cc5d6..eb2f54dd9d 100644 +--- a/block/nvme.c ++++ b/block/nvme.c +@@ -575,11 +575,9 @@ static bool nvme_poll_cb(void *opaque) + { + EventNotifier *e = opaque; + BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier); +- bool progress = false; + + trace_nvme_poll_cb(s); +- progress = nvme_poll_queues(s); +- return progress; ++ return nvme_poll_queues(s); + } + + static int nvme_init(BlockDriverState *bs, const char *device, int namespace, +diff --git a/block/vhdx.c b/block/vhdx.c +index 21497f7318..a427e47f10 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -411,8 +411,7 @@ int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, + if (ret < 0) { + return ret; + } +- ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); +- return ret; ++ return vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + } + + /* opens the specified header block from the VHDX file header section */ +diff --git a/hw/audio/ac97.c b/hw/audio/ac97.c +index a136b97f68..a2cfae52b3 100644 +--- a/hw/audio/ac97.c ++++ b/hw/audio/ac97.c +@@ -574,11 +574,9 @@ static uint32_t nam_readb (void *opaque, uint32_t addr) + static uint32_t nam_readw (void *opaque, uint32_t addr) + { + AC97LinkState *s = opaque; +- uint32_t val = ~0U; + uint32_t index = addr; + s->cas = 0; +- val = mixer_load (s, index); +- return val; ++ return mixer_load(s, index); + } + + static uint32_t nam_readl (void *opaque, uint32_t addr) +diff --git a/hw/audio/adlib.c b/hw/audio/adlib.c +index cb4178d861..5779d09815 100644 +--- a/hw/audio/adlib.c ++++ b/hw/audio/adlib.c +@@ -120,13 +120,10 @@ static void adlib_write(void *opaque, uint32_t nport, uint32_t val) + static uint32_t adlib_read(void *opaque, uint32_t nport) + { + AdlibState *s = opaque; +- uint8_t data; + int a = nport & 3; + + adlib_kill_timers (s); +- data = OPLRead (s->opl, a); +- +- return data; ++ return OPLRead (s->opl, a); + } + + static void timer_handler (void *opaque, int c, double interval_Sec) +diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c +index 93afa26fda..a52d3094b9 100644 +--- a/hw/display/cirrus_vga.c ++++ b/hw/display/cirrus_vga.c +@@ -2411,12 +2411,10 @@ static uint64_t cirrus_linear_bitblt_read(void *opaque, + unsigned size) + { + CirrusVGAState *s = opaque; +- uint32_t ret; + + /* XXX handle bitblt */ + (void)s; +- ret = 0xff; +- return ret; ++ return 0xff; + } + + static void cirrus_linear_bitblt_write(void *opaque, +diff --git a/migration/ram.c b/migration/ram.c +index 5344c7d59e..92c506d13c 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -3101,9 +3101,7 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms) + } + trace_ram_postcopy_send_discard_bitmap(); + +- ret = postcopy_each_ram_send_discard(ms); +- +- return ret; ++ return postcopy_each_ram_send_discard(ms); + } + + /** +diff --git a/ui/gtk.c b/ui/gtk.c +index 692ccc7bbb..e032e3c36f 100644 +--- a/ui/gtk.c ++++ b/ui/gtk.c +@@ -1649,8 +1649,7 @@ static GSList *gd_vc_menu_init(GtkDisplayState *s, VirtualConsole *vc, + G_CALLBACK(gd_menu_switch_vc), s); + gtk_menu_shell_append(GTK_MENU_SHELL(view_menu), vc->menu_item); + +- group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(vc->menu_item)); +- return group; ++ return gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(vc->menu_item)); + } + + #if defined(CONFIG_VTE) +diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c +index bcc06d0e01..86c48b9fa5 100644 +--- a/util/qemu-sockets.c ++++ b/util/qemu-sockets.c +@@ -765,15 +765,12 @@ static int vsock_connect_addr(const struct sockaddr_vm *svm, Error **errp) + static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp) + { + struct sockaddr_vm svm; +- int sock = -1; + + if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) { + return -1; + } + +- sock = vsock_connect_addr(&svm, errp); +- +- return sock; ++ return vsock_connect_addr(&svm, errp); + } + + static int vsock_listen_saddr(VsockSocketAddress *vaddr, +-- +2.27.0 + diff --git a/kvm-Don-t-leak-memory-when-reallocation-fails.patch b/kvm-Don-t-leak-memory-when-reallocation-fails.patch new file mode 100755 index 0000000000000000000000000000000000000000..5747672cfb0c9a18b8c9be3cab70bfa43caf7ade --- /dev/null +++ b/kvm-Don-t-leak-memory-when-reallocation-fails.patch @@ -0,0 +1,58 @@ +From bcb6107f98d7b1edf687d7afd552a4528b7e673b Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Tue, 12 May 2020 21:15:13 +0100 +Subject: [PATCH 2/7] Don't leak memory when reallocation fails. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200512211514.1398384-2-jmaloy@redhat.com> +Patchwork-id: 96412 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] Don't leak memory when reallocation fails. +Bugzilla: 1749737 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Jindrich Novy + +Signed-off-by: Jindrich Novy +[ Marc-André - modified to use a temporary variable ] +Signed-off-by: Marc-André Lureau +(cherry picked from libslirp commit d171af3732a0610a25334b06b77fa547bd677918) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/sbuf.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/slirp/src/sbuf.c b/slirp/src/sbuf.c +index abced48..0569c34 100644 +--- a/slirp/src/sbuf.c ++++ b/slirp/src/sbuf.c +@@ -39,13 +39,16 @@ void sbreserve(struct sbuf *sb, int size) + if (sb->sb_data) { + /* Already alloced, realloc if necessary */ + if (sb->sb_datalen != size) { +- sb->sb_wptr = sb->sb_rptr = sb->sb_data = +- (char *)realloc(sb->sb_data, size); ++ char *new = realloc(sb->sb_data, size); + sb->sb_cc = 0; +- if (sb->sb_wptr) ++ if (new) { ++ sb->sb_data = sb->sb_wptr = sb->sb_rptr = new; + sb->sb_datalen = size; +- else ++ } else { ++ free(sb->sb_data); ++ sb->sb_data = sb->sb_wptr = sb->sb_rptr = NULL; + sb->sb_datalen = 0; ++ } + } + } else { + sb->sb_wptr = sb->sb_rptr = sb->sb_data = (char *)malloc(size); +-- +1.8.3.1 + diff --git a/kvm-Drop-bogus-IPv6-messages.patch b/kvm-Drop-bogus-IPv6-messages.patch new file mode 100755 index 0000000000000000000000000000000000000000..4c30a3b44f487d0921333da82cef61f25f9e41ee --- /dev/null +++ b/kvm-Drop-bogus-IPv6-messages.patch @@ -0,0 +1,51 @@ +From 89c4300c97739aa3291f0322037bb65068e08d41 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 19 Jan 2021 23:34:33 -0500 +Subject: [PATCH] Drop bogus IPv6 messages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210119233433.1352902-2-jmaloy@redhat.com> +Patchwork-id: 100695 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] Drop bogus IPv6 messages +Bugzilla: 1918054 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Ralf Haferkamp + +Drop IPv6 message shorter than what's mentioned in the payload +length header (+ the size of the IPv6 header). They're invalid an could +lead to data leakage in icmp6_send_echoreply(). + +(cherry picked from libslirp commit c7ede54cbd2e2b25385325600958ba0124e31cc0) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ip6_input.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/slirp/src/ip6_input.c b/slirp/src/ip6_input.c +index d9d2b7e9cd4..0f2b17853ad 100644 +--- a/slirp/src/ip6_input.c ++++ b/slirp/src/ip6_input.c +@@ -49,6 +49,13 @@ void ip6_input(struct mbuf *m) + goto bad; + } + ++ // Check if the message size is big enough to hold what's ++ // set in the payload length header. If not this is an invalid ++ // packet ++ if (m->m_len < ntohs(ip6->ip_pl) + sizeof(struct ip6)) { ++ goto bad; ++ } ++ + /* check ip_ttl for a correct ICMP reply */ + if (ip6->ip_hl == 0) { + icmp6_send_error(m, ICMP6_TIMXCEED, ICMP6_TIMXCEED_INTRANS); +-- +2.27.0 + diff --git a/kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch b/kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch new file mode 100755 index 0000000000000000000000000000000000000000..2dd4457321d00aea2bbd244fe38b0aaec30bb900 --- /dev/null +++ b/kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch @@ -0,0 +1,59 @@ +From d0c668aa0ad255c3598267816154874541ac2943 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:42 -0400 +Subject: [PATCH 12/14] Fix "DHCP broken in libslirp v4.6.0" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-9-marcandre.lureau@redhat.com> +Patchwork-id: 101824 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 8/8] Fix "DHCP broken in libslirp v4.6.0" +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Akihiro Suda + +Fix issue 48 + +Signed-off-by: Akihiro Suda + +(cherry picked from commit c9f314f6e315a5518432761fea864196a290f799) +[ minor conflict fix due to indentation change ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/bootp.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/slirp/src/bootp.c b/slirp/src/bootp.c +index 5789187166..3e4af075f1 100644 +--- a/slirp/src/bootp.c ++++ b/slirp/src/bootp.c +@@ -354,14 +354,14 @@ static void bootp_reply(Slirp *slirp, + q += sizeof(nak_msg) - 1; + } + assert(q < end); +- *q = +-RFC1533_END +-; ++ *q = RFC1533_END; + +-daddr.sin_addr.s_addr = 0xffffffffu; ++ daddr.sin_addr.s_addr = 0xffffffffu; + +-m->m_len = sizeof(struct bootp_t) - sizeof(struct ip) - sizeof(struct udphdr); +-udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); ++ assert ((q - rbp->bp_vend + 1) <= DHCP_OPT_LEN); ++ ++ m->m_len = sizeof(struct bootp_t) + (q - rbp->bp_vend + 1) - sizeof(struct ip) - sizeof(struct udphdr); ++ udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); + } + + void bootp_input(struct mbuf *m) +-- +2.27.0 + diff --git a/kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch b/kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch new file mode 100755 index 0000000000000000000000000000000000000000..535c3af1b8c39e2d62642a9efff398074d4c3dcc --- /dev/null +++ b/kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch @@ -0,0 +1,60 @@ +From a33ea192428d9c9307f1140f3e25631a6ef7657c Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Sat, 20 Jun 2020 15:02:59 -0400 +Subject: [PATCH 12/12] Fix use-afte-free in ip_reass() (CVE-2020-1983) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20200620150259.3352467-2-jmaloy@redhat.com> +Patchwork-id: 97678 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] Fix use-afte-free in ip_reass() (CVE-2020-1983) +Bugzilla: 1838070 +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +The q pointer is updated when the mbuf data is moved from m_dat to +m_ext. + +m_ext buffer may also be realloc()'ed and moved during m_cat(): +q should also be updated in this case. + +Reported-by: Aviv Sasson +Signed-off-by: Marc-André Lureau +Reviewed-by: Samuel Thibault + +(cherry picked from libslirp commit 9bd6c5913271eabcb7768a58197ed3301fe19f2d) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ip_input.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/slirp/src/ip_input.c b/slirp/src/ip_input.c +index df1c846ade..0f5d522ec1 100644 +--- a/slirp/src/ip_input.c ++++ b/slirp/src/ip_input.c +@@ -329,7 +329,7 @@ insert: + q = fp->frag_link.next; + m = dtom(slirp, q); + +- int was_ext = m->m_flags & M_EXT; ++ int delta = (char *)q - (m->m_flags & M_EXT ? m->m_ext : m->m_dat); + + q = (struct ipasfrag *)q->ipf_next; + while (q != (struct ipasfrag *)&fp->frag_link) { +@@ -353,8 +353,7 @@ insert: + * the old buffer (in the mbuf), so we must point ip + * into the new buffer. + */ +- if (!was_ext && m->m_flags & M_EXT) { +- int delta = (char *)q - m->m_dat; ++ if (m->m_flags & M_EXT) { + q = (struct ipasfrag *)(m->m_ext + delta); + } + +-- +2.27.0 + diff --git a/kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch b/kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch new file mode 100755 index 0000000000000000000000000000000000000000..dce89d9e3c525a4dc32a74ad68a9d6e7b5bd3c29 --- /dev/null +++ b/kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch @@ -0,0 +1,55 @@ +From e3bec8c83459a68ae0c08e2ae0f1dbef24872d59 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:09 +0100 +Subject: [PATCH 04/26] MAINTAINERS: fix qcow2-bitmap.c under Dirty Bitmaps + header + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-2-eblake@redhat.com> +Patchwork-id: 97068 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 01/12] MAINTAINERS: fix qcow2-bitmap.c under Dirty Bitmaps header +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +From: Vladimir Sementsov-Ogievskiy + +Somehow I wrote not full path to the file. Fix that. + +Also, while being here, rearrange entries, so that includes go first, +then block, than migration, than util. + +Fixes: 052db8e71444d +Signed-off-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 00637c6b0b67694127cc01dd75f3626da23acdaa) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/MAINTAINERS b/MAINTAINERS +index d1b3e26..3a81ac9 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1873,12 +1873,12 @@ M: John Snow + R: Vladimir Sementsov-Ogievskiy + L: qemu-block@nongnu.org + S: Supported +-F: util/hbitmap.c +-F: block/dirty-bitmap.c + F: include/qemu/hbitmap.h + F: include/block/dirty-bitmap.h +-F: qcow2-bitmap.c ++F: block/dirty-bitmap.c ++F: block/qcow2-bitmap.c + F: migration/block-dirty-bitmap.c ++F: util/hbitmap.c + F: tests/test-hbitmap.c + F: docs/interop/bitmaps.rst + T: git https://github.com/jnsnow/qemu.git bitmaps +-- +1.8.3.1 + diff --git a/kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch b/kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch new file mode 100755 index 0000000000000000000000000000000000000000..14350174532b11affa1daf793deb7c94f4533d8e --- /dev/null +++ b/kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch @@ -0,0 +1,53 @@ +From 481357ea8ae32b6894860c296cf6a2898260195f Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 17 Jan 2020 13:18:27 +0100 +Subject: [PATCH 4/4] RHEL: hw/i386: disable nested PERF_GLOBAL_CTRL MSR + support + +RH-Author: Paolo Bonzini +Message-id: <20200117131827.20361-1-pbonzini@redhat.com> +Patchwork-id: 93405 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v3] RHEL: hw/i386: disable nested PERF_GLOBAL_CTRL MSR support +Bugzilla: 1559846 +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Miroslav Rezanina + +BZ: 1559846 +BRANCH: rhel-av-8.2.0 +BREW: 25775160 +UPSTREAM: RHEL only + +Nested PERF_GLOBAL_CTRL support is not present in the 8.2 kernel. Drop the +features via compat properties, they will be moved to 8.2 machine type compat +properties in the 8.3 timeframe. + +Signed-off-by: Paolo Bonzini +--- + No change, for v2 I mistakenly wrote "origin/rhel-av-8.2.0" as the + branch. :( + + hw/i386/pc.c | 2 ++ + 1 file changed, 2 insertions(+) + +Signed-off-by: Miroslav Rezanina +--- + hw/i386/pc.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index 61e70e4..73a0f11 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -351,6 +351,8 @@ const size_t pc_compat_1_4_len = G_N_ELEMENTS(pc_compat_1_4); + GlobalProperty pc_rhel_compat[] = { + { TYPE_X86_CPU, "host-phys-bits", "on" }, + { TYPE_X86_CPU, "host-phys-bits-limit", "48" }, ++ { TYPE_X86_CPU, "vmx-entry-load-perf-global-ctrl", "off" }, ++ { TYPE_X86_CPU, "vmx-exit-load-perf-global-ctrl", "off" }, + /* bz 1508330 */ + { "vfio-pci", "x-no-geforce-quirks", "on" }, + }; +-- +1.8.3.1 + diff --git a/kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch b/kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch new file mode 100755 index 0000000000000000000000000000000000000000..d717ae2c5d66cdd8fef6fe2e4615178b2b5592c7 --- /dev/null +++ b/kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch @@ -0,0 +1,115 @@ +From c477581ccc6962651d4d6c702a6c3e2fcc5e4205 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Thu, 2 Jan 2020 11:56:51 +0000 +Subject: [PATCH 2/2] kvm: Reallocate dirty_bmap when we change a slot + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200102115651.140177-1-dgilbert@redhat.com> +Patchwork-id: 93256 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] kvm: Reallocate dirty_bmap when we change a slot +Bugzilla: 1772774 +RH-Acked-by: Peter Xu +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Laszlo Ersek + +From: "Dr. David Alan Gilbert" + +bz: https://bugzilla.redhat.com/show_bug.cgi?id=1772774 +brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=25575691 +branch: rhel-av-8.2.0 + +kvm_set_phys_mem can be called to reallocate a slot by something the +guest does (e.g. writing to PAM and other chipset registers). +This can happen in the middle of a migration, and if we're unlucky +it can now happen between the split 'sync' and 'clear'; the clear +asserts if there's no bmap to clear. Recreate the bmap whenever +we change the slot, keeping the clear path happy. + +Typically this is triggered by the guest rebooting during a migrate. + +Corresponds to: +https://bugzilla.redhat.com/show_bug.cgi?id=1772774 +https://bugzilla.redhat.com/show_bug.cgi?id=1771032 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Peter Xu +(cherry picked from commit 9b3a31c745b61758aaa5466a3a9fc0526d409188) +Signed-off-by: Danilo C. L. de Paula +--- + accel/kvm/kvm-all.c | 44 +++++++++++++++++++++++++++++--------------- + 1 file changed, 29 insertions(+), 15 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index dc3ed7f..5007bda 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -518,6 +518,27 @@ static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, + + #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) + ++/* Allocate the dirty bitmap for a slot */ ++static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) ++{ ++ /* ++ * XXX bad kernel interface alert ++ * For dirty bitmap, kernel allocates array of size aligned to ++ * bits-per-long. But for case when the kernel is 64bits and ++ * the userspace is 32bits, userspace can't align to the same ++ * bits-per-long, since sizeof(long) is different between kernel ++ * and user space. This way, userspace will provide buffer which ++ * may be 4 bytes less than the kernel will use, resulting in ++ * userspace memory corruption (which is not detectable by valgrind ++ * too, in most cases). ++ * So for now, let's align to 64 instead of HOST_LONG_BITS here, in ++ * a hope that sizeof(long) won't become >8 any time soon. ++ */ ++ hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), ++ /*HOST_LONG_BITS*/ 64) / 8; ++ mem->dirty_bmap = g_malloc0(bitmap_size); ++} ++ + /** + * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space + * +@@ -550,23 +571,9 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, + goto out; + } + +- /* XXX bad kernel interface alert +- * For dirty bitmap, kernel allocates array of size aligned to +- * bits-per-long. But for case when the kernel is 64bits and +- * the userspace is 32bits, userspace can't align to the same +- * bits-per-long, since sizeof(long) is different between kernel +- * and user space. This way, userspace will provide buffer which +- * may be 4 bytes less than the kernel will use, resulting in +- * userspace memory corruption (which is not detectable by valgrind +- * too, in most cases). +- * So for now, let's align to 64 instead of HOST_LONG_BITS here, in +- * a hope that sizeof(long) won't become >8 any time soon. +- */ + if (!mem->dirty_bmap) { +- hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), +- /*HOST_LONG_BITS*/ 64) / 8; + /* Allocate on the first log_sync, once and for all */ +- mem->dirty_bmap = g_malloc0(bitmap_size); ++ kvm_memslot_init_dirty_bitmap(mem); + } + + d.dirty_bitmap = mem->dirty_bmap; +@@ -1067,6 +1074,13 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, + mem->ram = ram; + mem->flags = kvm_mem_flags(mr); + ++ if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { ++ /* ++ * Reallocate the bmap; it means it doesn't disappear in ++ * middle of a migrate. ++ */ ++ kvm_memslot_init_dirty_bitmap(mem); ++ } + err = kvm_set_user_memory_region(kml, mem, true); + if (err) { + fprintf(stderr, "%s: error registering slot: %s\n", __func__, +-- +1.8.3.1 + diff --git a/kvm-Replace-remaining-malloc-free-user-with-glib.patch b/kvm-Replace-remaining-malloc-free-user-with-glib.patch new file mode 100755 index 0000000000000000000000000000000000000000..71e6e472b95ef84bc004544e448dad01c9fd98f4 --- /dev/null +++ b/kvm-Replace-remaining-malloc-free-user-with-glib.patch @@ -0,0 +1,118 @@ +From c012dc9b501d96a2ff54a8a7a182726043b69aeb Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Tue, 12 May 2020 21:15:14 +0100 +Subject: [PATCH 3/7] Replace remaining malloc/free user with glib +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200512211514.1398384-3-jmaloy@redhat.com> +Patchwork-id: 96413 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] Replace remaining malloc/free user with glib +Bugzilla: 1749737 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Marc-André Lureau + +glib mem functions are already used in various places. Let's not mix +the two, and instead abort on OOM conditions. + +Signed-off-by: Marc-André Lureau +(cherry picked from libslirp commit 3a494648526be4eb96cba739a816a60e933ffd14) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/sbuf.c | 21 ++++++--------------- + slirp/src/socket.c | 2 +- + slirp/src/tcp_subr.c | 8 ++------ + 3 files changed, 9 insertions(+), 22 deletions(-) + +diff --git a/slirp/src/sbuf.c b/slirp/src/sbuf.c +index 0569c34..eab87f3 100644 +--- a/slirp/src/sbuf.c ++++ b/slirp/src/sbuf.c +@@ -9,7 +9,7 @@ static void sbappendsb(struct sbuf *sb, struct mbuf *m); + + void sbfree(struct sbuf *sb) + { +- free(sb->sb_data); ++ g_free(sb->sb_data); + } + + bool sbdrop(struct sbuf *sb, int num) +@@ -39,24 +39,15 @@ void sbreserve(struct sbuf *sb, int size) + if (sb->sb_data) { + /* Already alloced, realloc if necessary */ + if (sb->sb_datalen != size) { +- char *new = realloc(sb->sb_data, size); ++ char *new = g_realloc(sb->sb_data, size); + sb->sb_cc = 0; +- if (new) { +- sb->sb_data = sb->sb_wptr = sb->sb_rptr = new; +- sb->sb_datalen = size; +- } else { +- free(sb->sb_data); +- sb->sb_data = sb->sb_wptr = sb->sb_rptr = NULL; +- sb->sb_datalen = 0; +- } ++ sb->sb_data = sb->sb_wptr = sb->sb_rptr = new; ++ sb->sb_datalen = size; + } + } else { +- sb->sb_wptr = sb->sb_rptr = sb->sb_data = (char *)malloc(size); ++ sb->sb_wptr = sb->sb_rptr = sb->sb_data = g_malloc(size); + sb->sb_cc = 0; +- if (sb->sb_wptr) +- sb->sb_datalen = size; +- else +- sb->sb_datalen = 0; ++ sb->sb_datalen = size; + } + } + +diff --git a/slirp/src/socket.c b/slirp/src/socket.c +index 34daffc..ace18bf 100644 +--- a/slirp/src/socket.c ++++ b/slirp/src/socket.c +@@ -95,7 +95,7 @@ void sofree(struct socket *so) + remque(so); /* crashes if so is not in a queue */ + + if (so->so_tcpcb) { +- free(so->so_tcpcb); ++ g_free(so->so_tcpcb); + } + g_free(so); + } +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index 26d4ead..4e5a801 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -255,11 +255,7 @@ struct tcpcb *tcp_newtcpcb(struct socket *so) + { + register struct tcpcb *tp; + +- tp = (struct tcpcb *)malloc(sizeof(*tp)); +- if (tp == NULL) +- return ((struct tcpcb *)0); +- +- memset((char *)tp, 0, sizeof(struct tcpcb)); ++ tp = g_new0(struct tcpcb, 1); + tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp; + tp->t_maxseg = (so->so_ffamily == AF_INET) ? TCP_MSS : TCP6_MSS; + +@@ -330,7 +326,7 @@ struct tcpcb *tcp_close(struct tcpcb *tp) + remque(tcpiphdr2qlink(tcpiphdr_prev(t))); + m_free(m); + } +- free(tp); ++ g_free(tp); + so->so_tcpcb = NULL; + /* clobber input socket cache if we're closing the cached connection */ + if (so == slirp->tcp_last_so) +-- +1.8.3.1 + diff --git a/kvm-Revert-RHEL-disable-hostmem-memfd.patch b/kvm-Revert-RHEL-disable-hostmem-memfd.patch new file mode 100755 index 0000000000000000000000000000000000000000..f9597523a173f9ccf387ddb49751c077a5ad1da7 --- /dev/null +++ b/kvm-Revert-RHEL-disable-hostmem-memfd.patch @@ -0,0 +1,58 @@ +From 559d5899473dea180ced39a32bfbfbf2310c6e04 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Mon, 25 May 2020 15:33:06 +0100 +Subject: [PATCH 4/7] Revert "RHEL: disable hostmem-memfd" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200525153306.15373-1-marcandre.lureau@redhat.com> +Patchwork-id: 96747 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH] Revert "RHEL: disable hostmem-memfd" +Bugzilla: 1839030 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefano Garzarella + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1839030 +BRANCH: rhel-av-8.2.1 +UPSTREAM: RHEL-only +BREW: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=28817132 + +This reverts commit f7587ddb9a2731bf678a24156b6285dda79a4b2b. + +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + backends/Makefile.objs | 3 +-- + util/memfd.c | 2 +- + 2 files changed, 2 insertions(+), 3 deletions(-) + +diff --git a/backends/Makefile.objs b/backends/Makefile.objs +index f328d40..f069111 100644 +--- a/backends/Makefile.objs ++++ b/backends/Makefile.objs +@@ -16,5 +16,4 @@ endif + + common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o + +-# RHEL: disable memfd +-# common-obj-$(CONFIG_LINUX) += hostmem-memfd.o ++common-obj-$(CONFIG_LINUX) += hostmem-memfd.o +diff --git a/util/memfd.c b/util/memfd.c +index 3303ec9..4a3c07e 100644 +--- a/util/memfd.c ++++ b/util/memfd.c +@@ -193,7 +193,7 @@ bool qemu_memfd_alloc_check(void) + */ + bool qemu_memfd_check(unsigned int flags) + { +-#if 0 /* RHEL: memfd support disabled */ ++#ifdef CONFIG_LINUX + int mfd = memfd_create("test", flags | MFD_CLOEXEC); + + if (mfd >= 0) { +-- +1.8.3.1 + diff --git a/kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch b/kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch new file mode 100755 index 0000000000000000000000000000000000000000..0c1c37fe86368373ceb8e35249d9a266cb43820e --- /dev/null +++ b/kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch @@ -0,0 +1,121 @@ +From 71b5267ed33f9e60bc98acbabcbed62f01a96ff4 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 30 Mar 2020 11:19:23 +0100 +Subject: [PATCH 3/4] Revert "mirror: Don't let an operation wait for itself" + +RH-Author: Kevin Wolf +Message-id: <20200330111924.22938-2-kwolf@redhat.com> +Patchwork-id: 94464 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] Revert "mirror: Don't let an operation wait for itself" +Bugzilla: 1794692 +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +This reverts commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca. + +The fix was incomplete as it only protected against requests waiting for +themselves, but not against requests waiting for each other. We need a +different solution. + +Signed-off-by: Kevin Wolf +Message-Id: <20200326153628.4869-2-kwolf@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit 9178f4fe5f083064f5c91f04d98c815ce5a5af1c) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 21 +++++++++------------ + 1 file changed, 9 insertions(+), 12 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index cacbc70..8959e42 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -283,14 +283,11 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, + } + + static inline void coroutine_fn +-mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) ++mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) + { + MirrorOp *op; + + QTAILQ_FOREACH(op, &s->ops_in_flight, next) { +- if (self == op) { +- continue; +- } + /* Do not wait on pseudo ops, because it may in turn wait on + * some other operation to start, which may in fact be the + * caller of this function. Since there is only one pseudo op +@@ -305,10 +302,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) + } + + static inline void coroutine_fn +-mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self) ++mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) + { + /* Only non-active operations use up in-flight slots */ +- mirror_wait_for_any_operation(s, self, false); ++ mirror_wait_for_any_operation(s, false); + } + + /* Perform a mirror copy operation. +@@ -351,7 +348,7 @@ static void coroutine_fn mirror_co_read(void *opaque) + + while (s->buf_free_count < nb_chunks) { + trace_mirror_yield_in_flight(s, op->offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, op); ++ mirror_wait_for_free_in_flight_slot(s); + } + + /* Now make a QEMUIOVector taking enough granularity-sized chunks +@@ -558,7 +555,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) + + while (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield_in_flight(s, offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, pseudo_op); ++ mirror_wait_for_free_in_flight_slot(s); + } + + if (s->ret < 0) { +@@ -612,7 +609,7 @@ static void mirror_free_init(MirrorBlockJob *s) + static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) + { + while (s->in_flight > 0) { +- mirror_wait_for_free_in_flight_slot(s, NULL); ++ mirror_wait_for_free_in_flight_slot(s); + } + } + +@@ -797,7 +794,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) + if (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, + s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, NULL); ++ mirror_wait_for_free_in_flight_slot(s); + continue; + } + +@@ -950,7 +947,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + /* Do not start passive operations while there are active + * writes in progress */ + while (s->in_active_write_counter) { +- mirror_wait_for_any_operation(s, NULL, true); ++ mirror_wait_for_any_operation(s, true); + } + + if (s->ret < 0) { +@@ -976,7 +973,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || + (cnt == 0 && s->in_flight > 0)) { + trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s, NULL); ++ mirror_wait_for_free_in_flight_slot(s); + continue; + } else if (cnt != 0) { + delay_ns = mirror_iteration(s); +-- +1.8.3.1 + diff --git a/kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch b/kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch new file mode 100755 index 0000000000000000000000000000000000000000..dc65c26c1930209ba3e128a06ef1bd6469479bfd --- /dev/null +++ b/kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch @@ -0,0 +1,63 @@ +From ceb6d97674b8bc9a072db1be4167411bc0ee48d7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:02 +0100 +Subject: [PATCH 091/116] Virtiofsd: fix memory leak on fuse queueinfo +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-88-dgilbert@redhat.com> +Patchwork-id: 93542 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 087/112] Virtiofsd: fix memory leak on fuse queueinfo +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +For fuse's queueinfo, both queueinfo array and queueinfos are allocated in +fv_queue_set_started() but not cleaned up when the daemon process quits. + +This fixes the leak in proper places. + +Signed-off-by: Liu Bo +Signed-off-by: Eric Ren +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 740b0b700a6338a1cf60c26229651ac5f6724944) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index b7948de..fb8d6d1 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -625,6 +625,8 @@ static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) + } + close(ourqi->kill_fd); + ourqi->kick_fd = -1; ++ free(vud->qi[qidx]); ++ vud->qi[qidx] = NULL; + } + + /* Callback from libvhost-user on start or stop of a queue */ +@@ -884,6 +886,12 @@ int virtio_session_mount(struct fuse_session *se) + void virtio_session_close(struct fuse_session *se) + { + close(se->vu_socketfd); ++ ++ if (!se->virtio_dev) { ++ return; ++ } ++ ++ free(se->virtio_dev->qi); + free(se->virtio_dev); + se->virtio_dev = NULL; + } +-- +1.8.3.1 + diff --git a/kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch b/kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch new file mode 100755 index 0000000000000000000000000000000000000000..1538d11f23ec753745952d658fe1190de845f554 --- /dev/null +++ b/kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch @@ -0,0 +1,82 @@ +From dcac680adb6b8624f14eda3e812521bddbe8ecea Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:04 -0400 +Subject: [PATCH 5/7] acpi: accept byte and word access to core ACPI registers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-5-jmaloy@redhat.com> +Patchwork-id: 101482 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 4/6] acpi: accept byte and word access to core ACPI registers +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Michael Tokarev + +All ISA registers should be accessible as bytes, words or dwords +(if wide enough). Fix the access constraints for acpi-pm-evt, +acpi-pm-tmr & acpi-cnt registers. + +Fixes: 5d971f9e67 (memory: Revert "memory: accept mismatching sizes in memory_region_access_valid") +Fixes: afafe4bbe0 (apci: switch cnt to memory api) +Fixes: 77d58b1e47 (apci: switch timer to memory api) +Fixes: b5a7c024d2 (apci: switch evt to memory api) +Buglink: https://lore.kernel.org/xen-devel/20200630170913.123646-1-anthony.perard@citrix.com/T/ +Buglink: https://bugs.debian.org/964793 +BugLink: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=964247 +BugLink: https://bugs.launchpad.net/bugs/1886318 +Reported-By: Simon John +Signed-off-by: Michael Tokarev +Message-Id: <20200720160627.15491-1-mjt@msgid.tls.msk.ru> +Cc: qemu-stable@nongnu.org +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin + +(cherry picked from commit dba04c3488c4699f5afe96f66e448b1d447cf3fb) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/core.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/hw/acpi/core.c b/hw/acpi/core.c +index 45cbed49ab..d85052c34a 100644 +--- a/hw/acpi/core.c ++++ b/hw/acpi/core.c +@@ -461,7 +461,8 @@ static void acpi_pm_evt_write(void *opaque, hwaddr addr, uint64_t val, + static const MemoryRegionOps acpi_pm_evt_ops = { + .read = acpi_pm_evt_read, + .write = acpi_pm_evt_write, +- .valid.min_access_size = 2, ++ .impl.min_access_size = 2, ++ .valid.min_access_size = 1, + .valid.max_access_size = 2, + .endianness = DEVICE_LITTLE_ENDIAN, + }; +@@ -530,7 +531,8 @@ static void acpi_pm_tmr_write(void *opaque, hwaddr addr, uint64_t val, + static const MemoryRegionOps acpi_pm_tmr_ops = { + .read = acpi_pm_tmr_read, + .write = acpi_pm_tmr_write, +- .valid.min_access_size = 4, ++ .impl.min_access_size = 4, ++ .valid.min_access_size = 1, + .valid.max_access_size = 4, + .endianness = DEVICE_LITTLE_ENDIAN, + }; +@@ -602,7 +604,8 @@ static void acpi_pm_cnt_write(void *opaque, hwaddr addr, uint64_t val, + static const MemoryRegionOps acpi_pm_cnt_ops = { + .read = acpi_pm_cnt_read, + .write = acpi_pm_cnt_write, +- .valid.min_access_size = 2, ++ .impl.min_access_size = 2, ++ .valid.min_access_size = 1, + .valid.max_access_size = 2, + .endianness = DEVICE_LITTLE_ENDIAN, + }; +-- +2.27.0 + diff --git a/kvm-aio-posix-completely-stop-polling-when-disabled.patch b/kvm-aio-posix-completely-stop-polling-when-disabled.patch new file mode 100755 index 0000000000000000000000000000000000000000..39931811e006fc20ccccfe3dc0d73de44e20c15f --- /dev/null +++ b/kvm-aio-posix-completely-stop-polling-when-disabled.patch @@ -0,0 +1,104 @@ +From 4b4fb1cccb8e0307658cee3bc90c77e5f1dde60a Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:49 -0400 +Subject: [PATCH 13/14] aio-posix: completely stop polling when disabled + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-10-thuth@redhat.com> +Patchwork-id: 98603 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 9/9] aio-posix: completely stop polling when disabled +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +From: Stefan Hajnoczi + +One iteration of polling is always performed even when polling is +disabled. This is done because: +1. Userspace polling is cheaper than making a syscall. We might get + lucky. +2. We must poll once more after polling has stopped in case an event + occurred while stopping polling. + +However, there are downsides: +1. Polling becomes a bottleneck when the number of event sources is very + high. It's more efficient to monitor fds in that case. +2. A high-frequency polling event source can starve non-polling event + sources because ppoll(2)/epoll(7) is never invoked. + +This patch removes the forced polling iteration so that poll_ns=0 really +means no polling. + +IOPS increases from 10k to 60k when the guest has 100 +virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 +device because the large number of event sources being polled slows down +the event loop. + +Signed-off-by: Stefan Hajnoczi +Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com +Message-Id: <20200305170806.1313245-2-stefanha@redhat.com> +(cherry picked from commit e4346192f1c2e1683a807b46efac47ef0cf9b545) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + util/aio-posix.c | 22 +++++++++++++++------- + 1 file changed, 15 insertions(+), 7 deletions(-) + +diff --git a/util/aio-posix.c b/util/aio-posix.c +index a4977f538e..abc396d030 100644 +--- a/util/aio-posix.c ++++ b/util/aio-posix.c +@@ -340,12 +340,13 @@ void aio_set_event_notifier_poll(AioContext *ctx, + (IOHandler *)io_poll_end); + } + +-static void poll_set_started(AioContext *ctx, bool started) ++static bool poll_set_started(AioContext *ctx, bool started) + { + AioHandler *node; ++ bool progress = false; + + if (started == ctx->poll_started) { +- return; ++ return false; + } + + ctx->poll_started = started; +@@ -367,8 +368,15 @@ static void poll_set_started(AioContext *ctx, bool started) + if (fn) { + fn(node->opaque); + } ++ ++ /* Poll one last time in case ->io_poll_end() raced with the event */ ++ if (!started) { ++ progress = node->io_poll(node->opaque) || progress; ++ } + } + qemu_lockcnt_dec(&ctx->list_lock); ++ ++ return progress; + } + + +@@ -599,12 +607,12 @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) + } + } + +- poll_set_started(ctx, false); ++ if (poll_set_started(ctx, false)) { ++ *timeout = 0; ++ return true; ++ } + +- /* Even if we don't run busy polling, try polling once in case it can make +- * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2). +- */ +- return run_poll_handlers_once(ctx, timeout); ++ return false; + } + + bool aio_poll(AioContext *ctx, bool blocking) +-- +2.27.0 + diff --git a/kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch b/kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch new file mode 100755 index 0000000000000000000000000000000000000000..a234140b92b0b7ced6322275c76d16128cfc7f12 --- /dev/null +++ b/kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch @@ -0,0 +1,132 @@ +From b474155fdc38f86f516c14ba9a6f934616d589ef Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Wed, 4 Aug 2021 03:27:22 -0400 +Subject: [PATCH 1/2] aio-wait: delegate polling of main AioContext if BQL not + held + +RH-Author: Andrew Jones +Message-id: <20210729134448.4995-2-drjones@redhat.com> +Patchwork-id: 101935 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 1/2] aio-wait: delegate polling of main AioContext if BQL not held +Bugzilla: 1969848 +RH-Acked-by: Gavin Shan +RH-Acked-by: Auger Eric +RH-Acked-by: Stefan Hajnoczi + +From: Paolo Bonzini + +Any thread that is not a iothread returns NULL for qemu_get_current_aio_context(). +As a result, it would also return true for +in_aio_context_home_thread(qemu_get_aio_context()), causing +AIO_WAIT_WHILE to invoke aio_poll() directly. This is incorrect +if the BQL is not held, because aio_poll() does not expect to +run concurrently from multiple threads, and it can actually +happen when savevm writes to the vmstate file from the +migration thread. + +Therefore, restrict in_aio_context_home_thread to return true +for the main AioContext only if the BQL is held. + +The function is moved to aio-wait.h because it is mostly used +there and to avoid a circular reference between main-loop.h +and block/aio.h. + +Signed-off-by: Paolo Bonzini +Message-Id: <20200407140746.8041-5-pbonzini@redhat.com> +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 3c18a92dc4b55ca8cc37a755ed119f11c0f34099) +Signed-off-by: Andrew Jones +Signed-off-by: Miroslav Rezanina +--- + include/block/aio-wait.h | 22 ++++++++++++++++++++++ + include/block/aio.h | 29 ++++++++++------------------- + 2 files changed, 32 insertions(+), 19 deletions(-) + +diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h +index afeeb18f95..716d2639df 100644 +--- a/include/block/aio-wait.h ++++ b/include/block/aio-wait.h +@@ -26,6 +26,7 @@ + #define QEMU_AIO_WAIT_H + + #include "block/aio.h" ++#include "qemu/main-loop.h" + + /** + * AioWait: +@@ -124,4 +125,25 @@ void aio_wait_kick(void); + */ + void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); + ++/** ++ * in_aio_context_home_thread: ++ * @ctx: the aio context ++ * ++ * Return whether we are running in the thread that normally runs @ctx. Note ++ * that acquiring/releasing ctx does not affect the outcome, each AioContext ++ * still only has one home thread that is responsible for running it. ++ */ ++static inline bool in_aio_context_home_thread(AioContext *ctx) ++{ ++ if (ctx == qemu_get_current_aio_context()) { ++ return true; ++ } ++ ++ if (ctx == qemu_get_aio_context()) { ++ return qemu_mutex_iothread_locked(); ++ } else { ++ return false; ++ } ++} ++ + #endif /* QEMU_AIO_WAIT_H */ +diff --git a/include/block/aio.h b/include/block/aio.h +index 6b0d52f732..9d28e247df 100644 +--- a/include/block/aio.h ++++ b/include/block/aio.h +@@ -60,12 +60,16 @@ struct AioContext { + QLIST_HEAD(, AioHandler) aio_handlers; + + /* Used to avoid unnecessary event_notifier_set calls in aio_notify; +- * accessed with atomic primitives. If this field is 0, everything +- * (file descriptors, bottom halves, timers) will be re-evaluated +- * before the next blocking poll(), thus the event_notifier_set call +- * can be skipped. If it is non-zero, you may need to wake up a +- * concurrent aio_poll or the glib main event loop, making +- * event_notifier_set necessary. ++ * only written from the AioContext home thread, or under the BQL in ++ * the case of the main AioContext. However, it is read from any ++ * thread so it is still accessed with atomic primitives. ++ * ++ * If this field is 0, everything (file descriptors, bottom halves, ++ * timers) will be re-evaluated before the next blocking poll() or ++ * io_uring wait; therefore, the event_notifier_set call can be ++ * skipped. If it is non-zero, you may need to wake up a concurrent ++ * aio_poll or the glib main event loop, making event_notifier_set ++ * necessary. + * + * Bit 0 is reserved for GSource usage of the AioContext, and is 1 + * between a call to aio_ctx_prepare and the next call to aio_ctx_check. +@@ -580,19 +584,6 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co); + */ + AioContext *qemu_get_current_aio_context(void); + +-/** +- * in_aio_context_home_thread: +- * @ctx: the aio context +- * +- * Return whether we are running in the thread that normally runs @ctx. Note +- * that acquiring/releasing ctx does not affect the outcome, each AioContext +- * still only has one home thread that is responsible for running it. +- */ +-static inline bool in_aio_context_home_thread(AioContext *ctx) +-{ +- return ctx == qemu_get_current_aio_context(); +-} +- + /** + * aio_context_setup: + * @ctx: the aio context +-- +2.27.0 + diff --git a/kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch b/kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch new file mode 100755 index 0000000000000000000000000000000000000000..becba216f025c9e6440ac28657667d6a2cc823ff --- /dev/null +++ b/kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch @@ -0,0 +1,62 @@ +From 0d5a09173eb75b7e56122c2aefb2646a2be58400 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 31 Jan 2020 17:12:57 +0000 +Subject: [PATCH 15/15] apic: Use 32bit APIC ID for migration instance ID + +RH-Author: Peter Xu +Message-id: <20200131171257.1066593-4-peterx@redhat.com> +Patchwork-id: 93628 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/3] apic: Use 32bit APIC ID for migration instance ID +Bugzilla: 1529231 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +Migration is silently broken now with x2apic config like this: + + -smp 200,maxcpus=288,sockets=2,cores=72,threads=2 \ + -device intel-iommu,intremap=on,eim=on + +After migration, the guest kernel could hang at anything, due to +x2apic bit not migrated correctly in IA32_APIC_BASE on some vcpus, so +any operations related to x2apic could be broken then (e.g., RDMSR on +x2apic MSRs could fail because KVM would think that the vcpu hasn't +enabled x2apic at all). + +The issue is that the x2apic bit was never applied correctly for vcpus +whose ID > 255 when migrate completes, and that's because when we +migrate APIC we use the APICCommonState.id as instance ID of the +migration stream, while that's too short for x2apic. + +Let's use the newly introduced initial_apic_id for that. + +Signed-off-by: Peter Xu +Reviewed-by: Juan Quintela +Reviewed-by: Eduardo Habkost +Signed-off-by: Juan Quintela +(cherry picked from commit 0ab994867c365db21e15f9503922c79234d8e40e) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/apic_common.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c +index 54b8731..b5dbeb6 100644 +--- a/hw/intc/apic_common.c ++++ b/hw/intc/apic_common.c +@@ -268,7 +268,10 @@ static void apic_common_realize(DeviceState *dev, Error **errp) + APICCommonState *s = APIC_COMMON(dev); + APICCommonClass *info; + static DeviceState *vapic; +- uint32_t instance_id = s->id; ++ uint32_t instance_id = s->initial_apic_id; ++ ++ /* Normally initial APIC ID should be no more than hundreds */ ++ assert(instance_id != VMSTATE_INSTANCE_ID_ANY); + + info = APIC_COMMON_GET_CLASS(s); + info->realize(dev, errp); +-- +1.8.3.1 + diff --git a/kvm-async-use-explicit-memory-barriers.patch b/kvm-async-use-explicit-memory-barriers.patch new file mode 100755 index 0000000000000000000000000000000000000000..2bf72454ab08b15dfd1f909581595ca8fc86a134 --- /dev/null +++ b/kvm-async-use-explicit-memory-barriers.patch @@ -0,0 +1,183 @@ +From 82a02aec3a8b3c2ac925d0b71ea4c35aa5d6463b Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Wed, 4 Aug 2021 03:27:24 -0400 +Subject: [PATCH 2/2] async: use explicit memory barriers + +RH-Author: Andrew Jones +Message-id: <20210729134448.4995-3-drjones@redhat.com> +Patchwork-id: 101937 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 2/2] async: use explicit memory barriers +Bugzilla: 1969848 +RH-Acked-by: Gavin Shan +RH-Acked-by: Auger Eric +RH-Acked-by: Stefan Hajnoczi + +From: Paolo Bonzini + +When using C11 atomics, non-seqcst reads and writes do not participate +in the total order of seqcst operations. In util/async.c and util/aio-posix.c, +in particular, the pattern that we use + + write ctx->notify_me write bh->scheduled + read bh->scheduled read ctx->notify_me + if !bh->scheduled, sleep if ctx->notify_me, notify + +needs to use seqcst operations for both the write and the read. In +general this is something that we do not want, because there can be +many sources that are polled in addition to bottom halves. The +alternative is to place a seqcst memory barrier between the write +and the read. This also comes with a disadvantage, in that the +memory barrier is implicit on strongly-ordered architectures and +it wastes a few dozen clock cycles. + +Fortunately, ctx->notify_me is never written concurrently by two +threads, so we can assert that and relax the writes to ctx->notify_me. +The resulting solution works and performs well on both aarch64 and x86. + +Note that the atomic_set/atomic_read combination is not an atomic +read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED; +on x86, ATOMIC_RELAXED compiles to a locked operation. + +Analyzed-by: Ying Fang +Signed-off-by: Paolo Bonzini +Tested-by: Ying Fang +Message-Id: <20200407140746.8041-6-pbonzini@redhat.com> +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c) +Signed-off-by: Andrew Jones +Signed-off-by: Miroslav Rezanina +--- + util/aio-posix.c | 16 ++++++++++++++-- + util/aio-win32.c | 17 ++++++++++++++--- + util/async.c | 16 ++++++++++++---- + 3 files changed, 40 insertions(+), 9 deletions(-) + +diff --git a/util/aio-posix.c b/util/aio-posix.c +index abc396d030..8cfb25650d 100644 +--- a/util/aio-posix.c ++++ b/util/aio-posix.c +@@ -624,6 +624,11 @@ bool aio_poll(AioContext *ctx, bool blocking) + int64_t timeout; + int64_t start = 0; + ++ /* ++ * There cannot be two concurrent aio_poll calls for the same AioContext (or ++ * an aio_poll concurrent with a GSource prepare/check/dispatch callback). ++ * We rely on this below to avoid slow locked accesses to ctx->notify_me. ++ */ + assert(in_aio_context_home_thread(ctx)); + + /* aio_notify can avoid the expensive event_notifier_set if +@@ -634,7 +639,13 @@ bool aio_poll(AioContext *ctx, bool blocking) + * so disable the optimization now. + */ + if (blocking) { +- atomic_add(&ctx->notify_me, 2); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + } + + qemu_lockcnt_inc(&ctx->list_lock); +@@ -679,7 +690,8 @@ bool aio_poll(AioContext *ctx, bool blocking) + } + + if (blocking) { +- atomic_sub(&ctx->notify_me, 2); ++ /* Finish the poll before clearing the flag. */ ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); + aio_notify_accept(ctx); + } + +diff --git a/util/aio-win32.c b/util/aio-win32.c +index a23b9c364d..729d533faf 100644 +--- a/util/aio-win32.c ++++ b/util/aio-win32.c +@@ -321,6 +321,12 @@ bool aio_poll(AioContext *ctx, bool blocking) + int count; + int timeout; + ++ /* ++ * There cannot be two concurrent aio_poll calls for the same AioContext (or ++ * an aio_poll concurrent with a GSource prepare/check/dispatch callback). ++ * We rely on this below to avoid slow locked accesses to ctx->notify_me. ++ */ ++ assert(in_aio_context_home_thread(ctx)); + progress = false; + + /* aio_notify can avoid the expensive event_notifier_set if +@@ -331,7 +337,13 @@ bool aio_poll(AioContext *ctx, bool blocking) + * so disable the optimization now. + */ + if (blocking) { +- atomic_add(&ctx->notify_me, 2); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + } + + qemu_lockcnt_inc(&ctx->list_lock); +@@ -364,8 +376,7 @@ bool aio_poll(AioContext *ctx, bool blocking) + ret = WaitForMultipleObjects(count, events, FALSE, timeout); + if (blocking) { + assert(first); +- assert(in_aio_context_home_thread(ctx)); +- atomic_sub(&ctx->notify_me, 2); ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); + aio_notify_accept(ctx); + } + +diff --git a/util/async.c b/util/async.c +index b1fa5319e5..c65c58bbc9 100644 +--- a/util/async.c ++++ b/util/async.c +@@ -220,7 +220,14 @@ aio_ctx_prepare(GSource *source, gint *timeout) + { + AioContext *ctx = (AioContext *) source; + +- atomic_or(&ctx->notify_me, 1); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1); ++ ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + + /* We assume there is no timeout already supplied */ + *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); +@@ -238,7 +245,8 @@ aio_ctx_check(GSource *source) + AioContext *ctx = (AioContext *) source; + QEMUBH *bh; + +- atomic_and(&ctx->notify_me, ~1); ++ /* Finish computing the timeout before clearing the flag. */ ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1); + aio_notify_accept(ctx); + + for (bh = ctx->first_bh; bh; bh = bh->next) { +@@ -343,10 +351,10 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) + void aio_notify(AioContext *ctx) + { + /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs +- * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll. ++ * with smp_mb in aio_ctx_prepare or aio_poll. + */ + smp_mb(); +- if (ctx->notify_me) { ++ if (atomic_read(&ctx->notify_me)) { + event_notifier_set(&ctx->notifier); + atomic_mb_set(&ctx->notified, true); + } +-- +2.27.0 + diff --git a/kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch b/kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..1a20688b6f6836d53b4295a42b3126aa4762a351 --- /dev/null +++ b/kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch @@ -0,0 +1,53 @@ +From 96c8fcafa7325cd0e8a23a743a55f0ad0aa9f79b Mon Sep 17 00:00:00 2001 +From: Gerd Hoffmann +Date: Thu, 18 Mar 2021 09:13:42 -0400 +Subject: [PATCH 5/5] audio: audio_generic_get_buffer_in should honor *size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Gerd Hoffmann +Message-id: <20210318091342.3232471-2-kraxel@redhat.com> +Patchwork-id: 101352 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] audio: audio_generic_get_buffer_in should honor *size +Bugzilla: 1932823 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé + +From: Volker Rümelin + +The function generic_get_buffer_in currently ignores the *size +parameter and may return a buffer larger than *size. + +As a result the variable samples in function +audio_pcm_hw_run_in may underflow. The while loop then most +likely will never termiate. + +Buglink: http://bugs.debian.org/948658 +Signed-off-by: Volker Rümelin +Message-Id: <20200123074943.6699-9-vr_qemu@t-online.de> +Signed-off-by: Gerd Hoffmann +(cherry picked from commit 599eac4e5a41e828645594097daee39373acc3c0) +Signed-off-by: Danilo C. L. de Paula +--- + audio/audio.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/audio/audio.c b/audio/audio.c +index 56fae55047..39a62fc62a 100644 +--- a/audio/audio.c ++++ b/audio/audio.c +@@ -1402,7 +1402,8 @@ void *audio_generic_get_buffer_in(HWVoiceIn *hw, size_t *size) + } + assert(start >= 0 && start < hw->size_emul); + +- *size = MIN(hw->pending_emul, hw->size_emul - start); ++ *size = MIN(*size, hw->pending_emul); ++ *size = MIN(*size, hw->size_emul - start); + return hw->buf_emul + start; + } + +-- +2.27.0 + diff --git a/kvm-backup-Improve-error-for-bdrv_getlength-failure.patch b/kvm-backup-Improve-error-for-bdrv_getlength-failure.patch new file mode 100755 index 0000000000000000000000000000000000000000..8fa2629f9e2f8a06ee6ed976d3dd8f9a6a252b2b --- /dev/null +++ b/kvm-backup-Improve-error-for-bdrv_getlength-failure.patch @@ -0,0 +1,51 @@ +From fba183faf8ce819262a1a47f8531ea68051cdce7 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:19 +0100 +Subject: [PATCH 20/26] backup: Improve error for bdrv_getlength() failure + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-6-kwolf@redhat.com> +Patchwork-id: 97103 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 05/11] backup: Improve error for bdrv_getlength() failure +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +bdrv_get_device_name() will be an empty string with modern management +tools that don't use -drive. Use bdrv_get_device_or_node_name() instead +so that the node name is used if the BlockBackend is anonymous. + +While at it, start with upper case to make the message consistent with +the rest of the function. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Message-Id: <20200430142755.315494-3-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 58226634c4b02af7b10862f7fbd3610a344bfb7f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/backup.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/backup.c b/block/backup.c +index ec50946..7c6ddd2 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -408,8 +408,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + + len = bdrv_getlength(bs); + if (len < 0) { +- error_setg_errno(errp, -len, "unable to get length for '%s'", +- bdrv_get_device_name(bs)); ++ error_setg_errno(errp, -len, "Unable to get length for '%s'", ++ bdrv_get_device_or_node_name(bs)); + goto error; + } + +-- +1.8.3.1 + diff --git a/kvm-backup-Make-sure-that-source-and-target-size-match.patch b/kvm-backup-Make-sure-that-source-and-target-size-match.patch new file mode 100755 index 0000000000000000000000000000000000000000..05b5d1071ce199e7b1c853587453cf34ff1a8f29 --- /dev/null +++ b/kvm-backup-Make-sure-that-source-and-target-size-match.patch @@ -0,0 +1,124 @@ +From e56abd782be8bb41bb07c0317d008f95ec9a8ee5 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:20 +0100 +Subject: [PATCH 21/26] backup: Make sure that source and target size match + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-7-kwolf@redhat.com> +Patchwork-id: 97107 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 06/11] backup: Make sure that source and target size match +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +Since the introduction of a backup filter node in commit 00e30f05d, the +backup block job crashes when the target image is smaller than the +source image because it will try to write after the end of the target +node without having BLK_PERM_RESIZE. (Previously, the BlockBackend layer +would have caught this and errored out gracefully.) + +We can fix this and even do better than the old behaviour: Check that +source and target have the same image size at the start of the block job +and unshare BLK_PERM_RESIZE. (This permission was already unshared +before the same commit 00e30f05d, but the BlockBackend that was used to +make the restriction was removed without a replacement.) This will +immediately error out when starting the job instead of only when writing +to a block that doesn't exist in the target. + +Longer target than source would technically work because we would never +write to blocks that don't exist, but semantically these are invalid, +too, because a backup is supposed to create a copy, not just an image +that starts with a copy. + +Fixes: 00e30f05de1d19586345ec373970ef4c192c6270 +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1778593 +Cc: qemu-stable@nongnu.org +Signed-off-by: Kevin Wolf +Message-Id: <20200430142755.315494-4-kwolf@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 958a04bd32af18d9a207bcc78046e56a202aebc2) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/backup-top.c | 14 +++++++++----- + block/backup.c | 14 +++++++++++++- + 2 files changed, 22 insertions(+), 6 deletions(-) + +diff --git a/block/backup-top.c b/block/backup-top.c +index b8d863f..6756091 100644 +--- a/block/backup-top.c ++++ b/block/backup-top.c +@@ -143,8 +143,10 @@ static void backup_top_child_perm(BlockDriverState *bs, BdrvChild *c, + * + * Share write to target (child_file), to not interfere + * with guest writes to its disk which may be in target backing chain. ++ * Can't resize during a backup block job because we check the size ++ * only upfront. + */ +- *nshared = BLK_PERM_ALL; ++ *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE; + *nperm = BLK_PERM_WRITE; + } else { + /* Source child */ +@@ -154,7 +156,7 @@ static void backup_top_child_perm(BlockDriverState *bs, BdrvChild *c, + if (perm & BLK_PERM_WRITE) { + *nperm = *nperm | BLK_PERM_CONSISTENT_READ; + } +- *nshared &= ~BLK_PERM_WRITE; ++ *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); + } + } + +@@ -187,10 +189,12 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState *source, + { + Error *local_err = NULL; + BDRVBackupTopState *state; +- BlockDriverState *top = bdrv_new_open_driver(&bdrv_backup_top_filter, +- filter_node_name, +- BDRV_O_RDWR, errp); ++ BlockDriverState *top; ++ ++ assert(source->total_sectors == target->total_sectors); + ++ top = bdrv_new_open_driver(&bdrv_backup_top_filter, filter_node_name, ++ BDRV_O_RDWR, errp); + if (!top) { + return NULL; + } +diff --git a/block/backup.c b/block/backup.c +index 7c6ddd2..821c9fb 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -348,7 +348,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque, + JobTxn *txn, Error **errp) + { +- int64_t len; ++ int64_t len, target_len; + BackupBlockJob *job = NULL; + int64_t cluster_size; + BdrvRequestFlags write_flags; +@@ -413,6 +413,18 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + goto error; + } + ++ target_len = bdrv_getlength(target); ++ if (target_len < 0) { ++ error_setg_errno(errp, -target_len, "Unable to get length for '%s'", ++ bdrv_get_device_or_node_name(bs)); ++ goto error; ++ } ++ ++ if (target_len != len) { ++ error_setg(errp, "Source and target image have different sizes"); ++ goto error; ++ } ++ + cluster_size = backup_calculate_cluster_size(target, errp); + if (cluster_size < 0) { + goto error; +-- +1.8.3.1 + diff --git a/kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch b/kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch new file mode 100755 index 0000000000000000000000000000000000000000..7fb76c1eaa7d3f886f436b2d6fb03d1f0d3e4bf4 --- /dev/null +++ b/kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch @@ -0,0 +1,57 @@ +From 619b3aac9790a7ca7c01846144395a318a9ab250 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:14 +0100 +Subject: [PATCH 3/6] backup: don't acquire aio_context in backup_clean + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-4-kwolf@redhat.com> +Patchwork-id: 94596 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/6] backup: don't acquire aio_context in backup_clean +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +From: Stefan Reiter + +All code-paths leading to backup_clean (via job_clean) have the job's +context already acquired. The job's context is guaranteed to be the same +as the one used by backup_top via backup_job_create. + +Since the previous logic effectively acquired the lock twice, this +broke cleanup of backups for disks using IO threads, since the BDRV_POLL_WHILE +in bdrv_backup_top_drop -> bdrv_do_drained_begin would only release the lock +once, thus deadlocking with the IO thread. + +This is a partial revert of 0abf2581717a19. + +Signed-off-by: Stefan Reiter +Reviewed-by: Max Reitz +Message-Id: <20200407115651.69472-4-s.reiter@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit eca0f3524a4eb57d03a56b0cbcef5527a0981ce4) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/backup.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/block/backup.c b/block/backup.c +index 1383e21..ec50946 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -135,11 +135,7 @@ static void backup_abort(Job *job) + static void backup_clean(Job *job) + { + BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); +- AioContext *aio_context = bdrv_get_aio_context(s->backup_top); +- +- aio_context_acquire(aio_context); + bdrv_backup_top_drop(s->backup_top); +- aio_context_release(aio_context); + } + + void backup_do_checkpoint(BlockJob *job, Error **errp) +-- +1.8.3.1 + diff --git a/kvm-backup-top-Begin-drain-earlier.patch b/kvm-backup-top-Begin-drain-earlier.patch new file mode 100755 index 0000000000000000000000000000000000000000..ef289b76e3bd1780b574dfc82e8bda2ea6f0c864 --- /dev/null +++ b/kvm-backup-top-Begin-drain-earlier.patch @@ -0,0 +1,56 @@ +From bc78ee07bf400cbff0021367e05d308870471710 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:45 +0000 +Subject: [PATCH 12/18] backup-top: Begin drain earlier + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-6-slp@redhat.com> +Patchwork-id: 93757 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 5/9] backup-top: Begin drain earlier +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +From: Max Reitz + +When dropping backup-top, we need to drain the node before freeing the +BlockCopyState. Otherwise, requests may still be in flight and then the +assertion in shres_destroy() will fail. + +(This becomes visible in intermittent failure of 056.) + +Cc: qemu-stable@nongnu.org +Signed-off-by: Max Reitz +Message-id: 20191219182638.104621-1-mreitz@redhat.com +Signed-off-by: Max Reitz +(cherry picked from commit 503ca1262bab2c11c533a4816d1ff4297d4f58a6) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + block/backup-top.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/backup-top.c b/block/backup-top.c +index 7cdb1f8..818d3f2 100644 +--- a/block/backup-top.c ++++ b/block/backup-top.c +@@ -257,12 +257,12 @@ void bdrv_backup_top_drop(BlockDriverState *bs) + BDRVBackupTopState *s = bs->opaque; + AioContext *aio_context = bdrv_get_aio_context(bs); + +- block_copy_state_free(s->bcs); +- + aio_context_acquire(aio_context); + + bdrv_drained_begin(bs); + ++ block_copy_state_free(s->bcs); ++ + s->active = false; + bdrv_child_refresh_perms(bs, bs->backing, &error_abort); + bdrv_replace_node(bs, backing_bs(bs), &error_abort); +-- +1.8.3.1 + diff --git a/kvm-block-Activate-recursively-even-for-already-active-n.patch b/kvm-block-Activate-recursively-even-for-already-active-n.patch new file mode 100755 index 0000000000000000000000000000000000000000..d6cad0649428cf88a0703a97b47d1e21afa0418c --- /dev/null +++ b/kvm-block-Activate-recursively-even-for-already-active-n.patch @@ -0,0 +1,116 @@ +From 0ef6691ce8964bb2bbd677756c4e594793ca3ad8 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:01 +0000 +Subject: [PATCH 04/18] block: Activate recursively even for already active + nodes + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-4-kwolf@redhat.com> +Patchwork-id: 93749 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 3/6] block: Activate recursively even for already active nodes +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +bdrv_invalidate_cache_all() assumes that all nodes in a given subtree +are either active or inactive when it starts. Therefore, as soon as it +arrives at an already active node, it stops. + +However, this assumption is wrong. For example, it's possible to take a +snapshot of an inactive node, which results in an active overlay over an +inactive backing file. The active overlay is probably also the root node +of an inactive BlockBackend (blk->disable_perm == true). + +In this case, bdrv_invalidate_cache_all() does not need to do anything +to activate the overlay node, but it still needs to recurse into the +children and the parents to make sure that after returning success, +really everything is activated. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +(cherry picked from commit 7bb4941ace471fc7dd6ded4749b95b9622baa6ed) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 50 ++++++++++++++++++++++++-------------------------- + 1 file changed, 24 insertions(+), 26 deletions(-) + +diff --git a/block.c b/block.c +index 473eb6e..2e5e8b6 100644 +--- a/block.c ++++ b/block.c +@@ -5335,10 +5335,6 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, + return; + } + +- if (!(bs->open_flags & BDRV_O_INACTIVE)) { +- return; +- } +- + QLIST_FOREACH(child, &bs->children, next) { + bdrv_co_invalidate_cache(child->bs, &local_err); + if (local_err) { +@@ -5360,34 +5356,36 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, + * just keep the extended permissions for the next time that an activation + * of the image is tried. + */ +- bs->open_flags &= ~BDRV_O_INACTIVE; +- bdrv_get_cumulative_perm(bs, &perm, &shared_perm); +- ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, NULL, &local_err); +- if (ret < 0) { +- bs->open_flags |= BDRV_O_INACTIVE; +- error_propagate(errp, local_err); +- return; +- } +- bdrv_set_perm(bs, perm, shared_perm); +- +- if (bs->drv->bdrv_co_invalidate_cache) { +- bs->drv->bdrv_co_invalidate_cache(bs, &local_err); +- if (local_err) { ++ if (bs->open_flags & BDRV_O_INACTIVE) { ++ bs->open_flags &= ~BDRV_O_INACTIVE; ++ bdrv_get_cumulative_perm(bs, &perm, &shared_perm); ++ ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, NULL, &local_err); ++ if (ret < 0) { + bs->open_flags |= BDRV_O_INACTIVE; + error_propagate(errp, local_err); + return; + } +- } ++ bdrv_set_perm(bs, perm, shared_perm); + +- FOR_EACH_DIRTY_BITMAP(bs, bm) { +- bdrv_dirty_bitmap_skip_store(bm, false); +- } ++ if (bs->drv->bdrv_co_invalidate_cache) { ++ bs->drv->bdrv_co_invalidate_cache(bs, &local_err); ++ if (local_err) { ++ bs->open_flags |= BDRV_O_INACTIVE; ++ error_propagate(errp, local_err); ++ return; ++ } ++ } + +- ret = refresh_total_sectors(bs, bs->total_sectors); +- if (ret < 0) { +- bs->open_flags |= BDRV_O_INACTIVE; +- error_setg_errno(errp, -ret, "Could not refresh total sector count"); +- return; ++ FOR_EACH_DIRTY_BITMAP(bs, bm) { ++ bdrv_dirty_bitmap_skip_store(bm, false); ++ } ++ ++ ret = refresh_total_sectors(bs, bs->total_sectors); ++ if (ret < 0) { ++ bs->open_flags |= BDRV_O_INACTIVE; ++ error_setg_errno(errp, -ret, "Could not refresh total sector count"); ++ return; ++ } + } + + QLIST_FOREACH(parent, &bs->parents, next_parent) { +-- +1.8.3.1 + diff --git a/kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch b/kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch new file mode 100755 index 0000000000000000000000000000000000000000..bc672797e5695168f91d9182fd030e2dc312f869 --- /dev/null +++ b/kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch @@ -0,0 +1,283 @@ +From 13e2076f5c4adbc9a3f96c8978150aa5e423e14a Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:30 +0100 +Subject: [PATCH 02/17] block: Add flags to BlockDriver.bdrv_co_truncate() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-2-kwolf@redhat.com> +Patchwork-id: 97448 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 01/11] block: Add flags to BlockDriver.bdrv_co_truncate() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +This adds a new BdrvRequestFlags parameter to the .bdrv_co_truncate() +driver callbacks, and a supported_truncate_flags field in +BlockDriverState that allows drivers to advertise support for request +flags in the context of truncate. + +For now, we always pass 0 and no drivers declare support for any flag. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-2-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 92b92799dc8662b6f71809100a4aabc1ae408ebb) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/crypto.c | 3 ++- + block/file-posix.c | 2 +- + block/file-win32.c | 2 +- + block/gluster.c | 1 + + block/io.c | 8 +++++++- + block/iscsi.c | 2 +- + block/nfs.c | 3 ++- + block/qcow2.c | 2 +- + block/qed.c | 1 + + block/raw-format.c | 2 +- + block/rbd.c | 1 + + block/sheepdog.c | 4 ++-- + block/ssh.c | 2 +- + include/block/block_int.h | 10 +++++++++- + tests/test-block-iothread.c | 3 ++- + 15 files changed, 33 insertions(+), 13 deletions(-) + +diff --git a/block/crypto.c b/block/crypto.c +index 5e3b15c..6e4b726 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -299,7 +299,8 @@ static int block_crypto_co_create_generic(BlockDriverState *bs, + + static int coroutine_fn + block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + BlockCrypto *crypto = bs->opaque; + uint64_t payload_offset = +diff --git a/block/file-posix.c b/block/file-posix.c +index 1609598..7551e8d 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -2021,7 +2021,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, + + static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVRawState *s = bs->opaque; + struct stat st; +diff --git a/block/file-win32.c b/block/file-win32.c +index 1585983..a6b0dda 100644 +--- a/block/file-win32.c ++++ b/block/file-win32.c +@@ -469,7 +469,7 @@ static void raw_close(BlockDriverState *bs) + + static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVRawState *s = bs->opaque; + LONG low, high; +diff --git a/block/gluster.c b/block/gluster.c +index 0aa1f2c..d06df90 100644 +--- a/block/gluster.c ++++ b/block/gluster.c +@@ -1228,6 +1228,7 @@ static coroutine_fn int qemu_gluster_co_truncate(BlockDriverState *bs, + int64_t offset, + bool exact, + PreallocMode prealloc, ++ BdrvRequestFlags flags, + Error **errp) + { + BDRVGlusterState *s = bs->opaque; +diff --git a/block/io.c b/block/io.c +index f75777f..549e5a4 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -3320,6 +3320,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + BlockDriverState *bs = child->bs; + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; ++ BdrvRequestFlags flags = 0; + int64_t old_size, new_bytes; + int ret; + +@@ -3370,7 +3371,12 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + } + + if (drv->bdrv_co_truncate) { +- ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp); ++ if (flags & ~bs->supported_truncate_flags) { ++ error_setg(errp, "Block driver does not support requested flags"); ++ ret = -ENOTSUP; ++ goto out; ++ } ++ ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); + } else if (bs->file && drv->is_filter) { + ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); + } else { +diff --git a/block/iscsi.c b/block/iscsi.c +index 16b0716..0bea2d3 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -2125,7 +2125,7 @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state) + + static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + IscsiLun *iscsilun = bs->opaque; + int64_t cur_length; +diff --git a/block/nfs.c b/block/nfs.c +index cc2413d..2393fbf 100644 +--- a/block/nfs.c ++++ b/block/nfs.c +@@ -755,7 +755,8 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) + + static int coroutine_fn + nfs_file_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + NFSClient *client = bs->opaque; + int ret; +diff --git a/block/qcow2.c b/block/qcow2.c +index dbd870a..977445e 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3948,7 +3948,7 @@ fail: + + static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVQcow2State *s = bs->opaque; + uint64_t old_length; +diff --git a/block/qed.c b/block/qed.c +index 1af9b3c..fb6100b 100644 +--- a/block/qed.c ++++ b/block/qed.c +@@ -1467,6 +1467,7 @@ static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs, + int64_t offset, + bool exact, + PreallocMode prealloc, ++ BdrvRequestFlags flags, + Error **errp) + { + BDRVQEDState *s = bs->opaque; +diff --git a/block/raw-format.c b/block/raw-format.c +index 4bb54f4..f994c4a 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -371,7 +371,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) + + static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVRawState *s = bs->opaque; + +diff --git a/block/rbd.c b/block/rbd.c +index 8847259..fcdb60a 100644 +--- a/block/rbd.c ++++ b/block/rbd.c +@@ -1090,6 +1090,7 @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs, + int64_t offset, + bool exact, + PreallocMode prealloc, ++ BdrvRequestFlags flags, + Error **errp) + { + int r; +diff --git a/block/sheepdog.c b/block/sheepdog.c +index a8a7e32..077aed8 100644 +--- a/block/sheepdog.c ++++ b/block/sheepdog.c +@@ -2288,7 +2288,7 @@ static int64_t sd_getlength(BlockDriverState *bs) + + static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVSheepdogState *s = bs->opaque; + int ret, fd; +@@ -2604,7 +2604,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, + + assert(!flags); + if (offset > s->inode.vdi_size) { +- ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL); ++ ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + return ret; + } +diff --git a/block/ssh.c b/block/ssh.c +index 84e9282..9eb33df 100644 +--- a/block/ssh.c ++++ b/block/ssh.c +@@ -1298,7 +1298,7 @@ static int64_t ssh_getlength(BlockDriverState *bs) + + static int coroutine_fn ssh_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp) ++ BdrvRequestFlags flags, Error **errp) + { + BDRVSSHState *s = bs->opaque; + +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 876a83d..41f13ec 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -356,7 +356,7 @@ struct BlockDriver { + */ + int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, +- Error **errp); ++ BdrvRequestFlags flags, Error **errp); + + int64_t (*bdrv_getlength)(BlockDriverState *bs); + bool has_variable_length; +@@ -849,6 +849,14 @@ struct BlockDriverState { + /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, + * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */ + unsigned int supported_zero_flags; ++ /* ++ * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE). ++ * ++ * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure ++ * that any added space reads as all zeros. If this can't be guaranteed, ++ * the operation must fail. ++ */ ++ unsigned int supported_truncate_flags; + + /* the following member gives a name to every node on the bs graph. */ + char node_name[32]; +diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c +index 0c86180..2f3b763 100644 +--- a/tests/test-block-iothread.c ++++ b/tests/test-block-iothread.c +@@ -46,7 +46,8 @@ static int coroutine_fn bdrv_test_co_pdiscard(BlockDriverState *bs, + + static int coroutine_fn + bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + return 0; + } +-- +1.8.3.1 + diff --git a/kvm-block-Add-flags-to-bdrv-_co-_truncate.patch b/kvm-block-Add-flags-to-bdrv-_co-_truncate.patch new file mode 100755 index 0000000000000000000000000000000000000000..3da05ffe2770e7bc2d51097180baa056582763ec --- /dev/null +++ b/kvm-block-Add-flags-to-bdrv-_co-_truncate.patch @@ -0,0 +1,353 @@ +From 50127f0ff9e13a15fd5bfeb2662e2404ff20f364 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:31 +0100 +Subject: [PATCH 03/17] block: Add flags to bdrv(_co)_truncate() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-3-kwolf@redhat.com> +Patchwork-id: 97445 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 02/11] block: Add flags to bdrv(_co)_truncate() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Now that block drivers can support flags for .bdrv_co_truncate, expose +the parameter in the node level interfaces bdrv_co_truncate() and +bdrv_truncate(). + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-3-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 7b8e4857426f2e2de2441749996c6161b550bada) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 2 +- + block/crypto.c | 2 +- + block/io.c | 12 +++++++----- + block/parallels.c | 6 +++--- + block/qcow.c | 4 ++-- + block/qcow2-refcount.c | 2 +- + block/qcow2.c | 15 +++++++++------ + block/raw-format.c | 2 +- + block/vhdx-log.c | 2 +- + block/vhdx.c | 2 +- + block/vmdk.c | 2 +- + include/block/block.h | 5 +++-- + tests/test-block-iothread.c | 6 +++--- + 13 files changed, 34 insertions(+), 28 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 38ae413..8be2006 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -2144,7 +2144,7 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, + return -ENOMEDIUM; + } + +- return bdrv_truncate(blk->root, offset, exact, prealloc, errp); ++ return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp); + } + + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, +diff --git a/block/crypto.c b/block/crypto.c +index 6e4b726..fcb4a97 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -313,7 +313,7 @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, + + offset += payload_offset; + +- return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); ++ return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp); + } + + static void block_crypto_close(BlockDriverState *bs) +diff --git a/block/io.c b/block/io.c +index 549e5a4..3235ce5 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -3315,12 +3315,12 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs) + * 'offset' bytes in length. + */ + int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp) + { + BlockDriverState *bs = child->bs; + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; +- BdrvRequestFlags flags = 0; + int64_t old_size, new_bytes; + int ret; + +@@ -3378,7 +3378,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + } + ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); + } else if (bs->file && drv->is_filter) { +- ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); ++ ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + } else { + error_setg(errp, "Image format driver does not support resize"); + ret = -ENOTSUP; +@@ -3411,6 +3411,7 @@ typedef struct TruncateCo { + int64_t offset; + bool exact; + PreallocMode prealloc; ++ BdrvRequestFlags flags; + Error **errp; + int ret; + } TruncateCo; +@@ -3419,12 +3420,12 @@ static void coroutine_fn bdrv_truncate_co_entry(void *opaque) + { + TruncateCo *tco = opaque; + tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact, +- tco->prealloc, tco->errp); ++ tco->prealloc, tco->flags, tco->errp); + aio_wait_kick(); + } + + int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) + { + Coroutine *co; + TruncateCo tco = { +@@ -3432,6 +3433,7 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, + .offset = offset, + .exact = exact, + .prealloc = prealloc, ++ .flags = flags, + .errp = errp, + .ret = NOT_DONE, + }; +diff --git a/block/parallels.c b/block/parallels.c +index 6d4ed77..2be92cf 100644 +--- a/block/parallels.c ++++ b/block/parallels.c +@@ -203,7 +203,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, + } else { + ret = bdrv_truncate(bs->file, + (s->data_end + space) << BDRV_SECTOR_BITS, +- false, PREALLOC_MODE_OFF, NULL); ++ false, PREALLOC_MODE_OFF, 0, NULL); + } + if (ret < 0) { + return ret; +@@ -493,7 +493,7 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, + * That means we have to pass exact=true. + */ + ret = bdrv_truncate(bs->file, res->image_end_offset, true, +- PREALLOC_MODE_OFF, &local_err); ++ PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + res->check_errors++; +@@ -889,7 +889,7 @@ static void parallels_close(BlockDriverState *bs) + + /* errors are ignored, so we might as well pass exact=true */ + bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, true, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + } + + g_free(s->bat_dirty_bmap); +diff --git a/block/qcow.c b/block/qcow.c +index 8973e4e..6b5f226 100644 +--- a/block/qcow.c ++++ b/block/qcow.c +@@ -480,7 +480,7 @@ static int get_cluster_offset(BlockDriverState *bs, + return -E2BIG; + } + ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size, +- false, PREALLOC_MODE_OFF, NULL); ++ false, PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + return ret; + } +@@ -1035,7 +1035,7 @@ static int qcow_make_empty(BlockDriverState *bs) + l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) + return ret; + +diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c +index f67ac6b..3a90d75 100644 +--- a/block/qcow2-refcount.c ++++ b/block/qcow2-refcount.c +@@ -2017,7 +2017,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, + } + + ret = bdrv_truncate(bs->file, offset + s->cluster_size, false, +- PREALLOC_MODE_OFF, &local_err); ++ PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + goto resize_fail; +diff --git a/block/qcow2.c b/block/qcow2.c +index 977445e..c0fdcb9 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3082,7 +3082,7 @@ static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset, + mode = PREALLOC_MODE_OFF; + } + ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false, +- mode, errp); ++ mode, 0, errp); + if (ret < 0) { + return ret; + } +@@ -4044,7 +4044,7 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + * always fulfilled, so there is no need to pass it on.) + */ + bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size, +- false, PREALLOC_MODE_OFF, &local_err); ++ false, PREALLOC_MODE_OFF, 0, &local_err); + if (local_err) { + warn_reportf_err(local_err, + "Failed to truncate the tail of the image: "); +@@ -4066,7 +4066,8 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + * file should be resized to the exact target size, too, + * so we pass @exact here. + */ +- ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp); ++ ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, 0, ++ errp); + if (ret < 0) { + goto fail; + } +@@ -4152,7 +4153,8 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + new_file_size = allocation_start + + nb_new_data_clusters * s->cluster_size; + /* Image file grows, so @exact does not matter */ +- ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp); ++ ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0, ++ errp); + if (ret < 0) { + error_prepend(errp, "Failed to resize underlying file: "); + qcow2_free_clusters(bs, allocation_start, +@@ -4255,7 +4257,8 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs, + if (len < 0) { + return len; + } +- return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL); ++ return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, 0, ++ NULL); + } + + if (offset_into_cluster(s, offset)) { +@@ -4493,7 +4496,7 @@ static int make_completely_empty(BlockDriverState *bs) + } + + ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false, +- PREALLOC_MODE_OFF, &local_err); ++ PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + goto fail; +diff --git a/block/raw-format.c b/block/raw-format.c +index f994c4a..c3acf9a 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -387,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + + s->size = offset; + offset += s->offset; +- return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp); ++ return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp); + } + + static void raw_eject(BlockDriverState *bs, bool eject_flag) +diff --git a/block/vhdx-log.c b/block/vhdx-log.c +index 13a49c2..404fb5f 100644 +--- a/block/vhdx-log.c ++++ b/block/vhdx-log.c +@@ -558,7 +558,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, + goto exit; + } + ret = bdrv_truncate(bs->file, new_file_size, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + goto exit; + } +diff --git a/block/vhdx.c b/block/vhdx.c +index 33e57cd..5dfbb20 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -1264,7 +1264,7 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, + } + + return bdrv_truncate(bs->file, *new_offset + s->block_size, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + } + + /* +diff --git a/block/vmdk.c b/block/vmdk.c +index eb726f2..1bbf937 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -2077,7 +2077,7 @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, + } + length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE); + ret = bdrv_truncate(s->extents[i].file, length, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + return ret; + } +diff --git a/include/block/block.h b/include/block/block.h +index b2a3074..4913596 100644 +--- a/include/block/block.h ++++ b/include/block/block.h +@@ -348,9 +348,10 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + void bdrv_refresh_filename(BlockDriverState *bs); + + int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp); ++ PreallocMode prealloc, BdrvRequestFlags flags, ++ Error **errp); + int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp); ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); + + int64_t bdrv_nb_sectors(BlockDriverState *bs); + int64_t bdrv_getlength(BlockDriverState *bs); +diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c +index 2f3b763..71e9bce 100644 +--- a/tests/test-block-iothread.c ++++ b/tests/test-block-iothread.c +@@ -186,18 +186,18 @@ static void test_sync_op_truncate(BdrvChild *c) + int ret; + + /* Normal success path */ +- ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL); ++ ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL); + g_assert_cmpint(ret, ==, 0); + + /* Early error: Negative offset */ +- ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, NULL); ++ ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, 0, NULL); + g_assert_cmpint(ret, ==, -EINVAL); + + /* Error: Read-only image */ + c->bs->read_only = true; + c->bs->open_flags &= ~BDRV_O_RDWR; + +- ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL); ++ ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, 0, NULL); + g_assert_cmpint(ret, ==, -EACCES); + + c->bs->read_only = false; +-- +1.8.3.1 + diff --git a/kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch b/kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch new file mode 100755 index 0000000000000000000000000000000000000000..190826fdfd5e3804096b8bd9037deaa644387b39 --- /dev/null +++ b/kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch @@ -0,0 +1,105 @@ +From c8ecaea34f03b8ddda7d2b41b0d6f397469c8959 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Wed, 10 Jun 2020 18:32:02 -0400 +Subject: [PATCH 2/2] block: Call attention to truncation of long NBD exports + +RH-Author: Eric Blake +Message-id: <20200610183202.3780750-3-eblake@redhat.com> +Patchwork-id: 97495 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] block: Call attention to truncation of long NBD exports +Bugzilla: 1845384 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Commit 93676c88 relaxed our NBD client code to request export names up +to the NBD protocol maximum of 4096 bytes without NUL terminator, even +though the block layer can't store anything longer than 4096 bytes +including NUL terminator for display to the user. Since this means +there are some export names where we have to truncate things, we can +at least try to make the truncation a bit more obvious for the user. +Note that in spite of the truncated display name, we can still +communicate with an NBD server using such a long export name; this was +deemed nicer than refusing to even connect to such a server (since the +server may not be under our control, and since determining our actual +length limits gets tricky when nbd://host:port/export and +nbd+unix:///export?socket=/path are themselves variable-length +expansions beyond the export name but count towards the block layer +name length). + +Reported-by: Xueqiang Wei +Fixes: https://bugzilla.redhat.com/1843684 +Signed-off-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200610163741.3745251-3-eblake@redhat.com> +(cherry picked from commit 5c86bdf1208916ece0b87e1151c9b48ee54faa3e) +Signed-off-by: Eric Blake +Signed-off-by: Eduardo Lima (Etrunko) +--- + block.c | 7 +++++-- + block/nbd.c | 21 +++++++++++++-------- + 2 files changed, 18 insertions(+), 10 deletions(-) + +diff --git a/block.c b/block.c +index 12c8941879..57740d312e 100644 +--- a/block.c ++++ b/block.c +@@ -6683,8 +6683,11 @@ void bdrv_refresh_filename(BlockDriverState *bs) + pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename); + } else { + QString *json = qobject_to_json(QOBJECT(bs->full_open_options)); +- snprintf(bs->filename, sizeof(bs->filename), "json:%s", +- qstring_get_str(json)); ++ if (snprintf(bs->filename, sizeof(bs->filename), "json:%s", ++ qstring_get_str(json)) >= sizeof(bs->filename)) { ++ /* Give user a hint if we truncated things. */ ++ strcpy(bs->filename + sizeof(bs->filename) - 4, "..."); ++ } + qobject_unref(json); + } + } +diff --git a/block/nbd.c b/block/nbd.c +index 927915d93d..5bb154017d 100644 +--- a/block/nbd.c ++++ b/block/nbd.c +@@ -1978,6 +1978,7 @@ static void nbd_refresh_filename(BlockDriverState *bs) + { + BDRVNBDState *s = bs->opaque; + const char *host = NULL, *port = NULL, *path = NULL; ++ size_t len = 0; + + if (s->saddr->type == SOCKET_ADDRESS_TYPE_INET) { + const InetSocketAddress *inet = &s->saddr->u.inet; +@@ -1990,17 +1991,21 @@ static void nbd_refresh_filename(BlockDriverState *bs) + } /* else can't represent as pseudo-filename */ + + if (path && s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd+unix:///%s?socket=%s", s->export, path); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd+unix:///%s?socket=%s", s->export, path); + } else if (path && !s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd+unix://?socket=%s", path); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd+unix://?socket=%s", path); + } else if (host && s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd://%s:%s/%s", host, port, s->export); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd://%s:%s/%s", host, port, s->export); + } else if (host && !s->export) { +- snprintf(bs->exact_filename, sizeof(bs->exact_filename), +- "nbd://%s:%s", host, port); ++ len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), ++ "nbd://%s:%s", host, port); ++ } ++ if (len > sizeof(bs->exact_filename)) { ++ /* Name is too long to represent exactly, so leave it empty. */ ++ bs->exact_filename[0] = '\0'; + } + } + +-- +2.27.0 + diff --git a/kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch b/kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch new file mode 100755 index 0000000000000000000000000000000000000000..b16c0b73ab7afa05007c5f042ad41889fa6fc6b2 --- /dev/null +++ b/kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch @@ -0,0 +1,84 @@ +From f17b37b58a57d849d2ff5fa04f149d9415803a39 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:17 +0100 +Subject: [PATCH 6/6] block: Fix blk->in_flight during blk_wait_while_drained() + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-7-kwolf@redhat.com> +Patchwork-id: 94599 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 6/6] block: Fix blk->in_flight during blk_wait_while_drained() +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +Waiting in blk_wait_while_drained() while blk->in_flight is increased +for the current request is wrong because it will cause the drain +operation to deadlock. + +This patch makes sure that blk_wait_while_drained() is called with +blk->in_flight increased exactly once for the current request, and that +it temporarily decreases the counter while it waits. + +Fixes: cf3129323f900ef5ddbccbe86e4fa801e88c566e +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Max Reitz +Message-Id: <20200407121259.21350-4-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 7f16476fab14fc32388e0ebae793f64673848efa) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 17 +++++------------ + 1 file changed, 5 insertions(+), 12 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 610dbfa..38ae413 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -1140,10 +1140,15 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset, + return 0; + } + ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ + static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) + { ++ assert(blk->in_flight > 0); ++ + if (blk->quiesce_counter && !blk->disable_request_queuing) { ++ blk_dec_in_flight(blk); + qemu_co_queue_wait(&blk->queued_requests, NULL); ++ blk_inc_in_flight(blk); + } + } + +@@ -1418,12 +1423,6 @@ static void blk_aio_read_entry(void *opaque) + BlkRwCo *rwco = &acb->rwco; + QEMUIOVector *qiov = rwco->iobuf; + +- if (rwco->blk->quiesce_counter) { +- blk_dec_in_flight(rwco->blk); +- blk_wait_while_drained(rwco->blk); +- blk_inc_in_flight(rwco->blk); +- } +- + assert(qiov->size == acb->bytes); + rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes, + qiov, rwco->flags); +@@ -1436,12 +1435,6 @@ static void blk_aio_write_entry(void *opaque) + BlkRwCo *rwco = &acb->rwco; + QEMUIOVector *qiov = rwco->iobuf; + +- if (rwco->blk->quiesce_counter) { +- blk_dec_in_flight(rwco->blk); +- blk_wait_while_drained(rwco->blk); +- blk_inc_in_flight(rwco->blk); +- } +- + assert(!qiov || qiov->size == acb->bytes); + rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes, + qiov, 0, rwco->flags); +-- +1.8.3.1 + diff --git a/kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch b/kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch new file mode 100755 index 0000000000000000000000000000000000000000..0bad89063596f44e492ba662f48ae6ca54f1a438 --- /dev/null +++ b/kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch @@ -0,0 +1,91 @@ +From 5774af5a3c713d0c93010c30453812eae6a749cd Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:37 +0000 +Subject: [PATCH 17/20] block: Fix cross-AioContext blockdev-snapshot + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-12-kwolf@redhat.com> +Patchwork-id: 94286 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 11/13] block: Fix cross-AioContext blockdev-snapshot +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +external_snapshot_prepare() tries to move the overlay to the AioContext +of the backing file (the snapshotted node). However, it's possible that +this doesn't work, but the backing file can instead be moved to the +overlay's AioContext (e.g. opening the backing chain for a mirror +target). + +bdrv_append() already indirectly uses bdrv_attach_node(), which takes +care to move nodes to make sure they use the same AioContext and which +tries both directions. + +So the problem has a simple fix: Just delete the unnecessary extra +bdrv_try_set_aio_context() call in external_snapshot_prepare() and +instead assert in bdrv_append() that both nodes were indeed moved to the +same AioContext. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-6-kwolf@redhat.com> +Tested-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit 30dd65f307b647eef8156c4a33bd007823ef85cb) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 1 + + blockdev.c | 16 ---------------- + 2 files changed, 1 insertion(+), 16 deletions(-) + +diff --git a/block.c b/block.c +index 354d388..ec29b1e 100644 +--- a/block.c ++++ b/block.c +@@ -4327,6 +4327,7 @@ void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, + bdrv_ref(from); + + assert(qemu_get_current_aio_context() == qemu_get_aio_context()); ++ assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to)); + bdrv_drained_begin(from); + + /* Put all parents into @list and calculate their cumulative permissions */ +diff --git a/blockdev.c b/blockdev.c +index 7918533..c8d4b51 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1535,9 +1535,7 @@ static void external_snapshot_prepare(BlkActionState *common, + DO_UPCAST(ExternalSnapshotState, common, common); + TransactionAction *action = common->action; + AioContext *aio_context; +- AioContext *old_context; + uint64_t perm, shared; +- int ret; + + /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar + * purpose but a different set of parameters */ +@@ -1678,20 +1676,6 @@ static void external_snapshot_prepare(BlkActionState *common, + goto out; + } + +- /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ +- old_context = bdrv_get_aio_context(state->new_bs); +- aio_context_release(aio_context); +- aio_context_acquire(old_context); +- +- ret = bdrv_try_set_aio_context(state->new_bs, aio_context, errp); +- +- aio_context_release(old_context); +- aio_context_acquire(aio_context); +- +- if (ret < 0) { +- goto out; +- } +- + /* This removes our old bs and adds the new bs. This is an operation that + * can fail, so we need to do it in .prepare; undoing it for abort is + * always possible. */ +-- +1.8.3.1 + diff --git a/kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch b/kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch new file mode 100755 index 0000000000000000000000000000000000000000..1735dc0f53cba39bd487e7660f63e65188635c0f --- /dev/null +++ b/kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch @@ -0,0 +1,60 @@ +From 05452efd7e0fb0522099ae09a396f8f97e66014a Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:47 +0000 +Subject: [PATCH 06/20] block: Fix leak in bdrv_create_file_fallback() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-7-mlevitsk@redhat.com> +Patchwork-id: 94229 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 6/6] block: Fix leak in bdrv_create_file_fallback() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +@options is leaked by the first two return statements in this function. + +Note that blk_new_open() takes the reference to @options even on +failure, so all we need to do to fix the leak is to move the QDict +allocation down to where we actually need it. + +Reported-by: Coverity (CID 1419884) +Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd + ("block: Generic file creation fallback") +Signed-off-by: Max Reitz +Message-Id: <20200225155618.133412-1-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit eeea1faa099f82328f5831cf252f8ce0a59a9287) +Signed-off-by: Maxim Levitsky + +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/block.c b/block.c +index 3beec7f..e1a4e38 100644 +--- a/block.c ++++ b/block.c +@@ -600,7 +600,7 @@ static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, + QemuOpts *opts, Error **errp) + { + BlockBackend *blk; +- QDict *options = qdict_new(); ++ QDict *options; + int64_t size = 0; + char *buf = NULL; + PreallocMode prealloc; +@@ -623,6 +623,7 @@ static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, + return -ENOTSUP; + } + ++ options = qdict_new(); + qdict_put_str(options, "driver", drv->format_name); + + blk = blk_new_open(filename, NULL, options, +-- +1.8.3.1 + diff --git a/kvm-block-Generic-file-creation-fallback.patch b/kvm-block-Generic-file-creation-fallback.patch new file mode 100755 index 0000000000000000000000000000000000000000..a5dd1d7843647983f26b8036fe4f7c95aea722e8 --- /dev/null +++ b/kvm-block-Generic-file-creation-fallback.patch @@ -0,0 +1,227 @@ +From 882d09226b7f45b72c5b7763c4c4aba182e0f8a1 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:43 +0000 +Subject: [PATCH 02/20] block: Generic file creation fallback + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-3-mlevitsk@redhat.com> +Patchwork-id: 94227 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 2/6] block: Generic file creation fallback +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +If a protocol driver does not support image creation, we can see whether +maybe the file exists already. If so, just truncating it will be +sufficient. + +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-3-mreitz@redhat.com> +Signed-off-by: Max Reitz +(cherry picked from commit fd17146cd93d1704cd96d7c2757b325fc7aac6fd) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 147 insertions(+), 12 deletions(-) + +diff --git a/block.c b/block.c +index 2e5e8b6..3beec7f 100644 +--- a/block.c ++++ b/block.c +@@ -532,20 +532,139 @@ out: + return ret; + } + +-int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) ++/** ++ * Helper function for bdrv_create_file_fallback(): Resize @blk to at ++ * least the given @minimum_size. ++ * ++ * On success, return @blk's actual length. ++ * Otherwise, return -errno. ++ */ ++static int64_t create_file_fallback_truncate(BlockBackend *blk, ++ int64_t minimum_size, Error **errp) + { +- BlockDriver *drv; ++ Error *local_err = NULL; ++ int64_t size; ++ int ret; ++ ++ ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, &local_err); ++ if (ret < 0 && ret != -ENOTSUP) { ++ error_propagate(errp, local_err); ++ return ret; ++ } ++ ++ size = blk_getlength(blk); ++ if (size < 0) { ++ error_free(local_err); ++ error_setg_errno(errp, -size, ++ "Failed to inquire the new image file's length"); ++ return size; ++ } ++ ++ if (size < minimum_size) { ++ /* Need to grow the image, but we failed to do that */ ++ error_propagate(errp, local_err); ++ return -ENOTSUP; ++ } ++ ++ error_free(local_err); ++ local_err = NULL; ++ ++ return size; ++} ++ ++/** ++ * Helper function for bdrv_create_file_fallback(): Zero the first ++ * sector to remove any potentially pre-existing image header. ++ */ ++static int create_file_fallback_zero_first_sector(BlockBackend *blk, ++ int64_t current_size, ++ Error **errp) ++{ ++ int64_t bytes_to_clear; ++ int ret; ++ ++ bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE); ++ if (bytes_to_clear) { ++ ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, ++ "Failed to clear the new image's first sector"); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, ++ QemuOpts *opts, Error **errp) ++{ ++ BlockBackend *blk; ++ QDict *options = qdict_new(); ++ int64_t size = 0; ++ char *buf = NULL; ++ PreallocMode prealloc; + Error *local_err = NULL; + int ret; + ++ size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); ++ buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); ++ prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, ++ PREALLOC_MODE_OFF, &local_err); ++ g_free(buf); ++ if (local_err) { ++ error_propagate(errp, local_err); ++ return -EINVAL; ++ } ++ ++ if (prealloc != PREALLOC_MODE_OFF) { ++ error_setg(errp, "Unsupported preallocation mode '%s'", ++ PreallocMode_str(prealloc)); ++ return -ENOTSUP; ++ } ++ ++ qdict_put_str(options, "driver", drv->format_name); ++ ++ blk = blk_new_open(filename, NULL, options, ++ BDRV_O_RDWR | BDRV_O_RESIZE, errp); ++ if (!blk) { ++ error_prepend(errp, "Protocol driver '%s' does not support image " ++ "creation, and opening the image failed: ", ++ drv->format_name); ++ return -EINVAL; ++ } ++ ++ size = create_file_fallback_truncate(blk, size, errp); ++ if (size < 0) { ++ ret = size; ++ goto out; ++ } ++ ++ ret = create_file_fallback_zero_first_sector(blk, size, errp); ++ if (ret < 0) { ++ goto out; ++ } ++ ++ ret = 0; ++out: ++ blk_unref(blk); ++ return ret; ++} ++ ++int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) ++{ ++ BlockDriver *drv; ++ + drv = bdrv_find_protocol(filename, true, errp); + if (drv == NULL) { + return -ENOENT; + } + +- ret = bdrv_create(drv, filename, opts, &local_err); +- error_propagate(errp, local_err); +- return ret; ++ if (drv->bdrv_co_create_opts) { ++ return bdrv_create(drv, filename, opts, errp); ++ } else { ++ return bdrv_create_file_fallback(filename, drv, opts, errp); ++ } + } + + /** +@@ -1422,6 +1541,24 @@ QemuOptsList bdrv_runtime_opts = { + }, + }; + ++static QemuOptsList fallback_create_opts = { ++ .name = "fallback-create-opts", ++ .head = QTAILQ_HEAD_INITIALIZER(fallback_create_opts.head), ++ .desc = { ++ { ++ .name = BLOCK_OPT_SIZE, ++ .type = QEMU_OPT_SIZE, ++ .help = "Virtual disk size" ++ }, ++ { ++ .name = BLOCK_OPT_PREALLOC, ++ .type = QEMU_OPT_STRING, ++ .help = "Preallocation mode (allowed values: off)" ++ }, ++ { /* end of list */ } ++ } ++}; ++ + /* + * Common part for opening disk images and files + * +@@ -5743,14 +5880,12 @@ void bdrv_img_create(const char *filename, const char *fmt, + return; + } + +- if (!proto_drv->create_opts) { +- error_setg(errp, "Protocol driver '%s' does not support image creation", +- proto_drv->format_name); +- return; +- } +- + create_opts = qemu_opts_append(create_opts, drv->create_opts); +- create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); ++ if (proto_drv->create_opts) { ++ create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); ++ } else { ++ create_opts = qemu_opts_append(create_opts, &fallback_create_opts); ++ } + + /* Create parameter list with default values */ + opts = qemu_opts_create(create_opts, NULL, 0, &error_abort); +-- +1.8.3.1 + diff --git a/kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch b/kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch new file mode 100755 index 0000000000000000000000000000000000000000..463501aabb39fbec081c4bd6d2dfc5b722cecacd --- /dev/null +++ b/kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch @@ -0,0 +1,295 @@ +From 52cc1d1cd2f695c5761d65baec961d14552a79ed Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:16 +0100 +Subject: [PATCH 5/6] block: Increase BB.in_flight for coroutine and sync + interfaces + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-6-kwolf@redhat.com> +Patchwork-id: 94600 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/6] block: Increase BB.in_flight for coroutine and sync interfaces +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +External callers of blk_co_*() and of the synchronous blk_*() functions +don't currently increase the BlockBackend.in_flight counter, but calls +from blk_aio_*() do, so there is an inconsistency whether the counter +has been increased or not. + +This patch moves the actual operations to static functions that can +later know they will always be called with in_flight increased exactly +once, even for external callers using the blk_co_*() coroutine +interfaces. + +If the public blk_co_*() interface is unused, remove it. + +Signed-off-by: Kevin Wolf +Message-Id: <20200407121259.21350-3-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit fbb92b6798894d3bf62fe3578d99fa62c720b242) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 103 ++++++++++++++++++++++++++++++++--------- + include/sysemu/block-backend.h | 1 - + 2 files changed, 80 insertions(+), 24 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 17b2e87..610dbfa 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -1147,9 +1147,10 @@ static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) + } + } + +-int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, +- unsigned int bytes, QEMUIOVector *qiov, +- BdrvRequestFlags flags) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes, ++ QEMUIOVector *qiov, BdrvRequestFlags flags) + { + int ret; + BlockDriverState *bs; +@@ -1178,10 +1179,24 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + return ret; + } + +-int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, +- unsigned int bytes, +- QEMUIOVector *qiov, size_t qiov_offset, +- BdrvRequestFlags flags) ++int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, ++ unsigned int bytes, QEMUIOVector *qiov, ++ BdrvRequestFlags flags) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_preadv(blk, offset, bytes, qiov, flags); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes, ++ QEMUIOVector *qiov, size_t qiov_offset, ++ BdrvRequestFlags flags) + { + int ret; + BlockDriverState *bs; +@@ -1214,6 +1229,20 @@ int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, + return ret; + } + ++int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset, ++ unsigned int bytes, ++ QEMUIOVector *qiov, size_t qiov_offset, ++ BdrvRequestFlags flags) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ + int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +@@ -1234,7 +1263,7 @@ static void blk_read_entry(void *opaque) + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size, ++ rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size, + qiov, rwco->flags); + aio_wait_kick(); + } +@@ -1244,8 +1273,8 @@ static void blk_write_entry(void *opaque) + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size, +- qiov, rwco->flags); ++ rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size, ++ qiov, 0, rwco->flags); + aio_wait_kick(); + } + +@@ -1262,6 +1291,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, + .ret = NOT_DONE, + }; + ++ blk_inc_in_flight(blk); + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + co_entry(&rwco); +@@ -1270,6 +1300,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, + bdrv_coroutine_enter(blk_bs(blk), co); + BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE); + } ++ blk_dec_in_flight(blk); + + return rwco.ret; + } +@@ -1394,7 +1425,7 @@ static void blk_aio_read_entry(void *opaque) + } + + assert(qiov->size == acb->bytes); +- rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes, ++ rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes, + qiov, rwco->flags); + blk_aio_complete(acb); + } +@@ -1412,8 +1443,8 @@ static void blk_aio_write_entry(void *opaque) + } + + assert(!qiov || qiov->size == acb->bytes); +- rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes, +- qiov, rwco->flags); ++ rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes, ++ qiov, 0, rwco->flags); + blk_aio_complete(acb); + } + +@@ -1498,7 +1529,9 @@ void blk_aio_cancel_async(BlockAIOCB *acb) + bdrv_aio_cancel_async(acb); + } + +-int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf) + { + blk_wait_while_drained(blk); + +@@ -1514,8 +1547,7 @@ static void blk_ioctl_entry(void *opaque) + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, +- qiov->iov[0].iov_base); ++ rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base); + aio_wait_kick(); + } + +@@ -1529,7 +1561,7 @@ static void blk_aio_ioctl_entry(void *opaque) + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + +- rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, rwco->iobuf); ++ rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf); + + blk_aio_complete(acb); + } +@@ -1540,7 +1572,9 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, + return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque); + } + +-int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn ++blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes) + { + int ret; + +@@ -1559,7 +1593,7 @@ static void blk_aio_pdiscard_entry(void *opaque) + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes); ++ rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes); + blk_aio_complete(acb); + } + +@@ -1571,12 +1605,23 @@ BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, + cb, opaque); + } + ++int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_pdiscard(blk, offset, bytes); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ + static void blk_pdiscard_entry(void *opaque) + { + BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size); ++ rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size); + aio_wait_kick(); + } + +@@ -1585,7 +1630,8 @@ int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes) + return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0); + } + +-int blk_co_flush(BlockBackend *blk) ++/* To be called between exactly one pair of blk_inc/dec_in_flight() */ ++static int coroutine_fn blk_do_flush(BlockBackend *blk) + { + blk_wait_while_drained(blk); + +@@ -1601,7 +1647,7 @@ static void blk_aio_flush_entry(void *opaque) + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + +- rwco->ret = blk_co_flush(rwco->blk); ++ rwco->ret = blk_do_flush(rwco->blk); + blk_aio_complete(acb); + } + +@@ -1611,10 +1657,21 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk, + return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque); + } + ++int coroutine_fn blk_co_flush(BlockBackend *blk) ++{ ++ int ret; ++ ++ blk_inc_in_flight(blk); ++ ret = blk_do_flush(blk); ++ blk_dec_in_flight(blk); ++ ++ return ret; ++} ++ + static void blk_flush_entry(void *opaque) + { + BlkRwCo *rwco = opaque; +- rwco->ret = blk_co_flush(rwco->blk); ++ rwco->ret = blk_do_flush(rwco->blk); + aio_wait_kick(); + } + +diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h +index b198dec..9bbdbd6 100644 +--- a/include/sysemu/block-backend.h ++++ b/include/sysemu/block-backend.h +@@ -171,7 +171,6 @@ BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int bytes, + BlockCompletionFunc *cb, void *opaque); + void blk_aio_cancel(BlockAIOCB *acb); + void blk_aio_cancel_async(BlockAIOCB *acb); +-int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf); + int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf); + BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque); +-- +1.8.3.1 + diff --git a/kvm-block-Introduce-bdrv_reopen_commit_post-step.patch b/kvm-block-Introduce-bdrv_reopen_commit_post-step.patch new file mode 100755 index 0000000000000000000000000000000000000000..72c8986d55f294b40301ca3d3a27abd33d04e453 --- /dev/null +++ b/kvm-block-Introduce-bdrv_reopen_commit_post-step.patch @@ -0,0 +1,65 @@ +From f7dd953c2d0380cef3c351afb03d68c6fcda1dca Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:28 +0000 +Subject: [PATCH 08/20] block: Introduce 'bdrv_reopen_commit_post' step + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-3-kwolf@redhat.com> +Patchwork-id: 94278 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 02/13] block: Introduce 'bdrv_reopen_commit_post' step +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Peter Krempa + +Add another step in the reopen process where driver can execute code +after permission changes are comitted. + +Signed-off-by: Peter Krempa +Message-Id: +Signed-off-by: Kevin Wolf +(cherry picked from commit 17e1e2be5f9e84e0298e28e70675655b43e225ea) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 9 +++++++++ + include/block/block_int.h | 1 + + 2 files changed, 10 insertions(+) + +diff --git a/block.c b/block.c +index e1a4e38..a744bb5 100644 +--- a/block.c ++++ b/block.c +@@ -3657,6 +3657,15 @@ cleanup_perm: + } + } + } ++ ++ if (ret == 0) { ++ QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) { ++ BlockDriverState *bs = bs_entry->state.bs; ++ ++ if (bs->drv->bdrv_reopen_commit_post) ++ bs->drv->bdrv_reopen_commit_post(&bs_entry->state); ++ } ++ } + cleanup: + QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { + if (ret) { +diff --git a/include/block/block_int.h b/include/block/block_int.h +index dd033d0..c168690 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -123,6 +123,7 @@ struct BlockDriver { + int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp); + void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); ++ void (*bdrv_reopen_commit_post)(BDRVReopenState *reopen_state); + void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); + void (*bdrv_join_options)(QDict *options, QDict *old_options); + +-- +1.8.3.1 + diff --git a/kvm-block-Make-bdrv_get_cumulative_perm-public.patch b/kvm-block-Make-bdrv_get_cumulative_perm-public.patch new file mode 100755 index 0000000000000000000000000000000000000000..2f0f999791177ece4d255ddf03b5c6b52492b2ac --- /dev/null +++ b/kvm-block-Make-bdrv_get_cumulative_perm-public.patch @@ -0,0 +1,67 @@ +From 294ab4c4963295556d12ac15150b48c8536175a7 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:33 +0000 +Subject: [PATCH 13/20] block: Make bdrv_get_cumulative_perm() public + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-8-kwolf@redhat.com> +Patchwork-id: 94287 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 07/13] block: Make bdrv_get_cumulative_perm() public +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-2-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit c7a0f2be8f95b220cdadbba9a9236eaf115951dc) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 6 ++---- + include/block/block_int.h | 3 +++ + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/block.c b/block.c +index 39e4647..354d388 100644 +--- a/block.c ++++ b/block.c +@@ -1850,8 +1850,6 @@ static int bdrv_child_check_perm(BdrvChild *c, BlockReopenQueue *q, + bool *tighten_restrictions, Error **errp); + static void bdrv_child_abort_perm_update(BdrvChild *c); + static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared); +-static void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, +- uint64_t *shared_perm); + + typedef struct BlockReopenQueueEntry { + bool prepared; +@@ -2075,8 +2073,8 @@ static void bdrv_set_perm(BlockDriverState *bs, uint64_t cumulative_perms, + } + } + +-static void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, +- uint64_t *shared_perm) ++void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, ++ uint64_t *shared_perm) + { + BdrvChild *c; + uint64_t cumulative_perms = 0; +diff --git a/include/block/block_int.h b/include/block/block_int.h +index c168690..96e327b 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -1228,6 +1228,9 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + void *opaque, Error **errp); + void bdrv_root_unref_child(BdrvChild *child); + ++void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, ++ uint64_t *shared_perm); ++ + /** + * Sets a BdrvChild's permissions. Avoid if the parent is a BDS; use + * bdrv_child_refresh_perms() instead and make the parent's +-- +1.8.3.1 + diff --git a/kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch b/kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch new file mode 100755 index 0000000000000000000000000000000000000000..0d4a000b5ab13c19816303d64c3b8a0e392d15a7 --- /dev/null +++ b/kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch @@ -0,0 +1,145 @@ +From 41d6c207c482093df8669f7cdcdb49bb25dba741 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:12 +0100 +Subject: [PATCH 07/26] block: Make it easier to learn which BDS support + bitmaps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-5-eblake@redhat.com> +Patchwork-id: 97071 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 04/12] block: Make it easier to learn which BDS support bitmaps +Bugzilla: 1779893 1779904 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Upcoming patches will enhance bitmap support in qemu-img, but in doing +so, it turns out to be nice to suppress output when persistent bitmaps +make no sense (such as on a qcow2 v2 image). Add a hook to make this +easier to query. + +This patch adds a new callback .bdrv_supports_persistent_dirty_bitmap, +rather than trying to shoehorn the answer in via existing callbacks. +In particular, while it might have been possible to overload +.bdrv_co_can_store_new_dirty_bitmap to special-case a NULL input to +answer whether any persistent bitmaps are supported, that is at odds +with whether a particular bitmap can be stored (for example, even on +an image that supports persistent bitmaps but has currently filled up +the maximum number of bitmaps, attempts to store another one should +fail); and the new functionality doesn't require coroutine safety. +Similarly, we could have added one more piece of information to +.bdrv_get_info, but then again, most callers to that function tend to +already discard extraneous information, and making it a catch-all +rather than a series of dedicated scalar queries hasn't really +simplified life. + +In the future, when we improve the ability to look up bitmaps through +a filter, we will probably also want to teach the block layer to +automatically let filters pass this request on through. + +Signed-off-by: Eric Blake +Message-Id: <20200513011648.166876-4-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit ef893b5c84f3199d777e33966dc28839f71b1a5c) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + block/dirty-bitmap.c | 9 +++++++++ + block/qcow2-bitmap.c | 7 +++++++ + block/qcow2.c | 2 ++ + block/qcow2.h | 1 + + include/block/block_int.h | 1 + + include/block/dirty-bitmap.h | 1 + + 6 files changed, 21 insertions(+) + +diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c +index 7039e82..2f96acc 100644 +--- a/block/dirty-bitmap.c ++++ b/block/dirty-bitmap.c +@@ -478,6 +478,15 @@ int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, + } + } + ++bool ++bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs) ++{ ++ if (bs->drv && bs->drv->bdrv_supports_persistent_dirty_bitmap) { ++ return bs->drv->bdrv_supports_persistent_dirty_bitmap(bs); ++ } ++ return false; ++} ++ + static bool coroutine_fn + bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, + uint32_t granularity, Error **errp) +diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c +index c6c8ebb..cbac905 100644 +--- a/block/qcow2-bitmap.c ++++ b/block/qcow2-bitmap.c +@@ -1759,3 +1759,10 @@ fail: + name, bdrv_get_device_or_node_name(bs)); + return false; + } ++ ++bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs) ++{ ++ BDRVQcow2State *s = bs->opaque; ++ ++ return s->qcow_version >= 3; ++} +diff --git a/block/qcow2.c b/block/qcow2.c +index af0ad4a..36b0f7d 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -5551,6 +5551,8 @@ BlockDriver bdrv_qcow2 = { + .bdrv_detach_aio_context = qcow2_detach_aio_context, + .bdrv_attach_aio_context = qcow2_attach_aio_context, + ++ .bdrv_supports_persistent_dirty_bitmap = ++ qcow2_supports_persistent_dirty_bitmap, + .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap, + .bdrv_co_remove_persistent_dirty_bitmap = + qcow2_co_remove_persistent_dirty_bitmap, +diff --git a/block/qcow2.h b/block/qcow2.h +index 0942126..ceb1ceb 100644 +--- a/block/qcow2.h ++++ b/block/qcow2.h +@@ -767,6 +767,7 @@ bool qcow2_co_can_store_new_dirty_bitmap(BlockDriverState *bs, + int qcow2_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp); ++bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs); + + ssize_t coroutine_fn + qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 562dca1..cc18e8d 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -568,6 +568,7 @@ struct BlockDriver { + uint64_t parent_perm, uint64_t parent_shared, + uint64_t *nperm, uint64_t *nshared); + ++ bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); + bool (*bdrv_co_can_store_new_dirty_bitmap)(BlockDriverState *bs, + const char *name, + uint32_t granularity, +diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h +index e2b20ec..f6e9a38 100644 +--- a/include/block/dirty-bitmap.h ++++ b/include/block/dirty-bitmap.h +@@ -16,6 +16,7 @@ typedef enum BitmapCheckFlags { + + #define BDRV_BITMAP_MAX_NAME_SIZE 1023 + ++bool bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs); + BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, +-- +1.8.3.1 + diff --git a/kvm-block-Relax-restrictions-for-blockdev-snapshot.patch b/kvm-block-Relax-restrictions-for-blockdev-snapshot.patch new file mode 100755 index 0000000000000000000000000000000000000000..de85205f9d1eaaa44585ff9ae5de750255ee48eb --- /dev/null +++ b/kvm-block-Relax-restrictions-for-blockdev-snapshot.patch @@ -0,0 +1,117 @@ +From 9ba321e18a357c1a3a238ceee301bbb174f96eee Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:34 +0000 +Subject: [PATCH 14/20] block: Relax restrictions for blockdev-snapshot + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-9-kwolf@redhat.com> +Patchwork-id: 94285 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 08/13] block: Relax restrictions for blockdev-snapshot +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +blockdev-snapshot returned an error if the overlay was already in use, +which it defined as having any BlockBackend parent. This is in fact both +too strict (some parents can tolerate the change of visible data caused +by attaching a backing file) and too loose (some non-BlockBackend +parents may not be happy with it). + +One important use case that is prevented by the too strict check is live +storage migration with blockdev-mirror. Here, the target node is +usually opened without a backing file so that the active layer is +mirrored while its backing chain can be copied in the background. + +The backing chain should be attached to the mirror target node when +finalising the job, just before switching the users of the source node +to the new copy (at which point the mirror job still has a reference to +the node). drive-mirror did this automatically, but with blockdev-mirror +this is the job of the QMP client, so it needs a way to do this. + +blockdev-snapshot is the obvious way, so this patch makes it work in +this scenario. The new condition is that no parent uses CONSISTENT_READ +permissions. This will ensure that the operation will still be blocked +when the node is attached to the guest device, so blockdev-snapshot +remains safe. + +(For the sake of completeness, x-blockdev-reopen can be used to achieve +the same, however it is a big hammer, performs the graph change +completely unchecked and is still experimental. So even with the option +of using x-blockdev-reopen, there are reasons why blockdev-snapshot +should be able to perform this operation.) + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-3-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Tested-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit d29d3d1f80b3947fb26e7139645c83de66d146a9) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 14 ++++++++------ + tests/qemu-iotests/085.out | 4 ++-- + 2 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 4cd9a58..7918533 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1536,6 +1536,7 @@ static void external_snapshot_prepare(BlkActionState *common, + TransactionAction *action = common->action; + AioContext *aio_context; + AioContext *old_context; ++ uint64_t perm, shared; + int ret; + + /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar +@@ -1656,16 +1657,17 @@ static void external_snapshot_prepare(BlkActionState *common, + goto out; + } + +- if (bdrv_has_blk(state->new_bs)) { ++ /* ++ * Allow attaching a backing file to an overlay that's already in use only ++ * if the parents don't assume that they are already seeing a valid image. ++ * (Specifically, allow it as a mirror target, which is write-only access.) ++ */ ++ bdrv_get_cumulative_perm(state->new_bs, &perm, &shared); ++ if (perm & BLK_PERM_CONSISTENT_READ) { + error_setg(errp, "The overlay is already in use"); + goto out; + } + +- if (bdrv_op_is_blocked(state->new_bs, BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, +- errp)) { +- goto out; +- } +- + if (state->new_bs->backing != NULL) { + error_setg(errp, "The overlay already has a backing image"); + goto out; +diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out +index bb50227..487d920 100644 +--- a/tests/qemu-iotests/085.out ++++ b/tests/qemu-iotests/085.out +@@ -82,7 +82,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/ + === Invalid command - cannot create a snapshot using a file BDS === + + { 'execute': 'blockdev-snapshot', 'arguments': { 'node':'virtio0', 'overlay':'file_12' } } +-{"error": {"class": "GenericError", "desc": "The overlay does not support backing images"}} ++{"error": {"class": "GenericError", "desc": "The overlay is already in use"}} + + === Invalid command - snapshot node used as active layer === + +@@ -96,7 +96,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/ + === Invalid command - snapshot node used as backing hd === + + { 'execute': 'blockdev-snapshot', 'arguments': { 'node': 'virtio0', 'overlay':'snap_11' } } +-{"error": {"class": "GenericError", "desc": "Node 'snap_11' is busy: node is used as backing hd of 'snap_12'"}} ++{"error": {"class": "GenericError", "desc": "The overlay is already in use"}} + + === Invalid command - snapshot node has a backing image === + +-- +1.8.3.1 + diff --git a/kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch b/kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch new file mode 100755 index 0000000000000000000000000000000000000000..1188911547527dec0d54c72da8d9a1336af3b566 --- /dev/null +++ b/kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch @@ -0,0 +1,77 @@ +From e191ab6358b656764374ff1b3c7224a744dc902a Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Tue, 26 Jan 2021 17:21:02 -0500 +Subject: [PATCH 7/9] block: Require aligned image size to avoid assertion + failure + +RH-Author: Kevin Wolf +Message-id: <20210126172103.136060-2-kwolf@redhat.com> +Patchwork-id: 100786 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/2] block: Require aligned image size to avoid assertion failure +Bugzilla: 1834281 +RH-Acked-by: Markus Armbruster +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +Unaligned requests will automatically be aligned to bl.request_alignment +and we can't extend write requests to access space beyond the end of the +image without resizing the image, so if we have the WRITE permission, +but not the RESIZE one, it's required that the image size is aligned. + +Failing to meet this requirement could cause assertion failures like +this if RESIZE permissions weren't requested: + +qemu-img: block/io.c:1910: bdrv_co_write_req_prepare: Assertion `end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE' failed. + +This was e.g. triggered by qemu-img converting to a target image with 4k +request alignment when the image was only aligned to 512 bytes, but not +to 4k. + +Turn this into a graceful error in bdrv_check_perm() so that WRITE +without RESIZE can only be taken if the image size is aligned. If a user +holds both permissions and drops only RESIZE, the function will return +an error, but bdrv_child_try_set_perm() will ignore the failure silently +if permissions are only requested to be relaxed and just keep both +permissions while returning success. + +Signed-off-by: Kevin Wolf +Message-Id: <20200716142601.111237-2-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 9c60a5d1978e6dcf85c0e01b50e6f7f54ca09104) +Signed-off-by: Kevin Wolf +Signed-off-by: Jon Maloy +--- + block.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/block.c b/block.c +index 57740d312e..e9579ddf84 100644 +--- a/block.c ++++ b/block.c +@@ -2009,6 +2009,22 @@ static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q, + return -EPERM; + } + ++ /* ++ * Unaligned requests will automatically be aligned to bl.request_alignment ++ * and without RESIZE we can't extend requests to write to space beyond the ++ * end of the image, so it's required that the image size is aligned. ++ */ ++ if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) && ++ !(cumulative_perms & BLK_PERM_RESIZE)) ++ { ++ if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) { ++ error_setg(errp, "Cannot get 'write' permission without 'resize': " ++ "Image size is not a multiple of request " ++ "alignment"); ++ return -EPERM; ++ } ++ } ++ + /* Check this node */ + if (!drv) { + return 0; +-- +2.18.2 + diff --git a/kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch b/kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch new file mode 100755 index 0000000000000000000000000000000000000000..ea796d5f49cd981ee84862888a206bba82c89fb8 --- /dev/null +++ b/kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch @@ -0,0 +1,57 @@ +From 371d312300251c0dc24522607b06b7e47e760b53 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:32 +0000 +Subject: [PATCH 12/20] block: Versioned x-blockdev-reopen API with feature + flag + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-7-kwolf@redhat.com> +Patchwork-id: 94283 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 06/13] block: Versioned x-blockdev-reopen API with feature flag +Bugzilla: 1790482 1805143 +RH-Acked-by: Eric Blake +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +x-blockdev-reopen is still considered unstable upstream. libvirt needs +(a small subset of) it for incremental backups, though. + +Add a downstream-only feature flag that effectively makes this a +versioned interface. As long as the feature is present, we promise that +we won't change the interface incompatibly. Incompatible changes to the +command will require us to drop the feature flag (and possibly introduce +a new one if the new version is still not stable upstream). + +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + qapi/block-core.json | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/qapi/block-core.json b/qapi/block-core.json +index 0cf68fe..a1e85b0 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -4202,10 +4202,17 @@ + # image does not have a default backing file name as part of its + # metadata. + # ++# Features: ++# @__com.redhat_rhel-av-8_2_0-api: Versioning the downstream interface while ++# it's still unstable upstream. As long as ++# this flag is present, this command will not ++# change incompatibly. ++# + # Since: 4.0 + ## + { 'command': 'x-blockdev-reopen', +- 'data': 'BlockdevOptions', 'boxed': true } ++ 'data': 'BlockdevOptions', 'boxed': true, ++ 'features': [ '__com.redhat_rhel-av-8_2_0-api' ] } + + ## + # @blockdev-del: +-- +1.8.3.1 + diff --git a/kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch b/kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch new file mode 100755 index 0000000000000000000000000000000000000000..d1511d29211a3a3a9990bb2d21d7d74e8671b2e5 --- /dev/null +++ b/kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch @@ -0,0 +1,308 @@ +From 67f36d057aa71ca56ebc17ef28a7cb70bac6c6b6 Mon Sep 17 00:00:00 2001 +From: "Daniel P. Berrange" +Date: Tue, 5 May 2020 16:46:01 +0100 +Subject: [PATCH 01/12] block: always fill entire LUKS header space with zeros +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Daniel P. Berrange +Message-id: <20200505164601.1059974-2-berrange@redhat.com> +Patchwork-id: 96277 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] block: always fill entire LUKS header space with zeros +Bugzilla: 1775462 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi + +When initializing the LUKS header the size with default encryption +parameters will currently be 2068480 bytes. This is rounded up to +a multiple of the cluster size, 2081792, with 64k sectors. If the +end of the header is not the same as the end of the cluster we fill +the extra space with zeros. This was forgetting that not even the +space allocated for the header will be fully initialized, as we +only write key material for the first key slot. The space left +for the other 7 slots is never written to. + +An optimization to the ref count checking code: + + commit a5fff8d4b4d928311a5005efa12d0991fe3b66f9 (refs/bisect/bad) + Author: Vladimir Sementsov-Ogievskiy + Date: Wed Feb 27 16:14:30 2019 +0300 + + qcow2-refcount: avoid eating RAM + +made the assumption that every cluster which was allocated would +have at least some data written to it. This was violated by way +the LUKS header is only partially written, with much space simply +reserved for future use. + +Depending on the cluster size this problem was masked by the +logic which wrote zeros between the end of the LUKS header and +the end of the cluster. + +$ qemu-img create --object secret,id=cluster_encrypt0,data=123456 \ + -f qcow2 -o cluster_size=2k,encrypt.iter-time=1,\ + encrypt.format=luks,encrypt.key-secret=cluster_encrypt0 \ + cluster_size_check.qcow2 100M + Formatting 'cluster_size_check.qcow2', fmt=qcow2 size=104857600 + encrypt.format=luks encrypt.key-secret=cluster_encrypt0 + encrypt.iter-time=1 cluster_size=2048 lazy_refcounts=off refcount_bits=16 + +$ qemu-img check --object secret,id=cluster_encrypt0,data=redhat \ + 'json:{"driver": "qcow2", "encrypt.format": "luks", \ + "encrypt.key-secret": "cluster_encrypt0", \ + "file.driver": "file", "file.filename": "cluster_size_check.qcow2"}' +ERROR: counting reference for region exceeding the end of the file by one cluster or more: offset 0x2000 size 0x1f9000 +Leaked cluster 4 refcount=1 reference=0 +...snip... +Leaked cluster 130 refcount=1 reference=0 + +1 errors were found on the image. +Data may be corrupted, or further writes to the image may corrupt it. + +127 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. +Image end offset: 268288 + +The problem only exists when the disk image is entirely empty. Writing +data to the disk image payload will solve the problem by causing the +end of the file to be extended further. + +The change fixes it by ensuring that the entire allocated LUKS header +region is fully initialized with zeros. The qemu-img check will still +fail for any pre-existing disk images created prior to this change, +unless at least 1 byte of the payload is written to. + +Fully writing zeros to the entire LUKS header is a good idea regardless +as it ensures that space has been allocated on the host filesystem (or +whatever block storage backend is used). + +Signed-off-by: Daniel P. Berrangé +Message-Id: <20200207135520.2669430-1-berrange@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Max Reitz +(cherry picked from commit 087ab8e775f48766068e65de1bc99d03b40d1670) +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + tests/qemu-iotests/group: no test 283 in downstream + +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2.c | 11 ++++-- + tests/qemu-iotests/284 | 97 ++++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/284.out | 62 +++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 4 files changed, 167 insertions(+), 4 deletions(-) + create mode 100755 tests/qemu-iotests/284 + create mode 100644 tests/qemu-iotests/284.out + +diff --git a/block/qcow2.c b/block/qcow2.c +index 71067c6..af0ad4a 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -135,13 +135,16 @@ static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, + s->crypto_header.length = headerlen; + s->crypto_header.offset = ret; + +- /* Zero fill remaining space in cluster so it has predictable +- * content in case of future spec changes */ ++ /* ++ * Zero fill all space in cluster so it has predictable ++ * content, as we may not initialize some regions of the ++ * header (eg only 1 out of 8 key slots will be initialized) ++ */ + clusterlen = size_to_clusters(s, headerlen) * s->cluster_size; + assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0); + ret = bdrv_pwrite_zeroes(bs->file, +- ret + headerlen, +- clusterlen - headerlen, 0); ++ ret, ++ clusterlen, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not zero fill encryption header"); + return -1; +diff --git a/tests/qemu-iotests/284 b/tests/qemu-iotests/284 +new file mode 100755 +index 0000000..071e89b +--- /dev/null ++++ b/tests/qemu-iotests/284 +@@ -0,0 +1,97 @@ ++#!/usr/bin/env bash ++# ++# Test ref count checks on encrypted images ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++# creator ++owner=berrange@redhat.com ++ ++seq=`basename $0` ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++ ++_supported_fmt qcow2 ++_supported_proto generic ++_supported_os Linux ++ ++ ++size=1M ++ ++SECRET="secret,id=sec0,data=astrochicken" ++ ++IMGSPEC="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0" ++QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT ++ ++_run_test() ++{ ++ IMGOPTSSYNTAX=true ++ OLD_TEST_IMG="$TEST_IMG" ++ TEST_IMG="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0" ++ QEMU_IMG_EXTRA_ARGS="--image-opts --object $SECRET" ++ ++ echo ++ echo "== cluster size $csize" ++ echo "== checking image refcounts ==" ++ _check_test_img ++ ++ echo ++ echo "== writing some data ==" ++ $QEMU_IO -c "write -P 0x9 0 1" $QEMU_IMG_EXTRA_ARGS $TEST_IMG | _filter_qemu_io | _filter_testdir ++ echo ++ echo "== rechecking image refcounts ==" ++ _check_test_img ++ ++ echo ++ echo "== writing some more data ==" ++ $QEMU_IO -c "write -P 0x9 $csize 1" $QEMU_IMG_EXTRA_ARGS $TEST_IMG | _filter_qemu_io | _filter_testdir ++ echo ++ echo "== rechecking image refcounts ==" ++ _check_test_img ++ ++ TEST_IMG="$OLD_TEST_IMG" ++ QEMU_IMG_EXTRA_ARGS= ++ IMGOPTSSYNTAX= ++} ++ ++ ++echo ++echo "testing LUKS qcow2 encryption" ++echo ++ ++for csize in 512 2048 32768 ++do ++ _make_test_img --object $SECRET -o "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,cluster_size=$csize" $size ++ _run_test ++ _cleanup_test_img ++done ++ ++# success, all done ++echo "*** done" ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/284.out b/tests/qemu-iotests/284.out +new file mode 100644 +index 0000000..48216f5 +--- /dev/null ++++ b/tests/qemu-iotests/284.out +@@ -0,0 +1,62 @@ ++QA output created by 284 ++ ++testing LUKS qcow2 encryption ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10 ++ ++== cluster size 512 ++== checking image refcounts == ++No errors were found on the image. ++ ++== writing some data == ++wrote 1/1 bytes at offset 0 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++ ++== writing some more data == ++wrote 1/1 bytes at offset 512 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10 ++ ++== cluster size 2048 ++== checking image refcounts == ++No errors were found on the image. ++ ++== writing some data == ++wrote 1/1 bytes at offset 0 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++ ++== writing some more data == ++wrote 1/1 bytes at offset 2048 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10 ++ ++== cluster size 32768 ++== checking image refcounts == ++No errors were found on the image. ++ ++== writing some data == ++wrote 1/1 bytes at offset 0 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++ ++== writing some more data == ++wrote 1/1 bytes at offset 32768 ++1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== rechecking image refcounts == ++No errors were found on the image. ++*** done +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index e47cbfc..9c565cf 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -289,3 +289,4 @@ + 277 rw quick + 280 rw migration quick + 281 rw quick ++284 rw +-- +1.8.3.1 + diff --git a/kvm-block-backend-Add-flags-to-blk_truncate.patch b/kvm-block-backend-Add-flags-to-blk_truncate.patch new file mode 100755 index 0000000000000000000000000000000000000000..5b212fc36622615bcb4c518ac105ee45100e0204 --- /dev/null +++ b/kvm-block-backend-Add-flags-to-blk_truncate.patch @@ -0,0 +1,294 @@ +From 07a93e74efa4861f54dd3d4bec01885f7af2fee3 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 17:01:32 +0200 +Subject: [PATCH 04/17] block-backend: Add flags to blk_truncate() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-4-kwolf@redhat.com> +Patchwork-id: 97450 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 03/11] block-backend: Add flags to blk_truncate() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Now that node level interface bdrv_truncate() supports passing request +flags to the block driver, expose this on the BlockBackend level, too. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-4-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 8c6242b6f383e43fd11d2c50f8bcdd2bba1100fc) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 3 ++- + block/block-backend.c | 4 ++-- + block/commit.c | 4 ++-- + block/crypto.c | 2 +- + block/mirror.c | 2 +- + block/qcow2.c | 4 ++-- + block/qed.c | 2 +- + block/vdi.c | 2 +- + block/vhdx.c | 4 ++-- + block/vmdk.c | 6 +++--- + block/vpc.c | 2 +- + blockdev.c | 2 +- + include/sysemu/block-backend.h | 2 +- + qemu-img.c | 2 +- + qemu-io-cmds.c | 2 +- + 15 files changed, 22 insertions(+), 21 deletions(-) + +diff --git a/block.c b/block.c +index d6a05da..12c8941 100644 +--- a/block.c ++++ b/block.c +@@ -547,7 +547,8 @@ static int64_t create_file_fallback_truncate(BlockBackend *blk, + int64_t size; + int ret; + +- ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, &local_err); ++ ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0, ++ &local_err); + if (ret < 0 && ret != -ENOTSUP) { + error_propagate(errp, local_err); + return ret; +diff --git a/block/block-backend.c b/block/block-backend.c +index 8be2006..17ed6d8 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -2137,14 +2137,14 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf, + } + + int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp) ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) + { + if (!blk_is_available(blk)) { + error_setg(errp, "No medium inserted"); + return -ENOMEDIUM; + } + +- return bdrv_truncate(blk->root, offset, exact, prealloc, 0, errp); ++ return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp); + } + + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, +diff --git a/block/commit.c b/block/commit.c +index 23c90b3..075ebf8 100644 +--- a/block/commit.c ++++ b/block/commit.c +@@ -155,7 +155,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) + } + + if (base_len < len) { +- ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, NULL); ++ ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL); + if (ret) { + goto out; + } +@@ -471,7 +471,7 @@ int bdrv_commit(BlockDriverState *bs) + * grow the backing file image if possible. If not possible, + * we must return an error */ + if (length > backing_length) { +- ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, ++ ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0, + &local_err); + if (ret < 0) { + error_report_err(local_err); +diff --git a/block/crypto.c b/block/crypto.c +index fcb4a97..83a8fc0 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -115,7 +115,7 @@ static ssize_t block_crypto_init_func(QCryptoBlock *block, + * which will be used by the crypto header + */ + return blk_truncate(data->blk, data->size + headerlen, false, +- data->prealloc, errp); ++ data->prealloc, 0, errp); + } + + +diff --git a/block/mirror.c b/block/mirror.c +index 0d32fca..c8028cd 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -886,7 +886,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + if (s->base == blk_bs(s->target)) { + if (s->bdev_length > target_length) { + ret = blk_truncate(s->target, s->bdev_length, false, +- PREALLOC_MODE_OFF, NULL); ++ PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + goto immediate_exit; + } +diff --git a/block/qcow2.c b/block/qcow2.c +index c0fdcb9..86aa74a 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3497,7 +3497,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) + + /* Okay, now that we have a valid image, let's give it the right size */ + ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation, +- errp); ++ 0, errp); + if (ret < 0) { + error_prepend(errp, "Could not resize image: "); + goto out; +@@ -5347,7 +5347,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, + * Amending image options should ensure that the image has + * exactly the given new values, so pass exact=true here. + */ +- ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, 0, errp); + blk_unref(blk); + if (ret < 0) { + return ret; +diff --git a/block/qed.c b/block/qed.c +index fb6100b..b0fdb8f 100644 +--- a/block/qed.c ++++ b/block/qed.c +@@ -677,7 +677,7 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts, + * The QED format associates file length with allocation status, + * so a new file (which is empty) must have a length of 0. + */ +- ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto out; + } +diff --git a/block/vdi.c b/block/vdi.c +index e1a11f2..0c7835a 100644 +--- a/block/vdi.c ++++ b/block/vdi.c +@@ -875,7 +875,7 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options, + + if (image_type == VDI_TYPE_STATIC) { + ret = blk_truncate(blk, offset + blocks * block_size, false, +- PREALLOC_MODE_OFF, errp); ++ PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + error_prepend(errp, "Failed to statically allocate file"); + goto exit; +diff --git a/block/vhdx.c b/block/vhdx.c +index 5dfbb20..21497f7 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -1703,13 +1703,13 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, + /* All zeroes, so we can just extend the file - the end of the BAT + * is the furthest thing we have written yet */ + ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF, +- errp); ++ 0, errp); + if (ret < 0) { + goto exit; + } + } else if (type == VHDX_TYPE_FIXED) { + ret = blk_truncate(blk, data_file_offset + image_size, false, +- PREALLOC_MODE_OFF, errp); ++ PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto exit; + } +diff --git a/block/vmdk.c b/block/vmdk.c +index 1bbf937..1bd3991 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -2118,7 +2118,7 @@ static int vmdk_init_extent(BlockBackend *blk, + int gd_buf_size; + + if (flat) { +- ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp); + goto exit; + } + magic = cpu_to_be32(VMDK4_MAGIC); +@@ -2182,7 +2182,7 @@ static int vmdk_init_extent(BlockBackend *blk, + } + + ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false, +- PREALLOC_MODE_OFF, errp); ++ PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto exit; + } +@@ -2523,7 +2523,7 @@ static int coroutine_fn vmdk_co_do_create(int64_t size, + /* bdrv_pwrite write padding zeros to align to sector, we don't need that + * for description file */ + if (desc_offset == 0) { +- ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + goto exit; + } +diff --git a/block/vpc.c b/block/vpc.c +index 6df75e2..d5e7dc8 100644 +--- a/block/vpc.c ++++ b/block/vpc.c +@@ -898,7 +898,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, + /* Add footer to total size */ + total_size += HEADER_SIZE; + +- ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + return ret; + } +diff --git a/blockdev.c b/blockdev.c +index 5128c9b..6dde52a 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -3055,7 +3055,7 @@ void qmp_block_resize(bool has_device, const char *device, + } + + bdrv_drained_begin(bs); +- ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, errp); ++ ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp); + bdrv_drained_end(bs); + + out: +diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h +index 9bbdbd6..34de7fa 100644 +--- a/include/sysemu/block-backend.h ++++ b/include/sysemu/block-backend.h +@@ -237,7 +237,7 @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, + int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf, + int bytes); + int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, +- PreallocMode prealloc, Error **errp); ++ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); + int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes); + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, + int64_t pos, int size); +diff --git a/qemu-img.c b/qemu-img.c +index 6dc881b..a27ad70 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -3939,7 +3939,7 @@ static int img_resize(int argc, char **argv) + * resizing, so pass @exact=true. It is of no use to report + * success when the image has not actually been resized. + */ +- ret = blk_truncate(blk, total_size, true, prealloc, &err); ++ ret = blk_truncate(blk, total_size, true, prealloc, 0, &err); + if (!ret) { + qprintf(quiet, "Image resized.\n"); + } else { +diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c +index 1b7e700..851f07e 100644 +--- a/qemu-io-cmds.c ++++ b/qemu-io-cmds.c +@@ -1715,7 +1715,7 @@ static int truncate_f(BlockBackend *blk, int argc, char **argv) + * exact=true. It is better to err on the "emit more errors" side + * than to be overly permissive. + */ +- ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, &local_err); ++ ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, 0, &local_err); + if (ret < 0) { + error_report_err(local_err); + return ret; +-- +1.8.3.1 + diff --git a/kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch b/kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch new file mode 100755 index 0000000000000000000000000000000000000000..9d49cfa550e2856e67dfb9e62a5df2df0f765eab --- /dev/null +++ b/kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch @@ -0,0 +1,158 @@ +From 6cc456c4c1e6557fdc7e138e8ef8171b71609222 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:15 +0100 +Subject: [PATCH 4/6] block-backend: Reorder flush/pdiscard function + definitions + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-5-kwolf@redhat.com> +Patchwork-id: 94598 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/6] block-backend: Reorder flush/pdiscard function definitions +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +Move all variants of the flush/pdiscard functions to a single place and +put the blk_co_*() version first because it is called by all other +variants (and will become static in the next patch). + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Max Reitz +Message-Id: <20200407121259.21350-2-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 564806c529d4e0acad209b1e5b864a8886092f1f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/block-backend.c | 92 +++++++++++++++++++++++++-------------------------- + 1 file changed, 46 insertions(+), 46 deletions(-) + +diff --git a/block/block-backend.c b/block/block-backend.c +index 8b8f2a8..17b2e87 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -1488,38 +1488,6 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, + blk_aio_write_entry, flags, cb, opaque); + } + +-static void blk_aio_flush_entry(void *opaque) +-{ +- BlkAioEmAIOCB *acb = opaque; +- BlkRwCo *rwco = &acb->rwco; +- +- rwco->ret = blk_co_flush(rwco->blk); +- blk_aio_complete(acb); +-} +- +-BlockAIOCB *blk_aio_flush(BlockBackend *blk, +- BlockCompletionFunc *cb, void *opaque) +-{ +- return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque); +-} +- +-static void blk_aio_pdiscard_entry(void *opaque) +-{ +- BlkAioEmAIOCB *acb = opaque; +- BlkRwCo *rwco = &acb->rwco; +- +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes); +- blk_aio_complete(acb); +-} +- +-BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, +- int64_t offset, int bytes, +- BlockCompletionFunc *cb, void *opaque) +-{ +- return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0, +- cb, opaque); +-} +- + void blk_aio_cancel(BlockAIOCB *acb) + { + bdrv_aio_cancel(acb); +@@ -1586,6 +1554,37 @@ int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) + return bdrv_co_pdiscard(blk->root, offset, bytes); + } + ++static void blk_aio_pdiscard_entry(void *opaque) ++{ ++ BlkAioEmAIOCB *acb = opaque; ++ BlkRwCo *rwco = &acb->rwco; ++ ++ rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes); ++ blk_aio_complete(acb); ++} ++ ++BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, ++ int64_t offset, int bytes, ++ BlockCompletionFunc *cb, void *opaque) ++{ ++ return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0, ++ cb, opaque); ++} ++ ++static void blk_pdiscard_entry(void *opaque) ++{ ++ BlkRwCo *rwco = opaque; ++ QEMUIOVector *qiov = rwco->iobuf; ++ ++ rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size); ++ aio_wait_kick(); ++} ++ ++int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes) ++{ ++ return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0); ++} ++ + int blk_co_flush(BlockBackend *blk) + { + blk_wait_while_drained(blk); +@@ -1597,6 +1596,21 @@ int blk_co_flush(BlockBackend *blk) + return bdrv_co_flush(blk_bs(blk)); + } + ++static void blk_aio_flush_entry(void *opaque) ++{ ++ BlkAioEmAIOCB *acb = opaque; ++ BlkRwCo *rwco = &acb->rwco; ++ ++ rwco->ret = blk_co_flush(rwco->blk); ++ blk_aio_complete(acb); ++} ++ ++BlockAIOCB *blk_aio_flush(BlockBackend *blk, ++ BlockCompletionFunc *cb, void *opaque) ++{ ++ return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque); ++} ++ + static void blk_flush_entry(void *opaque) + { + BlkRwCo *rwco = opaque; +@@ -2083,20 +2097,6 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, + return bdrv_truncate(blk->root, offset, exact, prealloc, errp); + } + +-static void blk_pdiscard_entry(void *opaque) +-{ +- BlkRwCo *rwco = opaque; +- QEMUIOVector *qiov = rwco->iobuf; +- +- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size); +- aio_wait_kick(); +-} +- +-int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes) +-{ +- return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0); +-} +- + int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, + int64_t pos, int size) + { +-- +1.8.3.1 + diff --git a/kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch b/kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch new file mode 100755 index 0000000000000000000000000000000000000000..45f506c78bc44710efd9c9501b47bf0bfd5d60d4 --- /dev/null +++ b/kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch @@ -0,0 +1,130 @@ +From aefff389c4d11bd69180db7177135c4645a9b1bd Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:46 +0000 +Subject: [PATCH 13/18] block/backup-top: Don't acquire context while dropping + top + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-7-slp@redhat.com> +Patchwork-id: 93759 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 6/9] block/backup-top: Don't acquire context while dropping top +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +All paths that lead to bdrv_backup_top_drop(), except for the call +from backup_clean(), imply that the BDS AioContext has already been +acquired, so doing it there too can potentially lead to QEMU hanging +on AIO_WAIT_WHILE(). + +An easy way to trigger this situation is by issuing a two actions +transaction, with a proper and a bogus blockdev-backup, so the second +one will trigger a rollback. This will trigger a hang with an stack +trace like this one: + + #0 0x00007fb680c75016 in __GI_ppoll (fds=0x55e74580f7c0, nfds=1, timeout=, + timeout@entry=0x0, sigmask=sigmask@entry=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:39 + #1 0x000055e743386e09 in ppoll (__ss=0x0, __timeout=0x0, __nfds=, __fds=) + at /usr/include/bits/poll2.h:77 + #2 0x000055e743386e09 in qemu_poll_ns + (fds=, nfds=, timeout=) at util/qemu-timer.c:336 + #3 0x000055e743388dc4 in aio_poll (ctx=0x55e7458925d0, blocking=blocking@entry=true) + at util/aio-posix.c:669 + #4 0x000055e743305dea in bdrv_flush (bs=bs@entry=0x55e74593c0d0) at block/io.c:2878 + #5 0x000055e7432be58e in bdrv_close (bs=0x55e74593c0d0) at block.c:4017 + #6 0x000055e7432be58e in bdrv_delete (bs=) at block.c:4262 + #7 0x000055e7432be58e in bdrv_unref (bs=bs@entry=0x55e74593c0d0) at block.c:5644 + #8 0x000055e743316b9b in bdrv_backup_top_drop (bs=bs@entry=0x55e74593c0d0) at block/backup-top.c:273 + #9 0x000055e74331461f in backup_job_create + (job_id=0x0, bs=bs@entry=0x55e7458d5820, target=target@entry=0x55e74589f640, speed=0, sync_mode=MIRROR_SYNC_MODE_FULL, sync_bitmap=sync_bitmap@entry=0x0, bitmap_mode=BITMAP_SYNC_MODE_ON_SUCCESS, compress=false, filter_node_name=0x0, on_source_error=BLOCKDEV_ON_ERROR_REPORT, on_target_error=BLOCKDEV_ON_ERROR_REPORT, creation_flags=0, cb=0x0, opaque=0x0, txn=0x0, errp=0x7ffddfd1efb0) at block/backup.c:478 + #10 0x000055e74315bc52 in do_backup_common + (backup=backup@entry=0x55e746c066d0, bs=bs@entry=0x55e7458d5820, target_bs=target_bs@entry=0x55e74589f640, aio_context=aio_context@entry=0x55e7458a91e0, txn=txn@entry=0x0, errp=errp@entry=0x7ffddfd1efb0) + at blockdev.c:3580 + #11 0x000055e74315c37c in do_blockdev_backup + (backup=backup@entry=0x55e746c066d0, txn=0x0, errp=errp@entry=0x7ffddfd1efb0) + at /usr/src/debug/qemu-kvm-4.2.0-2.module+el8.2.0+5135+ed3b2489.x86_64/./qapi/qapi-types-block-core.h:1492 + #12 0x000055e74315c449 in blockdev_backup_prepare (common=0x55e746a8de90, errp=0x7ffddfd1f018) + at blockdev.c:1885 + #13 0x000055e743160152 in qmp_transaction + (dev_list=, has_props=, props=0x55e7467fe2c0, errp=errp@entry=0x7ffddfd1f088) at blockdev.c:2340 + #14 0x000055e743287ff5 in qmp_marshal_transaction + (args=, ret=, errp=0x7ffddfd1f0f8) + at qapi/qapi-commands-transaction.c:44 + #15 0x000055e74333de6c in do_qmp_dispatch + (errp=0x7ffddfd1f0f0, allow_oob=, request=, cmds=0x55e743c28d60 ) at qapi/qmp-dispatch.c:132 + #16 0x000055e74333de6c in qmp_dispatch + (cmds=0x55e743c28d60 , request=, allow_oob=) + at qapi/qmp-dispatch.c:175 + #17 0x000055e74325c061 in monitor_qmp_dispatch (mon=0x55e745908030, req=) + at monitor/qmp.c:145 + #18 0x000055e74325c6fa in monitor_qmp_bh_dispatcher (data=) at monitor/qmp.c:234 + #19 0x000055e743385866 in aio_bh_call (bh=0x55e745807ae0) at util/async.c:117 + #20 0x000055e743385866 in aio_bh_poll (ctx=ctx@entry=0x55e7458067a0) at util/async.c:117 + #21 0x000055e743388c54 in aio_dispatch (ctx=0x55e7458067a0) at util/aio-posix.c:459 + #22 0x000055e743385742 in aio_ctx_dispatch + (source=, callback=, user_data=) at util/async.c:260 + #23 0x00007fb68543e67d in g_main_dispatch (context=0x55e745893a40) at gmain.c:3176 + #24 0x00007fb68543e67d in g_main_context_dispatch (context=context@entry=0x55e745893a40) at gmain.c:3829 + #25 0x000055e743387d08 in glib_pollfds_poll () at util/main-loop.c:219 + #26 0x000055e743387d08 in os_host_main_loop_wait (timeout=) at util/main-loop.c:242 + #27 0x000055e743387d08 in main_loop_wait (nonblocking=) at util/main-loop.c:518 + #28 0x000055e74316a3c1 in main_loop () at vl.c:1828 + #29 0x000055e743016a72 in main (argc=, argv=, envp=) + at vl.c:4504 + +Fix this by not acquiring the AioContext there, and ensuring all paths +leading to it have it already acquired (backup_clean()). + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1782111 +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 0abf2581717a19d9749d5c2ff8acd0ac203452c2) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + block/backup-top.c | 5 ----- + block/backup.c | 3 +++ + 2 files changed, 3 insertions(+), 5 deletions(-) + +diff --git a/block/backup-top.c b/block/backup-top.c +index 818d3f2..b8d863f 100644 +--- a/block/backup-top.c ++++ b/block/backup-top.c +@@ -255,9 +255,6 @@ append_failed: + void bdrv_backup_top_drop(BlockDriverState *bs) + { + BDRVBackupTopState *s = bs->opaque; +- AioContext *aio_context = bdrv_get_aio_context(bs); +- +- aio_context_acquire(aio_context); + + bdrv_drained_begin(bs); + +@@ -271,6 +268,4 @@ void bdrv_backup_top_drop(BlockDriverState *bs) + bdrv_drained_end(bs); + + bdrv_unref(bs); +- +- aio_context_release(aio_context); + } +diff --git a/block/backup.c b/block/backup.c +index cf62b1a..1383e21 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -135,8 +135,11 @@ static void backup_abort(Job *job) + static void backup_clean(Job *job) + { + BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); ++ AioContext *aio_context = bdrv_get_aio_context(s->backup_top); + ++ aio_context_acquire(aio_context); + bdrv_backup_top_drop(s->backup_top); ++ aio_context_release(aio_context); + } + + void backup_do_checkpoint(BlockJob *job, Error **errp) +-- +1.8.3.1 + diff --git a/kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch b/kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch new file mode 100755 index 0000000000000000000000000000000000000000..745be9f31656dc9b3c9aae1613ea7bc7d4153b3e --- /dev/null +++ b/kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch @@ -0,0 +1,114 @@ +From 1e0582ad34e77a060e2067a35992979c9eae82c9 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:31 +0000 +Subject: [PATCH 11/20] block: bdrv_reopen() with backing file in different + AioContext + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-6-kwolf@redhat.com> +Patchwork-id: 94282 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 05/13] block: bdrv_reopen() with backing file in different AioContext +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +This patch allows bdrv_reopen() (and therefore the x-blockdev-reopen QMP +command) to attach a node as the new backing file even if the node is in +a different AioContext than the parent if one of both nodes can be moved +to the AioContext of the other node. + +Signed-off-by: Kevin Wolf +Tested-by: Peter Krempa +Message-Id: <20200306141413.30705-3-kwolf@redhat.com> +Reviewed-by: Alberto Garcia +Signed-off-by: Kevin Wolf +(cherry picked from commit 1de6b45fb5c1489b450df7d1a4c692bba9678ce6) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 32 ++++++++++++++++++++++++++------ + tests/qemu-iotests/245 | 8 +++----- + 2 files changed, 29 insertions(+), 11 deletions(-) + +diff --git a/block.c b/block.c +index a744bb5..39e4647 100644 +--- a/block.c ++++ b/block.c +@@ -3749,6 +3749,29 @@ static void bdrv_reopen_perm(BlockReopenQueue *q, BlockDriverState *bs, + *shared = cumulative_shared_perms; + } + ++static bool bdrv_reopen_can_attach(BlockDriverState *parent, ++ BdrvChild *child, ++ BlockDriverState *new_child, ++ Error **errp) ++{ ++ AioContext *parent_ctx = bdrv_get_aio_context(parent); ++ AioContext *child_ctx = bdrv_get_aio_context(new_child); ++ GSList *ignore; ++ bool ret; ++ ++ ignore = g_slist_prepend(NULL, child); ++ ret = bdrv_can_set_aio_context(new_child, parent_ctx, &ignore, NULL); ++ g_slist_free(ignore); ++ if (ret) { ++ return ret; ++ } ++ ++ ignore = g_slist_prepend(NULL, child); ++ ret = bdrv_can_set_aio_context(parent, child_ctx, &ignore, errp); ++ g_slist_free(ignore); ++ return ret; ++} ++ + /* + * Take a BDRVReopenState and check if the value of 'backing' in the + * reopen_state->options QDict is valid or not. +@@ -3800,14 +3823,11 @@ static int bdrv_reopen_parse_backing(BDRVReopenState *reopen_state, + } + + /* +- * TODO: before removing the x- prefix from x-blockdev-reopen we +- * should move the new backing file into the right AioContext +- * instead of returning an error. ++ * Check AioContext compatibility so that the bdrv_set_backing_hd() call in ++ * bdrv_reopen_commit() won't fail. + */ + if (new_backing_bs) { +- if (bdrv_get_aio_context(new_backing_bs) != bdrv_get_aio_context(bs)) { +- error_setg(errp, "Cannot use a new backing file " +- "with a different AioContext"); ++ if (!bdrv_reopen_can_attach(bs, bs->backing, new_backing_bs, errp)) { + return -EINVAL; + } + } +diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 +index f69c2fa..919131d 100644 +--- a/tests/qemu-iotests/245 ++++ b/tests/qemu-iotests/245 +@@ -1013,18 +1013,16 @@ class TestBlockdevReopen(iotests.QMPTestCase): + # neither of them can switch to the other AioContext + def test_iothreads_error(self): + self.run_test_iothreads('iothread0', 'iothread1', +- "Cannot use a new backing file with a different AioContext") ++ "Cannot change iothread of active block backend") + + def test_iothreads_compatible_users(self): + self.run_test_iothreads('iothread0', 'iothread0') + + def test_iothreads_switch_backing(self): +- self.run_test_iothreads('iothread0', None, +- "Cannot use a new backing file with a different AioContext") ++ self.run_test_iothreads('iothread0', None) + + def test_iothreads_switch_overlay(self): +- self.run_test_iothreads(None, 'iothread0', +- "Cannot use a new backing file with a different AioContext") ++ self.run_test_iothreads(None, 'iothread0') + + if __name__ == '__main__': + iotests.main(supported_fmts=["qcow2"], +-- +1.8.3.1 + diff --git a/kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch b/kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch new file mode 100755 index 0000000000000000000000000000000000000000..a974a18646d9a594f610c00f6413a02ba505dbe3 --- /dev/null +++ b/kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch @@ -0,0 +1,55 @@ +From 5e5ca17e1e09cfe9a780c556528bbde23c93fc4e Mon Sep 17 00:00:00 2001 +From: Richard Jones +Date: Thu, 28 May 2020 14:27:37 +0100 +Subject: [PATCH 03/26] block/curl: HTTP header field names are case + insensitive +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Richard Jones +Message-id: <20200528142737.17318-3-rjones@redhat.com> +Patchwork-id: 96895 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] block/curl: HTTP header field names are case insensitive +Bugzilla: 1841038 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Philippe Mathieu-Daudé + +From: David Edmondson + +RFC 7230 section 3.2 indicates that HTTP header field names are case +insensitive. + +Signed-off-by: David Edmondson +Message-Id: <20200224101310.101169-3-david.edmondson@oracle.com> +Reviewed-by: Max Reitz +Signed-off-by: Max Reitz +(cherry picked from commit 69032253c33ae1774233c63cedf36d32242a85fc) +Signed-off-by: Danilo C. L. de Paula +--- + block/curl.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/block/curl.c b/block/curl.c +index f9ffb7f..6e32590 100644 +--- a/block/curl.c ++++ b/block/curl.c +@@ -216,11 +216,12 @@ static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) + size_t realsize = size * nmemb; + const char *header = (char *)ptr; + const char *end = header + realsize; +- const char *accept_ranges = "Accept-Ranges:"; ++ const char *accept_ranges = "accept-ranges:"; + const char *bytes = "bytes"; + + if (realsize >= strlen(accept_ranges) +- && strncmp(header, accept_ranges, strlen(accept_ranges)) == 0) { ++ && g_ascii_strncasecmp(header, accept_ranges, ++ strlen(accept_ranges)) == 0) { + + char *p = strchr(header, ':') + 1; + +-- +1.8.3.1 + diff --git a/kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch b/kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch new file mode 100755 index 0000000000000000000000000000000000000000..c09a1e29b7881064409325202d0e7a61729d1313 --- /dev/null +++ b/kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch @@ -0,0 +1,76 @@ +From e5ac775de83d3d22f13c74ab198780b8b579f684 Mon Sep 17 00:00:00 2001 +From: Richard Jones +Date: Thu, 28 May 2020 14:27:36 +0100 +Subject: [PATCH 02/26] block/curl: HTTP header fields allow whitespace around + values + +RH-Author: Richard Jones +Message-id: <20200528142737.17318-2-rjones@redhat.com> +Patchwork-id: 96894 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] block/curl: HTTP header fields allow whitespace around values +Bugzilla: 1841038 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Danilo de Paula + +From: David Edmondson + +RFC 7230 section 3.2 indicates that whitespace is permitted between +the field name and field value and after the field value. + +Signed-off-by: David Edmondson +Message-Id: <20200224101310.101169-2-david.edmondson@oracle.com> +Reviewed-by: Max Reitz +Signed-off-by: Max Reitz +(cherry picked from commit 7788a319399f17476ff1dd43164c869e320820a2) +Signed-off-by: Danilo C. L. de Paula +--- + block/curl.c | 31 +++++++++++++++++++++++++++---- + 1 file changed, 27 insertions(+), 4 deletions(-) + +diff --git a/block/curl.c b/block/curl.c +index f862993..f9ffb7f 100644 +--- a/block/curl.c ++++ b/block/curl.c +@@ -214,11 +214,34 @@ static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) + { + BDRVCURLState *s = opaque; + size_t realsize = size * nmemb; +- const char *accept_line = "Accept-Ranges: bytes"; ++ const char *header = (char *)ptr; ++ const char *end = header + realsize; ++ const char *accept_ranges = "Accept-Ranges:"; ++ const char *bytes = "bytes"; + +- if (realsize >= strlen(accept_line) +- && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) { +- s->accept_range = true; ++ if (realsize >= strlen(accept_ranges) ++ && strncmp(header, accept_ranges, strlen(accept_ranges)) == 0) { ++ ++ char *p = strchr(header, ':') + 1; ++ ++ /* Skip whitespace between the header name and value. */ ++ while (p < end && *p && g_ascii_isspace(*p)) { ++ p++; ++ } ++ ++ if (end - p >= strlen(bytes) ++ && strncmp(p, bytes, strlen(bytes)) == 0) { ++ ++ /* Check that there is nothing but whitespace after the value. */ ++ p += strlen(bytes); ++ while (p < end && *p && g_ascii_isspace(*p)) { ++ p++; ++ } ++ ++ if (p == end || !*p) { ++ s->accept_range = true; ++ } ++ } + } + + return realsize; +-- +1.8.3.1 + diff --git a/kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch b/kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch new file mode 100755 index 0000000000000000000000000000000000000000..60b1b0a60011ad393fcd528e59446100e860aeef --- /dev/null +++ b/kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch @@ -0,0 +1,76 @@ +From 8c339c3535728179acc94deb5b922aebcfac9ab6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 3 Jun 2021 16:13:34 -0400 +Subject: [PATCH 2/4] block/file-posix: Fix problem with fallocate(PUNCH_HOLE) + on GPFS + +RH-Author: Thomas Huth +Message-id: <20210603161334.607005-2-thuth@redhat.com> +Patchwork-id: 101673 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] block/file-posix: Fix problem with fallocate(PUNCH_HOLE) on GPFS +Bugzilla: 1944861 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Max Reitz +RH-Acked-by: Cornelia Huck +RH-Acked-by: Laszlo Ersek + +A customer reported that running + + qemu-img convert -t none -O qcow2 -f qcow2 input.qcow2 output.qcow2 + +fails for them with the following error message when the images are +stored on a GPFS file system : + + qemu-img: error while writing sector 0: Invalid argument + +After analyzing the strace output, it seems like the problem is in +handle_aiocb_write_zeroes(): The call to fallocate(FALLOC_FL_PUNCH_HOLE) +returns EINVAL, which can apparently happen if the file system has +a different idea of the granularity of the operation. It's arguably +a bug in GPFS, since the PUNCH_HOLE mode should not result in EINVAL +according to the man-page of fallocate(), but the file system is out +there in production and so we have to deal with it. In commit 294682cc3a +("block: workaround for unaligned byte range in fallocate()") we also +already applied the a work-around for the same problem to the earlier +fallocate(FALLOC_FL_ZERO_RANGE) call, so do it now similar with the +PUNCH_HOLE call. But instead of silently catching and returning +-ENOTSUP (which causes the caller to fall back to writing zeroes), +let's rather inform the user once about the buggy file system and +try the other fallback instead. + +Signed-off-by: Thomas Huth +Message-Id: <20210527172020.847617-2-thuth@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 73ebf29729d1a40feaa9f8ab8951b6ee6dbfbede) +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1944861 +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 62a463229f..371572f1b0 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1587,6 +1587,17 @@ static int handle_aiocb_write_zeroes(void *opaque) + return ret; + } + s->has_fallocate = false; ++ } else if (ret == -EINVAL) { ++ /* ++ * Some file systems like older versions of GPFS do not like un- ++ * aligned byte ranges, and return EINVAL in such a case, though ++ * they should not do it according to the man-page of fallocate(). ++ * Warn about the bad filesystem and try the final fallback instead. ++ */ ++ warn_report_once("Your file system is misbehaving: " ++ "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. " ++ "Please report this bug to your file sytem " ++ "vendor."); + } else if (ret != -ENOTSUP) { + return ret; + } else { +-- +2.27.0 + diff --git a/kvm-block-introducing-bdrv_co_delete_file-interface.patch b/kvm-block-introducing-bdrv_co_delete_file-interface.patch new file mode 100755 index 0000000000000000000000000000000000000000..9d5e65932d9ef04effd16df712b7fb57277f4ada --- /dev/null +++ b/kvm-block-introducing-bdrv_co_delete_file-interface.patch @@ -0,0 +1,99 @@ +From 9581770f48911cbe68cfa1a7fa125df2a0a27d02 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Sun, 31 May 2020 16:40:33 +0100 +Subject: [PATCH 5/7] block: introducing 'bdrv_co_delete_file' interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Maxim Levitsky +Message-id: <20200531164035.34188-2-mlevitsk@redhat.com> +Patchwork-id: 97057 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/3] block: introducing 'bdrv_co_delete_file' interface +Bugzilla: 1827630 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: John Snow +RH-Acked-by: Eric Blake + +From: Daniel Henrique Barboza + +Adding to Block Drivers the capability of being able to clean up +its created files can be useful in certain situations. For the +LUKS driver, for instance, a failure in one of its authentication +steps can leave files in the host that weren't there before. + +This patch adds the 'bdrv_co_delete_file' interface to block +drivers and add it to the 'file' driver in file-posix.c. The +implementation is given by 'raw_co_delete_file'. + +Suggested-by: Daniel P. Berrangé +Signed-off-by: Daniel Henrique Barboza +Message-Id: <20200130213907.2830642-2-danielhb413@gmail.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 9bffae14df879255329473a7bd578643af2d4c9c) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 23 +++++++++++++++++++++++ + include/block/block_int.h | 4 ++++ + 2 files changed, 27 insertions(+) + +diff --git a/block/file-posix.c b/block/file-posix.c +index dd18d40..1609598 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -2388,6 +2388,28 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + return raw_co_create(&options, errp); + } + ++static int coroutine_fn raw_co_delete_file(BlockDriverState *bs, ++ Error **errp) ++{ ++ struct stat st; ++ int ret; ++ ++ if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) { ++ error_setg_errno(errp, ENOENT, "%s is not a regular file", ++ bs->filename); ++ return -ENOENT; ++ } ++ ++ ret = unlink(bs->filename); ++ if (ret < 0) { ++ ret = -errno; ++ error_setg_errno(errp, -ret, "Error when deleting file %s", ++ bs->filename); ++ } ++ ++ return ret; ++} ++ + /* + * Find allocation range in @bs around offset @start. + * May change underlying file descriptor's file offset. +@@ -3019,6 +3041,7 @@ BlockDriver bdrv_file = { + .bdrv_co_block_status = raw_co_block_status, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, ++ .bdrv_co_delete_file = raw_co_delete_file, + + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 529f153..562dca1 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -316,6 +316,10 @@ struct BlockDriver { + */ + int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs); + ++ /* Delete a created file. */ ++ int coroutine_fn (*bdrv_co_delete_file)(BlockDriverState *bs, ++ Error **errp); ++ + /* + * Flushes all data that was already written to the OS all the way down to + * the disk (for example file-posix.c calls fsync()). +-- +1.8.3.1 + diff --git a/kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch b/kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch new file mode 100755 index 0000000000000000000000000000000000000000..fe8c49b579f45691480d9ef7161b69d251061a90 --- /dev/null +++ b/kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch @@ -0,0 +1,100 @@ +From b9b77159567283628645943b5367d39b558e8faa Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 26 Jan 2021 20:07:59 -0500 +Subject: [PATCH 9/9] block/iscsi:fix heap-buffer-overflow in + iscsi_aio_ioctl_cb +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210126200759.245891-2-jmaloy@redhat.com> +Patchwork-id: 100787 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] block/iscsi:fix heap-buffer-overflow in iscsi_aio_ioctl_cb +Bugzilla: 1912974 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf +RH-Acked-by: Laszlo Ersek + +From: Chen Qun + +There is an overflow, the source 'datain.data[2]' is 100 bytes, + but the 'ss' is 252 bytes.This may cause a security issue because + we can access a lot of unrelated memory data. + +The len for sbp copy data should take the minimum of mx_sb_len and + sb_len_wr, not the maximum. + +If we use iscsi device for VM backend storage, ASAN show stack: + +READ of size 252 at 0xfffd149dcfc4 thread T0 + #0 0xaaad433d0d34 in __asan_memcpy (aarch64-softmmu/qemu-system-aarch64+0x2cb0d34) + #1 0xaaad45f9d6d0 in iscsi_aio_ioctl_cb /qemu/block/iscsi.c:996:9 + #2 0xfffd1af0e2dc (/usr/lib64/iscsi/libiscsi.so.8+0xe2dc) + #3 0xfffd1af0d174 (/usr/lib64/iscsi/libiscsi.so.8+0xd174) + #4 0xfffd1af19fac (/usr/lib64/iscsi/libiscsi.so.8+0x19fac) + #5 0xaaad45f9acc8 in iscsi_process_read /qemu/block/iscsi.c:403:5 + #6 0xaaad4623733c in aio_dispatch_handler /qemu/util/aio-posix.c:467:9 + #7 0xaaad4622f350 in aio_dispatch_handlers /qemu/util/aio-posix.c:510:20 + #8 0xaaad4622f350 in aio_dispatch /qemu/util/aio-posix.c:520 + #9 0xaaad46215944 in aio_ctx_dispatch /qemu/util/async.c:298:5 + #10 0xfffd1bed12f4 in g_main_context_dispatch (/lib64/libglib-2.0.so.0+0x512f4) + #11 0xaaad46227de0 in glib_pollfds_poll /qemu/util/main-loop.c:219:9 + #12 0xaaad46227de0 in os_host_main_loop_wait /qemu/util/main-loop.c:242 + #13 0xaaad46227de0 in main_loop_wait /qemu/util/main-loop.c:518 + #14 0xaaad43d9d60c in qemu_main_loop /qemu/softmmu/vl.c:1662:9 + #15 0xaaad4607a5b0 in main /qemu/softmmu/main.c:49:5 + #16 0xfffd1a460b9c in __libc_start_main (/lib64/libc.so.6+0x20b9c) + #17 0xaaad43320740 in _start (aarch64-softmmu/qemu-system-aarch64+0x2c00740) + +0xfffd149dcfc4 is located 0 bytes to the right of 100-byte region [0xfffd149dcf60,0xfffd149dcfc4) +allocated by thread T0 here: + #0 0xaaad433d1e70 in __interceptor_malloc (aarch64-softmmu/qemu-system-aarch64+0x2cb1e70) + #1 0xfffd1af0e254 (/usr/lib64/iscsi/libiscsi.so.8+0xe254) + #2 0xfffd1af0d174 (/usr/lib64/iscsi/libiscsi.so.8+0xd174) + #3 0xfffd1af19fac (/usr/lib64/iscsi/libiscsi.so.8+0x19fac) + #4 0xaaad45f9acc8 in iscsi_process_read /qemu/block/iscsi.c:403:5 + #5 0xaaad4623733c in aio_dispatch_handler /qemu/util/aio-posix.c:467:9 + #6 0xaaad4622f350 in aio_dispatch_handlers /qemu/util/aio-posix.c:510:20 + #7 0xaaad4622f350 in aio_dispatch /qemu/util/aio-posix.c:520 + #8 0xaaad46215944 in aio_ctx_dispatch /qemu/util/async.c:298:5 + #9 0xfffd1bed12f4 in g_main_context_dispatch (/lib64/libglib-2.0.so.0+0x512f4) + #10 0xaaad46227de0 in glib_pollfds_poll /qemu/util/main-loop.c:219:9 + #11 0xaaad46227de0 in os_host_main_loop_wait /qemu/util/main-loop.c:242 + #12 0xaaad46227de0 in main_loop_wait /qemu/util/main-loop.c:518 + #13 0xaaad43d9d60c in qemu_main_loop /qemu/softmmu/vl.c:1662:9 + #14 0xaaad4607a5b0 in main /qemu/softmmu/main.c:49:5 + #15 0xfffd1a460b9c in __libc_start_main (/lib64/libc.so.6+0x20b9c) + #16 0xaaad43320740 in _start (aarch64-softmmu/qemu-system-aarch64+0x2c00740) + +Reported-by: Euler Robot +Signed-off-by: Chen Qun +Reviewed-by: Stefan Hajnoczi +Message-id: 20200418062602.10776-1-kuhn.chenqun@huawei.com +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Peter Maydell + +(cherry picked from ff0507c239a246fd7215b31c5658fc6a3ee1e4c5) +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + block/iscsi.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/block/iscsi.c b/block/iscsi.c +index 0bea2d3a93..06915655b3 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -991,8 +991,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, + acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE; + + acb->ioh->sb_len_wr = acb->task->datain.size - 2; +- ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ? +- acb->ioh->mx_sb_len : acb->ioh->sb_len_wr; ++ ss = MIN(acb->ioh->mx_sb_len, acb->ioh->sb_len_wr); + memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss); + } + +-- +2.18.2 + diff --git a/kvm-block-nbd-Fix-hang-in-.bdrv_close.patch b/kvm-block-nbd-Fix-hang-in-.bdrv_close.patch new file mode 100755 index 0000000000000000000000000000000000000000..378ae1abba7ee49bdde92f21e28c7eca7f76af34 --- /dev/null +++ b/kvm-block-nbd-Fix-hang-in-.bdrv_close.patch @@ -0,0 +1,78 @@ +From 4ef2c464a54b0b618d933641ac0a7012e629fed9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:42 +0000 +Subject: [PATCH 01/20] block/nbd: Fix hang in .bdrv_close() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-2-mlevitsk@redhat.com> +Patchwork-id: 94224 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 1/6] block/nbd: Fix hang in .bdrv_close() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +When nbd_close() is called from a coroutine, the connection_co never +gets to run, and thus nbd_teardown_connection() hangs. + +This is because aio_co_enter() only puts the connection_co into the main +coroutine's wake-up queue, so this main coroutine needs to yield and +wait for connection_co to terminate. + +Suggested-by: Kevin Wolf +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-2-mreitz@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Maxim Levitsky +Signed-off-by: Max Reitz +(cherry picked from commit 78c81a3f108870d325b0a39d88711366afe6f703) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/nbd.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/block/nbd.c b/block/nbd.c +index 5f18f78..a73f0d9 100644 +--- a/block/nbd.c ++++ b/block/nbd.c +@@ -70,6 +70,7 @@ typedef struct BDRVNBDState { + CoMutex send_mutex; + CoQueue free_sema; + Coroutine *connection_co; ++ Coroutine *teardown_co; + QemuCoSleepState *connection_co_sleep_ns_state; + bool drained; + bool wait_drained_end; +@@ -203,7 +204,15 @@ static void nbd_teardown_connection(BlockDriverState *bs) + qemu_co_sleep_wake(s->connection_co_sleep_ns_state); + } + } +- BDRV_POLL_WHILE(bs, s->connection_co); ++ if (qemu_in_coroutine()) { ++ s->teardown_co = qemu_coroutine_self(); ++ /* connection_co resumes us when it terminates */ ++ qemu_coroutine_yield(); ++ s->teardown_co = NULL; ++ } else { ++ BDRV_POLL_WHILE(bs, s->connection_co); ++ } ++ assert(!s->connection_co); + } + + static bool nbd_client_connecting(BDRVNBDState *s) +@@ -395,6 +404,9 @@ static coroutine_fn void nbd_connection_entry(void *opaque) + s->ioc = NULL; + } + ++ if (s->teardown_co) { ++ aio_co_wake(s->teardown_co); ++ } + aio_wait_kick(); + } + +-- +1.8.3.1 + diff --git a/kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch b/kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch new file mode 100755 index 0000000000000000000000000000000000000000..43f9ffc90c6ba77b21817e76ccb64f51caa1fd35 --- /dev/null +++ b/kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch @@ -0,0 +1,328 @@ +From 25c528b30f8774f33e957d14060805398da524d9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 26 Mar 2020 20:23:06 +0000 +Subject: [PATCH 1/4] block: pass BlockDriver reference to the .bdrv_co_create + +RH-Author: Maxim Levitsky +Message-id: <20200326202307.9264-2-mlevitsk@redhat.com> +Patchwork-id: 94447 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create +Bugzilla: 1816007 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Kevin Wolf +RH-Acked-by: Max Reitz + +This will allow the reuse of a single generic .bdrv_co_create +implementation for several drivers. +No functional changes. + +Signed-off-by: Maxim Levitsky +Message-Id: <20200326011218.29230-2-mlevitsk@redhat.com> +Reviewed-by: Denis V. Lunev +Signed-off-by: Max Reitz +(cherry picked from commit b92902dfeaafbceaf744ab7473f2d070284f6172) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 3 ++- + block/crypto.c | 3 ++- + block/file-posix.c | 4 +++- + block/file-win32.c | 4 +++- + block/gluster.c | 3 ++- + block/nfs.c | 4 +++- + block/parallels.c | 3 ++- + block/qcow.c | 3 ++- + block/qcow2.c | 4 +++- + block/qed.c | 3 ++- + block/raw-format.c | 4 +++- + block/rbd.c | 3 ++- + block/sheepdog.c | 4 +++- + block/ssh.c | 4 +++- + block/vdi.c | 4 +++- + block/vhdx.c | 3 ++- + block/vmdk.c | 4 +++- + block/vpc.c | 6 ++++-- + include/block/block_int.h | 3 ++- + 19 files changed, 49 insertions(+), 20 deletions(-) + +diff --git a/block.c b/block.c +index ec29b1e..f9a1c5b 100644 +--- a/block.c ++++ b/block.c +@@ -482,7 +482,8 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque) + CreateCo *cco = opaque; + assert(cco->drv); + +- ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, &local_err); ++ ret = cco->drv->bdrv_co_create_opts(cco->drv, ++ cco->filename, cco->opts, &local_err); + error_propagate(&cco->err, local_err); + cco->ret = ret; + } +diff --git a/block/crypto.c b/block/crypto.c +index 2482383..970d463 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -539,7 +539,8 @@ fail: + return ret; + } + +-static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename, ++static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/file-posix.c b/block/file-posix.c +index fd29372..a2e0a74 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -2346,7 +2346,9 @@ out: + return result; + } + +-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn raw_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions options; +diff --git a/block/file-win32.c b/block/file-win32.c +index 77e8ff7..1585983 100644 +--- a/block/file-win32.c ++++ b/block/file-win32.c +@@ -588,7 +588,9 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp) + return 0; + } + +-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn raw_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions options; +diff --git a/block/gluster.c b/block/gluster.c +index 4fa4a77..0aa1f2c 100644 +--- a/block/gluster.c ++++ b/block/gluster.c +@@ -1130,7 +1130,8 @@ out: + return ret; + } + +-static int coroutine_fn qemu_gluster_co_create_opts(const char *filename, ++static int coroutine_fn qemu_gluster_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/nfs.c b/block/nfs.c +index 9a6311e..cc2413d 100644 +--- a/block/nfs.c ++++ b/block/nfs.c +@@ -662,7 +662,9 @@ out: + return ret; + } + +-static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts *opts, ++static int coroutine_fn nfs_file_co_create_opts(BlockDriver *drv, ++ const char *url, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options; +diff --git a/block/parallels.c b/block/parallels.c +index 7a01997..6d4ed77 100644 +--- a/block/parallels.c ++++ b/block/parallels.c +@@ -609,7 +609,8 @@ exit: + goto out; + } + +-static int coroutine_fn parallels_co_create_opts(const char *filename, ++static int coroutine_fn parallels_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/qcow.c b/block/qcow.c +index fce8989..8973e4e 100644 +--- a/block/qcow.c ++++ b/block/qcow.c +@@ -934,7 +934,8 @@ exit: + return ret; + } + +-static int coroutine_fn qcow_co_create_opts(const char *filename, ++static int coroutine_fn qcow_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, Error **errp) + { + BlockdevCreateOptions *create_options = NULL; +diff --git a/block/qcow2.c b/block/qcow2.c +index 83b1fc0..71067c6 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -3558,7 +3558,9 @@ out: + return ret; + } + +-static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn qcow2_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options = NULL; +diff --git a/block/qed.c b/block/qed.c +index d8c4e5f..1af9b3c 100644 +--- a/block/qed.c ++++ b/block/qed.c +@@ -720,7 +720,8 @@ out: + return ret; + } + +-static int coroutine_fn bdrv_qed_co_create_opts(const char *filename, ++static int coroutine_fn bdrv_qed_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/raw-format.c b/block/raw-format.c +index 3a76ec7..93b25e1 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -419,7 +419,9 @@ static int raw_has_zero_init_truncate(BlockDriverState *bs) + return bdrv_has_zero_init_truncate(bs->file->bs); + } + +-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn raw_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + return bdrv_create_file(filename, opts, errp); +diff --git a/block/rbd.c b/block/rbd.c +index 027cbcc..8847259 100644 +--- a/block/rbd.c ++++ b/block/rbd.c +@@ -425,7 +425,8 @@ static int qemu_rbd_co_create(BlockdevCreateOptions *options, Error **errp) + return qemu_rbd_do_create(options, NULL, NULL, errp); + } + +-static int coroutine_fn qemu_rbd_co_create_opts(const char *filename, ++static int coroutine_fn qemu_rbd_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/sheepdog.c b/block/sheepdog.c +index cfa8433..a8a7e32 100644 +--- a/block/sheepdog.c ++++ b/block/sheepdog.c +@@ -2157,7 +2157,9 @@ out: + return ret; + } + +-static int coroutine_fn sd_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn sd_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options = NULL; +diff --git a/block/ssh.c b/block/ssh.c +index b4375cf..84e9282 100644 +--- a/block/ssh.c ++++ b/block/ssh.c +@@ -963,7 +963,9 @@ fail: + return ret; + } + +-static int coroutine_fn ssh_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn ssh_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + BlockdevCreateOptions *create_options; +diff --git a/block/vdi.c b/block/vdi.c +index 0142da7..e1a11f2 100644 +--- a/block/vdi.c ++++ b/block/vdi.c +@@ -896,7 +896,9 @@ static int coroutine_fn vdi_co_create(BlockdevCreateOptions *create_options, + return vdi_co_do_create(create_options, DEFAULT_CLUSTER_SIZE, errp); + } + +-static int coroutine_fn vdi_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn vdi_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + QDict *qdict = NULL; +diff --git a/block/vhdx.c b/block/vhdx.c +index f02d261..33e57cd 100644 +--- a/block/vhdx.c ++++ b/block/vhdx.c +@@ -2046,7 +2046,8 @@ delete_and_exit: + return ret; + } + +-static int coroutine_fn vhdx_co_create_opts(const char *filename, ++static int coroutine_fn vhdx_co_create_opts(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp) + { +diff --git a/block/vmdk.c b/block/vmdk.c +index 20e909d..eb726f2 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -2588,7 +2588,9 @@ exit: + return blk; + } + +-static int coroutine_fn vmdk_co_create_opts(const char *filename, QemuOpts *opts, ++static int coroutine_fn vmdk_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, + Error **errp) + { + Error *local_err = NULL; +diff --git a/block/vpc.c b/block/vpc.c +index a655502..6df75e2 100644 +--- a/block/vpc.c ++++ b/block/vpc.c +@@ -1089,8 +1089,10 @@ out: + return ret; + } + +-static int coroutine_fn vpc_co_create_opts(const char *filename, +- QemuOpts *opts, Error **errp) ++static int coroutine_fn vpc_co_create_opts(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, ++ Error **errp) + { + BlockdevCreateOptions *create_options = NULL; + QDict *qdict; +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 96e327b..7ff81be 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -136,7 +136,8 @@ struct BlockDriver { + void (*bdrv_close)(BlockDriverState *bs); + int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts, + Error **errp); +- int coroutine_fn (*bdrv_co_create_opts)(const char *filename, ++ int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv, ++ const char *filename, + QemuOpts *opts, + Error **errp); + int (*bdrv_make_empty)(BlockDriverState *bs); +-- +1.8.3.1 + diff --git a/kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch b/kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch new file mode 100755 index 0000000000000000000000000000000000000000..2c27fd2f5b5e73e00152e11dcf623056106de6e5 --- /dev/null +++ b/kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch @@ -0,0 +1,78 @@ +From ec5408763c49cd0b63ee324bdc38a429ed1adeee Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:29 +0000 +Subject: [PATCH 09/20] block/qcow2: Move bitmap reopen into + bdrv_reopen_commit_post + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-4-kwolf@redhat.com> +Patchwork-id: 94280 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 03/13] block/qcow2: Move bitmap reopen into bdrv_reopen_commit_post +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Peter Krempa + +The bitmap code requires writing the 'file' child when the qcow2 driver +is reopened in read-write mode. + +If the 'file' child is being reopened due to a permissions change, the +modification is commited yet when qcow2_reopen_commit is called. This +means that any attempt to write the 'file' child will end with EBADFD +as the original fd was already closed. + +Moving bitmap reopening to the new callback which is called after +permission modifications are commited fixes this as the file descriptor +will be replaced with the correct one. + +The above problem manifests itself when reopening 'qcow2' format layer +which uses a 'file-posix' file child which was opened with the +'auto-read-only' property set. + +Signed-off-by: Peter Krempa +Message-Id: +Signed-off-by: Kevin Wolf +(cherry picked from commit 65eb7c85a3e62529e2bad782e94d5a7b11dd5a92) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/block/qcow2.c b/block/qcow2.c +index 7c18721..83b1fc0 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -1881,6 +1881,11 @@ fail: + static void qcow2_reopen_commit(BDRVReopenState *state) + { + qcow2_update_options_commit(state->bs, state->opaque); ++ g_free(state->opaque); ++} ++ ++static void qcow2_reopen_commit_post(BDRVReopenState *state) ++{ + if (state->flags & BDRV_O_RDWR) { + Error *local_err = NULL; + +@@ -1895,7 +1900,6 @@ static void qcow2_reopen_commit(BDRVReopenState *state) + bdrv_get_node_name(state->bs)); + } + } +- g_free(state->opaque); + } + + static void qcow2_reopen_abort(BDRVReopenState *state) +@@ -5492,6 +5496,7 @@ BlockDriver bdrv_qcow2 = { + .bdrv_close = qcow2_close, + .bdrv_reopen_prepare = qcow2_reopen_prepare, + .bdrv_reopen_commit = qcow2_reopen_commit, ++ .bdrv_reopen_commit_post = qcow2_reopen_commit_post, + .bdrv_reopen_abort = qcow2_reopen_abort, + .bdrv_join_options = qcow2_join_options, + .bdrv_child_perm = bdrv_format_default_perms, +-- +1.8.3.1 + diff --git a/kvm-block-trickle-down-the-fallback-image-creation-funct.patch b/kvm-block-trickle-down-the-fallback-image-creation-funct.patch new file mode 100755 index 0000000000000000000000000000000000000000..5ba15213ef3b11a59e35f776aa3104f7d52e49d2 --- /dev/null +++ b/kvm-block-trickle-down-the-fallback-image-creation-funct.patch @@ -0,0 +1,296 @@ +From a1f7b929ae1fe6fa424c520c3a5eb497333b0fd9 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Thu, 26 Mar 2020 20:23:07 +0000 +Subject: [PATCH 2/4] block: trickle down the fallback image creation function + use to the block drivers + +RH-Author: Maxim Levitsky +Message-id: <20200326202307.9264-3-mlevitsk@redhat.com> +Patchwork-id: 94446 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers +Bugzilla: 1816007 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Kevin Wolf +RH-Acked-by: Max Reitz + +Instead of checking the .bdrv_co_create_opts to see if we need the +fallback, just implement the .bdrv_co_create_opts in the drivers that +need it. + +This way we don't break various places that need to know if the +underlying protocol/format really supports image creation, and this way +we still allow some drivers to not support image creation. + +Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007 + +Note that technically this driver reverts the image creation fallback +for the vxhs driver since I don't have a means to test it, and IMHO it +is better to leave it not supported as it was prior to generic image +creation patches. + +Also drop iscsi_create_opts which was left accidentally. + +Signed-off-by: Maxim Levitsky +Message-Id: <20200326011218.29230-3-mlevitsk@redhat.com> +Reviewed-by: Denis V. Lunev +[mreitz: Fixed alignment, and moved bdrv_co_create_opts_simple() and + bdrv_create_opts_simple from block.h into block_int.h] +Signed-off-by: Max Reitz +(cherry picked from commit 5a5e7f8cd86b7ced0732b1b6e28c82baa65b09c9) + +Contextual conflicts in block.c and include/block/block_int.h + +(conflict in block.c by default shows as functional but +with --diff-algorithm=patience it becomes a contextual conflict) + +... +001/2:[----] [--] 'block: pass BlockDriver reference to the .bdrv_co_create' +002/2:[0014] [FC] 'block: trickle down the fallback image creation function use to the block drivers' +... +002/2: 'meld <(git show 5a5e7f8^\!) <(git show 6d3bca5^\!)' + +So now running: +meld <(git show 5a5e7f8^\! --diff-algorithm=patience) <(git show 6d3bca5^\! --diff-algorithm=patience) + +shows no contextual conflicts +It is mostly due to missing commit f6dc1c31d3801dcbdf0c56574f9ff4f05180810c +Thanks to Max Reitz for helping me with this. + +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 35 ++++++++++++++++++++--------------- + block/file-posix.c | 7 ++++++- + block/iscsi.c | 16 ++++------------ + block/nbd.c | 6 ++++++ + block/nvme.c | 3 +++ + include/block/block.h | 1 + + include/block/block_int.h | 11 +++++++++++ + 7 files changed, 51 insertions(+), 28 deletions(-) + +diff --git a/block.c b/block.c +index f9a1c5b..ba3b40d7 100644 +--- a/block.c ++++ b/block.c +@@ -597,8 +597,15 @@ static int create_file_fallback_zero_first_sector(BlockBackend *blk, + return 0; + } + +-static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, +- QemuOpts *opts, Error **errp) ++/** ++ * Simple implementation of bdrv_co_create_opts for protocol drivers ++ * which only support creation via opening a file ++ * (usually existing raw storage device) ++ */ ++int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, ++ Error **errp) + { + BlockBackend *blk; + QDict *options; +@@ -662,11 +669,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) + return -ENOENT; + } + +- if (drv->bdrv_co_create_opts) { +- return bdrv_create(drv, filename, opts, errp); +- } else { +- return bdrv_create_file_fallback(filename, drv, opts, errp); +- } ++ return bdrv_create(drv, filename, opts, errp); + } + + /** +@@ -1543,9 +1546,9 @@ QemuOptsList bdrv_runtime_opts = { + }, + }; + +-static QemuOptsList fallback_create_opts = { +- .name = "fallback-create-opts", +- .head = QTAILQ_HEAD_INITIALIZER(fallback_create_opts.head), ++QemuOptsList bdrv_create_opts_simple = { ++ .name = "simple-create-opts", ++ .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, +@@ -5910,13 +5913,15 @@ void bdrv_img_create(const char *filename, const char *fmt, + return; + } + +- create_opts = qemu_opts_append(create_opts, drv->create_opts); +- if (proto_drv->create_opts) { +- create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); +- } else { +- create_opts = qemu_opts_append(create_opts, &fallback_create_opts); ++ if (!proto_drv->create_opts) { ++ error_setg(errp, "Protocol driver '%s' does not support image creation", ++ proto_drv->format_name); ++ return; + } + ++ create_opts = qemu_opts_append(create_opts, drv->create_opts); ++ create_opts = qemu_opts_append(create_opts, proto_drv->create_opts); ++ + /* Create parameter list with default values */ + opts = qemu_opts_create(create_opts, NULL, 0, &error_abort); + qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort); +diff --git a/block/file-posix.c b/block/file-posix.c +index a2e0a74..dd18d40 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -3432,6 +3432,8 @@ static BlockDriver bdrv_host_device = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, +@@ -3558,10 +3560,11 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + +- + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, +@@ -3690,6 +3693,8 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + + .bdrv_co_preadv = raw_co_preadv, +diff --git a/block/iscsi.c b/block/iscsi.c +index b45da65..16b0716 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -2399,18 +2399,6 @@ out_unlock: + return r; + } + +-static QemuOptsList iscsi_create_opts = { +- .name = "iscsi-create-opts", +- .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), +- .desc = { +- { +- .name = BLOCK_OPT_SIZE, +- .type = QEMU_OPT_SIZE, +- .help = "Virtual disk size" +- }, +- { /* end of list */ } +- } +-}; + + static const char *const iscsi_strong_runtime_opts[] = { + "transport", +@@ -2434,6 +2422,8 @@ static BlockDriver bdrv_iscsi = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +@@ -2471,6 +2461,8 @@ static BlockDriver bdrv_iser = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +diff --git a/block/nbd.c b/block/nbd.c +index a73f0d9..927915d 100644 +--- a/block/nbd.c ++++ b/block/nbd.c +@@ -2030,6 +2030,8 @@ static BlockDriver bdrv_nbd = { + .protocol_name = "nbd", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_file_open = nbd_open, + .bdrv_reopen_prepare = nbd_client_reopen_prepare, + .bdrv_co_preadv = nbd_client_co_preadv, +@@ -2055,6 +2057,8 @@ static BlockDriver bdrv_nbd_tcp = { + .protocol_name = "nbd+tcp", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_file_open = nbd_open, + .bdrv_reopen_prepare = nbd_client_reopen_prepare, + .bdrv_co_preadv = nbd_client_co_preadv, +@@ -2080,6 +2084,8 @@ static BlockDriver bdrv_nbd_unix = { + .protocol_name = "nbd+unix", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, + .bdrv_file_open = nbd_open, + .bdrv_reopen_prepare = nbd_client_reopen_prepare, + .bdrv_co_preadv = nbd_client_co_preadv, +diff --git a/block/nvme.c b/block/nvme.c +index d41c4bd..7b7c0cc 100644 +--- a/block/nvme.c ++++ b/block/nvme.c +@@ -1333,6 +1333,9 @@ static BlockDriver bdrv_nvme = { + .protocol_name = "nvme", + .instance_size = sizeof(BDRVNVMeState), + ++ .bdrv_co_create_opts = bdrv_co_create_opts_simple, ++ .create_opts = &bdrv_create_opts_simple, ++ + .bdrv_parse_filename = nvme_parse_filename, + .bdrv_file_open = nvme_file_open, + .bdrv_close = nvme_close, +diff --git a/include/block/block.h b/include/block/block.h +index 1df9848..92685d2 100644 +--- a/include/block/block.h ++++ b/include/block/block.h +@@ -293,6 +293,7 @@ BlockDriver *bdrv_find_format(const char *format_name); + int bdrv_create(BlockDriver *drv, const char* filename, + QemuOpts *opts, Error **errp); + int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); ++ + BlockDriverState *bdrv_new(void); + void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, + Error **errp); +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 7ff81be..529f153 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -1325,4 +1325,15 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, + + int refresh_total_sectors(BlockDriverState *bs, int64_t hint); + ++/** ++ * Simple implementation of bdrv_co_create_opts for protocol drivers ++ * which only support creation via opening a file ++ * (usually existing raw storage device) ++ */ ++int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, ++ const char *filename, ++ QemuOpts *opts, ++ Error **errp); ++extern QemuOptsList bdrv_create_opts_simple; ++ + #endif /* BLOCK_INT_H */ +-- +1.8.3.1 + diff --git a/kvm-block-truncate-Don-t-make-backing-file-data-visible.patch b/kvm-block-truncate-Don-t-make-backing-file-data-visible.patch new file mode 100755 index 0000000000000000000000000000000000000000..114e1b7648fce7909fa2aa79cac6146cb4087ea0 --- /dev/null +++ b/kvm-block-truncate-Don-t-make-backing-file-data-visible.patch @@ -0,0 +1,94 @@ +From d84b9b93755ece6618ed98fa84386beeb1a0e40b Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:36 +0100 +Subject: [PATCH 08/17] block: truncate: Don't make backing file data visible + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-8-kwolf@redhat.com> +Patchwork-id: 97454 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 07/11] block: truncate: Don't make backing file data visible +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +When extending the size of an image that has a backing file larger than +its old size, make sure that the backing file data doesn't become +visible in the guest, but the added area is properly zeroed out. + +Consider the following scenario where the overlay is shorter than its +backing file: + + base.qcow2: AAAAAAAA + overlay.qcow2: BBBB + +When resizing (extending) overlay.qcow2, the new blocks should not stay +unallocated and make the additional As from base.qcow2 visible like +before this patch, but zeros should be read. + +A similar case happens with the various variants of a commit job when an +intermediate file is short (- for unallocated): + + base.qcow2: A-A-AAAA + mid.qcow2: BB-B + top.qcow2: C--C--C- + +After commit top.qcow2 to mid.qcow2, the following happens: + + mid.qcow2: CB-C00C0 (correct result) + mid.qcow2: CB-C--C- (before this fix) + +Without the fix, blocks that previously read as zeros on top.qcow2 +suddenly turn into A. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200424125448.63318-8-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 955c7d6687fefcd903900a1e597fcbc896c661cd) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/io.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/block/io.c b/block/io.c +index 3235ce5..6c70b56 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -3370,6 +3370,31 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + goto out; + } + ++ /* ++ * If the image has a backing file that is large enough that it would ++ * provide data for the new area, we cannot leave it unallocated because ++ * then the backing file content would become visible. Instead, zero-fill ++ * the new area. ++ * ++ * Note that if the image has a backing file, but was opened without the ++ * backing file, taking care of keeping things consistent with that backing ++ * file is the user's responsibility. ++ */ ++ if (new_bytes && bs->backing) { ++ int64_t backing_len; ++ ++ backing_len = bdrv_getlength(backing_bs(bs)); ++ if (backing_len < 0) { ++ ret = backing_len; ++ error_setg_errno(errp, -ret, "Could not get backing file size"); ++ goto out; ++ } ++ ++ if (backing_len > old_size) { ++ flags |= BDRV_REQ_ZERO_WRITE; ++ } ++ } ++ + if (drv->bdrv_co_truncate) { + if (flags & ~bs->supported_truncate_flags) { + error_setg(errp, "Block driver does not support requested flags"); +-- +1.8.3.1 + diff --git a/kvm-block.c-adding-bdrv_co_delete_file.patch b/kvm-block.c-adding-bdrv_co_delete_file.patch new file mode 100755 index 0000000000000000000000000000000000000000..91c3cd1b5dbfee3ff0581dfefaa4db3420eb697c --- /dev/null +++ b/kvm-block.c-adding-bdrv_co_delete_file.patch @@ -0,0 +1,92 @@ +From 23b92512d7f11b3a38cf24a5c2fe7848f1e550e8 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Sun, 31 May 2020 16:40:34 +0100 +Subject: [PATCH 6/7] block.c: adding bdrv_co_delete_file +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Maxim Levitsky +Message-id: <20200531164035.34188-3-mlevitsk@redhat.com> +Patchwork-id: 97058 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/3] block.c: adding bdrv_co_delete_file +Bugzilla: 1827630 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: John Snow +RH-Acked-by: Eric Blake + +From: Daniel Henrique Barboza + +Using the new 'bdrv_co_delete_file' interface, a pure co_routine function +'bdrv_co_delete_file' inside block.c can can be used in a way similar of +the existing bdrv_create_file to to clean up a created file. + +We're creating a pure co_routine because the only caller of +'bdrv_co_delete_file' will be already in co_routine context, thus there +is no need to add all the machinery to check for qemu_in_coroutine() and +create a separated co_routine to do the job. + +Suggested-by: Daniel P. Berrangé +Signed-off-by: Daniel Henrique Barboza +Message-Id: <20200130213907.2830642-3-danielhb413@gmail.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit e1d7f8bb1ec0c6911dcea81641ce6139dbded02d) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block.c | 26 ++++++++++++++++++++++++++ + include/block/block.h | 1 + + 2 files changed, 27 insertions(+) + +diff --git a/block.c b/block.c +index ba3b40d7..d6a05da 100644 +--- a/block.c ++++ b/block.c +@@ -672,6 +672,32 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) + return bdrv_create(drv, filename, opts, errp); + } + ++int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp) ++{ ++ Error *local_err = NULL; ++ int ret; ++ ++ assert(bs != NULL); ++ ++ if (!bs->drv) { ++ error_setg(errp, "Block node '%s' is not opened", bs->filename); ++ return -ENOMEDIUM; ++ } ++ ++ if (!bs->drv->bdrv_co_delete_file) { ++ error_setg(errp, "Driver '%s' does not support image deletion", ++ bs->drv->format_name); ++ return -ENOTSUP; ++ } ++ ++ ret = bs->drv->bdrv_co_delete_file(bs, &local_err); ++ if (ret < 0) { ++ error_propagate(errp, local_err); ++ } ++ ++ return ret; ++} ++ + /** + * Try to get @bs's logical and physical block size. + * On success, store them in @bsz struct and return 0. +diff --git a/include/block/block.h b/include/block/block.h +index 92685d2..b2a3074 100644 +--- a/include/block/block.h ++++ b/include/block/block.h +@@ -373,6 +373,7 @@ bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, + int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base, + Error **errp); + void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base); ++int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp); + + + typedef struct BdrvCheckResult { +-- +1.8.3.1 + diff --git a/kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch b/kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch new file mode 100755 index 0000000000000000000000000000000000000000..9a691308128b43c4a47cb4928814e5415b1b648f --- /dev/null +++ b/kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch @@ -0,0 +1,176 @@ +From dc2654f2319ad6c379e0ba10be143726c6f0e9e0 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:47 +0000 +Subject: [PATCH 14/18] blockdev: Acquire AioContext on dirty bitmap functions + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-8-slp@redhat.com> +Patchwork-id: 93760 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 7/9] blockdev: Acquire AioContext on dirty bitmap functions +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Dirty map addition and removal functions are not acquiring to BDS +AioContext, while they may call to code that expects it to be +acquired. + +This may trigger a crash with a stack trace like this one: + + #0 0x00007f0ef146370f in __GI_raise (sig=sig@entry=6) + at ../sysdeps/unix/sysv/linux/raise.c:50 + #1 0x00007f0ef144db25 in __GI_abort () at abort.c:79 + #2 0x0000565022294dce in error_exit + (err=, msg=msg@entry=0x56502243a730 <__func__.16350> "qemu_mutex_unlock_impl") at util/qemu-thread-posix.c:36 + #3 0x00005650222950ba in qemu_mutex_unlock_impl + (mutex=mutex@entry=0x5650244b0240, file=file@entry=0x565022439adf "util/async.c", line=line@entry=526) at util/qemu-thread-posix.c:108 + #4 0x0000565022290029 in aio_context_release + (ctx=ctx@entry=0x5650244b01e0) at util/async.c:526 + #5 0x000056502221cd08 in bdrv_can_store_new_dirty_bitmap + (bs=bs@entry=0x5650244dc820, name=name@entry=0x56502481d360 "bitmap1", granularity=granularity@entry=65536, errp=errp@entry=0x7fff22831718) + at block/dirty-bitmap.c:542 + #6 0x000056502206ae53 in qmp_block_dirty_bitmap_add + (errp=0x7fff22831718, disabled=false, has_disabled=, persistent=, has_persistent=true, granularity=65536, has_granularity=, name=0x56502481d360 "bitmap1", node=) at blockdev.c:2894 + #7 0x000056502206ae53 in qmp_block_dirty_bitmap_add + (node=, name=0x56502481d360 "bitmap1", has_granularity=, granularity=, has_persistent=true, persistent=, has_disabled=false, disabled=false, errp=0x7fff22831718) at blockdev.c:2856 + #8 0x00005650221847a3 in qmp_marshal_block_dirty_bitmap_add + (args=, ret=, errp=0x7fff22831798) + at qapi/qapi-commands-block-core.c:651 + #9 0x0000565022247e6c in do_qmp_dispatch + (errp=0x7fff22831790, allow_oob=, request=, cmds=0x565022b32d60 ) at qapi/qmp-dispatch.c:132 + #10 0x0000565022247e6c in qmp_dispatch + (cmds=0x565022b32d60 , request=, allow_oob=) at qapi/qmp-dispatch.c:175 + #11 0x0000565022166061 in monitor_qmp_dispatch + (mon=0x56502450faa0, req=) at monitor/qmp.c:145 + #12 0x00005650221666fa in monitor_qmp_bh_dispatcher + (data=) at monitor/qmp.c:234 + #13 0x000056502228f866 in aio_bh_call (bh=0x56502440eae0) + at util/async.c:117 + #14 0x000056502228f866 in aio_bh_poll (ctx=ctx@entry=0x56502440d7a0) + at util/async.c:117 + #15 0x0000565022292c54 in aio_dispatch (ctx=0x56502440d7a0) + at util/aio-posix.c:459 + #16 0x000056502228f742 in aio_ctx_dispatch + (source=, callback=, user_data=) at util/async.c:260 + #17 0x00007f0ef5ce667d in g_main_dispatch (context=0x56502449aa40) + at gmain.c:3176 + #18 0x00007f0ef5ce667d in g_main_context_dispatch + (context=context@entry=0x56502449aa40) at gmain.c:3829 + #19 0x0000565022291d08 in glib_pollfds_poll () at util/main-loop.c:219 + #20 0x0000565022291d08 in os_host_main_loop_wait + (timeout=) at util/main-loop.c:242 + #21 0x0000565022291d08 in main_loop_wait (nonblocking=) + at util/main-loop.c:518 + #22 0x00005650220743c1 in main_loop () at vl.c:1828 + #23 0x0000565021f20a72 in main + (argc=, argv=, envp=) + at vl.c:4504 + +Fix this by acquiring the AioContext at qmp_block_dirty_bitmap_add() +and qmp_block_dirty_bitmap_add(). + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1782175 +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 91005a495e228ebd7e5e173cd18f952450eef82d) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 22 ++++++++++++++++++---- + 1 file changed, 18 insertions(+), 4 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 1dacbc2..d4ef6cd 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -2984,6 +2984,7 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; + + if (!name || name[0] == '\0') { + error_setg(errp, "Bitmap name cannot be empty"); +@@ -2995,11 +2996,14 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + return; + } + ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ + if (has_granularity) { + if (granularity < 512 || !is_power_of_2(granularity)) { + error_setg(errp, "Granularity must be power of 2 " + "and at least 512"); +- return; ++ goto out; + } + } else { + /* Default to cluster size, if available: */ +@@ -3017,12 +3021,12 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + if (persistent && + !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp)) + { +- return; ++ goto out; + } + + bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp); + if (bitmap == NULL) { +- return; ++ goto out; + } + + if (disabled) { +@@ -3030,6 +3034,9 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name, + } + + bdrv_dirty_bitmap_set_persistence(bitmap, persistent); ++ ++out: ++ aio_context_release(aio_context); + } + + static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( +@@ -3038,21 +3045,27 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; + + bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); + if (!bitmap || !bs) { + return NULL; + } + ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ + if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_BUSY | BDRV_BITMAP_RO, + errp)) { ++ aio_context_release(aio_context); + return NULL; + } + + if (bdrv_dirty_bitmap_get_persistence(bitmap) && + bdrv_remove_persistent_dirty_bitmap(bs, name, errp) < 0) + { +- return NULL; ++ aio_context_release(aio_context); ++ return NULL; + } + + if (release) { +@@ -3063,6 +3076,7 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( + *bitmap_bs = bs; + } + ++ aio_context_release(aio_context); + return release ? NULL : bitmap; + } + +-- +1.8.3.1 + diff --git a/kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch b/kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch new file mode 100755 index 0000000000000000000000000000000000000000..8cb170081b794fa3a088ab563c2235b7ce8934b8 --- /dev/null +++ b/kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch @@ -0,0 +1,179 @@ +From 0c8ba0a96a7d0cbf371f1a5fbee543e8b2cb2595 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:13 +0100 +Subject: [PATCH 08/26] blockdev: Promote several bitmap functions to + non-static +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-6-eblake@redhat.com> +Patchwork-id: 97077 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 05/12] blockdev: Promote several bitmap functions to non-static +Bugzilla: 1779893 1779904 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +The next patch will split blockdev.c, which will require accessing +some previously-static functions from more than one .c file. But part +of promoting a function to public is picking a naming scheme that does +not reek of exposing too many internals (two of the three functions +were named starting with 'do_'). To make future code motion easier, +perform the function rename and non-static promotion into its own +patch. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Message-Id: <20200513011648.166876-5-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit c6996cf9a6c759c29919642be9a73ac64b38301b) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 47 +++++++++++++++++++---------------------------- + include/block/block_int.h | 12 ++++++++++++ + 2 files changed, 31 insertions(+), 28 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 86eb115..3958058 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1262,10 +1262,10 @@ out_aio_context: + * + * @return: A bitmap object on success, or NULL on failure. + */ +-static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, +- const char *name, +- BlockDriverState **pbs, +- Error **errp) ++BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, ++ const char *name, ++ BlockDriverState **pbs, ++ Error **errp) + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; +@@ -2241,11 +2241,6 @@ static void block_dirty_bitmap_disable_abort(BlkActionState *common) + } + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_merge( +- const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bitmaps, +- HBitmap **backup, Error **errp); +- + static void block_dirty_bitmap_merge_prepare(BlkActionState *common, + Error **errp) + { +@@ -2259,15 +2254,11 @@ static void block_dirty_bitmap_merge_prepare(BlkActionState *common, + + action = common->action->u.block_dirty_bitmap_merge.data; + +- state->bitmap = do_block_dirty_bitmap_merge(action->node, action->target, +- action->bitmaps, &state->backup, +- errp); ++ state->bitmap = block_dirty_bitmap_merge(action->node, action->target, ++ action->bitmaps, &state->backup, ++ errp); + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( +- const char *node, const char *name, bool release, +- BlockDriverState **bitmap_bs, Error **errp); +- + static void block_dirty_bitmap_remove_prepare(BlkActionState *common, + Error **errp) + { +@@ -2281,8 +2272,8 @@ static void block_dirty_bitmap_remove_prepare(BlkActionState *common, + + action = common->action->u.block_dirty_bitmap_remove.data; + +- state->bitmap = do_block_dirty_bitmap_remove(action->node, action->name, +- false, &state->bs, errp); ++ state->bitmap = block_dirty_bitmap_remove(action->node, action->name, ++ false, &state->bs, errp); + if (state->bitmap) { + bdrv_dirty_bitmap_skip_store(state->bitmap, true); + bdrv_dirty_bitmap_set_busy(state->bitmap, true); +@@ -3046,9 +3037,10 @@ out: + aio_context_release(aio_context); + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( +- const char *node, const char *name, bool release, +- BlockDriverState **bitmap_bs, Error **errp) ++BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, ++ bool release, ++ BlockDriverState **bitmap_bs, ++ Error **errp) + { + BlockDriverState *bs; + BdrvDirtyBitmap *bitmap; +@@ -3090,7 +3082,7 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_remove( + void qmp_block_dirty_bitmap_remove(const char *node, const char *name, + Error **errp) + { +- do_block_dirty_bitmap_remove(node, name, true, NULL, errp); ++ block_dirty_bitmap_remove(node, name, true, NULL, errp); + } + + /** +@@ -3151,10 +3143,9 @@ void qmp_block_dirty_bitmap_disable(const char *node, const char *name, + bdrv_disable_dirty_bitmap(bitmap); + } + +-static BdrvDirtyBitmap *do_block_dirty_bitmap_merge( +- const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bitmaps, +- HBitmap **backup, Error **errp) ++BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bms, ++ HBitmap **backup, Error **errp) + { + BlockDriverState *bs; + BdrvDirtyBitmap *dst, *src, *anon; +@@ -3172,7 +3163,7 @@ static BdrvDirtyBitmap *do_block_dirty_bitmap_merge( + return NULL; + } + +- for (lst = bitmaps; lst; lst = lst->next) { ++ for (lst = bms; lst; lst = lst->next) { + switch (lst->value->type) { + const char *name, *node; + case QTYPE_QSTRING: +@@ -3217,7 +3208,7 @@ void qmp_block_dirty_bitmap_merge(const char *node, const char *target, + BlockDirtyBitmapMergeSourceList *bitmaps, + Error **errp) + { +- do_block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); ++ block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); + } + + BlockDirtyBitmapSha256 *qmp_x_debug_block_dirty_bitmap_sha256(const char *node, +diff --git a/include/block/block_int.h b/include/block/block_int.h +index cc18e8d..876a83d 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -1341,4 +1341,16 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, + Error **errp); + extern QemuOptsList bdrv_create_opts_simple; + ++BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, ++ const char *name, ++ BlockDriverState **pbs, ++ Error **errp); ++BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bms, ++ HBitmap **backup, Error **errp); ++BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, ++ bool release, ++ BlockDriverState **bitmap_bs, ++ Error **errp); ++ + #endif /* BLOCK_INT_H */ +-- +1.8.3.1 + diff --git a/kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch b/kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch new file mode 100755 index 0000000000000000000000000000000000000000..b2dd453ae0b2193b6e74e502dd02cc241d5a3826 --- /dev/null +++ b/kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch @@ -0,0 +1,107 @@ +From 24e5eca4218b294bd013e2d85a38345045506bec Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:48 +0000 +Subject: [PATCH 15/18] blockdev: Return bs to the proper context on snapshot + abort + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-9-slp@redhat.com> +Patchwork-id: 93761 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 8/9] blockdev: Return bs to the proper context on snapshot abort +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +external_snapshot_abort() calls to bdrv_set_backing_hd(), which +returns state->old_bs to the main AioContext, as it's intended to be +used then the BDS is going to be released. As that's not the case when +aborting an external snapshot, return it to the AioContext it was +before the call. + +This issue can be triggered by issuing a transaction with two actions, +a proper blockdev-snapshot-sync and a bogus one, so the second will +trigger a transaction abort. This results in a crash with an stack +trace like this one: + + #0 0x00007fa1048b28df in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 + #1 0x00007fa10489ccf5 in __GI_abort () at abort.c:79 + #2 0x00007fa10489cbc9 in __assert_fail_base + (fmt=0x7fa104a03300 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x5572240b44d8 "bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)", file=0x557224014d30 "block.c", line=2240, function=) at assert.c:92 + #3 0x00007fa1048aae96 in __GI___assert_fail + (assertion=assertion@entry=0x5572240b44d8 "bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)", file=file@entry=0x557224014d30 "block.c", line=line@entry=2240, function=function@entry=0x5572240b5d60 <__PRETTY_FUNCTION__.31620> "bdrv_replace_child_noperm") at assert.c:101 + #4 0x0000557223e631f8 in bdrv_replace_child_noperm (child=0x557225b9c980, new_bs=new_bs@entry=0x557225c42e40) at block.c:2240 + #5 0x0000557223e68be7 in bdrv_replace_node (from=0x557226951a60, to=0x557225c42e40, errp=0x5572247d6138 ) at block.c:4196 + #6 0x0000557223d069c4 in external_snapshot_abort (common=0x557225d7e170) at blockdev.c:1731 + #7 0x0000557223d069c4 in external_snapshot_abort (common=0x557225d7e170) at blockdev.c:1717 + #8 0x0000557223d09013 in qmp_transaction (dev_list=, has_props=, props=0x557225cc7d70, errp=errp@entry=0x7ffe704c0c98) at blockdev.c:2360 + #9 0x0000557223e32085 in qmp_marshal_transaction (args=, ret=, errp=0x7ffe704c0d08) at qapi/qapi-commands-transaction.c:44 + #10 0x0000557223ee798c in do_qmp_dispatch (errp=0x7ffe704c0d00, allow_oob=, request=, cmds=0x5572247d3cc0 ) at qapi/qmp-dispatch.c:132 + #11 0x0000557223ee798c in qmp_dispatch (cmds=0x5572247d3cc0 , request=, allow_oob=) at qapi/qmp-dispatch.c:175 + #12 0x0000557223e06141 in monitor_qmp_dispatch (mon=0x557225c69ff0, req=) at monitor/qmp.c:120 + #13 0x0000557223e0678a in monitor_qmp_bh_dispatcher (data=) at monitor/qmp.c:209 + #14 0x0000557223f2f366 in aio_bh_call (bh=0x557225b9dc60) at util/async.c:117 + #15 0x0000557223f2f366 in aio_bh_poll (ctx=ctx@entry=0x557225b9c840) at util/async.c:117 + #16 0x0000557223f32754 in aio_dispatch (ctx=0x557225b9c840) at util/aio-posix.c:459 + #17 0x0000557223f2f242 in aio_ctx_dispatch (source=, callback=, user_data=) at util/async.c:260 + #18 0x00007fa10913467d in g_main_dispatch (context=0x557225c28e80) at gmain.c:3176 + #19 0x00007fa10913467d in g_main_context_dispatch (context=context@entry=0x557225c28e80) at gmain.c:3829 + #20 0x0000557223f31808 in glib_pollfds_poll () at util/main-loop.c:219 + #21 0x0000557223f31808 in os_host_main_loop_wait (timeout=) at util/main-loop.c:242 + #22 0x0000557223f31808 in main_loop_wait (nonblocking=) at util/main-loop.c:518 + #23 0x0000557223d13201 in main_loop () at vl.c:1828 + #24 0x0000557223bbfb82 in main (argc=, argv=, envp=) at vl.c:4504 + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1779036 +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 377410f6fb4f6b0d26d4a028c20766fae05de17e) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/blockdev.c b/blockdev.c +index d4ef6cd..4cd9a58 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1731,6 +1731,8 @@ static void external_snapshot_abort(BlkActionState *common) + if (state->new_bs) { + if (state->overlay_appended) { + AioContext *aio_context; ++ AioContext *tmp_context; ++ int ret; + + aio_context = bdrv_get_aio_context(state->old_bs); + aio_context_acquire(aio_context); +@@ -1738,6 +1740,25 @@ static void external_snapshot_abort(BlkActionState *common) + bdrv_ref(state->old_bs); /* we can't let bdrv_set_backind_hd() + close state->old_bs; we need it */ + bdrv_set_backing_hd(state->new_bs, NULL, &error_abort); ++ ++ /* ++ * The call to bdrv_set_backing_hd() above returns state->old_bs to ++ * the main AioContext. As we're still going to be using it, return ++ * it to the AioContext it was before. ++ */ ++ tmp_context = bdrv_get_aio_context(state->old_bs); ++ if (aio_context != tmp_context) { ++ aio_context_release(aio_context); ++ aio_context_acquire(tmp_context); ++ ++ ret = bdrv_try_set_aio_context(state->old_bs, ++ aio_context, NULL); ++ assert(ret == 0); ++ ++ aio_context_release(tmp_context); ++ aio_context_acquire(aio_context); ++ } ++ + bdrv_replace_node(state->new_bs, state->old_bs, &error_abort); + bdrv_unref(state->old_bs); /* bdrv_replace_node() ref'ed old_bs */ + +-- +1.8.3.1 + diff --git a/kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch b/kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch new file mode 100755 index 0000000000000000000000000000000000000000..d977922aefa73d41992d17c53b87966141fbd638 --- /dev/null +++ b/kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch @@ -0,0 +1,720 @@ +From 2afa718d59ef86879a9e34b4601a1f2658afa9ba Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:14 +0100 +Subject: [PATCH 09/26] blockdev: Split off basic bitmap operations for + qemu-img + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-7-eblake@redhat.com> +Patchwork-id: 97073 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 06/12] blockdev: Split off basic bitmap operations for qemu-img +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Upcoming patches want to add some basic bitmap manipulation abilities +to qemu-img. But blockdev.o is too heavyweight to link into qemu-img +(among other things, it would drag in block jobs and transaction +support - qemu-img does offline manipulation, where atomicity is less +important because there are no concurrent modifications to compete +with), so it's time to split off the bare bones of what we will need +into a new file block/monitor/bitmap-qmp-cmds.o. + +This is sufficient to expose 6 QMP commands for use by qemu-img (add, +remove, clear, enable, disable, merge), as well as move the three +helper functions touched in the previous patch. Regarding +MAINTAINERS, the new file is automatically part of block core, but +also makes sense as related to other dirty bitmap files. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Message-Id: <20200513011648.166876-6-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit bb4e58c6137e80129b955789dd4b66c1504f20dc) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + Makefile.objs - comment context + block/monitor/Makefile.objs - context: a2dde2f2 not backported + blockdev.c - context +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 1 + + Makefile.objs | 3 +- + block/monitor/Makefile.objs | 1 + + block/monitor/bitmap-qmp-cmds.c | 321 ++++++++++++++++++++++++++++++++++++++++ + blockdev.c | 284 ----------------------------------- + 5 files changed, 324 insertions(+), 286 deletions(-) + create mode 100644 block/monitor/Makefile.objs + create mode 100644 block/monitor/bitmap-qmp-cmds.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index 3a81ac9..49d5d44 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1875,6 +1875,7 @@ L: qemu-block@nongnu.org + S: Supported + F: include/qemu/hbitmap.h + F: include/block/dirty-bitmap.h ++F: block/monitor/bitmap-qmp-cmds.c + F: block/dirty-bitmap.c + F: block/qcow2-bitmap.c + F: migration/block-dirty-bitmap.c +diff --git a/Makefile.objs b/Makefile.objs +index 1a8f288..7404ef0 100644 +--- a/Makefile.objs ++++ b/Makefile.objs +@@ -13,9 +13,8 @@ authz-obj-y = authz/ + ####################################################################### + # block-obj-y is code used by both qemu system emulation and qemu-img + +-block-obj-y = nbd/ ++block-obj-y = block/ block/monitor/ nbd/ scsi/ + block-obj-y += block.o blockjob.o job.o +-block-obj-y += block/ scsi/ + block-obj-y += qemu-io-cmds.o + block-obj-$(CONFIG_REPLICATION) += replication.o + +diff --git a/block/monitor/Makefile.objs b/block/monitor/Makefile.objs +new file mode 100644 +index 0000000..f0c7642 +--- /dev/null ++++ b/block/monitor/Makefile.objs +@@ -0,0 +1 @@ ++block-obj-y += bitmap-qmp-cmds.o +diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c +new file mode 100644 +index 0000000..9f11dee +--- /dev/null ++++ b/block/monitor/bitmap-qmp-cmds.c +@@ -0,0 +1,321 @@ ++/* ++ * QEMU block dirty bitmap QMP commands ++ * ++ * Copyright (c) 2003-2008 Fabrice Bellard ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or ++ * later. See the COPYING file in the top-level directory. ++ * ++ * This file incorporates work covered by the following copyright and ++ * permission notice: ++ * ++ * Copyright (c) 2003-2008 Fabrice Bellard ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to deal ++ * in the Software without restriction, including without limitation the rights ++ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++ * copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++ * THE SOFTWARE. ++ */ ++ ++#include "qemu/osdep.h" ++ ++#include "block/block_int.h" ++#include "qapi/qapi-commands-block.h" ++#include "qapi/error.h" ++ ++/** ++ * block_dirty_bitmap_lookup: ++ * Return a dirty bitmap (if present), after validating ++ * the node reference and bitmap names. ++ * ++ * @node: The name of the BDS node to search for bitmaps ++ * @name: The name of the bitmap to search for ++ * @pbs: Output pointer for BDS lookup, if desired. Can be NULL. ++ * @errp: Output pointer for error information. Can be NULL. ++ * ++ * @return: A bitmap object on success, or NULL on failure. ++ */ ++BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, ++ const char *name, ++ BlockDriverState **pbs, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ ++ if (!node) { ++ error_setg(errp, "Node cannot be NULL"); ++ return NULL; ++ } ++ if (!name) { ++ error_setg(errp, "Bitmap name cannot be NULL"); ++ return NULL; ++ } ++ bs = bdrv_lookup_bs(node, node, NULL); ++ if (!bs) { ++ error_setg(errp, "Node '%s' not found", node); ++ return NULL; ++ } ++ ++ bitmap = bdrv_find_dirty_bitmap(bs, name); ++ if (!bitmap) { ++ error_setg(errp, "Dirty bitmap '%s' not found", name); ++ return NULL; ++ } ++ ++ if (pbs) { ++ *pbs = bs; ++ } ++ ++ return bitmap; ++} ++ ++void qmp_block_dirty_bitmap_add(const char *node, const char *name, ++ bool has_granularity, uint32_t granularity, ++ bool has_persistent, bool persistent, ++ bool has_disabled, bool disabled, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; ++ ++ if (!name || name[0] == '\0') { ++ error_setg(errp, "Bitmap name cannot be empty"); ++ return; ++ } ++ ++ bs = bdrv_lookup_bs(node, node, errp); ++ if (!bs) { ++ return; ++ } ++ ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ ++ if (has_granularity) { ++ if (granularity < 512 || !is_power_of_2(granularity)) { ++ error_setg(errp, "Granularity must be power of 2 " ++ "and at least 512"); ++ goto out; ++ } ++ } else { ++ /* Default to cluster size, if available: */ ++ granularity = bdrv_get_default_bitmap_granularity(bs); ++ } ++ ++ if (!has_persistent) { ++ persistent = false; ++ } ++ ++ if (!has_disabled) { ++ disabled = false; ++ } ++ ++ if (persistent && ++ !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp)) ++ { ++ goto out; ++ } ++ ++ bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp); ++ if (bitmap == NULL) { ++ goto out; ++ } ++ ++ if (disabled) { ++ bdrv_disable_dirty_bitmap(bitmap); ++ } ++ ++ bdrv_dirty_bitmap_set_persistence(bitmap, persistent); ++ ++out: ++ aio_context_release(aio_context); ++} ++ ++BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, ++ bool release, ++ BlockDriverState **bitmap_bs, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ AioContext *aio_context; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap || !bs) { ++ return NULL; ++ } ++ ++ aio_context = bdrv_get_aio_context(bs); ++ aio_context_acquire(aio_context); ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_BUSY | BDRV_BITMAP_RO, ++ errp)) { ++ aio_context_release(aio_context); ++ return NULL; ++ } ++ ++ if (bdrv_dirty_bitmap_get_persistence(bitmap) && ++ bdrv_remove_persistent_dirty_bitmap(bs, name, errp) < 0) ++ { ++ aio_context_release(aio_context); ++ return NULL; ++ } ++ ++ if (release) { ++ bdrv_release_dirty_bitmap(bitmap); ++ } ++ ++ if (bitmap_bs) { ++ *bitmap_bs = bs; ++ } ++ ++ aio_context_release(aio_context); ++ return release ? NULL : bitmap; ++} ++ ++void qmp_block_dirty_bitmap_remove(const char *node, const char *name, ++ Error **errp) ++{ ++ block_dirty_bitmap_remove(node, name, true, NULL, errp); ++} ++ ++/** ++ * Completely clear a bitmap, for the purposes of synchronizing a bitmap ++ * immediately after a full backup operation. ++ */ ++void qmp_block_dirty_bitmap_clear(const char *node, const char *name, ++ Error **errp) ++{ ++ BdrvDirtyBitmap *bitmap; ++ BlockDriverState *bs; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap || !bs) { ++ return; ++ } ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_DEFAULT, errp)) { ++ return; ++ } ++ ++ bdrv_clear_dirty_bitmap(bitmap, NULL); ++} ++ ++void qmp_block_dirty_bitmap_enable(const char *node, const char *name, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap) { ++ return; ++ } ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { ++ return; ++ } ++ ++ bdrv_enable_dirty_bitmap(bitmap); ++} ++ ++void qmp_block_dirty_bitmap_disable(const char *node, const char *name, ++ Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *bitmap; ++ ++ bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); ++ if (!bitmap) { ++ return; ++ } ++ ++ if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { ++ return; ++ } ++ ++ bdrv_disable_dirty_bitmap(bitmap); ++} ++ ++BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bms, ++ HBitmap **backup, Error **errp) ++{ ++ BlockDriverState *bs; ++ BdrvDirtyBitmap *dst, *src, *anon; ++ BlockDirtyBitmapMergeSourceList *lst; ++ Error *local_err = NULL; ++ ++ dst = block_dirty_bitmap_lookup(node, target, &bs, errp); ++ if (!dst) { ++ return NULL; ++ } ++ ++ anon = bdrv_create_dirty_bitmap(bs, bdrv_dirty_bitmap_granularity(dst), ++ NULL, errp); ++ if (!anon) { ++ return NULL; ++ } ++ ++ for (lst = bms; lst; lst = lst->next) { ++ switch (lst->value->type) { ++ const char *name, *node; ++ case QTYPE_QSTRING: ++ name = lst->value->u.local; ++ src = bdrv_find_dirty_bitmap(bs, name); ++ if (!src) { ++ error_setg(errp, "Dirty bitmap '%s' not found", name); ++ dst = NULL; ++ goto out; ++ } ++ break; ++ case QTYPE_QDICT: ++ node = lst->value->u.external.node; ++ name = lst->value->u.external.name; ++ src = block_dirty_bitmap_lookup(node, name, NULL, errp); ++ if (!src) { ++ dst = NULL; ++ goto out; ++ } ++ break; ++ default: ++ abort(); ++ } ++ ++ bdrv_merge_dirty_bitmap(anon, src, NULL, &local_err); ++ if (local_err) { ++ error_propagate(errp, local_err); ++ dst = NULL; ++ goto out; ++ } ++ } ++ ++ /* Merge into dst; dst is unchanged on failure. */ ++ bdrv_merge_dirty_bitmap(dst, anon, backup, errp); ++ ++ out: ++ bdrv_release_dirty_bitmap(anon); ++ return dst; ++} ++ ++void qmp_block_dirty_bitmap_merge(const char *node, const char *target, ++ BlockDirtyBitmapMergeSourceList *bitmaps, ++ Error **errp) ++{ ++ block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); ++} +diff --git a/blockdev.c b/blockdev.c +index 3958058..5128c9b 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1250,53 +1250,6 @@ out_aio_context: + return NULL; + } + +-/** +- * block_dirty_bitmap_lookup: +- * Return a dirty bitmap (if present), after validating +- * the node reference and bitmap names. +- * +- * @node: The name of the BDS node to search for bitmaps +- * @name: The name of the bitmap to search for +- * @pbs: Output pointer for BDS lookup, if desired. Can be NULL. +- * @errp: Output pointer for error information. Can be NULL. +- * +- * @return: A bitmap object on success, or NULL on failure. +- */ +-BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, +- const char *name, +- BlockDriverState **pbs, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- +- if (!node) { +- error_setg(errp, "Node cannot be NULL"); +- return NULL; +- } +- if (!name) { +- error_setg(errp, "Bitmap name cannot be NULL"); +- return NULL; +- } +- bs = bdrv_lookup_bs(node, node, NULL); +- if (!bs) { +- error_setg(errp, "Node '%s' not found", node); +- return NULL; +- } +- +- bitmap = bdrv_find_dirty_bitmap(bs, name); +- if (!bitmap) { +- error_setg(errp, "Dirty bitmap '%s' not found", name); +- return NULL; +- } +- +- if (pbs) { +- *pbs = bs; +- } +- +- return bitmap; +-} +- + /* New and old BlockDriverState structs for atomic group operations */ + + typedef struct BlkActionState BlkActionState; +@@ -2974,243 +2927,6 @@ out: + aio_context_release(aio_context); + } + +-void qmp_block_dirty_bitmap_add(const char *node, const char *name, +- bool has_granularity, uint32_t granularity, +- bool has_persistent, bool persistent, +- bool has_disabled, bool disabled, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- AioContext *aio_context; +- +- if (!name || name[0] == '\0') { +- error_setg(errp, "Bitmap name cannot be empty"); +- return; +- } +- +- bs = bdrv_lookup_bs(node, node, errp); +- if (!bs) { +- return; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- if (has_granularity) { +- if (granularity < 512 || !is_power_of_2(granularity)) { +- error_setg(errp, "Granularity must be power of 2 " +- "and at least 512"); +- goto out; +- } +- } else { +- /* Default to cluster size, if available: */ +- granularity = bdrv_get_default_bitmap_granularity(bs); +- } +- +- if (!has_persistent) { +- persistent = false; +- } +- +- if (!has_disabled) { +- disabled = false; +- } +- +- if (persistent && +- !bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp)) +- { +- goto out; +- } +- +- bitmap = bdrv_create_dirty_bitmap(bs, granularity, name, errp); +- if (bitmap == NULL) { +- goto out; +- } +- +- if (disabled) { +- bdrv_disable_dirty_bitmap(bitmap); +- } +- +- bdrv_dirty_bitmap_set_persistence(bitmap, persistent); +- +-out: +- aio_context_release(aio_context); +-} +- +-BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, +- bool release, +- BlockDriverState **bitmap_bs, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- AioContext *aio_context; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap || !bs) { +- return NULL; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_BUSY | BDRV_BITMAP_RO, +- errp)) { +- aio_context_release(aio_context); +- return NULL; +- } +- +- if (bdrv_dirty_bitmap_get_persistence(bitmap) && +- bdrv_remove_persistent_dirty_bitmap(bs, name, errp) < 0) +- { +- aio_context_release(aio_context); +- return NULL; +- } +- +- if (release) { +- bdrv_release_dirty_bitmap(bitmap); +- } +- +- if (bitmap_bs) { +- *bitmap_bs = bs; +- } +- +- aio_context_release(aio_context); +- return release ? NULL : bitmap; +-} +- +-void qmp_block_dirty_bitmap_remove(const char *node, const char *name, +- Error **errp) +-{ +- block_dirty_bitmap_remove(node, name, true, NULL, errp); +-} +- +-/** +- * Completely clear a bitmap, for the purposes of synchronizing a bitmap +- * immediately after a full backup operation. +- */ +-void qmp_block_dirty_bitmap_clear(const char *node, const char *name, +- Error **errp) +-{ +- BdrvDirtyBitmap *bitmap; +- BlockDriverState *bs; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap || !bs) { +- return; +- } +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_DEFAULT, errp)) { +- return; +- } +- +- bdrv_clear_dirty_bitmap(bitmap, NULL); +-} +- +-void qmp_block_dirty_bitmap_enable(const char *node, const char *name, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap) { +- return; +- } +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { +- return; +- } +- +- bdrv_enable_dirty_bitmap(bitmap); +-} +- +-void qmp_block_dirty_bitmap_disable(const char *node, const char *name, +- Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *bitmap; +- +- bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp); +- if (!bitmap) { +- return; +- } +- +- if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { +- return; +- } +- +- bdrv_disable_dirty_bitmap(bitmap); +-} +- +-BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bms, +- HBitmap **backup, Error **errp) +-{ +- BlockDriverState *bs; +- BdrvDirtyBitmap *dst, *src, *anon; +- BlockDirtyBitmapMergeSourceList *lst; +- Error *local_err = NULL; +- +- dst = block_dirty_bitmap_lookup(node, target, &bs, errp); +- if (!dst) { +- return NULL; +- } +- +- anon = bdrv_create_dirty_bitmap(bs, bdrv_dirty_bitmap_granularity(dst), +- NULL, errp); +- if (!anon) { +- return NULL; +- } +- +- for (lst = bms; lst; lst = lst->next) { +- switch (lst->value->type) { +- const char *name, *node; +- case QTYPE_QSTRING: +- name = lst->value->u.local; +- src = bdrv_find_dirty_bitmap(bs, name); +- if (!src) { +- error_setg(errp, "Dirty bitmap '%s' not found", name); +- dst = NULL; +- goto out; +- } +- break; +- case QTYPE_QDICT: +- node = lst->value->u.external.node; +- name = lst->value->u.external.name; +- src = block_dirty_bitmap_lookup(node, name, NULL, errp); +- if (!src) { +- dst = NULL; +- goto out; +- } +- break; +- default: +- abort(); +- } +- +- bdrv_merge_dirty_bitmap(anon, src, NULL, &local_err); +- if (local_err) { +- error_propagate(errp, local_err); +- dst = NULL; +- goto out; +- } +- } +- +- /* Merge into dst; dst is unchanged on failure. */ +- bdrv_merge_dirty_bitmap(dst, anon, backup, errp); +- +- out: +- bdrv_release_dirty_bitmap(anon); +- return dst; +-} +- +-void qmp_block_dirty_bitmap_merge(const char *node, const char *target, +- BlockDirtyBitmapMergeSourceList *bitmaps, +- Error **errp) +-{ +- block_dirty_bitmap_merge(node, target, bitmaps, NULL, errp); +-} +- + BlockDirtyBitmapSha256 *qmp_x_debug_block_dirty_bitmap_sha256(const char *node, + const char *name, + Error **errp) +-- +1.8.3.1 + diff --git a/kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch b/kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch new file mode 100755 index 0000000000000000000000000000000000000000..399a06ab669cf505e2987bc03b8cf71d397772e6 --- /dev/null +++ b/kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch @@ -0,0 +1,62 @@ +From d56b53cd75c4146eae7a06d1cc30ab823a9bde93 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:41 +0000 +Subject: [PATCH 08/18] blockdev: fix coding style issues in + drive_backup_prepare +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-2-slp@redhat.com> +Patchwork-id: 93754 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 1/9] blockdev: fix coding style issues in drive_backup_prepare +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Fix a couple of minor coding style issues in drive_backup_prepare. + +Signed-off-by: Sergio Lopez +Reviewed-by: Max Reitz +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 471ded690e19689018535e3f48480507ed073e22) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 8e029e9..553e315 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -3620,7 +3620,7 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, + + if (!backup->has_format) { + backup->format = backup->mode == NEW_IMAGE_MODE_EXISTING ? +- NULL : (char*) bs->drv->format_name; ++ NULL : (char *) bs->drv->format_name; + } + + /* Early check to avoid creating target */ +@@ -3630,8 +3630,10 @@ static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, + + flags = bs->open_flags | BDRV_O_RDWR; + +- /* See if we have a backing HD we can use to create our new image +- * on top of. */ ++ /* ++ * See if we have a backing HD we can use to create our new image ++ * on top of. ++ */ + if (backup->sync == MIRROR_SYNC_MODE_TOP) { + source = backing_bs(bs); + if (!source) { +-- +1.8.3.1 + diff --git a/kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch b/kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch new file mode 100755 index 0000000000000000000000000000000000000000..a94ee7577cb8d02f196179de9ce1b56c9d23b8e8 --- /dev/null +++ b/kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch @@ -0,0 +1,204 @@ +From da4ee4c0d56200042cb86f8ccd2777009bd82df3 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:44 +0000 +Subject: [PATCH 11/18] blockdev: honor bdrv_try_set_aio_context() context + requirements + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-5-slp@redhat.com> +Patchwork-id: 93758 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 4/9] blockdev: honor bdrv_try_set_aio_context() context requirements +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +bdrv_try_set_aio_context() requires that the old context is held, and +the new context is not held. Fix all the occurrences where it's not +done this way. + +Suggested-by: Max Reitz +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 3ea67e08832775a28d0bd2795f01bc77e7ea1512) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 60 insertions(+), 8 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 152a0f7..1dacbc2 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1535,6 +1535,7 @@ static void external_snapshot_prepare(BlkActionState *common, + DO_UPCAST(ExternalSnapshotState, common, common); + TransactionAction *action = common->action; + AioContext *aio_context; ++ AioContext *old_context; + int ret; + + /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar +@@ -1675,7 +1676,16 @@ static void external_snapshot_prepare(BlkActionState *common, + goto out; + } + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(state->new_bs); ++ aio_context_release(aio_context); ++ aio_context_acquire(old_context); ++ + ret = bdrv_try_set_aio_context(state->new_bs, aio_context, errp); ++ ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + if (ret < 0) { + goto out; + } +@@ -1775,11 +1785,13 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) + BlockDriverState *target_bs; + BlockDriverState *source = NULL; + AioContext *aio_context; ++ AioContext *old_context; + QDict *options; + Error *local_err = NULL; + int flags; + int64_t size; + bool set_backing_hd = false; ++ int ret; + + assert(common->action->type == TRANSACTION_ACTION_KIND_DRIVE_BACKUP); + backup = common->action->u.drive_backup.data; +@@ -1868,6 +1880,21 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) + goto out; + } + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(target_bs); ++ aio_context_release(aio_context); ++ aio_context_acquire(old_context); ++ ++ ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); ++ if (ret < 0) { ++ bdrv_unref(target_bs); ++ aio_context_release(old_context); ++ return; ++ } ++ ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + if (set_backing_hd) { + bdrv_set_backing_hd(target_bs, source, &local_err); + if (local_err) { +@@ -1947,6 +1974,8 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + BlockDriverState *bs; + BlockDriverState *target_bs; + AioContext *aio_context; ++ AioContext *old_context; ++ int ret; + + assert(common->action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP); + backup = common->action->u.blockdev_backup.data; +@@ -1961,7 +1990,18 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + return; + } + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ + aio_context = bdrv_get_aio_context(bs); ++ old_context = bdrv_get_aio_context(target_bs); ++ aio_context_acquire(old_context); ++ ++ ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); ++ if (ret < 0) { ++ aio_context_release(old_context); ++ return; ++ } ++ ++ aio_context_release(old_context); + aio_context_acquire(aio_context); + state->bs = bs; + +@@ -3562,7 +3602,6 @@ static BlockJob *do_backup_common(BackupCommon *backup, + BlockJob *job = NULL; + BdrvDirtyBitmap *bmap = NULL; + int job_flags = JOB_DEFAULT; +- int ret; + + if (!backup->has_speed) { + backup->speed = 0; +@@ -3586,11 +3625,6 @@ static BlockJob *do_backup_common(BackupCommon *backup, + backup->compress = false; + } + +- ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); +- if (ret < 0) { +- return NULL; +- } +- + if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) || + (backup->sync == MIRROR_SYNC_MODE_INCREMENTAL)) { + /* done before desugaring 'incremental' to print the right message */ +@@ -3825,6 +3859,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + BlockDriverState *bs; + BlockDriverState *source, *target_bs; + AioContext *aio_context; ++ AioContext *old_context; + BlockMirrorBackingMode backing_mode; + Error *local_err = NULL; + QDict *options = NULL; +@@ -3937,12 +3972,22 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + (arg->mode == NEW_IMAGE_MODE_EXISTING || + !bdrv_has_zero_init(target_bs))); + ++ ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(target_bs); ++ aio_context_release(aio_context); ++ aio_context_acquire(old_context); ++ + ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); + if (ret < 0) { + bdrv_unref(target_bs); +- goto out; ++ aio_context_release(old_context); ++ return; + } + ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + blockdev_mirror_common(arg->has_job_id ? arg->job_id : NULL, bs, target_bs, + arg->has_replaces, arg->replaces, arg->sync, + backing_mode, zero_target, +@@ -3984,6 +4029,7 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id, + BlockDriverState *bs; + BlockDriverState *target_bs; + AioContext *aio_context; ++ AioContext *old_context; + BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN; + Error *local_err = NULL; + bool zero_target; +@@ -4001,10 +4047,16 @@ void qmp_blockdev_mirror(bool has_job_id, const char *job_id, + + zero_target = (sync == MIRROR_SYNC_MODE_FULL); + ++ /* Honor bdrv_try_set_aio_context() context acquisition requirements. */ ++ old_context = bdrv_get_aio_context(target_bs); + aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); ++ aio_context_acquire(old_context); + + ret = bdrv_try_set_aio_context(target_bs, aio_context, errp); ++ ++ aio_context_release(old_context); ++ aio_context_acquire(aio_context); ++ + if (ret < 0) { + goto out; + } +-- +1.8.3.1 + diff --git a/kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch b/kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch new file mode 100755 index 0000000000000000000000000000000000000000..c4263848e3d55b108515bcfe6a405251da0044b6 --- /dev/null +++ b/kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch @@ -0,0 +1,144 @@ +From 959955217f745f1ee6cbea97314efe69f2d7dc08 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:43 +0000 +Subject: [PATCH 10/18] blockdev: unify qmp_blockdev_backup and blockdev-backup + transaction paths + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-4-slp@redhat.com> +Patchwork-id: 93756 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 3/9] blockdev: unify qmp_blockdev_backup and blockdev-backup transaction paths +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Issuing a blockdev-backup from qmp_blockdev_backup takes a slightly +different path than when it's issued from a transaction. In the code, +this is manifested as some redundancy between do_blockdev_backup() and +blockdev_backup_prepare(). + +This change unifies both paths, merging do_blockdev_backup() and +blockdev_backup_prepare(), and changing qmp_blockdev_backup() to +create a transaction instead of calling do_backup_common() direcly. + +As a side-effect, now qmp_blockdev_backup() is executed inside a +drained section, as it happens when creating a blockdev-backup +transaction. This change is visible from the user's perspective, as +the job gets paused and immediately resumed before starting the actual +work. + +Signed-off-by: Sergio Lopez +Reviewed-by: Max Reitz +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 5b7bfe515ecbd584b40ff6e41d2fd8b37c7d5139) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 60 +++++++++++++----------------------------------------------- + 1 file changed, 13 insertions(+), 47 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 5e85fc0..152a0f7 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1940,16 +1940,13 @@ typedef struct BlockdevBackupState { + BlockJob *job; + } BlockdevBackupState; + +-static BlockJob *do_blockdev_backup(BlockdevBackup *backup, JobTxn *txn, +- Error **errp); +- + static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + { + BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common); + BlockdevBackup *backup; +- BlockDriverState *bs, *target; ++ BlockDriverState *bs; ++ BlockDriverState *target_bs; + AioContext *aio_context; +- Error *local_err = NULL; + + assert(common->action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP); + backup = common->action->u.blockdev_backup.data; +@@ -1959,8 +1956,8 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + return; + } + +- target = bdrv_lookup_bs(backup->target, backup->target, errp); +- if (!target) { ++ target_bs = bdrv_lookup_bs(backup->target, backup->target, errp); ++ if (!target_bs) { + return; + } + +@@ -1971,13 +1968,10 @@ static void blockdev_backup_prepare(BlkActionState *common, Error **errp) + /* Paired with .clean() */ + bdrv_drained_begin(state->bs); + +- state->job = do_blockdev_backup(backup, common->block_job_txn, &local_err); +- if (local_err) { +- error_propagate(errp, local_err); +- goto out; +- } ++ state->job = do_backup_common(qapi_BlockdevBackup_base(backup), ++ bs, target_bs, aio_context, ++ common->block_job_txn, errp); + +-out: + aio_context_release(aio_context); + } + +@@ -3695,41 +3689,13 @@ XDbgBlockGraph *qmp_x_debug_query_block_graph(Error **errp) + return bdrv_get_xdbg_block_graph(errp); + } + +-BlockJob *do_blockdev_backup(BlockdevBackup *backup, JobTxn *txn, +- Error **errp) ++void qmp_blockdev_backup(BlockdevBackup *backup, Error **errp) + { +- BlockDriverState *bs; +- BlockDriverState *target_bs; +- AioContext *aio_context; +- BlockJob *job; +- +- bs = bdrv_lookup_bs(backup->device, backup->device, errp); +- if (!bs) { +- return NULL; +- } +- +- target_bs = bdrv_lookup_bs(backup->target, backup->target, errp); +- if (!target_bs) { +- return NULL; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- job = do_backup_common(qapi_BlockdevBackup_base(backup), +- bs, target_bs, aio_context, txn, errp); +- +- aio_context_release(aio_context); +- return job; +-} +- +-void qmp_blockdev_backup(BlockdevBackup *arg, Error **errp) +-{ +- BlockJob *job; +- job = do_blockdev_backup(arg, NULL, errp); +- if (job) { +- job_start(&job->job); +- } ++ TransactionAction action = { ++ .type = TRANSACTION_ACTION_KIND_BLOCKDEV_BACKUP, ++ .u.blockdev_backup.data = backup, ++ }; ++ blockdev_do_action(&action, errp); + } + + /* Parameter check and block job starting for drive mirroring. +-- +1.8.3.1 + diff --git a/kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch b/kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch new file mode 100755 index 0000000000000000000000000000000000000000..9ec19750e21186a341139edb86a46a3ea76dd393 --- /dev/null +++ b/kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch @@ -0,0 +1,419 @@ +From 4a03ab2a6cc4974d8d43240d1297b09160818af3 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 11:27:42 +0000 +Subject: [PATCH 09/18] blockdev: unify qmp_drive_backup and drive-backup + transaction paths + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-3-slp@redhat.com> +Patchwork-id: 93755 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 2/9] blockdev: unify qmp_drive_backup and drive-backup transaction paths +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Issuing a drive-backup from qmp_drive_backup takes a slightly +different path than when it's issued from a transaction. In the code, +this is manifested as some redundancy between do_drive_backup() and +drive_backup_prepare(). + +This change unifies both paths, merging do_drive_backup() and +drive_backup_prepare(), and changing qmp_drive_backup() to create a +transaction instead of calling do_backup_common() direcly. + +As a side-effect, now qmp_drive_backup() is executed inside a drained +section, as it happens when creating a drive-backup transaction. This +change is visible from the user's perspective, as the job gets paused +and immediately resumed before starting the actual work. + +Also fix tests 141, 185 and 219 to cope with the extra +JOB_STATUS_CHANGE lines. + +Signed-off-by: Sergio Lopez +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 2288ccfac96281c316db942d10e3f921c1373064) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 224 ++++++++++++++++++++------------------------- + tests/qemu-iotests/141.out | 2 + + tests/qemu-iotests/185.out | 2 + + tests/qemu-iotests/219 | 7 +- + tests/qemu-iotests/219.out | 8 ++ + 5 files changed, 117 insertions(+), 126 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 553e315..5e85fc0 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1761,39 +1761,128 @@ typedef struct DriveBackupState { + BlockJob *job; + } DriveBackupState; + +-static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, +- Error **errp); ++static BlockJob *do_backup_common(BackupCommon *backup, ++ BlockDriverState *bs, ++ BlockDriverState *target_bs, ++ AioContext *aio_context, ++ JobTxn *txn, Error **errp); + + static void drive_backup_prepare(BlkActionState *common, Error **errp) + { + DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common); +- BlockDriverState *bs; + DriveBackup *backup; ++ BlockDriverState *bs; ++ BlockDriverState *target_bs; ++ BlockDriverState *source = NULL; + AioContext *aio_context; ++ QDict *options; + Error *local_err = NULL; ++ int flags; ++ int64_t size; ++ bool set_backing_hd = false; + + assert(common->action->type == TRANSACTION_ACTION_KIND_DRIVE_BACKUP); + backup = common->action->u.drive_backup.data; + ++ if (!backup->has_mode) { ++ backup->mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; ++ } ++ + bs = bdrv_lookup_bs(backup->device, backup->device, errp); + if (!bs) { + return; + } + ++ if (!bs->drv) { ++ error_setg(errp, "Device has no medium"); ++ return; ++ } ++ + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + /* Paired with .clean() */ + bdrv_drained_begin(bs); + +- state->bs = bs; ++ if (!backup->has_format) { ++ backup->format = backup->mode == NEW_IMAGE_MODE_EXISTING ? ++ NULL : (char *) bs->drv->format_name; ++ } ++ ++ /* Early check to avoid creating target */ ++ if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { ++ goto out; ++ } ++ ++ flags = bs->open_flags | BDRV_O_RDWR; ++ ++ /* ++ * See if we have a backing HD we can use to create our new image ++ * on top of. ++ */ ++ if (backup->sync == MIRROR_SYNC_MODE_TOP) { ++ source = backing_bs(bs); ++ if (!source) { ++ backup->sync = MIRROR_SYNC_MODE_FULL; ++ } ++ } ++ if (backup->sync == MIRROR_SYNC_MODE_NONE) { ++ source = bs; ++ flags |= BDRV_O_NO_BACKING; ++ set_backing_hd = true; ++ } ++ ++ size = bdrv_getlength(bs); ++ if (size < 0) { ++ error_setg_errno(errp, -size, "bdrv_getlength failed"); ++ goto out; ++ } ++ ++ if (backup->mode != NEW_IMAGE_MODE_EXISTING) { ++ assert(backup->format); ++ if (source) { ++ bdrv_refresh_filename(source); ++ bdrv_img_create(backup->target, backup->format, source->filename, ++ source->drv->format_name, NULL, ++ size, flags, false, &local_err); ++ } else { ++ bdrv_img_create(backup->target, backup->format, NULL, NULL, NULL, ++ size, flags, false, &local_err); ++ } ++ } + +- state->job = do_drive_backup(backup, common->block_job_txn, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto out; + } + ++ options = qdict_new(); ++ qdict_put_str(options, "discard", "unmap"); ++ qdict_put_str(options, "detect-zeroes", "unmap"); ++ if (backup->format) { ++ qdict_put_str(options, "driver", backup->format); ++ } ++ ++ target_bs = bdrv_open(backup->target, NULL, options, flags, errp); ++ if (!target_bs) { ++ goto out; ++ } ++ ++ if (set_backing_hd) { ++ bdrv_set_backing_hd(target_bs, source, &local_err); ++ if (local_err) { ++ goto unref; ++ } ++ } ++ ++ state->bs = bs; ++ ++ state->job = do_backup_common(qapi_DriveBackup_base(backup), ++ bs, target_bs, aio_context, ++ common->block_job_txn, errp); ++ ++unref: ++ bdrv_unref(target_bs); + out: + aio_context_release(aio_context); + } +@@ -3587,126 +3676,13 @@ static BlockJob *do_backup_common(BackupCommon *backup, + return job; + } + +-static BlockJob *do_drive_backup(DriveBackup *backup, JobTxn *txn, +- Error **errp) +-{ +- BlockDriverState *bs; +- BlockDriverState *target_bs; +- BlockDriverState *source = NULL; +- BlockJob *job = NULL; +- AioContext *aio_context; +- QDict *options; +- Error *local_err = NULL; +- int flags; +- int64_t size; +- bool set_backing_hd = false; +- +- if (!backup->has_mode) { +- backup->mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; +- } +- +- bs = bdrv_lookup_bs(backup->device, backup->device, errp); +- if (!bs) { +- return NULL; +- } +- +- if (!bs->drv) { +- error_setg(errp, "Device has no medium"); +- return NULL; +- } +- +- aio_context = bdrv_get_aio_context(bs); +- aio_context_acquire(aio_context); +- +- if (!backup->has_format) { +- backup->format = backup->mode == NEW_IMAGE_MODE_EXISTING ? +- NULL : (char *) bs->drv->format_name; +- } +- +- /* Early check to avoid creating target */ +- if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { +- goto out; +- } +- +- flags = bs->open_flags | BDRV_O_RDWR; +- +- /* +- * See if we have a backing HD we can use to create our new image +- * on top of. +- */ +- if (backup->sync == MIRROR_SYNC_MODE_TOP) { +- source = backing_bs(bs); +- if (!source) { +- backup->sync = MIRROR_SYNC_MODE_FULL; +- } +- } +- if (backup->sync == MIRROR_SYNC_MODE_NONE) { +- source = bs; +- flags |= BDRV_O_NO_BACKING; +- set_backing_hd = true; +- } +- +- size = bdrv_getlength(bs); +- if (size < 0) { +- error_setg_errno(errp, -size, "bdrv_getlength failed"); +- goto out; +- } +- +- if (backup->mode != NEW_IMAGE_MODE_EXISTING) { +- assert(backup->format); +- if (source) { +- bdrv_refresh_filename(source); +- bdrv_img_create(backup->target, backup->format, source->filename, +- source->drv->format_name, NULL, +- size, flags, false, &local_err); +- } else { +- bdrv_img_create(backup->target, backup->format, NULL, NULL, NULL, +- size, flags, false, &local_err); +- } +- } +- +- if (local_err) { +- error_propagate(errp, local_err); +- goto out; +- } +- +- options = qdict_new(); +- qdict_put_str(options, "discard", "unmap"); +- qdict_put_str(options, "detect-zeroes", "unmap"); +- if (backup->format) { +- qdict_put_str(options, "driver", backup->format); +- } +- +- target_bs = bdrv_open(backup->target, NULL, options, flags, errp); +- if (!target_bs) { +- goto out; +- } +- +- if (set_backing_hd) { +- bdrv_set_backing_hd(target_bs, source, &local_err); +- if (local_err) { +- goto unref; +- } +- } +- +- job = do_backup_common(qapi_DriveBackup_base(backup), +- bs, target_bs, aio_context, txn, errp); +- +-unref: +- bdrv_unref(target_bs); +-out: +- aio_context_release(aio_context); +- return job; +-} +- +-void qmp_drive_backup(DriveBackup *arg, Error **errp) ++void qmp_drive_backup(DriveBackup *backup, Error **errp) + { +- +- BlockJob *job; +- job = do_drive_backup(arg, NULL, errp); +- if (job) { +- job_start(&job->job); +- } ++ TransactionAction action = { ++ .type = TRANSACTION_ACTION_KIND_DRIVE_BACKUP, ++ .u.drive_backup.data = backup, ++ }; ++ blockdev_do_action(&action, errp); + } + + BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp) +diff --git a/tests/qemu-iotests/141.out b/tests/qemu-iotests/141.out +index 3645675..263b680 100644 +--- a/tests/qemu-iotests/141.out ++++ b/tests/qemu-iotests/141.out +@@ -13,6 +13,8 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/m. + Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "paused", "id": "job0"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}} + {'execute': 'blockdev-del', 'arguments': {'node-name': 'drv0'}} + {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: node is used as backing hd of 'NODE_NAME'"}} + {'execute': 'block-job-cancel', 'arguments': {'device': 'job0'}} +diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out +index 8379ac5..9a3b657 100644 +--- a/tests/qemu-iotests/185.out ++++ b/tests/qemu-iotests/185.out +@@ -65,6 +65,8 @@ Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 cluster_size=65536 l + Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 cluster_size=65536 lazy_refcounts=off refcount_bits=16 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "disk"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "disk"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "paused", "id": "disk"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "disk"}} + {"return": {}} + { 'execute': 'quit' } + {"return": {}} +diff --git a/tests/qemu-iotests/219 b/tests/qemu-iotests/219 +index e0c5166..655f54d 100755 +--- a/tests/qemu-iotests/219 ++++ b/tests/qemu-iotests/219 +@@ -63,7 +63,7 @@ def test_pause_resume(vm): + # logged immediately + iotests.log(vm.qmp('query-jobs')) + +-def test_job_lifecycle(vm, job, job_args, has_ready=False): ++def test_job_lifecycle(vm, job, job_args, has_ready=False, is_mirror=False): + global img_size + + iotests.log('') +@@ -135,6 +135,9 @@ def test_job_lifecycle(vm, job, job_args, has_ready=False): + iotests.log('Waiting for PENDING state...') + iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) + iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) ++ if is_mirror: ++ iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) ++ iotests.log(iotests.filter_qmp_event(vm.event_wait('JOB_STATUS_CHANGE'))) + + if not job_args.get('auto-finalize', True): + # PENDING state: +@@ -218,7 +221,7 @@ with iotests.FilePath('disk.img') as disk_path, \ + + for auto_finalize in [True, False]: + for auto_dismiss in [True, False]: +- test_job_lifecycle(vm, 'drive-backup', job_args={ ++ test_job_lifecycle(vm, 'drive-backup', is_mirror=True, job_args={ + 'device': 'drive0-node', + 'target': copy_path, + 'sync': 'full', +diff --git a/tests/qemu-iotests/219.out b/tests/qemu-iotests/219.out +index 8ebd3fe..0ea5d0b 100644 +--- a/tests/qemu-iotests/219.out ++++ b/tests/qemu-iotests/219.out +@@ -135,6 +135,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "concluded"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} +@@ -186,6 +188,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "concluded"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} +@@ -245,6 +249,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"return": [{"current-progress": 4194304, "id": "job0", "status": "pending", "total-progress": 4194304, "type": "backup"}]} +@@ -304,6 +310,8 @@ Pause/resume in RUNNING + {"return": {}} + + Waiting for PENDING state... ++{"data": {"id": "job0", "status": "paused"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"id": "job0", "status": "running"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "waiting"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"data": {"id": "job0", "status": "pending"}, "event": "JOB_STATUS_CHANGE", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + {"return": [{"current-progress": 4194304, "id": "job0", "status": "pending", "total-progress": 4194304, "type": "backup"}]} +-- +1.8.3.1 + diff --git a/kvm-bootp-check-bootp_input-buffer-size.patch b/kvm-bootp-check-bootp_input-buffer-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..3362cb0c525e9e96be84c22569201411cda3bc63 --- /dev/null +++ b/kvm-bootp-check-bootp_input-buffer-size.patch @@ -0,0 +1,52 @@ +From a66ab346bf74ebf3ed8fca0dc2e2febfe70069e8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:28 -0400 +Subject: [PATCH 07/14] bootp: check bootp_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-4-marcandre.lureau@redhat.com> +Patchwork-id: 101820 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 3/8] bootp: check bootp_input buffer size +Bugzilla: 1970819 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3592 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/44 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970819 + +(cherry picked from commit 2eca0838eee1da96204545e22cdaed860d9d7c6c) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/bootp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/slirp/src/bootp.c b/slirp/src/bootp.c +index 5754327138..5789187166 100644 +--- a/slirp/src/bootp.c ++++ b/slirp/src/bootp.c +@@ -366,9 +366,9 @@ udp_output(NULL, m, &saddr, &daddr, IPTOS_LOWDELAY); + + void bootp_input(struct mbuf *m) + { +- struct bootp_t *bp = mtod(m, struct bootp_t *); ++ struct bootp_t *bp = mtod_check(m, sizeof(struct bootp_t)); + +- if (bp->bp_op == BOOTP_REQUEST) { ++ if (bp && bp->bp_op == BOOTP_REQUEST) { + bootp_reply(m->slirp, bp, m_end(m)); + } + } +-- +2.27.0 + diff --git a/kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch b/kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch new file mode 100755 index 0000000000000000000000000000000000000000..bbf9b03ec7fe7c3d0c4bbcb55a80846179098d3f --- /dev/null +++ b/kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch @@ -0,0 +1,175 @@ +From 8198ae7c21a4d37f7e365058f973867c41d44d21 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:25 -0400 +Subject: [PATCH 06/14] bootp: limit vendor-specific area to input packet + memory buffer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-3-marcandre.lureau@redhat.com> +Patchwork-id: 101821 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/8] bootp: limit vendor-specific area to input packet memory buffer +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +sizeof(bootp_t) currently holds DHCP_OPT_LEN. Remove this optional field +from the structure, to help with the following patch checking for +minimal header size. Modify the bootp_reply() function to take the +buffer boundaries and avoiding potential buffer overflow. + +Related to CVE-2021-3592. + +https://gitlab.freedesktop.org/slirp/libslirp/-/issues/44 + +Signed-off-by: Marc-André Lureau + +(cherry picked from commit f13cad45b25d92760bb0ad67bec0300a4d7d5275) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/bootp.c | 26 +++++++++++++++----------- + slirp/src/bootp.h | 2 +- + slirp/src/mbuf.c | 5 +++++ + slirp/src/mbuf.h | 1 + + 4 files changed, 22 insertions(+), 12 deletions(-) + +diff --git a/slirp/src/bootp.c b/slirp/src/bootp.c +index 3f9ce2553e..5754327138 100644 +--- a/slirp/src/bootp.c ++++ b/slirp/src/bootp.c +@@ -92,21 +92,22 @@ found: + return bc; + } + +-static void dhcp_decode(const struct bootp_t *bp, int *pmsg_type, ++static void dhcp_decode(const struct bootp_t *bp, ++ const uint8_t *bp_end, ++ int *pmsg_type, + struct in_addr *preq_addr) + { +- const uint8_t *p, *p_end; ++ const uint8_t *p; + int len, tag; + + *pmsg_type = 0; + preq_addr->s_addr = htonl(0L); + + p = bp->bp_vend; +- p_end = p + DHCP_OPT_LEN; + if (memcmp(p, rfc1533_cookie, 4) != 0) + return; + p += 4; +- while (p < p_end) { ++ while (p < bp_end) { + tag = p[0]; + if (tag == RFC1533_PAD) { + p++; +@@ -114,10 +115,10 @@ static void dhcp_decode(const struct bootp_t *bp, int *pmsg_type, + break; + } else { + p++; +- if (p >= p_end) ++ if (p >= bp_end) + break; + len = *p++; +- if (p + len > p_end) { ++ if (p + len > bp_end) { + break; + } + DPRINTF("dhcp: tag=%d len=%d\n", tag, len); +@@ -144,7 +145,9 @@ static void dhcp_decode(const struct bootp_t *bp, int *pmsg_type, + } + } + +-static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) ++static void bootp_reply(Slirp *slirp, ++ const struct bootp_t *bp, ++ const uint8_t *bp_end) + { + BOOTPClient *bc = NULL; + struct mbuf *m; +@@ -157,7 +160,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) + uint8_t client_ethaddr[ETH_ALEN]; + + /* extract exact DHCP msg type */ +- dhcp_decode(bp, &dhcp_msg_type, &preq_addr); ++ dhcp_decode(bp, bp_end, &dhcp_msg_type, &preq_addr); + DPRINTF("bootp packet op=%d msgtype=%d", bp->bp_op, dhcp_msg_type); + if (preq_addr.s_addr != htonl(0L)) + DPRINTF(" req_addr=%08" PRIx32 "\n", ntohl(preq_addr.s_addr)); +@@ -179,9 +182,10 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) + return; + } + m->m_data += IF_MAXLINKHDR; ++ m_inc(m, sizeof(struct bootp_t) + DHCP_OPT_LEN); + rbp = (struct bootp_t *)m->m_data; + m->m_data += sizeof(struct udpiphdr); +- memset(rbp, 0, sizeof(struct bootp_t)); ++ memset(rbp, 0, sizeof(struct bootp_t) + DHCP_OPT_LEN); + + if (dhcp_msg_type == DHCPDISCOVER) { + if (preq_addr.s_addr != htonl(0L)) { +@@ -235,7 +239,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) + rbp->bp_siaddr = saddr.sin_addr; /* Server IP address */ + + q = rbp->bp_vend; +- end = (uint8_t *)&rbp[1]; ++ end = rbp->bp_vend + DHCP_OPT_LEN; + memcpy(q, rfc1533_cookie, 4); + q += 4; + +@@ -365,6 +369,6 @@ void bootp_input(struct mbuf *m) + struct bootp_t *bp = mtod(m, struct bootp_t *); + + if (bp->bp_op == BOOTP_REQUEST) { +- bootp_reply(m->slirp, bp); ++ bootp_reply(m->slirp, bp, m_end(m)); + } + } +diff --git a/slirp/src/bootp.h b/slirp/src/bootp.h +index 03ece9bf28..0d20a944a8 100644 +--- a/slirp/src/bootp.h ++++ b/slirp/src/bootp.h +@@ -114,7 +114,7 @@ struct bootp_t { + uint8_t bp_hwaddr[16]; + uint8_t bp_sname[64]; + uint8_t bp_file[128]; +- uint8_t bp_vend[DHCP_OPT_LEN]; ++ uint8_t bp_vend[]; + }; + + typedef struct { +diff --git a/slirp/src/mbuf.c b/slirp/src/mbuf.c +index 6d0653ed3d..7db07c088e 100644 +--- a/slirp/src/mbuf.c ++++ b/slirp/src/mbuf.c +@@ -233,3 +233,8 @@ void *mtod_check(struct mbuf *m, size_t len) + + return NULL; + } ++ ++void *m_end(struct mbuf *m) ++{ ++ return m->m_data + m->m_len; ++} +diff --git a/slirp/src/mbuf.h b/slirp/src/mbuf.h +index 2015e3232f..a9752a36e0 100644 +--- a/slirp/src/mbuf.h ++++ b/slirp/src/mbuf.h +@@ -119,6 +119,7 @@ void m_adj(struct mbuf *, int); + int m_copy(struct mbuf *, struct mbuf *, int, int); + struct mbuf *dtom(Slirp *, void *); + void *mtod_check(struct mbuf *, size_t len); ++void *m_end(struct mbuf *); + + static inline void ifs_init(struct mbuf *ifm) + { +-- +2.27.0 + diff --git a/kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch b/kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch new file mode 100755 index 0000000000000000000000000000000000000000..5d21bf86aafe87c58d53b2559334b94375a61597 --- /dev/null +++ b/kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch @@ -0,0 +1,137 @@ +From f756c1c4590a37c533ec0429644a7034ba35dada Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:38 +0100 +Subject: [PATCH 007/116] build: rename CONFIG_LIBCAP to CONFIG_LIBCAP_NG +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-4-dgilbert@redhat.com> +Patchwork-id: 93459 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 003/112] build: rename CONFIG_LIBCAP to CONFIG_LIBCAP_NG +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Paolo Bonzini + +Since we are actually testing for the newer capng library, rename the +symbol to match. + +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Paolo Bonzini +(cherry picked from commit a358bca24026a377e0804e137a4499e4e041918d) +Signed-off-by: Miroslav Rezanina +--- + configure | 2 +- + qemu-bridge-helper.c | 6 +++--- + scsi/qemu-pr-helper.c | 12 ++++++------ + 3 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/configure b/configure +index 16564f8..7831618 100755 +--- a/configure ++++ b/configure +@@ -6760,7 +6760,7 @@ if test "$l2tpv3" = "yes" ; then + echo "CONFIG_L2TPV3=y" >> $config_host_mak + fi + if test "$cap_ng" = "yes" ; then +- echo "CONFIG_LIBCAP=y" >> $config_host_mak ++ echo "CONFIG_LIBCAP_NG=y" >> $config_host_mak + fi + echo "CONFIG_AUDIO_DRIVERS=$audio_drv_list" >> $config_host_mak + for drv in $audio_drv_list; do +diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c +index 3d50ec0..88b2674 100644 +--- a/qemu-bridge-helper.c ++++ b/qemu-bridge-helper.c +@@ -43,7 +43,7 @@ + + #include "net/tap-linux.h" + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + #include + #endif + +@@ -207,7 +207,7 @@ static int send_fd(int c, int fd) + return sendmsg(c, &msg, 0); + } + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + static int drop_privileges(void) + { + /* clear all capabilities */ +@@ -246,7 +246,7 @@ int main(int argc, char **argv) + int access_allowed, access_denied; + int ret = EXIT_SUCCESS; + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + /* if we're run from an suid binary, immediately drop privileges preserving + * cap_net_admin */ + if (geteuid() == 0 && getuid() != geteuid()) { +diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c +index debb18f..0659cee 100644 +--- a/scsi/qemu-pr-helper.c ++++ b/scsi/qemu-pr-helper.c +@@ -24,7 +24,7 @@ + #include + #include + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + #include + #endif + #include +@@ -70,7 +70,7 @@ static int num_active_sockets = 1; + static int noisy; + static int verbose; + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + static int uid = -1; + static int gid = -1; + #endif +@@ -97,7 +97,7 @@ static void usage(const char *name) + " (default '%s')\n" + " -T, --trace [[enable=]][,events=][,file=]\n" + " specify tracing options\n" +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + " -u, --user=USER user to drop privileges to\n" + " -g, --group=GROUP group to drop privileges to\n" + #endif +@@ -827,7 +827,7 @@ static void close_server_socket(void) + num_active_sockets--; + } + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + static int drop_privileges(void) + { + /* clear all capabilities */ +@@ -920,7 +920,7 @@ int main(int argc, char **argv) + pidfile = g_strdup(optarg); + pidfile_specified = true; + break; +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + case 'u': { + unsigned long res; + struct passwd *userinfo = getpwnam(optarg); +@@ -1056,7 +1056,7 @@ int main(int argc, char **argv) + exit(EXIT_FAILURE); + } + +-#ifdef CONFIG_LIBCAP ++#ifdef CONFIG_LIBCAP_NG + if (drop_privileges() < 0) { + error_report("Failed to drop privileges: %s", strerror(errno)); + exit(EXIT_FAILURE); +-- +1.8.3.1 + diff --git a/kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch b/kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch new file mode 100755 index 0000000000000000000000000000000000000000..5b1b170ae3cc9ceccdf978399ef33a7b2a72891a --- /dev/null +++ b/kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch @@ -0,0 +1,2463 @@ +From fc2d0dfe60b14992a9b67e7a18394ba6365dc5ed Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 18 Mar 2020 18:10:40 +0000 +Subject: [PATCH 2/2] build-sys: do not make qemu-ga link with pixman +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200318181040.256425-1-marcandre.lureau@redhat.com> +Patchwork-id: 94381 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] build-sys: do not make qemu-ga link with pixman +Bugzilla: 1811670 +RH-Acked-by: Markus Armbruster +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange + +Since commit d52c454aadcdae74506f315ebf8b58bb79a05573 ("contrib: add +vhost-user-gpu"), qemu-ga is linking with pixman. + +This is because the Make-based build-system use a global namespace for +variables, and we rely on "main.o-libs" for different linking targets. + +Note: this kind of variable clashing is hard to fix or prevent +currently. meson should help, as declarations have a linear +dependency and doesn't rely so much on variables and clever tricks. + +Note2: we have a lot of main.c (or other duplicated names!) in +tree. Imho, it would be annoying and a bad workaroud to rename all +those to avoid conflicts like I did here. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1811670 + +Signed-off-by: Marc-André Lureau +Message-Id: <20200311160923.882474-1-marcandre.lureau@redhat.com> +Signed-off-by: Paolo Bonzini + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1811670 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=27330493 + +(cherry picked from commit 5b42bc5ce9ab4a3171819feea5042931817211fd) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + contrib/vhost-user-gpu/Makefile.objs | 6 +- + contrib/vhost-user-gpu/main.c | 1191 ------------------------------- + contrib/vhost-user-gpu/vhost-user-gpu.c | 1191 +++++++++++++++++++++++++++++++ + 3 files changed, 1194 insertions(+), 1194 deletions(-) + delete mode 100644 contrib/vhost-user-gpu/main.c + create mode 100644 contrib/vhost-user-gpu/vhost-user-gpu.c + +diff --git a/contrib/vhost-user-gpu/Makefile.objs b/contrib/vhost-user-gpu/Makefile.objs +index 6170c91..0929609 100644 +--- a/contrib/vhost-user-gpu/Makefile.objs ++++ b/contrib/vhost-user-gpu/Makefile.objs +@@ -1,7 +1,7 @@ +-vhost-user-gpu-obj-y = main.o virgl.o vugbm.o ++vhost-user-gpu-obj-y = vhost-user-gpu.o virgl.o vugbm.o + +-main.o-cflags := $(PIXMAN_CFLAGS) $(GBM_CFLAGS) +-main.o-libs := $(PIXMAN_LIBS) ++vhost-user-gpu.o-cflags := $(PIXMAN_CFLAGS) $(GBM_CFLAGS) ++vhost-user-gpu.o-libs := $(PIXMAN_LIBS) + + virgl.o-cflags := $(VIRGL_CFLAGS) $(GBM_CFLAGS) + virgl.o-libs := $(VIRGL_LIBS) +diff --git a/contrib/vhost-user-gpu/main.c b/contrib/vhost-user-gpu/main.c +deleted file mode 100644 +index b45d201..0000000 +--- a/contrib/vhost-user-gpu/main.c ++++ /dev/null +@@ -1,1191 +0,0 @@ +-/* +- * Virtio vhost-user GPU Device +- * +- * Copyright Red Hat, Inc. 2013-2018 +- * +- * Authors: +- * Dave Airlie +- * Gerd Hoffmann +- * Marc-André Lureau +- * +- * This work is licensed under the terms of the GNU GPL, version 2 or later. +- * See the COPYING file in the top-level directory. +- */ +-#include "qemu/osdep.h" +-#include "qemu/drm.h" +-#include "qapi/error.h" +-#include "qemu/sockets.h" +- +-#include +-#include +- +-#include "vugpu.h" +-#include "hw/virtio/virtio-gpu-bswap.h" +-#include "hw/virtio/virtio-gpu-pixman.h" +-#include "virgl.h" +-#include "vugbm.h" +- +-enum { +- VHOST_USER_GPU_MAX_QUEUES = 2, +-}; +- +-struct virtio_gpu_simple_resource { +- uint32_t resource_id; +- uint32_t width; +- uint32_t height; +- uint32_t format; +- struct iovec *iov; +- unsigned int iov_cnt; +- uint32_t scanout_bitmask; +- pixman_image_t *image; +- struct vugbm_buffer buffer; +- QTAILQ_ENTRY(virtio_gpu_simple_resource) next; +-}; +- +-static gboolean opt_print_caps; +-static int opt_fdnum = -1; +-static char *opt_socket_path; +-static char *opt_render_node; +-static gboolean opt_virgl; +- +-static void vg_handle_ctrl(VuDev *dev, int qidx); +- +-static const char * +-vg_cmd_to_string(int cmd) +-{ +-#define CMD(cmd) [cmd] = #cmd +- static const char *vg_cmd_str[] = { +- CMD(VIRTIO_GPU_UNDEFINED), +- +- /* 2d commands */ +- CMD(VIRTIO_GPU_CMD_GET_DISPLAY_INFO), +- CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_2D), +- CMD(VIRTIO_GPU_CMD_RESOURCE_UNREF), +- CMD(VIRTIO_GPU_CMD_SET_SCANOUT), +- CMD(VIRTIO_GPU_CMD_RESOURCE_FLUSH), +- CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D), +- CMD(VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING), +- CMD(VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING), +- CMD(VIRTIO_GPU_CMD_GET_CAPSET_INFO), +- CMD(VIRTIO_GPU_CMD_GET_CAPSET), +- +- /* 3d commands */ +- CMD(VIRTIO_GPU_CMD_CTX_CREATE), +- CMD(VIRTIO_GPU_CMD_CTX_DESTROY), +- CMD(VIRTIO_GPU_CMD_CTX_ATTACH_RESOURCE), +- CMD(VIRTIO_GPU_CMD_CTX_DETACH_RESOURCE), +- CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_3D), +- CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_3D), +- CMD(VIRTIO_GPU_CMD_TRANSFER_FROM_HOST_3D), +- CMD(VIRTIO_GPU_CMD_SUBMIT_3D), +- +- /* cursor commands */ +- CMD(VIRTIO_GPU_CMD_UPDATE_CURSOR), +- CMD(VIRTIO_GPU_CMD_MOVE_CURSOR), +- }; +-#undef REQ +- +- if (cmd >= 0 && cmd < G_N_ELEMENTS(vg_cmd_str)) { +- return vg_cmd_str[cmd]; +- } else { +- return "unknown"; +- } +-} +- +-static int +-vg_sock_fd_read(int sock, void *buf, ssize_t buflen) +-{ +- int ret; +- +- do { +- ret = read(sock, buf, buflen); +- } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); +- +- g_warn_if_fail(ret == buflen); +- return ret; +-} +- +-static void +-vg_sock_fd_close(VuGpu *g) +-{ +- if (g->sock_fd >= 0) { +- close(g->sock_fd); +- g->sock_fd = -1; +- } +-} +- +-static gboolean +-source_wait_cb(gint fd, GIOCondition condition, gpointer user_data) +-{ +- VuGpu *g = user_data; +- +- if (!vg_recv_msg(g, VHOST_USER_GPU_DMABUF_UPDATE, 0, NULL)) { +- return G_SOURCE_CONTINUE; +- } +- +- /* resume */ +- g->wait_ok = 0; +- vg_handle_ctrl(&g->dev.parent, 0); +- +- return G_SOURCE_REMOVE; +-} +- +-void +-vg_wait_ok(VuGpu *g) +-{ +- assert(g->wait_ok == 0); +- g->wait_ok = g_unix_fd_add(g->sock_fd, G_IO_IN | G_IO_HUP, +- source_wait_cb, g); +-} +- +-static int +-vg_sock_fd_write(int sock, const void *buf, ssize_t buflen, int fd) +-{ +- ssize_t ret; +- struct iovec iov = { +- .iov_base = (void *)buf, +- .iov_len = buflen, +- }; +- struct msghdr msg = { +- .msg_iov = &iov, +- .msg_iovlen = 1, +- }; +- union { +- struct cmsghdr cmsghdr; +- char control[CMSG_SPACE(sizeof(int))]; +- } cmsgu; +- struct cmsghdr *cmsg; +- +- if (fd != -1) { +- msg.msg_control = cmsgu.control; +- msg.msg_controllen = sizeof(cmsgu.control); +- +- cmsg = CMSG_FIRSTHDR(&msg); +- cmsg->cmsg_len = CMSG_LEN(sizeof(int)); +- cmsg->cmsg_level = SOL_SOCKET; +- cmsg->cmsg_type = SCM_RIGHTS; +- +- *((int *)CMSG_DATA(cmsg)) = fd; +- } +- +- do { +- ret = sendmsg(sock, &msg, 0); +- } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); +- +- g_warn_if_fail(ret == buflen); +- return ret; +-} +- +-void +-vg_send_msg(VuGpu *vg, const VhostUserGpuMsg *msg, int fd) +-{ +- if (vg_sock_fd_write(vg->sock_fd, msg, +- VHOST_USER_GPU_HDR_SIZE + msg->size, fd) < 0) { +- vg_sock_fd_close(vg); +- } +-} +- +-bool +-vg_recv_msg(VuGpu *g, uint32_t expect_req, uint32_t expect_size, +- gpointer payload) +-{ +- uint32_t req, flags, size; +- +- if (vg_sock_fd_read(g->sock_fd, &req, sizeof(req)) < 0 || +- vg_sock_fd_read(g->sock_fd, &flags, sizeof(flags)) < 0 || +- vg_sock_fd_read(g->sock_fd, &size, sizeof(size)) < 0) { +- goto err; +- } +- +- g_return_val_if_fail(req == expect_req, false); +- g_return_val_if_fail(flags & VHOST_USER_GPU_MSG_FLAG_REPLY, false); +- g_return_val_if_fail(size == expect_size, false); +- +- if (size && vg_sock_fd_read(g->sock_fd, payload, size) != size) { +- goto err; +- } +- +- return true; +- +-err: +- vg_sock_fd_close(g); +- return false; +-} +- +-static struct virtio_gpu_simple_resource * +-virtio_gpu_find_resource(VuGpu *g, uint32_t resource_id) +-{ +- struct virtio_gpu_simple_resource *res; +- +- QTAILQ_FOREACH(res, &g->reslist, next) { +- if (res->resource_id == resource_id) { +- return res; +- } +- } +- return NULL; +-} +- +-void +-vg_ctrl_response(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd, +- struct virtio_gpu_ctrl_hdr *resp, +- size_t resp_len) +-{ +- size_t s; +- +- if (cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_FENCE) { +- resp->flags |= VIRTIO_GPU_FLAG_FENCE; +- resp->fence_id = cmd->cmd_hdr.fence_id; +- resp->ctx_id = cmd->cmd_hdr.ctx_id; +- } +- virtio_gpu_ctrl_hdr_bswap(resp); +- s = iov_from_buf(cmd->elem.in_sg, cmd->elem.in_num, 0, resp, resp_len); +- if (s != resp_len) { +- g_critical("%s: response size incorrect %zu vs %zu", +- __func__, s, resp_len); +- } +- vu_queue_push(&g->dev.parent, cmd->vq, &cmd->elem, s); +- vu_queue_notify(&g->dev.parent, cmd->vq); +- cmd->finished = true; +-} +- +-void +-vg_ctrl_response_nodata(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd, +- enum virtio_gpu_ctrl_type type) +-{ +- struct virtio_gpu_ctrl_hdr resp = { +- .type = type, +- }; +- +- vg_ctrl_response(g, cmd, &resp, sizeof(resp)); +-} +- +-void +-vg_get_display_info(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_resp_display_info dpy_info = { {} }; +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_GET_DISPLAY_INFO, +- .size = 0, +- }; +- +- assert(vg->wait_ok == 0); +- +- vg_send_msg(vg, &msg, -1); +- if (!vg_recv_msg(vg, msg.request, sizeof(dpy_info), &dpy_info)) { +- return; +- } +- +- vg_ctrl_response(vg, cmd, &dpy_info.hdr, sizeof(dpy_info)); +-} +- +-static void +-vg_resource_create_2d(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- pixman_format_code_t pformat; +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_create_2d c2d; +- +- VUGPU_FILL_CMD(c2d); +- virtio_gpu_bswap_32(&c2d, sizeof(c2d)); +- +- if (c2d.resource_id == 0) { +- g_critical("%s: resource id 0 is not allowed", __func__); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- res = virtio_gpu_find_resource(g, c2d.resource_id); +- if (res) { +- g_critical("%s: resource already exists %d", __func__, c2d.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- res = g_new0(struct virtio_gpu_simple_resource, 1); +- res->width = c2d.width; +- res->height = c2d.height; +- res->format = c2d.format; +- res->resource_id = c2d.resource_id; +- +- pformat = virtio_gpu_get_pixman_format(c2d.format); +- if (!pformat) { +- g_critical("%s: host couldn't handle guest format %d", +- __func__, c2d.format); +- g_free(res); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- vugbm_buffer_create(&res->buffer, &g->gdev, c2d.width, c2d.height); +- res->image = pixman_image_create_bits(pformat, +- c2d.width, +- c2d.height, +- (uint32_t *)res->buffer.mmap, +- res->buffer.stride); +- if (!res->image) { +- g_critical("%s: resource creation failed %d %d %d", +- __func__, c2d.resource_id, c2d.width, c2d.height); +- g_free(res); +- cmd->error = VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY; +- return; +- } +- +- QTAILQ_INSERT_HEAD(&g->reslist, res, next); +-} +- +-static void +-vg_disable_scanout(VuGpu *g, int scanout_id) +-{ +- struct virtio_gpu_scanout *scanout = &g->scanout[scanout_id]; +- struct virtio_gpu_simple_resource *res; +- +- if (scanout->resource_id == 0) { +- return; +- } +- +- res = virtio_gpu_find_resource(g, scanout->resource_id); +- if (res) { +- res->scanout_bitmask &= ~(1 << scanout_id); +- } +- +- scanout->width = 0; +- scanout->height = 0; +- +- if (g->sock_fd >= 0) { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_SCANOUT, +- .size = sizeof(VhostUserGpuScanout), +- .payload.scanout.scanout_id = scanout_id, +- }; +- vg_send_msg(g, &msg, -1); +- } +-} +- +-static void +-vg_resource_destroy(VuGpu *g, +- struct virtio_gpu_simple_resource *res) +-{ +- int i; +- +- if (res->scanout_bitmask) { +- for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { +- if (res->scanout_bitmask & (1 << i)) { +- vg_disable_scanout(g, i); +- } +- } +- } +- +- vugbm_buffer_destroy(&res->buffer); +- pixman_image_unref(res->image); +- QTAILQ_REMOVE(&g->reslist, res, next); +- g_free(res); +-} +- +-static void +-vg_resource_unref(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_unref unref; +- +- VUGPU_FILL_CMD(unref); +- virtio_gpu_bswap_32(&unref, sizeof(unref)); +- +- res = virtio_gpu_find_resource(g, unref.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d", +- __func__, unref.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- vg_resource_destroy(g, res); +-} +- +-int +-vg_create_mapping_iov(VuGpu *g, +- struct virtio_gpu_resource_attach_backing *ab, +- struct virtio_gpu_ctrl_command *cmd, +- struct iovec **iov) +-{ +- struct virtio_gpu_mem_entry *ents; +- size_t esize, s; +- int i; +- +- if (ab->nr_entries > 16384) { +- g_critical("%s: nr_entries is too big (%d > 16384)", +- __func__, ab->nr_entries); +- return -1; +- } +- +- esize = sizeof(*ents) * ab->nr_entries; +- ents = g_malloc(esize); +- s = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, +- sizeof(*ab), ents, esize); +- if (s != esize) { +- g_critical("%s: command data size incorrect %zu vs %zu", +- __func__, s, esize); +- g_free(ents); +- return -1; +- } +- +- *iov = g_malloc0(sizeof(struct iovec) * ab->nr_entries); +- for (i = 0; i < ab->nr_entries; i++) { +- uint64_t len = ents[i].length; +- (*iov)[i].iov_len = ents[i].length; +- (*iov)[i].iov_base = vu_gpa_to_va(&g->dev.parent, &len, ents[i].addr); +- if (!(*iov)[i].iov_base || len != ents[i].length) { +- g_critical("%s: resource %d element %d", +- __func__, ab->resource_id, i); +- g_free(*iov); +- g_free(ents); +- *iov = NULL; +- return -1; +- } +- } +- g_free(ents); +- return 0; +-} +- +-static void +-vg_resource_attach_backing(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_attach_backing ab; +- int ret; +- +- VUGPU_FILL_CMD(ab); +- virtio_gpu_bswap_32(&ab, sizeof(ab)); +- +- res = virtio_gpu_find_resource(g, ab.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d", +- __func__, ab.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- ret = vg_create_mapping_iov(g, &ab, cmd, &res->iov); +- if (ret != 0) { +- cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; +- return; +- } +- +- res->iov_cnt = ab.nr_entries; +-} +- +-static void +-vg_resource_detach_backing(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_detach_backing detach; +- +- VUGPU_FILL_CMD(detach); +- virtio_gpu_bswap_32(&detach, sizeof(detach)); +- +- res = virtio_gpu_find_resource(g, detach.resource_id); +- if (!res || !res->iov) { +- g_critical("%s: illegal resource specified %d", +- __func__, detach.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- g_free(res->iov); +- res->iov = NULL; +- res->iov_cnt = 0; +-} +- +-static void +-vg_transfer_to_host_2d(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- int h; +- uint32_t src_offset, dst_offset, stride; +- int bpp; +- pixman_format_code_t format; +- struct virtio_gpu_transfer_to_host_2d t2d; +- +- VUGPU_FILL_CMD(t2d); +- virtio_gpu_t2d_bswap(&t2d); +- +- res = virtio_gpu_find_resource(g, t2d.resource_id); +- if (!res || !res->iov) { +- g_critical("%s: illegal resource specified %d", +- __func__, t2d.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- if (t2d.r.x > res->width || +- t2d.r.y > res->height || +- t2d.r.width > res->width || +- t2d.r.height > res->height || +- t2d.r.x + t2d.r.width > res->width || +- t2d.r.y + t2d.r.height > res->height) { +- g_critical("%s: transfer bounds outside resource" +- " bounds for resource %d: %d %d %d %d vs %d %d", +- __func__, t2d.resource_id, t2d.r.x, t2d.r.y, +- t2d.r.width, t2d.r.height, res->width, res->height); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- +- format = pixman_image_get_format(res->image); +- bpp = (PIXMAN_FORMAT_BPP(format) + 7) / 8; +- stride = pixman_image_get_stride(res->image); +- +- if (t2d.offset || t2d.r.x || t2d.r.y || +- t2d.r.width != pixman_image_get_width(res->image)) { +- void *img_data = pixman_image_get_data(res->image); +- for (h = 0; h < t2d.r.height; h++) { +- src_offset = t2d.offset + stride * h; +- dst_offset = (t2d.r.y + h) * stride + (t2d.r.x * bpp); +- +- iov_to_buf(res->iov, res->iov_cnt, src_offset, +- img_data +- + dst_offset, t2d.r.width * bpp); +- } +- } else { +- iov_to_buf(res->iov, res->iov_cnt, 0, +- pixman_image_get_data(res->image), +- pixman_image_get_stride(res->image) +- * pixman_image_get_height(res->image)); +- } +-} +- +-static void +-vg_set_scanout(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res, *ores; +- struct virtio_gpu_scanout *scanout; +- struct virtio_gpu_set_scanout ss; +- int fd; +- +- VUGPU_FILL_CMD(ss); +- virtio_gpu_bswap_32(&ss, sizeof(ss)); +- +- if (ss.scanout_id >= VIRTIO_GPU_MAX_SCANOUTS) { +- g_critical("%s: illegal scanout id specified %d", +- __func__, ss.scanout_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_SCANOUT_ID; +- return; +- } +- +- if (ss.resource_id == 0) { +- vg_disable_scanout(g, ss.scanout_id); +- return; +- } +- +- /* create a surface for this scanout */ +- res = virtio_gpu_find_resource(g, ss.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d", +- __func__, ss.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- if (ss.r.x > res->width || +- ss.r.y > res->height || +- ss.r.width > res->width || +- ss.r.height > res->height || +- ss.r.x + ss.r.width > res->width || +- ss.r.y + ss.r.height > res->height) { +- g_critical("%s: illegal scanout %d bounds for" +- " resource %d, (%d,%d)+%d,%d vs %d %d", +- __func__, ss.scanout_id, ss.resource_id, ss.r.x, ss.r.y, +- ss.r.width, ss.r.height, res->width, res->height); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- +- scanout = &g->scanout[ss.scanout_id]; +- +- ores = virtio_gpu_find_resource(g, scanout->resource_id); +- if (ores) { +- ores->scanout_bitmask &= ~(1 << ss.scanout_id); +- } +- +- res->scanout_bitmask |= (1 << ss.scanout_id); +- scanout->resource_id = ss.resource_id; +- scanout->x = ss.r.x; +- scanout->y = ss.r.y; +- scanout->width = ss.r.width; +- scanout->height = ss.r.height; +- +- struct vugbm_buffer *buffer = &res->buffer; +- +- if (vugbm_buffer_can_get_dmabuf_fd(buffer)) { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_DMABUF_SCANOUT, +- .size = sizeof(VhostUserGpuDMABUFScanout), +- .payload.dmabuf_scanout = (VhostUserGpuDMABUFScanout) { +- .scanout_id = ss.scanout_id, +- .x = ss.r.x, +- .y = ss.r.y, +- .width = ss.r.width, +- .height = ss.r.height, +- .fd_width = buffer->width, +- .fd_height = buffer->height, +- .fd_stride = buffer->stride, +- .fd_drm_fourcc = buffer->format +- } +- }; +- +- if (vugbm_buffer_get_dmabuf_fd(buffer, &fd)) { +- vg_send_msg(g, &msg, fd); +- close(fd); +- } +- } else { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_SCANOUT, +- .size = sizeof(VhostUserGpuScanout), +- .payload.scanout = (VhostUserGpuScanout) { +- .scanout_id = ss.scanout_id, +- .width = scanout->width, +- .height = scanout->height +- } +- }; +- vg_send_msg(g, &msg, -1); +- } +-} +- +-static void +-vg_resource_flush(VuGpu *g, +- struct virtio_gpu_ctrl_command *cmd) +-{ +- struct virtio_gpu_simple_resource *res; +- struct virtio_gpu_resource_flush rf; +- pixman_region16_t flush_region; +- int i; +- +- VUGPU_FILL_CMD(rf); +- virtio_gpu_bswap_32(&rf, sizeof(rf)); +- +- res = virtio_gpu_find_resource(g, rf.resource_id); +- if (!res) { +- g_critical("%s: illegal resource specified %d\n", +- __func__, rf.resource_id); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +- return; +- } +- +- if (rf.r.x > res->width || +- rf.r.y > res->height || +- rf.r.width > res->width || +- rf.r.height > res->height || +- rf.r.x + rf.r.width > res->width || +- rf.r.y + rf.r.height > res->height) { +- g_critical("%s: flush bounds outside resource" +- " bounds for resource %d: %d %d %d %d vs %d %d\n", +- __func__, rf.resource_id, rf.r.x, rf.r.y, +- rf.r.width, rf.r.height, res->width, res->height); +- cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; +- return; +- } +- +- pixman_region_init_rect(&flush_region, +- rf.r.x, rf.r.y, rf.r.width, rf.r.height); +- for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { +- struct virtio_gpu_scanout *scanout; +- pixman_region16_t region, finalregion; +- pixman_box16_t *extents; +- +- if (!(res->scanout_bitmask & (1 << i))) { +- continue; +- } +- scanout = &g->scanout[i]; +- +- pixman_region_init(&finalregion); +- pixman_region_init_rect(®ion, scanout->x, scanout->y, +- scanout->width, scanout->height); +- +- pixman_region_intersect(&finalregion, &flush_region, ®ion); +- +- extents = pixman_region_extents(&finalregion); +- size_t width = extents->x2 - extents->x1; +- size_t height = extents->y2 - extents->y1; +- +- if (vugbm_buffer_can_get_dmabuf_fd(&res->buffer)) { +- VhostUserGpuMsg vmsg = { +- .request = VHOST_USER_GPU_DMABUF_UPDATE, +- .size = sizeof(VhostUserGpuUpdate), +- .payload.update = (VhostUserGpuUpdate) { +- .scanout_id = i, +- .x = extents->x1, +- .y = extents->y1, +- .width = width, +- .height = height, +- } +- }; +- vg_send_msg(g, &vmsg, -1); +- vg_wait_ok(g); +- } else { +- size_t bpp = +- PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) / 8; +- size_t size = width * height * bpp; +- +- void *p = g_malloc(VHOST_USER_GPU_HDR_SIZE + +- sizeof(VhostUserGpuUpdate) + size); +- VhostUserGpuMsg *msg = p; +- msg->request = VHOST_USER_GPU_UPDATE; +- msg->size = sizeof(VhostUserGpuUpdate) + size; +- msg->payload.update = (VhostUserGpuUpdate) { +- .scanout_id = i, +- .x = extents->x1, +- .y = extents->y1, +- .width = width, +- .height = height, +- }; +- pixman_image_t *i = +- pixman_image_create_bits(pixman_image_get_format(res->image), +- msg->payload.update.width, +- msg->payload.update.height, +- p + offsetof(VhostUserGpuMsg, +- payload.update.data), +- width * bpp); +- pixman_image_composite(PIXMAN_OP_SRC, +- res->image, NULL, i, +- extents->x1, extents->y1, +- 0, 0, 0, 0, +- width, height); +- pixman_image_unref(i); +- vg_send_msg(g, msg, -1); +- g_free(msg); +- } +- pixman_region_fini(®ion); +- pixman_region_fini(&finalregion); +- } +- pixman_region_fini(&flush_region); +-} +- +-static void +-vg_process_cmd(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) +-{ +- switch (cmd->cmd_hdr.type) { +- case VIRTIO_GPU_CMD_GET_DISPLAY_INFO: +- vg_get_display_info(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_CREATE_2D: +- vg_resource_create_2d(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_UNREF: +- vg_resource_unref(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_FLUSH: +- vg_resource_flush(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D: +- vg_transfer_to_host_2d(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_SET_SCANOUT: +- vg_set_scanout(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING: +- vg_resource_attach_backing(vg, cmd); +- break; +- case VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING: +- vg_resource_detach_backing(vg, cmd); +- break; +- /* case VIRTIO_GPU_CMD_GET_EDID: */ +- /* break */ +- default: +- g_warning("TODO handle ctrl %x\n", cmd->cmd_hdr.type); +- cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; +- break; +- } +- if (!cmd->finished) { +- vg_ctrl_response_nodata(vg, cmd, cmd->error ? cmd->error : +- VIRTIO_GPU_RESP_OK_NODATA); +- } +-} +- +-static void +-vg_handle_ctrl(VuDev *dev, int qidx) +-{ +- VuGpu *vg = container_of(dev, VuGpu, dev.parent); +- VuVirtq *vq = vu_get_queue(dev, qidx); +- struct virtio_gpu_ctrl_command *cmd = NULL; +- size_t len; +- +- for (;;) { +- if (vg->wait_ok != 0) { +- return; +- } +- +- cmd = vu_queue_pop(dev, vq, sizeof(struct virtio_gpu_ctrl_command)); +- if (!cmd) { +- break; +- } +- cmd->vq = vq; +- cmd->error = 0; +- cmd->finished = false; +- +- len = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, +- 0, &cmd->cmd_hdr, sizeof(cmd->cmd_hdr)); +- if (len != sizeof(cmd->cmd_hdr)) { +- g_warning("%s: command size incorrect %zu vs %zu\n", +- __func__, len, sizeof(cmd->cmd_hdr)); +- } +- +- virtio_gpu_ctrl_hdr_bswap(&cmd->cmd_hdr); +- g_debug("%d %s\n", cmd->cmd_hdr.type, +- vg_cmd_to_string(cmd->cmd_hdr.type)); +- +- if (vg->virgl) { +- vg_virgl_process_cmd(vg, cmd); +- } else { +- vg_process_cmd(vg, cmd); +- } +- +- if (!cmd->finished) { +- QTAILQ_INSERT_TAIL(&vg->fenceq, cmd, next); +- vg->inflight++; +- } else { +- g_free(cmd); +- } +- } +-} +- +-static void +-update_cursor_data_simple(VuGpu *g, uint32_t resource_id, gpointer data) +-{ +- struct virtio_gpu_simple_resource *res; +- +- res = virtio_gpu_find_resource(g, resource_id); +- g_return_if_fail(res != NULL); +- g_return_if_fail(pixman_image_get_width(res->image) == 64); +- g_return_if_fail(pixman_image_get_height(res->image) == 64); +- g_return_if_fail( +- PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) == 32); +- +- memcpy(data, pixman_image_get_data(res->image), 64 * 64 * sizeof(uint32_t)); +-} +- +-static void +-vg_process_cursor_cmd(VuGpu *g, struct virtio_gpu_update_cursor *cursor) +-{ +- bool move = cursor->hdr.type != VIRTIO_GPU_CMD_MOVE_CURSOR; +- +- g_debug("%s move:%d\n", G_STRFUNC, move); +- +- if (move) { +- VhostUserGpuMsg msg = { +- .request = cursor->resource_id ? +- VHOST_USER_GPU_CURSOR_POS : VHOST_USER_GPU_CURSOR_POS_HIDE, +- .size = sizeof(VhostUserGpuCursorPos), +- .payload.cursor_pos = { +- .scanout_id = cursor->pos.scanout_id, +- .x = cursor->pos.x, +- .y = cursor->pos.y, +- } +- }; +- vg_send_msg(g, &msg, -1); +- } else { +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_CURSOR_UPDATE, +- .size = sizeof(VhostUserGpuCursorUpdate), +- .payload.cursor_update = { +- .pos = { +- .scanout_id = cursor->pos.scanout_id, +- .x = cursor->pos.x, +- .y = cursor->pos.y, +- }, +- .hot_x = cursor->hot_x, +- .hot_y = cursor->hot_y, +- } +- }; +- if (g->virgl) { +- vg_virgl_update_cursor_data(g, cursor->resource_id, +- msg.payload.cursor_update.data); +- } else { +- update_cursor_data_simple(g, cursor->resource_id, +- msg.payload.cursor_update.data); +- } +- vg_send_msg(g, &msg, -1); +- } +-} +- +-static void +-vg_handle_cursor(VuDev *dev, int qidx) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- VuVirtq *vq = vu_get_queue(dev, qidx); +- VuVirtqElement *elem; +- size_t len; +- struct virtio_gpu_update_cursor cursor; +- +- for (;;) { +- elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); +- if (!elem) { +- break; +- } +- g_debug("cursor out:%d in:%d\n", elem->out_num, elem->in_num); +- +- len = iov_to_buf(elem->out_sg, elem->out_num, +- 0, &cursor, sizeof(cursor)); +- if (len != sizeof(cursor)) { +- g_warning("%s: cursor size incorrect %zu vs %zu\n", +- __func__, len, sizeof(cursor)); +- } else { +- virtio_gpu_bswap_32(&cursor, sizeof(cursor)); +- vg_process_cursor_cmd(g, &cursor); +- } +- vu_queue_push(dev, vq, elem, 0); +- vu_queue_notify(dev, vq); +- g_free(elem); +- } +-} +- +-static void +-vg_panic(VuDev *dev, const char *msg) +-{ +- g_critical("%s\n", msg); +- exit(1); +-} +- +-static void +-vg_queue_set_started(VuDev *dev, int qidx, bool started) +-{ +- VuVirtq *vq = vu_get_queue(dev, qidx); +- +- g_debug("queue started %d:%d\n", qidx, started); +- +- switch (qidx) { +- case 0: +- vu_set_queue_handler(dev, vq, started ? vg_handle_ctrl : NULL); +- break; +- case 1: +- vu_set_queue_handler(dev, vq, started ? vg_handle_cursor : NULL); +- break; +- default: +- break; +- } +-} +- +-static void +-set_gpu_protocol_features(VuGpu *g) +-{ +- uint64_t u64; +- VhostUserGpuMsg msg = { +- .request = VHOST_USER_GPU_GET_PROTOCOL_FEATURES +- }; +- +- assert(g->wait_ok == 0); +- vg_send_msg(g, &msg, -1); +- if (!vg_recv_msg(g, msg.request, sizeof(u64), &u64)) { +- return; +- } +- +- msg = (VhostUserGpuMsg) { +- .request = VHOST_USER_GPU_SET_PROTOCOL_FEATURES, +- .size = sizeof(uint64_t), +- .payload.u64 = 0 +- }; +- vg_send_msg(g, &msg, -1); +-} +- +-static int +-vg_process_msg(VuDev *dev, VhostUserMsg *msg, int *do_reply) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- +- switch (msg->request) { +- case VHOST_USER_GPU_SET_SOCKET: { +- g_return_val_if_fail(msg->fd_num == 1, 1); +- g_return_val_if_fail(g->sock_fd == -1, 1); +- g->sock_fd = msg->fds[0]; +- set_gpu_protocol_features(g); +- return 1; +- } +- default: +- return 0; +- } +- +- return 0; +-} +- +-static uint64_t +-vg_get_features(VuDev *dev) +-{ +- uint64_t features = 0; +- +- if (opt_virgl) { +- features |= 1 << VIRTIO_GPU_F_VIRGL; +- } +- +- return features; +-} +- +-static void +-vg_set_features(VuDev *dev, uint64_t features) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- bool virgl = features & (1 << VIRTIO_GPU_F_VIRGL); +- +- if (virgl && !g->virgl_inited) { +- if (!vg_virgl_init(g)) { +- vg_panic(dev, "Failed to initialize virgl"); +- } +- g->virgl_inited = true; +- } +- +- g->virgl = virgl; +-} +- +-static int +-vg_get_config(VuDev *dev, uint8_t *config, uint32_t len) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- +- g_return_val_if_fail(len <= sizeof(struct virtio_gpu_config), -1); +- +- if (opt_virgl) { +- g->virtio_config.num_capsets = vg_virgl_get_num_capsets(); +- } +- +- memcpy(config, &g->virtio_config, len); +- +- return 0; +-} +- +-static int +-vg_set_config(VuDev *dev, const uint8_t *data, +- uint32_t offset, uint32_t size, +- uint32_t flags) +-{ +- VuGpu *g = container_of(dev, VuGpu, dev.parent); +- struct virtio_gpu_config *config = (struct virtio_gpu_config *)data; +- +- if (config->events_clear) { +- g->virtio_config.events_read &= ~config->events_clear; +- } +- +- return 0; +-} +- +-static const VuDevIface vuiface = { +- .set_features = vg_set_features, +- .get_features = vg_get_features, +- .queue_set_started = vg_queue_set_started, +- .process_msg = vg_process_msg, +- .get_config = vg_get_config, +- .set_config = vg_set_config, +-}; +- +-static void +-vg_destroy(VuGpu *g) +-{ +- struct virtio_gpu_simple_resource *res, *tmp; +- +- vug_deinit(&g->dev); +- +- vg_sock_fd_close(g); +- +- QTAILQ_FOREACH_SAFE(res, &g->reslist, next, tmp) { +- vg_resource_destroy(g, res); +- } +- +- vugbm_device_destroy(&g->gdev); +-} +- +-static GOptionEntry entries[] = { +- { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps, +- "Print capabilities", NULL }, +- { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum, +- "Use inherited fd socket", "FDNUM" }, +- { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path, +- "Use UNIX socket path", "PATH" }, +- { "render-node", 'r', 0, G_OPTION_ARG_FILENAME, &opt_render_node, +- "Specify DRM render node", "PATH" }, +- { "virgl", 'v', 0, G_OPTION_ARG_NONE, &opt_virgl, +- "Turn virgl rendering on", NULL }, +- { NULL, } +-}; +- +-int +-main(int argc, char *argv[]) +-{ +- GOptionContext *context; +- GError *error = NULL; +- GMainLoop *loop = NULL; +- int fd; +- VuGpu g = { .sock_fd = -1, .drm_rnode_fd = -1 }; +- +- QTAILQ_INIT(&g.reslist); +- QTAILQ_INIT(&g.fenceq); +- +- context = g_option_context_new("QEMU vhost-user-gpu"); +- g_option_context_add_main_entries(context, entries, NULL); +- if (!g_option_context_parse(context, &argc, &argv, &error)) { +- g_printerr("Option parsing failed: %s\n", error->message); +- exit(EXIT_FAILURE); +- } +- g_option_context_free(context); +- +- if (opt_print_caps) { +- g_print("{\n"); +- g_print(" \"type\": \"gpu\",\n"); +- g_print(" \"features\": [\n"); +- g_print(" \"render-node\",\n"); +- g_print(" \"virgl\"\n"); +- g_print(" ]\n"); +- g_print("}\n"); +- exit(EXIT_SUCCESS); +- } +- +- g.drm_rnode_fd = qemu_drm_rendernode_open(opt_render_node); +- if (opt_render_node && g.drm_rnode_fd == -1) { +- g_printerr("Failed to open DRM rendernode.\n"); +- exit(EXIT_FAILURE); +- } +- +- if (g.drm_rnode_fd >= 0) { +- if (!vugbm_device_init(&g.gdev, g.drm_rnode_fd)) { +- g_warning("Failed to init DRM device, using fallback path"); +- } +- } +- +- if ((!!opt_socket_path + (opt_fdnum != -1)) != 1) { +- g_printerr("Please specify either --fd or --socket-path\n"); +- exit(EXIT_FAILURE); +- } +- +- if (opt_socket_path) { +- int lsock = unix_listen(opt_socket_path, &error_fatal); +- if (lsock < 0) { +- g_printerr("Failed to listen on %s.\n", opt_socket_path); +- exit(EXIT_FAILURE); +- } +- fd = accept(lsock, NULL, NULL); +- close(lsock); +- } else { +- fd = opt_fdnum; +- } +- if (fd == -1) { +- g_printerr("Invalid vhost-user socket.\n"); +- exit(EXIT_FAILURE); +- } +- +- if (!vug_init(&g.dev, VHOST_USER_GPU_MAX_QUEUES, fd, vg_panic, &vuiface)) { +- g_printerr("Failed to initialize libvhost-user-glib.\n"); +- exit(EXIT_FAILURE); +- } +- +- loop = g_main_loop_new(NULL, FALSE); +- g_main_loop_run(loop); +- g_main_loop_unref(loop); +- +- vg_destroy(&g); +- if (g.drm_rnode_fd >= 0) { +- close(g.drm_rnode_fd); +- } +- +- return 0; +-} +diff --git a/contrib/vhost-user-gpu/vhost-user-gpu.c b/contrib/vhost-user-gpu/vhost-user-gpu.c +new file mode 100644 +index 0000000..b45d201 +--- /dev/null ++++ b/contrib/vhost-user-gpu/vhost-user-gpu.c +@@ -0,0 +1,1191 @@ ++/* ++ * Virtio vhost-user GPU Device ++ * ++ * Copyright Red Hat, Inc. 2013-2018 ++ * ++ * Authors: ++ * Dave Airlie ++ * Gerd Hoffmann ++ * Marc-André Lureau ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++#include "qemu/osdep.h" ++#include "qemu/drm.h" ++#include "qapi/error.h" ++#include "qemu/sockets.h" ++ ++#include ++#include ++ ++#include "vugpu.h" ++#include "hw/virtio/virtio-gpu-bswap.h" ++#include "hw/virtio/virtio-gpu-pixman.h" ++#include "virgl.h" ++#include "vugbm.h" ++ ++enum { ++ VHOST_USER_GPU_MAX_QUEUES = 2, ++}; ++ ++struct virtio_gpu_simple_resource { ++ uint32_t resource_id; ++ uint32_t width; ++ uint32_t height; ++ uint32_t format; ++ struct iovec *iov; ++ unsigned int iov_cnt; ++ uint32_t scanout_bitmask; ++ pixman_image_t *image; ++ struct vugbm_buffer buffer; ++ QTAILQ_ENTRY(virtio_gpu_simple_resource) next; ++}; ++ ++static gboolean opt_print_caps; ++static int opt_fdnum = -1; ++static char *opt_socket_path; ++static char *opt_render_node; ++static gboolean opt_virgl; ++ ++static void vg_handle_ctrl(VuDev *dev, int qidx); ++ ++static const char * ++vg_cmd_to_string(int cmd) ++{ ++#define CMD(cmd) [cmd] = #cmd ++ static const char *vg_cmd_str[] = { ++ CMD(VIRTIO_GPU_UNDEFINED), ++ ++ /* 2d commands */ ++ CMD(VIRTIO_GPU_CMD_GET_DISPLAY_INFO), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_2D), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_UNREF), ++ CMD(VIRTIO_GPU_CMD_SET_SCANOUT), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_FLUSH), ++ CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING), ++ CMD(VIRTIO_GPU_CMD_GET_CAPSET_INFO), ++ CMD(VIRTIO_GPU_CMD_GET_CAPSET), ++ ++ /* 3d commands */ ++ CMD(VIRTIO_GPU_CMD_CTX_CREATE), ++ CMD(VIRTIO_GPU_CMD_CTX_DESTROY), ++ CMD(VIRTIO_GPU_CMD_CTX_ATTACH_RESOURCE), ++ CMD(VIRTIO_GPU_CMD_CTX_DETACH_RESOURCE), ++ CMD(VIRTIO_GPU_CMD_RESOURCE_CREATE_3D), ++ CMD(VIRTIO_GPU_CMD_TRANSFER_TO_HOST_3D), ++ CMD(VIRTIO_GPU_CMD_TRANSFER_FROM_HOST_3D), ++ CMD(VIRTIO_GPU_CMD_SUBMIT_3D), ++ ++ /* cursor commands */ ++ CMD(VIRTIO_GPU_CMD_UPDATE_CURSOR), ++ CMD(VIRTIO_GPU_CMD_MOVE_CURSOR), ++ }; ++#undef REQ ++ ++ if (cmd >= 0 && cmd < G_N_ELEMENTS(vg_cmd_str)) { ++ return vg_cmd_str[cmd]; ++ } else { ++ return "unknown"; ++ } ++} ++ ++static int ++vg_sock_fd_read(int sock, void *buf, ssize_t buflen) ++{ ++ int ret; ++ ++ do { ++ ret = read(sock, buf, buflen); ++ } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); ++ ++ g_warn_if_fail(ret == buflen); ++ return ret; ++} ++ ++static void ++vg_sock_fd_close(VuGpu *g) ++{ ++ if (g->sock_fd >= 0) { ++ close(g->sock_fd); ++ g->sock_fd = -1; ++ } ++} ++ ++static gboolean ++source_wait_cb(gint fd, GIOCondition condition, gpointer user_data) ++{ ++ VuGpu *g = user_data; ++ ++ if (!vg_recv_msg(g, VHOST_USER_GPU_DMABUF_UPDATE, 0, NULL)) { ++ return G_SOURCE_CONTINUE; ++ } ++ ++ /* resume */ ++ g->wait_ok = 0; ++ vg_handle_ctrl(&g->dev.parent, 0); ++ ++ return G_SOURCE_REMOVE; ++} ++ ++void ++vg_wait_ok(VuGpu *g) ++{ ++ assert(g->wait_ok == 0); ++ g->wait_ok = g_unix_fd_add(g->sock_fd, G_IO_IN | G_IO_HUP, ++ source_wait_cb, g); ++} ++ ++static int ++vg_sock_fd_write(int sock, const void *buf, ssize_t buflen, int fd) ++{ ++ ssize_t ret; ++ struct iovec iov = { ++ .iov_base = (void *)buf, ++ .iov_len = buflen, ++ }; ++ struct msghdr msg = { ++ .msg_iov = &iov, ++ .msg_iovlen = 1, ++ }; ++ union { ++ struct cmsghdr cmsghdr; ++ char control[CMSG_SPACE(sizeof(int))]; ++ } cmsgu; ++ struct cmsghdr *cmsg; ++ ++ if (fd != -1) { ++ msg.msg_control = cmsgu.control; ++ msg.msg_controllen = sizeof(cmsgu.control); ++ ++ cmsg = CMSG_FIRSTHDR(&msg); ++ cmsg->cmsg_len = CMSG_LEN(sizeof(int)); ++ cmsg->cmsg_level = SOL_SOCKET; ++ cmsg->cmsg_type = SCM_RIGHTS; ++ ++ *((int *)CMSG_DATA(cmsg)) = fd; ++ } ++ ++ do { ++ ret = sendmsg(sock, &msg, 0); ++ } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); ++ ++ g_warn_if_fail(ret == buflen); ++ return ret; ++} ++ ++void ++vg_send_msg(VuGpu *vg, const VhostUserGpuMsg *msg, int fd) ++{ ++ if (vg_sock_fd_write(vg->sock_fd, msg, ++ VHOST_USER_GPU_HDR_SIZE + msg->size, fd) < 0) { ++ vg_sock_fd_close(vg); ++ } ++} ++ ++bool ++vg_recv_msg(VuGpu *g, uint32_t expect_req, uint32_t expect_size, ++ gpointer payload) ++{ ++ uint32_t req, flags, size; ++ ++ if (vg_sock_fd_read(g->sock_fd, &req, sizeof(req)) < 0 || ++ vg_sock_fd_read(g->sock_fd, &flags, sizeof(flags)) < 0 || ++ vg_sock_fd_read(g->sock_fd, &size, sizeof(size)) < 0) { ++ goto err; ++ } ++ ++ g_return_val_if_fail(req == expect_req, false); ++ g_return_val_if_fail(flags & VHOST_USER_GPU_MSG_FLAG_REPLY, false); ++ g_return_val_if_fail(size == expect_size, false); ++ ++ if (size && vg_sock_fd_read(g->sock_fd, payload, size) != size) { ++ goto err; ++ } ++ ++ return true; ++ ++err: ++ vg_sock_fd_close(g); ++ return false; ++} ++ ++static struct virtio_gpu_simple_resource * ++virtio_gpu_find_resource(VuGpu *g, uint32_t resource_id) ++{ ++ struct virtio_gpu_simple_resource *res; ++ ++ QTAILQ_FOREACH(res, &g->reslist, next) { ++ if (res->resource_id == resource_id) { ++ return res; ++ } ++ } ++ return NULL; ++} ++ ++void ++vg_ctrl_response(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd, ++ struct virtio_gpu_ctrl_hdr *resp, ++ size_t resp_len) ++{ ++ size_t s; ++ ++ if (cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_FENCE) { ++ resp->flags |= VIRTIO_GPU_FLAG_FENCE; ++ resp->fence_id = cmd->cmd_hdr.fence_id; ++ resp->ctx_id = cmd->cmd_hdr.ctx_id; ++ } ++ virtio_gpu_ctrl_hdr_bswap(resp); ++ s = iov_from_buf(cmd->elem.in_sg, cmd->elem.in_num, 0, resp, resp_len); ++ if (s != resp_len) { ++ g_critical("%s: response size incorrect %zu vs %zu", ++ __func__, s, resp_len); ++ } ++ vu_queue_push(&g->dev.parent, cmd->vq, &cmd->elem, s); ++ vu_queue_notify(&g->dev.parent, cmd->vq); ++ cmd->finished = true; ++} ++ ++void ++vg_ctrl_response_nodata(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd, ++ enum virtio_gpu_ctrl_type type) ++{ ++ struct virtio_gpu_ctrl_hdr resp = { ++ .type = type, ++ }; ++ ++ vg_ctrl_response(g, cmd, &resp, sizeof(resp)); ++} ++ ++void ++vg_get_display_info(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_resp_display_info dpy_info = { {} }; ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_GET_DISPLAY_INFO, ++ .size = 0, ++ }; ++ ++ assert(vg->wait_ok == 0); ++ ++ vg_send_msg(vg, &msg, -1); ++ if (!vg_recv_msg(vg, msg.request, sizeof(dpy_info), &dpy_info)) { ++ return; ++ } ++ ++ vg_ctrl_response(vg, cmd, &dpy_info.hdr, sizeof(dpy_info)); ++} ++ ++static void ++vg_resource_create_2d(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ pixman_format_code_t pformat; ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_create_2d c2d; ++ ++ VUGPU_FILL_CMD(c2d); ++ virtio_gpu_bswap_32(&c2d, sizeof(c2d)); ++ ++ if (c2d.resource_id == 0) { ++ g_critical("%s: resource id 0 is not allowed", __func__); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ res = virtio_gpu_find_resource(g, c2d.resource_id); ++ if (res) { ++ g_critical("%s: resource already exists %d", __func__, c2d.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ res = g_new0(struct virtio_gpu_simple_resource, 1); ++ res->width = c2d.width; ++ res->height = c2d.height; ++ res->format = c2d.format; ++ res->resource_id = c2d.resource_id; ++ ++ pformat = virtio_gpu_get_pixman_format(c2d.format); ++ if (!pformat) { ++ g_critical("%s: host couldn't handle guest format %d", ++ __func__, c2d.format); ++ g_free(res); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ vugbm_buffer_create(&res->buffer, &g->gdev, c2d.width, c2d.height); ++ res->image = pixman_image_create_bits(pformat, ++ c2d.width, ++ c2d.height, ++ (uint32_t *)res->buffer.mmap, ++ res->buffer.stride); ++ if (!res->image) { ++ g_critical("%s: resource creation failed %d %d %d", ++ __func__, c2d.resource_id, c2d.width, c2d.height); ++ g_free(res); ++ cmd->error = VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY; ++ return; ++ } ++ ++ QTAILQ_INSERT_HEAD(&g->reslist, res, next); ++} ++ ++static void ++vg_disable_scanout(VuGpu *g, int scanout_id) ++{ ++ struct virtio_gpu_scanout *scanout = &g->scanout[scanout_id]; ++ struct virtio_gpu_simple_resource *res; ++ ++ if (scanout->resource_id == 0) { ++ return; ++ } ++ ++ res = virtio_gpu_find_resource(g, scanout->resource_id); ++ if (res) { ++ res->scanout_bitmask &= ~(1 << scanout_id); ++ } ++ ++ scanout->width = 0; ++ scanout->height = 0; ++ ++ if (g->sock_fd >= 0) { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_SCANOUT, ++ .size = sizeof(VhostUserGpuScanout), ++ .payload.scanout.scanout_id = scanout_id, ++ }; ++ vg_send_msg(g, &msg, -1); ++ } ++} ++ ++static void ++vg_resource_destroy(VuGpu *g, ++ struct virtio_gpu_simple_resource *res) ++{ ++ int i; ++ ++ if (res->scanout_bitmask) { ++ for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { ++ if (res->scanout_bitmask & (1 << i)) { ++ vg_disable_scanout(g, i); ++ } ++ } ++ } ++ ++ vugbm_buffer_destroy(&res->buffer); ++ pixman_image_unref(res->image); ++ QTAILQ_REMOVE(&g->reslist, res, next); ++ g_free(res); ++} ++ ++static void ++vg_resource_unref(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_unref unref; ++ ++ VUGPU_FILL_CMD(unref); ++ virtio_gpu_bswap_32(&unref, sizeof(unref)); ++ ++ res = virtio_gpu_find_resource(g, unref.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, unref.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ vg_resource_destroy(g, res); ++} ++ ++int ++vg_create_mapping_iov(VuGpu *g, ++ struct virtio_gpu_resource_attach_backing *ab, ++ struct virtio_gpu_ctrl_command *cmd, ++ struct iovec **iov) ++{ ++ struct virtio_gpu_mem_entry *ents; ++ size_t esize, s; ++ int i; ++ ++ if (ab->nr_entries > 16384) { ++ g_critical("%s: nr_entries is too big (%d > 16384)", ++ __func__, ab->nr_entries); ++ return -1; ++ } ++ ++ esize = sizeof(*ents) * ab->nr_entries; ++ ents = g_malloc(esize); ++ s = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, ++ sizeof(*ab), ents, esize); ++ if (s != esize) { ++ g_critical("%s: command data size incorrect %zu vs %zu", ++ __func__, s, esize); ++ g_free(ents); ++ return -1; ++ } ++ ++ *iov = g_malloc0(sizeof(struct iovec) * ab->nr_entries); ++ for (i = 0; i < ab->nr_entries; i++) { ++ uint64_t len = ents[i].length; ++ (*iov)[i].iov_len = ents[i].length; ++ (*iov)[i].iov_base = vu_gpa_to_va(&g->dev.parent, &len, ents[i].addr); ++ if (!(*iov)[i].iov_base || len != ents[i].length) { ++ g_critical("%s: resource %d element %d", ++ __func__, ab->resource_id, i); ++ g_free(*iov); ++ g_free(ents); ++ *iov = NULL; ++ return -1; ++ } ++ } ++ g_free(ents); ++ return 0; ++} ++ ++static void ++vg_resource_attach_backing(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_attach_backing ab; ++ int ret; ++ ++ VUGPU_FILL_CMD(ab); ++ virtio_gpu_bswap_32(&ab, sizeof(ab)); ++ ++ res = virtio_gpu_find_resource(g, ab.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, ab.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ ret = vg_create_mapping_iov(g, &ab, cmd, &res->iov); ++ if (ret != 0) { ++ cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; ++ return; ++ } ++ ++ res->iov_cnt = ab.nr_entries; ++} ++ ++static void ++vg_resource_detach_backing(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_detach_backing detach; ++ ++ VUGPU_FILL_CMD(detach); ++ virtio_gpu_bswap_32(&detach, sizeof(detach)); ++ ++ res = virtio_gpu_find_resource(g, detach.resource_id); ++ if (!res || !res->iov) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, detach.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ g_free(res->iov); ++ res->iov = NULL; ++ res->iov_cnt = 0; ++} ++ ++static void ++vg_transfer_to_host_2d(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ int h; ++ uint32_t src_offset, dst_offset, stride; ++ int bpp; ++ pixman_format_code_t format; ++ struct virtio_gpu_transfer_to_host_2d t2d; ++ ++ VUGPU_FILL_CMD(t2d); ++ virtio_gpu_t2d_bswap(&t2d); ++ ++ res = virtio_gpu_find_resource(g, t2d.resource_id); ++ if (!res || !res->iov) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, t2d.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ if (t2d.r.x > res->width || ++ t2d.r.y > res->height || ++ t2d.r.width > res->width || ++ t2d.r.height > res->height || ++ t2d.r.x + t2d.r.width > res->width || ++ t2d.r.y + t2d.r.height > res->height) { ++ g_critical("%s: transfer bounds outside resource" ++ " bounds for resource %d: %d %d %d %d vs %d %d", ++ __func__, t2d.resource_id, t2d.r.x, t2d.r.y, ++ t2d.r.width, t2d.r.height, res->width, res->height); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ ++ format = pixman_image_get_format(res->image); ++ bpp = (PIXMAN_FORMAT_BPP(format) + 7) / 8; ++ stride = pixman_image_get_stride(res->image); ++ ++ if (t2d.offset || t2d.r.x || t2d.r.y || ++ t2d.r.width != pixman_image_get_width(res->image)) { ++ void *img_data = pixman_image_get_data(res->image); ++ for (h = 0; h < t2d.r.height; h++) { ++ src_offset = t2d.offset + stride * h; ++ dst_offset = (t2d.r.y + h) * stride + (t2d.r.x * bpp); ++ ++ iov_to_buf(res->iov, res->iov_cnt, src_offset, ++ img_data ++ + dst_offset, t2d.r.width * bpp); ++ } ++ } else { ++ iov_to_buf(res->iov, res->iov_cnt, 0, ++ pixman_image_get_data(res->image), ++ pixman_image_get_stride(res->image) ++ * pixman_image_get_height(res->image)); ++ } ++} ++ ++static void ++vg_set_scanout(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res, *ores; ++ struct virtio_gpu_scanout *scanout; ++ struct virtio_gpu_set_scanout ss; ++ int fd; ++ ++ VUGPU_FILL_CMD(ss); ++ virtio_gpu_bswap_32(&ss, sizeof(ss)); ++ ++ if (ss.scanout_id >= VIRTIO_GPU_MAX_SCANOUTS) { ++ g_critical("%s: illegal scanout id specified %d", ++ __func__, ss.scanout_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_SCANOUT_ID; ++ return; ++ } ++ ++ if (ss.resource_id == 0) { ++ vg_disable_scanout(g, ss.scanout_id); ++ return; ++ } ++ ++ /* create a surface for this scanout */ ++ res = virtio_gpu_find_resource(g, ss.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d", ++ __func__, ss.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ if (ss.r.x > res->width || ++ ss.r.y > res->height || ++ ss.r.width > res->width || ++ ss.r.height > res->height || ++ ss.r.x + ss.r.width > res->width || ++ ss.r.y + ss.r.height > res->height) { ++ g_critical("%s: illegal scanout %d bounds for" ++ " resource %d, (%d,%d)+%d,%d vs %d %d", ++ __func__, ss.scanout_id, ss.resource_id, ss.r.x, ss.r.y, ++ ss.r.width, ss.r.height, res->width, res->height); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ ++ scanout = &g->scanout[ss.scanout_id]; ++ ++ ores = virtio_gpu_find_resource(g, scanout->resource_id); ++ if (ores) { ++ ores->scanout_bitmask &= ~(1 << ss.scanout_id); ++ } ++ ++ res->scanout_bitmask |= (1 << ss.scanout_id); ++ scanout->resource_id = ss.resource_id; ++ scanout->x = ss.r.x; ++ scanout->y = ss.r.y; ++ scanout->width = ss.r.width; ++ scanout->height = ss.r.height; ++ ++ struct vugbm_buffer *buffer = &res->buffer; ++ ++ if (vugbm_buffer_can_get_dmabuf_fd(buffer)) { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_DMABUF_SCANOUT, ++ .size = sizeof(VhostUserGpuDMABUFScanout), ++ .payload.dmabuf_scanout = (VhostUserGpuDMABUFScanout) { ++ .scanout_id = ss.scanout_id, ++ .x = ss.r.x, ++ .y = ss.r.y, ++ .width = ss.r.width, ++ .height = ss.r.height, ++ .fd_width = buffer->width, ++ .fd_height = buffer->height, ++ .fd_stride = buffer->stride, ++ .fd_drm_fourcc = buffer->format ++ } ++ }; ++ ++ if (vugbm_buffer_get_dmabuf_fd(buffer, &fd)) { ++ vg_send_msg(g, &msg, fd); ++ close(fd); ++ } ++ } else { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_SCANOUT, ++ .size = sizeof(VhostUserGpuScanout), ++ .payload.scanout = (VhostUserGpuScanout) { ++ .scanout_id = ss.scanout_id, ++ .width = scanout->width, ++ .height = scanout->height ++ } ++ }; ++ vg_send_msg(g, &msg, -1); ++ } ++} ++ ++static void ++vg_resource_flush(VuGpu *g, ++ struct virtio_gpu_ctrl_command *cmd) ++{ ++ struct virtio_gpu_simple_resource *res; ++ struct virtio_gpu_resource_flush rf; ++ pixman_region16_t flush_region; ++ int i; ++ ++ VUGPU_FILL_CMD(rf); ++ virtio_gpu_bswap_32(&rf, sizeof(rf)); ++ ++ res = virtio_gpu_find_resource(g, rf.resource_id); ++ if (!res) { ++ g_critical("%s: illegal resource specified %d\n", ++ __func__, rf.resource_id); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; ++ return; ++ } ++ ++ if (rf.r.x > res->width || ++ rf.r.y > res->height || ++ rf.r.width > res->width || ++ rf.r.height > res->height || ++ rf.r.x + rf.r.width > res->width || ++ rf.r.y + rf.r.height > res->height) { ++ g_critical("%s: flush bounds outside resource" ++ " bounds for resource %d: %d %d %d %d vs %d %d\n", ++ __func__, rf.resource_id, rf.r.x, rf.r.y, ++ rf.r.width, rf.r.height, res->width, res->height); ++ cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER; ++ return; ++ } ++ ++ pixman_region_init_rect(&flush_region, ++ rf.r.x, rf.r.y, rf.r.width, rf.r.height); ++ for (i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) { ++ struct virtio_gpu_scanout *scanout; ++ pixman_region16_t region, finalregion; ++ pixman_box16_t *extents; ++ ++ if (!(res->scanout_bitmask & (1 << i))) { ++ continue; ++ } ++ scanout = &g->scanout[i]; ++ ++ pixman_region_init(&finalregion); ++ pixman_region_init_rect(®ion, scanout->x, scanout->y, ++ scanout->width, scanout->height); ++ ++ pixman_region_intersect(&finalregion, &flush_region, ®ion); ++ ++ extents = pixman_region_extents(&finalregion); ++ size_t width = extents->x2 - extents->x1; ++ size_t height = extents->y2 - extents->y1; ++ ++ if (vugbm_buffer_can_get_dmabuf_fd(&res->buffer)) { ++ VhostUserGpuMsg vmsg = { ++ .request = VHOST_USER_GPU_DMABUF_UPDATE, ++ .size = sizeof(VhostUserGpuUpdate), ++ .payload.update = (VhostUserGpuUpdate) { ++ .scanout_id = i, ++ .x = extents->x1, ++ .y = extents->y1, ++ .width = width, ++ .height = height, ++ } ++ }; ++ vg_send_msg(g, &vmsg, -1); ++ vg_wait_ok(g); ++ } else { ++ size_t bpp = ++ PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) / 8; ++ size_t size = width * height * bpp; ++ ++ void *p = g_malloc(VHOST_USER_GPU_HDR_SIZE + ++ sizeof(VhostUserGpuUpdate) + size); ++ VhostUserGpuMsg *msg = p; ++ msg->request = VHOST_USER_GPU_UPDATE; ++ msg->size = sizeof(VhostUserGpuUpdate) + size; ++ msg->payload.update = (VhostUserGpuUpdate) { ++ .scanout_id = i, ++ .x = extents->x1, ++ .y = extents->y1, ++ .width = width, ++ .height = height, ++ }; ++ pixman_image_t *i = ++ pixman_image_create_bits(pixman_image_get_format(res->image), ++ msg->payload.update.width, ++ msg->payload.update.height, ++ p + offsetof(VhostUserGpuMsg, ++ payload.update.data), ++ width * bpp); ++ pixman_image_composite(PIXMAN_OP_SRC, ++ res->image, NULL, i, ++ extents->x1, extents->y1, ++ 0, 0, 0, 0, ++ width, height); ++ pixman_image_unref(i); ++ vg_send_msg(g, msg, -1); ++ g_free(msg); ++ } ++ pixman_region_fini(®ion); ++ pixman_region_fini(&finalregion); ++ } ++ pixman_region_fini(&flush_region); ++} ++ ++static void ++vg_process_cmd(VuGpu *vg, struct virtio_gpu_ctrl_command *cmd) ++{ ++ switch (cmd->cmd_hdr.type) { ++ case VIRTIO_GPU_CMD_GET_DISPLAY_INFO: ++ vg_get_display_info(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_CREATE_2D: ++ vg_resource_create_2d(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_UNREF: ++ vg_resource_unref(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_FLUSH: ++ vg_resource_flush(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D: ++ vg_transfer_to_host_2d(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_SET_SCANOUT: ++ vg_set_scanout(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING: ++ vg_resource_attach_backing(vg, cmd); ++ break; ++ case VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING: ++ vg_resource_detach_backing(vg, cmd); ++ break; ++ /* case VIRTIO_GPU_CMD_GET_EDID: */ ++ /* break */ ++ default: ++ g_warning("TODO handle ctrl %x\n", cmd->cmd_hdr.type); ++ cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; ++ break; ++ } ++ if (!cmd->finished) { ++ vg_ctrl_response_nodata(vg, cmd, cmd->error ? cmd->error : ++ VIRTIO_GPU_RESP_OK_NODATA); ++ } ++} ++ ++static void ++vg_handle_ctrl(VuDev *dev, int qidx) ++{ ++ VuGpu *vg = container_of(dev, VuGpu, dev.parent); ++ VuVirtq *vq = vu_get_queue(dev, qidx); ++ struct virtio_gpu_ctrl_command *cmd = NULL; ++ size_t len; ++ ++ for (;;) { ++ if (vg->wait_ok != 0) { ++ return; ++ } ++ ++ cmd = vu_queue_pop(dev, vq, sizeof(struct virtio_gpu_ctrl_command)); ++ if (!cmd) { ++ break; ++ } ++ cmd->vq = vq; ++ cmd->error = 0; ++ cmd->finished = false; ++ ++ len = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, ++ 0, &cmd->cmd_hdr, sizeof(cmd->cmd_hdr)); ++ if (len != sizeof(cmd->cmd_hdr)) { ++ g_warning("%s: command size incorrect %zu vs %zu\n", ++ __func__, len, sizeof(cmd->cmd_hdr)); ++ } ++ ++ virtio_gpu_ctrl_hdr_bswap(&cmd->cmd_hdr); ++ g_debug("%d %s\n", cmd->cmd_hdr.type, ++ vg_cmd_to_string(cmd->cmd_hdr.type)); ++ ++ if (vg->virgl) { ++ vg_virgl_process_cmd(vg, cmd); ++ } else { ++ vg_process_cmd(vg, cmd); ++ } ++ ++ if (!cmd->finished) { ++ QTAILQ_INSERT_TAIL(&vg->fenceq, cmd, next); ++ vg->inflight++; ++ } else { ++ g_free(cmd); ++ } ++ } ++} ++ ++static void ++update_cursor_data_simple(VuGpu *g, uint32_t resource_id, gpointer data) ++{ ++ struct virtio_gpu_simple_resource *res; ++ ++ res = virtio_gpu_find_resource(g, resource_id); ++ g_return_if_fail(res != NULL); ++ g_return_if_fail(pixman_image_get_width(res->image) == 64); ++ g_return_if_fail(pixman_image_get_height(res->image) == 64); ++ g_return_if_fail( ++ PIXMAN_FORMAT_BPP(pixman_image_get_format(res->image)) == 32); ++ ++ memcpy(data, pixman_image_get_data(res->image), 64 * 64 * sizeof(uint32_t)); ++} ++ ++static void ++vg_process_cursor_cmd(VuGpu *g, struct virtio_gpu_update_cursor *cursor) ++{ ++ bool move = cursor->hdr.type != VIRTIO_GPU_CMD_MOVE_CURSOR; ++ ++ g_debug("%s move:%d\n", G_STRFUNC, move); ++ ++ if (move) { ++ VhostUserGpuMsg msg = { ++ .request = cursor->resource_id ? ++ VHOST_USER_GPU_CURSOR_POS : VHOST_USER_GPU_CURSOR_POS_HIDE, ++ .size = sizeof(VhostUserGpuCursorPos), ++ .payload.cursor_pos = { ++ .scanout_id = cursor->pos.scanout_id, ++ .x = cursor->pos.x, ++ .y = cursor->pos.y, ++ } ++ }; ++ vg_send_msg(g, &msg, -1); ++ } else { ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_CURSOR_UPDATE, ++ .size = sizeof(VhostUserGpuCursorUpdate), ++ .payload.cursor_update = { ++ .pos = { ++ .scanout_id = cursor->pos.scanout_id, ++ .x = cursor->pos.x, ++ .y = cursor->pos.y, ++ }, ++ .hot_x = cursor->hot_x, ++ .hot_y = cursor->hot_y, ++ } ++ }; ++ if (g->virgl) { ++ vg_virgl_update_cursor_data(g, cursor->resource_id, ++ msg.payload.cursor_update.data); ++ } else { ++ update_cursor_data_simple(g, cursor->resource_id, ++ msg.payload.cursor_update.data); ++ } ++ vg_send_msg(g, &msg, -1); ++ } ++} ++ ++static void ++vg_handle_cursor(VuDev *dev, int qidx) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ VuVirtq *vq = vu_get_queue(dev, qidx); ++ VuVirtqElement *elem; ++ size_t len; ++ struct virtio_gpu_update_cursor cursor; ++ ++ for (;;) { ++ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); ++ if (!elem) { ++ break; ++ } ++ g_debug("cursor out:%d in:%d\n", elem->out_num, elem->in_num); ++ ++ len = iov_to_buf(elem->out_sg, elem->out_num, ++ 0, &cursor, sizeof(cursor)); ++ if (len != sizeof(cursor)) { ++ g_warning("%s: cursor size incorrect %zu vs %zu\n", ++ __func__, len, sizeof(cursor)); ++ } else { ++ virtio_gpu_bswap_32(&cursor, sizeof(cursor)); ++ vg_process_cursor_cmd(g, &cursor); ++ } ++ vu_queue_push(dev, vq, elem, 0); ++ vu_queue_notify(dev, vq); ++ g_free(elem); ++ } ++} ++ ++static void ++vg_panic(VuDev *dev, const char *msg) ++{ ++ g_critical("%s\n", msg); ++ exit(1); ++} ++ ++static void ++vg_queue_set_started(VuDev *dev, int qidx, bool started) ++{ ++ VuVirtq *vq = vu_get_queue(dev, qidx); ++ ++ g_debug("queue started %d:%d\n", qidx, started); ++ ++ switch (qidx) { ++ case 0: ++ vu_set_queue_handler(dev, vq, started ? vg_handle_ctrl : NULL); ++ break; ++ case 1: ++ vu_set_queue_handler(dev, vq, started ? vg_handle_cursor : NULL); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void ++set_gpu_protocol_features(VuGpu *g) ++{ ++ uint64_t u64; ++ VhostUserGpuMsg msg = { ++ .request = VHOST_USER_GPU_GET_PROTOCOL_FEATURES ++ }; ++ ++ assert(g->wait_ok == 0); ++ vg_send_msg(g, &msg, -1); ++ if (!vg_recv_msg(g, msg.request, sizeof(u64), &u64)) { ++ return; ++ } ++ ++ msg = (VhostUserGpuMsg) { ++ .request = VHOST_USER_GPU_SET_PROTOCOL_FEATURES, ++ .size = sizeof(uint64_t), ++ .payload.u64 = 0 ++ }; ++ vg_send_msg(g, &msg, -1); ++} ++ ++static int ++vg_process_msg(VuDev *dev, VhostUserMsg *msg, int *do_reply) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ ++ switch (msg->request) { ++ case VHOST_USER_GPU_SET_SOCKET: { ++ g_return_val_if_fail(msg->fd_num == 1, 1); ++ g_return_val_if_fail(g->sock_fd == -1, 1); ++ g->sock_fd = msg->fds[0]; ++ set_gpu_protocol_features(g); ++ return 1; ++ } ++ default: ++ return 0; ++ } ++ ++ return 0; ++} ++ ++static uint64_t ++vg_get_features(VuDev *dev) ++{ ++ uint64_t features = 0; ++ ++ if (opt_virgl) { ++ features |= 1 << VIRTIO_GPU_F_VIRGL; ++ } ++ ++ return features; ++} ++ ++static void ++vg_set_features(VuDev *dev, uint64_t features) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ bool virgl = features & (1 << VIRTIO_GPU_F_VIRGL); ++ ++ if (virgl && !g->virgl_inited) { ++ if (!vg_virgl_init(g)) { ++ vg_panic(dev, "Failed to initialize virgl"); ++ } ++ g->virgl_inited = true; ++ } ++ ++ g->virgl = virgl; ++} ++ ++static int ++vg_get_config(VuDev *dev, uint8_t *config, uint32_t len) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ ++ g_return_val_if_fail(len <= sizeof(struct virtio_gpu_config), -1); ++ ++ if (opt_virgl) { ++ g->virtio_config.num_capsets = vg_virgl_get_num_capsets(); ++ } ++ ++ memcpy(config, &g->virtio_config, len); ++ ++ return 0; ++} ++ ++static int ++vg_set_config(VuDev *dev, const uint8_t *data, ++ uint32_t offset, uint32_t size, ++ uint32_t flags) ++{ ++ VuGpu *g = container_of(dev, VuGpu, dev.parent); ++ struct virtio_gpu_config *config = (struct virtio_gpu_config *)data; ++ ++ if (config->events_clear) { ++ g->virtio_config.events_read &= ~config->events_clear; ++ } ++ ++ return 0; ++} ++ ++static const VuDevIface vuiface = { ++ .set_features = vg_set_features, ++ .get_features = vg_get_features, ++ .queue_set_started = vg_queue_set_started, ++ .process_msg = vg_process_msg, ++ .get_config = vg_get_config, ++ .set_config = vg_set_config, ++}; ++ ++static void ++vg_destroy(VuGpu *g) ++{ ++ struct virtio_gpu_simple_resource *res, *tmp; ++ ++ vug_deinit(&g->dev); ++ ++ vg_sock_fd_close(g); ++ ++ QTAILQ_FOREACH_SAFE(res, &g->reslist, next, tmp) { ++ vg_resource_destroy(g, res); ++ } ++ ++ vugbm_device_destroy(&g->gdev); ++} ++ ++static GOptionEntry entries[] = { ++ { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps, ++ "Print capabilities", NULL }, ++ { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum, ++ "Use inherited fd socket", "FDNUM" }, ++ { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path, ++ "Use UNIX socket path", "PATH" }, ++ { "render-node", 'r', 0, G_OPTION_ARG_FILENAME, &opt_render_node, ++ "Specify DRM render node", "PATH" }, ++ { "virgl", 'v', 0, G_OPTION_ARG_NONE, &opt_virgl, ++ "Turn virgl rendering on", NULL }, ++ { NULL, } ++}; ++ ++int ++main(int argc, char *argv[]) ++{ ++ GOptionContext *context; ++ GError *error = NULL; ++ GMainLoop *loop = NULL; ++ int fd; ++ VuGpu g = { .sock_fd = -1, .drm_rnode_fd = -1 }; ++ ++ QTAILQ_INIT(&g.reslist); ++ QTAILQ_INIT(&g.fenceq); ++ ++ context = g_option_context_new("QEMU vhost-user-gpu"); ++ g_option_context_add_main_entries(context, entries, NULL); ++ if (!g_option_context_parse(context, &argc, &argv, &error)) { ++ g_printerr("Option parsing failed: %s\n", error->message); ++ exit(EXIT_FAILURE); ++ } ++ g_option_context_free(context); ++ ++ if (opt_print_caps) { ++ g_print("{\n"); ++ g_print(" \"type\": \"gpu\",\n"); ++ g_print(" \"features\": [\n"); ++ g_print(" \"render-node\",\n"); ++ g_print(" \"virgl\"\n"); ++ g_print(" ]\n"); ++ g_print("}\n"); ++ exit(EXIT_SUCCESS); ++ } ++ ++ g.drm_rnode_fd = qemu_drm_rendernode_open(opt_render_node); ++ if (opt_render_node && g.drm_rnode_fd == -1) { ++ g_printerr("Failed to open DRM rendernode.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (g.drm_rnode_fd >= 0) { ++ if (!vugbm_device_init(&g.gdev, g.drm_rnode_fd)) { ++ g_warning("Failed to init DRM device, using fallback path"); ++ } ++ } ++ ++ if ((!!opt_socket_path + (opt_fdnum != -1)) != 1) { ++ g_printerr("Please specify either --fd or --socket-path\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (opt_socket_path) { ++ int lsock = unix_listen(opt_socket_path, &error_fatal); ++ if (lsock < 0) { ++ g_printerr("Failed to listen on %s.\n", opt_socket_path); ++ exit(EXIT_FAILURE); ++ } ++ fd = accept(lsock, NULL, NULL); ++ close(lsock); ++ } else { ++ fd = opt_fdnum; ++ } ++ if (fd == -1) { ++ g_printerr("Invalid vhost-user socket.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (!vug_init(&g.dev, VHOST_USER_GPU_MAX_QUEUES, fd, vg_panic, &vuiface)) { ++ g_printerr("Failed to initialize libvhost-user-glib.\n"); ++ exit(EXIT_FAILURE); ++ } ++ ++ loop = g_main_loop_new(NULL, FALSE); ++ g_main_loop_run(loop); ++ g_main_loop_unref(loop); ++ ++ vg_destroy(&g); ++ if (g.drm_rnode_fd >= 0) { ++ close(g.drm_rnode_fd); ++ } ++ ++ return 0; ++} +-- +1.8.3.1 + diff --git a/kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch b/kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch new file mode 100755 index 0000000000000000000000000000000000000000..32d53773db9942ac1e83fb122739a60dd83a3f3e --- /dev/null +++ b/kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch @@ -0,0 +1,60 @@ +From 6f1ebcfdb92d12ef2caae0b63a3a380265cba1fa Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:46 -0400 +Subject: [PATCH 8/9] cadence_gem: switch to use qemu_receive_packet() for + loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-9-jmaloy@redhat.com> +Patchwork-id: 101793 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 8/9] cadence_gem: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Alexander Bulekov +Signed-off-by: Jason Wang + +(cherry picked from commit e73adfbeec9d4e008630c814759052ed945c3fed) +Conflict: upstream commit 24d62fd5028e ("net: cadence_gem: Move tx/rx +packet buffert to CadenceGEMState") is missing in this version, so +we stick to using the original stack variable tx_packet in the calls. + +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/cadence_gem.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c +index b8be73dc55..be7c91123b 100644 +--- a/hw/net/cadence_gem.c ++++ b/hw/net/cadence_gem.c +@@ -1225,8 +1225,8 @@ static void gem_transmit(CadenceGEMState *s) + /* Send the packet somewhere */ + if (s->phy_loop || (s->regs[GEM_NWCTRL] & + GEM_NWCTRL_LOCALLOOP)) { +- gem_receive(qemu_get_queue(s->nic), tx_packet, +- total_bytes); ++ qemu_receive_packet(qemu_get_queue(s->nic), tx_packet, ++ total_bytes); + } else { + qemu_send_packet(qemu_get_queue(s->nic), tx_packet, + total_bytes); +-- +2.27.0 + diff --git a/kvm-compat-disable-edid-for-virtio-gpu-ccw.patch b/kvm-compat-disable-edid-for-virtio-gpu-ccw.patch new file mode 100755 index 0000000000000000000000000000000000000000..e00053430e1b61df555a88980e686293560b42bc --- /dev/null +++ b/kvm-compat-disable-edid-for-virtio-gpu-ccw.patch @@ -0,0 +1,50 @@ +From 8f9f4d8d52ebb7878543ac0b84cc372477041e33 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Wed, 1 Apr 2020 16:13:50 -0400 +Subject: [PATCH 2/2] compat: disable 'edid' for virtio-gpu-ccw + +RH-Author: Cornelia Huck +Message-id: <20200401161350.20462-1-cohuck@redhat.com> +Patchwork-id: 94523 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2] compat: disable 'edid' for virtio-gpu-ccw +Bugzilla: 1816793 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Markus Armbruster +RH-Acked-by: Dr. David Alan Gilbert + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816793 +Branch: rhel-av-8.2.1 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27629804 +Upstream: downstream only +Tested: verified that for a virtio-gpu-ccw device 'edid' is false with + a s390-ccw-virtio-rhel7.6.0 machine and true with a + s390-ccw-virtio-rhel8.2.0 (s390x does not have the 8.0 or 8.1 + machine types) + +hw_compat_rhel_8_0 copied the original upstream version of +disabling 'edid' for virtio-gpu-pci only (not following later +changes). Switch it to virtio-gpu-device, following upstream +02501fc39381 ("compat: disable edid on correct virtio-gpu device"). + +Signed-off-by: Cornelia Huck +Signed-off-by: Jon Maloy +--- + hw/core/machine.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index e0e0eec8bf..5a025d1af2 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -72,7 +72,7 @@ GlobalProperty hw_compat_rhel_8_0[] = { + /* hw_compat_rhel_8_0 from hw_compat_4_0 */ + { "virtio-vga", "edid", "false" }, + /* hw_compat_rhel_8_0 from hw_compat_4_0 */ +- { "virtio-gpu-pci", "edid", "false" }, ++ { "virtio-gpu-device", "edid", "false" }, + /* hw_compat_rhel_8_0 from hw_compat_4_0 */ + { "virtio-device", "use-started", "false" }, + /* hw_compat_rhel_8_0 from hw_compat_3_1 - that was added in 4.1 */ +-- +2.18.2 + diff --git a/kvm-config-enable-VFIO_CCW.patch b/kvm-config-enable-VFIO_CCW.patch new file mode 100755 index 0000000000000000000000000000000000000000..44af9cfb6a83af0c611e11781ee44acf28520599 --- /dev/null +++ b/kvm-config-enable-VFIO_CCW.patch @@ -0,0 +1,39 @@ +From f3e80771c921560a58c30020781fa01a54be8eb0 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:43 -0400 +Subject: [PATCH 09/12] config: enable VFIO_CCW + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-10-cohuck@redhat.com> +Patchwork-id: 97699 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 9/9] config: enable VFIO_CCW +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +Enable vfio-ccw in RHEL builds. + +Upstream: n/a + +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + default-configs/s390x-rh-devices.mak | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/default-configs/s390x-rh-devices.mak b/default-configs/s390x-rh-devices.mak +index c3c73fe752..08a15f3e01 100644 +--- a/default-configs/s390x-rh-devices.mak ++++ b/default-configs/s390x-rh-devices.mak +@@ -9,6 +9,7 @@ CONFIG_SCSI=y + CONFIG_TERMINAL3270=y + CONFIG_VFIO=y + CONFIG_VFIO_AP=y ++CONFIG_VFIO_CCW=y + CONFIG_VFIO_PCI=y + CONFIG_VHOST_USER=y + CONFIG_VIRTIO_CCW=y +-- +2.27.0 + diff --git a/kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch b/kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch new file mode 100755 index 0000000000000000000000000000000000000000..4212f1c6efa4646e6419ddfcec0f418498deb5fc --- /dev/null +++ b/kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch @@ -0,0 +1,134 @@ +From 548de8acbf0137b6e49a14b63682badfff037d23 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:44 +0100 +Subject: [PATCH 073/116] contrib/libvhost-user: Protect slave fd with mutex +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-70-dgilbert@redhat.com> +Patchwork-id: 93523 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 069/112] contrib/libvhost-user: Protect slave fd with mutex +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +In future patches we'll be performing commands on the slave-fd driven +by commands on queues, since those queues will be driven by individual +threads we need to make sure they don't attempt to use the slave-fd +for multiple commands in parallel. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c25c02b9e6a196be87a818f459c426556b24770d) +Signed-off-by: Miroslav Rezanina +--- + contrib/libvhost-user/libvhost-user.c | 24 ++++++++++++++++++++---- + contrib/libvhost-user/libvhost-user.h | 3 +++ + 2 files changed, 23 insertions(+), 4 deletions(-) + +diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c +index ec27b78..63e4106 100644 +--- a/contrib/libvhost-user/libvhost-user.c ++++ b/contrib/libvhost-user/libvhost-user.c +@@ -392,26 +392,37 @@ vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) + return vu_message_write(dev, conn_fd, vmsg); + } + ++/* ++ * Processes a reply on the slave channel. ++ * Entered with slave_mutex held and releases it before exit. ++ * Returns true on success. ++ */ + static bool + vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) + { + VhostUserMsg msg_reply; ++ bool result = false; + + if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { +- return true; ++ result = true; ++ goto out; + } + + if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) { +- return false; ++ goto out; + } + + if (msg_reply.request != vmsg->request) { + DPRINT("Received unexpected msg type. Expected %d received %d", + vmsg->request, msg_reply.request); +- return false; ++ goto out; + } + +- return msg_reply.payload.u64 == 0; ++ result = msg_reply.payload.u64 == 0; ++ ++out: ++ pthread_mutex_unlock(&dev->slave_mutex); ++ return result; + } + + /* Kick the log_call_fd if required. */ +@@ -1105,10 +1116,13 @@ bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, + return false; + } + ++ pthread_mutex_lock(&dev->slave_mutex); + if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { ++ pthread_mutex_unlock(&dev->slave_mutex); + return false; + } + ++ /* Also unlocks the slave_mutex */ + return vu_process_message_reply(dev, &vmsg); + } + +@@ -1628,6 +1642,7 @@ vu_deinit(VuDev *dev) + close(dev->slave_fd); + dev->slave_fd = -1; + } ++ pthread_mutex_destroy(&dev->slave_mutex); + + if (dev->sock != -1) { + close(dev->sock); +@@ -1663,6 +1678,7 @@ vu_init(VuDev *dev, + dev->remove_watch = remove_watch; + dev->iface = iface; + dev->log_call_fd = -1; ++ pthread_mutex_init(&dev->slave_mutex, NULL); + dev->slave_fd = -1; + dev->max_queues = max_queues; + +diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h +index 46b6007..1844b6f 100644 +--- a/contrib/libvhost-user/libvhost-user.h ++++ b/contrib/libvhost-user/libvhost-user.h +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include "standard-headers/linux/virtio_ring.h" + + /* Based on qemu/hw/virtio/vhost-user.c */ +@@ -355,6 +356,8 @@ struct VuDev { + VuVirtq *vq; + VuDevInflightInfo inflight_info; + int log_call_fd; ++ /* Must be held while using slave_fd */ ++ pthread_mutex_t slave_mutex; + int slave_fd; + uint64_t log_size; + uint8_t *log_table; +-- +1.8.3.1 + diff --git a/kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch b/kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch new file mode 100755 index 0000000000000000000000000000000000000000..891b866b1fecba0e70ec074c7a4d4b42ac875c52 --- /dev/null +++ b/kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch @@ -0,0 +1,98 @@ +From 043decff5812c1f46ed44dd0f82099e3b8bb6a28 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Sun, 31 May 2020 16:40:35 +0100 +Subject: [PATCH 7/7] crypto.c: cleanup created file when + block_crypto_co_create_opts_luks fails + +RH-Author: Maxim Levitsky +Message-id: <20200531164035.34188-4-mlevitsk@redhat.com> +Patchwork-id: 97060 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 3/3] crypto.c: cleanup created file when block_crypto_co_create_opts_luks fails +Bugzilla: 1827630 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: John Snow +RH-Acked-by: Eric Blake + +From: Daniel Henrique Barboza + +When using a non-UTF8 secret to create a volume using qemu-img, the +following error happens: + +$ qemu-img create -f luks --object secret,id=vol_1_encrypt0,file=vol_resize_pool.vol_1.secret.qzVQrI -o key-secret=vol_1_encrypt0 /var/tmp/pool_target/vol_1 10240K + +Formatting '/var/tmp/pool_target/vol_1', fmt=luks size=10485760 key-secret=vol_1_encrypt0 +qemu-img: /var/tmp/pool_target/vol_1: Data from secret vol_1_encrypt0 is not valid UTF-8 + +However, the created file '/var/tmp/pool_target/vol_1' is left behind in the +file system after the failure. This behavior can be observed when creating +the volume using Libvirt, via 'virsh vol-create', and then getting "volume +target path already exist" errors when trying to re-create the volume. + +The volume file is created inside block_crypto_co_create_opts_luks(), in +block/crypto.c. If the bdrv_create_file() call is successful but any +succeeding step fails*, the existing 'fail' label does not take into +account the created file, leaving it behind. + +This patch changes block_crypto_co_create_opts_luks() to delete +'filename' in case of failure. A failure in this point means that +the volume is now truncated/corrupted, so even if 'filename' was an +existing volume before calling qemu-img, it is now unusable. Deleting +the file it is not much worse than leaving it in the filesystem in +this scenario, and we don't have to deal with checking the file +pre-existence in the code. + +* in our case, block_crypto_co_create_generic calls qcrypto_block_create, +which calls qcrypto_block_luks_create, and this function fails when +calling qcrypto_secret_lookup_as_utf8. + +Reported-by: Srikanth Aithal +Suggested-by: Kevin Wolf +Signed-off-by: Daniel Henrique Barboza +Message-Id: <20200130213907.2830642-4-danielhb413@gmail.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 1bba30da24e1124ceeb0693c81382a0d77e20ca5) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/crypto.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/block/crypto.c b/block/crypto.c +index 970d463..5e3b15c 100644 +--- a/block/crypto.c ++++ b/block/crypto.c +@@ -30,6 +30,7 @@ + #include "qapi/error.h" + #include "qemu/module.h" + #include "qemu/option.h" ++#include "qemu/cutils.h" + #include "crypto.h" + + typedef struct BlockCrypto BlockCrypto; +@@ -597,6 +598,23 @@ static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv, + + ret = 0; + fail: ++ /* ++ * If an error occurred, delete 'filename'. Even if the file existed ++ * beforehand, it has been truncated and corrupted in the process. ++ */ ++ if (ret && bs) { ++ Error *local_delete_err = NULL; ++ int r_del = bdrv_co_delete_file(bs, &local_delete_err); ++ /* ++ * ENOTSUP will happen if the block driver doesn't support ++ * the 'bdrv_co_delete_file' interface. This is a predictable ++ * scenario and shouldn't be reported back to the user. ++ */ ++ if ((r_del < 0) && (r_del != -ENOTSUP)) { ++ error_report_err(local_delete_err); ++ } ++ } ++ + bdrv_unref(bs); + qapi_free_QCryptoBlockCreateOptions(create_opts); + qobject_unref(cryptoopts); +-- +1.8.3.1 + diff --git a/kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch b/kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch new file mode 100755 index 0000000000000000000000000000000000000000..a6177c63d43bb133dead0e84ccd586d728ec8cd3 --- /dev/null +++ b/kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch @@ -0,0 +1,56 @@ +From f01178897c8f5ff98692a22059dd65e35677eaa3 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Mon, 10 Feb 2020 17:33:58 +0000 +Subject: [PATCH 18/18] docs/arm-cpu-features: Make kvm-no-adjvtime comment + clearer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200210173358.16896-3-drjones@redhat.com> +Patchwork-id: 93772 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] docs/arm-cpu-features: Make kvm-no-adjvtime comment clearer +Bugzilla: 1801320 +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan +RH-Acked-by: Philippe Mathieu-Daudé + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1801320 + +Author: Philippe Mathieu-Daudé +Date: Fri, 07 Feb 2020 14:04:28 +0000 + + docs/arm-cpu-features: Make kvm-no-adjvtime comment clearer + + The bold text sounds like 'knock knock'. Only bolding the + second 'not' makes it easier to read. + + Fixes: dea101a1ae + Signed-off-by: Philippe Mathieu-Daudé + Reviewed-by: Andrew Jones + Message-id: 20200206225148.23923-1-philmd@redhat.com + Signed-off-by: Peter Maydell + +(cherry picked from commit fa3236a970b6ea5be3fa3ad258f1a75920ca1ebb) +Signed-off-by: Danilo C. L. de Paula +--- + docs/arm-cpu-features.rst | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/docs/arm-cpu-features.rst b/docs/arm-cpu-features.rst +index 45d1eb6..48d5054 100644 +--- a/docs/arm-cpu-features.rst ++++ b/docs/arm-cpu-features.rst +@@ -185,7 +185,7 @@ the list of KVM VCPU features and their descriptions. + + kvm-no-adjvtime By default kvm-no-adjvtime is disabled. This + means that by default the virtual time +- adjustment is enabled (vtime is *not not* ++ adjustment is enabled (vtime is not *not* + adjusted). + + When virtual time adjustment is enabled each +-- +1.8.3.1 + diff --git a/kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch b/kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch new file mode 100755 index 0000000000000000000000000000000000000000..77e99eb71ca2c2ad578c67704c9bd0959a9258dd --- /dev/null +++ b/kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch @@ -0,0 +1,53 @@ +From a6f0bef82cdd84844a06dac1e6d279d95824d827 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:41 -0400 +Subject: [PATCH 3/9] dp8393x: switch to use qemu_receive_packet() for loopback + packet +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-4-jmaloy@redhat.com> +Patchwork-id: 101789 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 3/9] dp8393x: switch to use qemu_receive_packet() for loopback packet +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé + +(cherry picked from commit 331d2ac9ea307c990dc86e6493e8f0c48d14bb33) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/dp8393x.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c +index 3d991af163..6d55b5de64 100644 +--- a/hw/net/dp8393x.c ++++ b/hw/net/dp8393x.c +@@ -482,7 +482,7 @@ static void dp8393x_do_transmit_packets(dp8393xState *s) + s->regs[SONIC_TCR] |= SONIC_TCR_CRSL; + if (nc->info->can_receive(nc)) { + s->loopback_packet = 1; +- nc->info->receive(nc, s->tx_buffer, tx_len); ++ qemu_receive_packet(nc, s->tx_buffer, tx_len); + } + } else { + /* Transmit packet */ +-- +2.27.0 + diff --git a/kvm-e1000-fail-early-for-evil-descriptor.patch b/kvm-e1000-fail-early-for-evil-descriptor.patch new file mode 100755 index 0000000000000000000000000000000000000000..e599b7c71e41c74014aadddd61d728f451db4cfd --- /dev/null +++ b/kvm-e1000-fail-early-for-evil-descriptor.patch @@ -0,0 +1,65 @@ +From 7bd3000cf22a91e6bc6afc1e7adbf0ae1b731104 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 22:45:17 -0400 +Subject: [PATCH 2/5] e1000: fail early for evil descriptor + +RH-Author: Jon Maloy +Message-id: <20210413224517.3841507-2-jmaloy@redhat.com> +Patchwork-id: 101473 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] e1000: fail early for evil descriptor +Bugzilla: 1930092 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Stefan Hajnoczi + +From: Jason Wang + +During procss_tx_desc(), driver can try to chain data descriptor with +legacy descriptor, when will lead underflow for the following +calculation in process_tx_desc() for bytes: + + if (tp->size + bytes > msh) + bytes = msh - tp->size; + +This will lead a infinite loop. So check and fail early if tp->size if +greater or equal to msh. + +Reported-by: Alexander Bulekov +Reported-by: Cheolwoo Myung +Reported-by: Ruhr-University Bochum +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Signed-off-by: Jason Wang + +(cherry picked from commit 3de46e6fc489c52c9431a8a832ad8170a7569bd8) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/e1000.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index fc73fdd6fa..fe56bccd52 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -671,6 +671,9 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) + msh = tp->tso_props.hdr_len + tp->tso_props.mss; + do { + bytes = split_size; ++ if (tp->size >= msh) { ++ goto eop; ++ } + if (tp->size + bytes > msh) + bytes = msh - tp->size; + +@@ -696,6 +699,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) + tp->size += split_size; + } + ++eop: + if (!(txd_lower & E1000_TXD_CMD_EOP)) + return; + if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) { +-- +2.27.0 + diff --git a/kvm-e1000-fix-tx-re-entrancy-problem.patch b/kvm-e1000-fix-tx-re-entrancy-problem.patch new file mode 100755 index 0000000000000000000000000000000000000000..55aae5e5ba7c2cdb319e4f1cff2cb58fb6d2616e --- /dev/null +++ b/kvm-e1000-fix-tx-re-entrancy-problem.patch @@ -0,0 +1,71 @@ +From fc0bca7bd2685b8f8e3c37f19ce74967870ef952 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 21 Oct 2021 12:10:47 -0400 +Subject: [PATCH 2/2] e1000: fix tx re-entrancy problem +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +RH-MergeRequest: 73: e1000: fix tx re-entrancy problem +RH-Commit: [1/1] 3088ea275ddcee1ba0d47f7cff195af3e256f15f (jmaloy/qemu-kvm) +RH-Bugzilla: 2025011 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier + +The fact that the MMIO handler is not re-entrant causes an infinite +loop under certain conditions: + +Guest write to TDT -> Loopback -> RX (DMA to TDT) -> TX + +We now eliminate the effect of this problem locally in e1000, by adding +a boolean in struct E1000State indicating when the TX side is busy. This +will cause any entering new call to return early instead of interfering +with the ongoing work, and eliminates any risk of looping. + +This is intended to address CVE-2021-20257. + +Signed-off-by: Jon Maloy +Signed-off-by: Jason Wang +(cherry picked from commit 25ddb946e6301f42cff3094ea1c25fb78813e7e9) +Signed-off-by: Jon Maloy +--- + hw/net/e1000.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index 8680b7d46b..1963a5b243 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -105,6 +105,7 @@ typedef struct E1000State_st { + e1000x_txd_props props; + e1000x_txd_props tso_props; + uint16_t tso_frames; ++ bool busy; + } tx; + + struct { +@@ -749,6 +750,11 @@ start_xmit(E1000State *s) + return; + } + ++ if (s->tx.busy) { ++ return; ++ } ++ s->tx.busy = true; ++ + while (s->mac_reg[TDH] != s->mac_reg[TDT]) { + base = tx_desc_base(s) + + sizeof(struct e1000_tx_desc) * s->mac_reg[TDH]; +@@ -775,6 +781,7 @@ start_xmit(E1000State *s) + break; + } + } ++ s->tx.busy = false; + set_ics(s, 0, cause); + } + +-- +2.27.0 + diff --git a/kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch b/kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch new file mode 100755 index 0000000000000000000000000000000000000000..05ff372efe1722a56a725364b2323090264a252f --- /dev/null +++ b/kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch @@ -0,0 +1,52 @@ +From 128b97f6049144af3c1a41ceb8e8583419edcd69 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:40 -0400 +Subject: [PATCH 2/9] e1000: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-3-jmaloy@redhat.com> +Patchwork-id: 101784 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 2/9] e1000: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 1caff0340f49c93d535c6558a5138d20d475315c) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/e1000.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/e1000.c b/hw/net/e1000.c +index fe56bccd52..8680b7d46b 100644 +--- a/hw/net/e1000.c ++++ b/hw/net/e1000.c +@@ -547,7 +547,7 @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size) + + NetClientState *nc = qemu_get_queue(s->nic); + if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { +- nc->info->receive(nc, buf, size); ++ qemu_receive_packet(nc, buf, size); + } else { + qemu_send_packet(nc, buf, size); + } +-- +2.27.0 + diff --git a/kvm-enable-ramfb.patch b/kvm-enable-ramfb.patch new file mode 100755 index 0000000000000000000000000000000000000000..fa2fe114f3da8cf209b25aa313f94618c73ddd8d --- /dev/null +++ b/kvm-enable-ramfb.patch @@ -0,0 +1,72 @@ +From 441128e2f13a56d4949b70971edd2f6902772959 Mon Sep 17 00:00:00 2001 +From: Gerd Hoffmann +Date: Wed, 3 Jun 2020 15:15:56 +0100 +Subject: [PATCH 01/17] enable ramfb + +RH-Author: Gerd Hoffmann +Message-id: <20200603151556.1195-2-kraxel@redhat.com> +Patchwork-id: 97097 +O-Subject: [RHEL-AV-8.2.0.z qemu-kvm PATCH 1/1] enable ramfb +Bugzilla: 1841068 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefan Hajnoczi + +--- + hw/vfio/pci.c | 5 ----- + hw/display/Makefile.objs | 5 ++--- + 2 files changed, 2 insertions(+), 8 deletions(-) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/display/Makefile.objs | 5 ++--- + hw/vfio/pci.c | 5 ----- + 2 files changed, 2 insertions(+), 8 deletions(-) + +diff --git a/hw/display/Makefile.objs b/hw/display/Makefile.objs +index 3d0cda1..f2182e3 100644 +--- a/hw/display/Makefile.objs ++++ b/hw/display/Makefile.objs +@@ -1,9 +1,8 @@ + common-obj-$(CONFIG_DDC) += i2c-ddc.o + common-obj-$(CONFIG_EDID) += edid-generate.o edid-region.o + +-# Disabled for Red Hat Enterprise Linux +-#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o +-#common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o ++common-obj-$(CONFIG_FW_CFG_DMA) += ramfb.o ++common-obj-$(CONFIG_FW_CFG_DMA) += ramfb-standalone.o + + common-obj-$(CONFIG_ADS7846) += ads7846.o + common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index d717520..f191904 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3249,7 +3249,6 @@ static const TypeInfo vfio_pci_dev_info = { + }, + }; + +-#if 0 /* Disabled for Red Hat Enterprise Linux */ + static Property vfio_pci_dev_nohotplug_properties[] = { + DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), + DEFINE_PROP_END_OF_LIST(), +@@ -3269,15 +3268,11 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { + .instance_size = sizeof(VFIOPCIDevice), + .class_init = vfio_pci_nohotplug_dev_class_init, + }; +-#endif + + static void register_vfio_pci_dev_type(void) + { + type_register_static(&vfio_pci_dev_info); +- +-#if 0 /* Disabled for Red Hat Enterprise Linux */ + type_register_static(&vfio_pci_nohotplug_dev_info); +-#endif + } + + type_init(register_vfio_pci_dev_type) +-- +1.8.3.1 + diff --git a/kvm-error-Document-Error-API-usage-rules.patch b/kvm-error-Document-Error-API-usage-rules.patch new file mode 100755 index 0000000000000000000000000000000000000000..fb9f1b017ed471bcb9ebde151ef34ad348d83c1f --- /dev/null +++ b/kvm-error-Document-Error-API-usage-rules.patch @@ -0,0 +1,154 @@ +From b2ac3e491eb7f18a421e2b1132e527d484681767 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:09 -0500 +Subject: [PATCH 08/14] error: Document Error API usage rules +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-5-marcandre.lureau@redhat.com> +Patchwork-id: 100477 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 04/10] error: Document Error API usage rules +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Markus Armbruster + +This merely codifies existing practice, with one exception: the rule +advising against returning void, where existing practice is mixed. + +When the Error API was created, we adopted the (unwritten) rule to +return void when the function returns no useful value on success, +unlike GError, which recommends to return true on success and false on +error then. + +When a function returns a distinct error value, say false, a checked +call that passes the error up looks like + + if (!frobnicate(..., errp)) { + handle the error... + } + +When it returns void, we need + + Error *err = NULL; + + frobnicate(..., &err); + if (err) { + handle the error... + error_propagate(errp, err); + } + +Not only is this more verbose, it also creates an Error object even +when @errp is null, &error_abort or &error_fatal. + +People got tired of the additional boilerplate, and started to ignore +the unwritten rule. The result is confusion among developers about +the preferred usage. + +Make the rule advising against returning void official by putting it +in writing. This will hopefully reduce confusion. + +Update the examples accordingly. + +The remainder of this series will update a substantial amount of code +to honor the rule. + +Signed-off-by: Markus Armbruster +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Greg Kurz +Message-Id: <20200707160613.848843-4-armbru@redhat.com> + +(cherry picked from commit e3fe3988d7851cac30abffae06d2f555ff7bee62) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 52 +++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 46 insertions(+), 6 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 3351fe76368..08d48e74836 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -15,6 +15,33 @@ + /* + * Error reporting system loosely patterned after Glib's GError. + * ++ * = Rules = ++ * ++ * - Functions that use Error to report errors have an Error **errp ++ * parameter. It should be the last parameter, except for functions ++ * taking variable arguments. ++ * ++ * - You may pass NULL to not receive the error, &error_abort to abort ++ * on error, &error_fatal to exit(1) on error, or a pointer to a ++ * variable containing NULL to receive the error. ++ * ++ * - Separation of concerns: the function is responsible for detecting ++ * errors and failing cleanly; handling the error is its caller's ++ * job. Since the value of @errp is about handling the error, the ++ * function should not examine it. ++ * ++ * - On success, the function should not touch *errp. On failure, it ++ * should set a new error, e.g. with error_setg(errp, ...), or ++ * propagate an existing one, e.g. with error_propagate(errp, ...). ++ * ++ * - Whenever practical, also return a value that indicates success / ++ * failure. This can make the error checking more concise, and can ++ * avoid useless error object creation and destruction. Note that ++ * we still have many functions returning void. We recommend ++ * • bool-valued functions return true on success / false on failure, ++ * • pointer-valued functions return non-null / null pointer, and ++ * • integer-valued functions return non-negative / negative. ++ * + * = Creating errors = + * + * Create an error: +@@ -95,14 +122,13 @@ + * Create a new error and pass it to the caller: + * error_setg(errp, "situation normal, all fouled up"); + * +- * Call a function and receive an error from it: +- * Error *err = NULL; +- * foo(arg, &err); +- * if (err) { ++ * Call a function, receive an error from it, and pass it to the caller ++ * - when the function returns a value that indicates failure, say ++ * false: ++ * if (!foo(arg, errp)) { + * handle the error... + * } +- * +- * Receive an error and pass it on to the caller: ++ * - when it does not, say because it is a void function: + * Error *err = NULL; + * foo(arg, &err); + * if (err) { +@@ -120,6 +146,20 @@ + * foo(arg, errp); + * for readability. + * ++ * Receive an error, and handle it locally ++ * - when the function returns a value that indicates failure, say ++ * false: ++ * Error *err = NULL; ++ * if (!foo(arg, &err)) { ++ * handle the error... ++ * } ++ * - when it does not, say because it is a void function: ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * if (err) { ++ * handle the error... ++ * } ++ * + * Receive and accumulate multiple errors (first one wins): + * Error *err = NULL, *local_err = NULL; + * foo(arg, &err); +-- +2.27.0 + diff --git a/kvm-error-Fix-examples-in-error.h-s-big-comment.patch b/kvm-error-Fix-examples-in-error.h-s-big-comment.patch new file mode 100755 index 0000000000000000000000000000000000000000..ee14eb576cd603937a5ac0e207917b03607c5cf0 --- /dev/null +++ b/kvm-error-Fix-examples-in-error.h-s-big-comment.patch @@ -0,0 +1,85 @@ +From fe7dd779a9674dc54ffe296247ae6559f2b55b22 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:07 -0500 +Subject: [PATCH 06/14] error: Fix examples in error.h's big comment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-3-marcandre.lureau@redhat.com> +Patchwork-id: 100473 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 02/10] error: Fix examples in error.h's big comment +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Markus Armbruster + +Mark a bad example more clearly. Fix the error_propagate_prepend() +example. Add a missing declaration and a second error pileup example. + +Signed-off-by: Markus Armbruster +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Greg Kurz +Message-Id: <20200707160613.848843-2-armbru@redhat.com> + +(cherry picked from commit 47ff5ac81e8bb3096500de7b132051691d533d36) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 3f95141a01a..83c38f9a188 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -24,7 +24,7 @@ + * "charm, top, bottom.\n"); + * + * Do *not* contract this to +- * error_setg(&err, "invalid quark\n" ++ * error_setg(&err, "invalid quark\n" // WRONG! + * "Valid quarks are up, down, strange, charm, top, bottom."); + * + * Report an error to the current monitor if we have one, else stderr: +@@ -52,7 +52,8 @@ + * where Error **errp is a parameter, by convention the last one. + * + * Pass an existing error to the caller with the message modified: +- * error_propagate_prepend(errp, err); ++ * error_propagate_prepend(errp, err, ++ * "Could not frobnicate '%s': ", name); + * + * Avoid + * error_propagate(errp, err); +@@ -108,12 +109,23 @@ + * } + * + * Do *not* "optimize" this to ++ * Error *err = NULL; + * foo(arg, &err); + * bar(arg, &err); // WRONG! + * if (err) { + * handle the error... + * } + * because this may pass a non-null err to bar(). ++ * ++ * Likewise, do *not* ++ * Error *err = NULL; ++ * if (cond1) { ++ * error_setg(&err, ...); ++ * } ++ * if (cond2) { ++ * error_setg(&err, ...); // WRONG! ++ * } ++ * because this may pass a non-null err to error_setg(). + */ + + #ifndef ERROR_H +-- +2.27.0 + diff --git a/kvm-error-Improve-error.h-s-big-comment.patch b/kvm-error-Improve-error.h-s-big-comment.patch new file mode 100755 index 0000000000000000000000000000000000000000..0ad43678b84e2c41dd390afe22e61d389d2de0b2 --- /dev/null +++ b/kvm-error-Improve-error.h-s-big-comment.patch @@ -0,0 +1,146 @@ +From 439c11850165fd838e367aa6d4fff4af951a5bd9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:08 -0500 +Subject: [PATCH 07/14] error: Improve error.h's big comment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-4-marcandre.lureau@redhat.com> +Patchwork-id: 100474 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 03/10] error: Improve error.h's big comment +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Markus Armbruster + +Add headlines to the big comment. + +Explain examples for NULL, &error_abort and &error_fatal argument +better. + +Tweak rationale for error_propagate_prepend(). + +Signed-off-by: Markus Armbruster +Message-Id: <20200707160613.848843-3-armbru@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Greg Kurz + +(cherry picked from commit 9aac7d486cc792191c25c30851f501624b0c2751) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 51 +++++++++++++++++++++++++++++++------------- + 1 file changed, 36 insertions(+), 15 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 83c38f9a188..3351fe76368 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -15,6 +15,8 @@ + /* + * Error reporting system loosely patterned after Glib's GError. + * ++ * = Creating errors = ++ * + * Create an error: + * error_setg(&err, "situation normal, all fouled up"); + * +@@ -27,6 +29,8 @@ + * error_setg(&err, "invalid quark\n" // WRONG! + * "Valid quarks are up, down, strange, charm, top, bottom."); + * ++ * = Reporting and destroying errors = ++ * + * Report an error to the current monitor if we have one, else stderr: + * error_report_err(err); + * This frees the error object. +@@ -40,6 +44,30 @@ + * error_free(err); + * Note that this loses hints added with error_append_hint(). + * ++ * Call a function ignoring errors: ++ * foo(arg, NULL); ++ * This is more concise than ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * error_free(err); // don't do this ++ * ++ * Call a function aborting on errors: ++ * foo(arg, &error_abort); ++ * This is more concise and fails more nicely than ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * assert(!err); // don't do this ++ * ++ * Call a function treating errors as fatal: ++ * foo(arg, &error_fatal); ++ * This is more concise than ++ * Error *err = NULL; ++ * foo(arg, &err); ++ * if (err) { // don't do this ++ * error_report_err(err); ++ * exit(1); ++ * } ++ * + * Handle an error without reporting it (just for completeness): + * error_free(err); + * +@@ -47,6 +75,11 @@ + * reporting it (primarily useful in testsuites): + * error_free_or_abort(&err); + * ++ * = Passing errors around = ++ * ++ * Errors get passed to the caller through the conventional @errp ++ * parameter. ++ * + * Pass an existing error to the caller: + * error_propagate(errp, err); + * where Error **errp is a parameter, by convention the last one. +@@ -54,11 +87,10 @@ + * Pass an existing error to the caller with the message modified: + * error_propagate_prepend(errp, err, + * "Could not frobnicate '%s': ", name); +- * +- * Avoid +- * error_propagate(errp, err); ++ * This is more concise than ++ * error_propagate(errp, err); // don't do this + * error_prepend(errp, "Could not frobnicate '%s': ", name); +- * because this fails to prepend when @errp is &error_fatal. ++ * and works even when @errp is &error_fatal. + * + * Create a new error and pass it to the caller: + * error_setg(errp, "situation normal, all fouled up"); +@@ -70,15 +102,6 @@ + * handle the error... + * } + * +- * Call a function ignoring errors: +- * foo(arg, NULL); +- * +- * Call a function aborting on errors: +- * foo(arg, &error_abort); +- * +- * Call a function treating errors as fatal: +- * foo(arg, &error_fatal); +- * + * Receive an error and pass it on to the caller: + * Error *err = NULL; + * foo(arg, &err); +@@ -86,8 +109,6 @@ + * handle the error... + * error_propagate(errp, err); + * } +- * where Error **errp is a parameter, by convention the last one. +- * + * Do *not* "optimize" this to + * foo(arg, errp); + * if (*errp) { // WRONG! +-- +2.27.0 + diff --git a/kvm-error-New-macro-ERRP_GUARD.patch b/kvm-error-New-macro-ERRP_GUARD.patch new file mode 100755 index 0000000000000000000000000000000000000000..d67ad7cea34036a338efd400c06dd838c835c048 --- /dev/null +++ b/kvm-error-New-macro-ERRP_GUARD.patch @@ -0,0 +1,305 @@ +From 46c3298774b976cc6a1cd834751e644fb482b08e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:10 -0500 +Subject: [PATCH 09/14] error: New macro ERRP_GUARD() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-6-marcandre.lureau@redhat.com> +Patchwork-id: 100476 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 05/10] error: New macro ERRP_GUARD() +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Vladimir Sementsov-Ogievskiy + +Introduce a new ERRP_GUARD() macro, to be used at start of functions +with an errp OUT parameter. + +It has three goals: + +1. Fix issue with error_fatal and error_prepend/error_append_hint: the +user can't see this additional information, because exit() happens in +error_setg earlier than information is added. [Reported by Greg Kurz] + +2. Fix issue with error_abort and error_propagate: when we wrap +error_abort by local_err+error_propagate, the resulting coredump will +refer to error_propagate and not to the place where error happened. +(the macro itself doesn't fix the issue, but it allows us to [3.] drop +the local_err+error_propagate pattern, which will definitely fix the +issue) [Reported by Kevin Wolf] + +3. Drop local_err+error_propagate pattern, which is used to workaround +void functions with errp parameter, when caller wants to know resulting +status. (Note: actually these functions could be merely updated to +return int error code). + +To achieve these goals, later patches will add invocations +of this macro at the start of functions with either use +error_prepend/error_append_hint (solving 1) or which use +local_err+error_propagate to check errors, switching those +functions to use *errp instead (solving 2 and 3). + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Paul Durrant +Reviewed-by: Greg Kurz +Reviewed-by: Eric Blake +[Merge comments properly with recent commit "error: Document Error API +usage rules", and edit for clarity. Put ERRP_AUTO_PROPAGATE() before +its helpers, and touch up style. Tweak commit message.] +Signed-off-by: Markus Armbruster +Message-Id: <20200707165037.1026246-2-armbru@redhat.com> + +(cherry picked from commit ae7c80a7bd73685437bf6ba9d7c26098351f4166) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qapi/error.h | 158 +++++++++++++++++++++++++++++++++++++------ + 1 file changed, 139 insertions(+), 19 deletions(-) + +diff --git a/include/qapi/error.h b/include/qapi/error.h +index 08d48e74836..e658790acfc 100644 +--- a/include/qapi/error.h ++++ b/include/qapi/error.h +@@ -30,6 +30,10 @@ + * job. Since the value of @errp is about handling the error, the + * function should not examine it. + * ++ * - The function may pass @errp to functions it calls to pass on ++ * their errors to its caller. If it dereferences @errp to check ++ * for errors, it must use ERRP_GUARD(). ++ * + * - On success, the function should not touch *errp. On failure, it + * should set a new error, e.g. with error_setg(errp, ...), or + * propagate an existing one, e.g. with error_propagate(errp, ...). +@@ -45,15 +49,17 @@ + * = Creating errors = + * + * Create an error: +- * error_setg(&err, "situation normal, all fouled up"); ++ * error_setg(errp, "situation normal, all fouled up"); ++ * where @errp points to the location to receive the error. + * + * Create an error and add additional explanation: +- * error_setg(&err, "invalid quark"); +- * error_append_hint(&err, "Valid quarks are up, down, strange, " ++ * error_setg(errp, "invalid quark"); ++ * error_append_hint(errp, "Valid quarks are up, down, strange, " + * "charm, top, bottom.\n"); ++ * This may require use of ERRP_GUARD(); more on that below. + * + * Do *not* contract this to +- * error_setg(&err, "invalid quark\n" // WRONG! ++ * error_setg(errp, "invalid quark\n" // WRONG! + * "Valid quarks are up, down, strange, charm, top, bottom."); + * + * = Reporting and destroying errors = +@@ -107,18 +113,6 @@ + * Errors get passed to the caller through the conventional @errp + * parameter. + * +- * Pass an existing error to the caller: +- * error_propagate(errp, err); +- * where Error **errp is a parameter, by convention the last one. +- * +- * Pass an existing error to the caller with the message modified: +- * error_propagate_prepend(errp, err, +- * "Could not frobnicate '%s': ", name); +- * This is more concise than +- * error_propagate(errp, err); // don't do this +- * error_prepend(errp, "Could not frobnicate '%s': ", name); +- * and works even when @errp is &error_fatal. +- * + * Create a new error and pass it to the caller: + * error_setg(errp, "situation normal, all fouled up"); + * +@@ -129,18 +123,26 @@ + * handle the error... + * } + * - when it does not, say because it is a void function: ++ * ERRP_GUARD(); ++ * foo(arg, errp); ++ * if (*errp) { ++ * handle the error... ++ * } ++ * More on ERRP_GUARD() below. ++ * ++ * Code predating ERRP_GUARD() still exists, and looks like this: + * Error *err = NULL; + * foo(arg, &err); + * if (err) { + * handle the error... +- * error_propagate(errp, err); ++ * error_propagate(errp, err); // deprecated + * } +- * Do *not* "optimize" this to ++ * Avoid in new code. Do *not* "optimize" it to + * foo(arg, errp); + * if (*errp) { // WRONG! + * handle the error... + * } +- * because errp may be NULL! ++ * because errp may be NULL without the ERRP_GUARD() guard. + * + * But when all you do with the error is pass it on, please use + * foo(arg, errp); +@@ -160,6 +162,19 @@ + * handle the error... + * } + * ++ * Pass an existing error to the caller: ++ * error_propagate(errp, err); ++ * This is rarely needed. When @err is a local variable, use of ++ * ERRP_GUARD() commonly results in more readable code. ++ * ++ * Pass an existing error to the caller with the message modified: ++ * error_propagate_prepend(errp, err, ++ * "Could not frobnicate '%s': ", name); ++ * This is more concise than ++ * error_propagate(errp, err); // don't do this ++ * error_prepend(errp, "Could not frobnicate '%s': ", name); ++ * and works even when @errp is &error_fatal. ++ * + * Receive and accumulate multiple errors (first one wins): + * Error *err = NULL, *local_err = NULL; + * foo(arg, &err); +@@ -187,6 +202,69 @@ + * error_setg(&err, ...); // WRONG! + * } + * because this may pass a non-null err to error_setg(). ++ * ++ * = Why, when and how to use ERRP_GUARD() = ++ * ++ * Without ERRP_GUARD(), use of the @errp parameter is restricted: ++ * - It must not be dereferenced, because it may be null. ++ * - It should not be passed to error_prepend() or ++ * error_append_hint(), because that doesn't work with &error_fatal. ++ * ERRP_GUARD() lifts these restrictions. ++ * ++ * To use ERRP_GUARD(), add it right at the beginning of the function. ++ * @errp can then be used without worrying about the argument being ++ * NULL or &error_fatal. ++ * ++ * Using it when it's not needed is safe, but please avoid cluttering ++ * the source with useless code. ++ * ++ * = Converting to ERRP_GUARD() = ++ * ++ * To convert a function to use ERRP_GUARD(): ++ * ++ * 0. If the Error ** parameter is not named @errp, rename it to ++ * @errp. ++ * ++ * 1. Add an ERRP_GUARD() invocation, by convention right at the ++ * beginning of the function. This makes @errp safe to use. ++ * ++ * 2. Replace &err by errp, and err by *errp. Delete local variable ++ * @err. ++ * ++ * 3. Delete error_propagate(errp, *errp), replace ++ * error_propagate_prepend(errp, *errp, ...) by error_prepend(errp, ...) ++ * ++ * 4. Ensure @errp is valid at return: when you destroy *errp, set ++ * errp = NULL. ++ * ++ * Example: ++ * ++ * bool fn(..., Error **errp) ++ * { ++ * Error *err = NULL; ++ * ++ * foo(arg, &err); ++ * if (err) { ++ * handle the error... ++ * error_propagate(errp, err); ++ * return false; ++ * } ++ * ... ++ * } ++ * ++ * becomes ++ * ++ * bool fn(..., Error **errp) ++ * { ++ * ERRP_GUARD(); ++ * ++ * foo(arg, errp); ++ * if (*errp) { ++ * handle the error... ++ * return false; ++ * } ++ * ... ++ * } + */ + + #ifndef ERROR_H +@@ -287,6 +365,7 @@ void error_setg_win32_internal(Error **errp, + * the error object. + * Else, move the error object from @local_err to *@dst_errp. + * On return, @local_err is invalid. ++ * Please use ERRP_GUARD() instead when possible. + * Please don't error_propagate(&error_fatal, ...), use + * error_report_err() and exit(), because that's more obvious. + */ +@@ -298,6 +377,7 @@ void error_propagate(Error **dst_errp, Error *local_err); + * Behaves like + * error_prepend(&local_err, fmt, ...); + * error_propagate(dst_errp, local_err); ++ * Please use ERRP_GUARD() and error_prepend() instead when possible. + */ + void error_propagate_prepend(Error **dst_errp, Error *local_err, + const char *fmt, ...); +@@ -395,6 +475,46 @@ void error_set_internal(Error **errp, + ErrorClass err_class, const char *fmt, ...) + GCC_FMT_ATTR(6, 7); + ++/* ++ * Make @errp parameter easier to use regardless of argument value ++ * ++ * This macro is for use right at the beginning of a function that ++ * takes an Error **errp parameter to pass errors to its caller. The ++ * parameter must be named @errp. ++ * ++ * It must be used when the function dereferences @errp or passes ++ * @errp to error_prepend(), error_vprepend(), or error_append_hint(). ++ * It is safe to use even when it's not needed, but please avoid ++ * cluttering the source with useless code. ++ * ++ * If @errp is NULL or &error_fatal, rewrite it to point to a local ++ * Error variable, which will be automatically propagated to the ++ * original @errp on function exit. ++ * ++ * Note: &error_abort is not rewritten, because that would move the ++ * abort from the place where the error is created to the place where ++ * it's propagated. ++ */ ++#define ERRP_GUARD() \ ++ g_auto(ErrorPropagator) _auto_errp_prop = {.errp = errp}; \ ++ do { \ ++ if (!errp || errp == &error_fatal) { \ ++ errp = &_auto_errp_prop.local_err; \ ++ } \ ++ } while (0) ++ ++typedef struct ErrorPropagator { ++ Error *local_err; ++ Error **errp; ++} ErrorPropagator; ++ ++static inline void error_propagator_cleanup(ErrorPropagator *prop) ++{ ++ error_propagate(prop->errp, prop->local_err); ++} ++ ++G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(ErrorPropagator, error_propagator_cleanup); ++ + /* + * Special error destination to abort on error. + * See error_setg() and error_propagate() for details. +-- +2.27.0 + diff --git a/kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch b/kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch new file mode 100755 index 0000000000000000000000000000000000000000..5d44708dfc18162cf02cc2e630b246bfe4b1bf0f --- /dev/null +++ b/kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch @@ -0,0 +1,85 @@ +From 5770fe43fe1e15e6f53cfd3705605e8645b95a98 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 13 Mar 2020 17:17:08 +0000 +Subject: [PATCH 20/20] exec/rom_reset: Free rom data during inmigrate skip +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200313171708.242774-1-dgilbert@redhat.com> +Patchwork-id: 94292 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] exec/rom_reset: Free rom data during inmigrate skip +Bugzilla: 1809380 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Paolo Bonzini + +From: "Dr. David Alan Gilbert" + +bz: https://bugzilla.redhat.com/show_bug.cgi?id=1809380 +brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27249921 +branch: rhel-av-8.2.0 +upstream: Posted and with review-by, not merged yet + +Commit 355477f8c73e9 skips rom reset when we're an incoming migration +so as not to overwrite shared ram in the ignore-shared migration +optimisation. +However, it's got an unexpected side effect that because it skips +freeing the ROM data, when rom_reset gets called later on, after +migration (e.g. during a reboot), the ROM does get reset to the original +file contents. Because of seabios/x86's weird reboot process +this confuses a reboot into hanging after a migration. + +Fixes: 355477f8c73e9 ("migration: do not rom_reset() during incoming migration") +https://bugzilla.redhat.com/show_bug.cgi?id=1809380 + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/loader.c | 25 ++++++++++++++++--------- + 1 file changed, 16 insertions(+), 9 deletions(-) + +diff --git a/hw/core/loader.c b/hw/core/loader.c +index 5099f27..375b29b 100644 +--- a/hw/core/loader.c ++++ b/hw/core/loader.c +@@ -1118,19 +1118,26 @@ static void rom_reset(void *unused) + { + Rom *rom; + +- /* +- * We don't need to fill in the RAM with ROM data because we'll fill +- * the data in during the next incoming migration in all cases. Note +- * that some of those RAMs can actually be modified by the guest on ARM +- * so this is probably the only right thing to do here. +- */ +- if (runstate_check(RUN_STATE_INMIGRATE)) +- return; +- + QTAILQ_FOREACH(rom, &roms, next) { + if (rom->fw_file) { + continue; + } ++ /* ++ * We don't need to fill in the RAM with ROM data because we'll fill ++ * the data in during the next incoming migration in all cases. Note ++ * that some of those RAMs can actually be modified by the guest. ++ */ ++ if (runstate_check(RUN_STATE_INMIGRATE)) { ++ if (rom->data && rom->isrom) { ++ /* ++ * Free it so that a rom_reset after migration doesn't ++ * overwrite a potentially modified 'rom'. ++ */ ++ rom_free_data(rom); ++ } ++ continue; ++ } ++ + if (rom->data == NULL) { + continue; + } +-- +1.8.3.1 + diff --git a/kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch b/kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch new file mode 100755 index 0000000000000000000000000000000000000000..aa471087cd4ece79a0f7648b5e09e519c2003a00 --- /dev/null +++ b/kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch @@ -0,0 +1,96 @@ +From 4e553943c8fe4924d194884b4719c5459210c686 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Tue, 26 Jan 2021 17:21:03 -0500 +Subject: [PATCH 8/9] file-posix: Allow byte-aligned O_DIRECT with NFS + +RH-Author: Kevin Wolf +Message-id: <20210126172103.136060-3-kwolf@redhat.com> +Patchwork-id: 100785 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/2] file-posix: Allow byte-aligned O_DIRECT with NFS +Bugzilla: 1834281 +RH-Acked-by: Markus Armbruster +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +Since commit a6b257a08e3 ('file-posix: Handle undetectable alignment'), +we assume that if we open a file with O_DIRECT and alignment probing +returns 1, we just couldn't find out the real alignment requirement +because some filesystems make the requirement only for allocated blocks. +In this case, a safe default of 4k is used. + +This is too strict for NFS, which does actually allow byte-aligned +requests even with O_DIRECT. Because we can't distinguish both cases +with generic code, let's just look at the file system magic and disable +s->needs_alignment for NFS. This way, O_DIRECT can still be used on NFS +for images that are not aligned to 4k. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200716142601.111237-3-kwolf@redhat.com> +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 5edc85571e7b7269dce408735eba7507f18ac666) +Signed-off-by: Kevin Wolf +Signed-off-by: Jon Maloy +--- + block/file-posix.c | 26 +++++++++++++++++++++++++- + 1 file changed, 25 insertions(+), 1 deletion(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index adafbfa1be..2d834fbdf6 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -61,10 +61,12 @@ + #include + #include + #include ++#include + #include + #include + #include + #include ++#include + #include + #ifdef __s390__ + #include +@@ -298,6 +300,28 @@ static int probe_physical_blocksize(int fd, unsigned int *blk_size) + #endif + } + ++/* ++ * Returns true if no alignment restrictions are necessary even for files ++ * opened with O_DIRECT. ++ * ++ * raw_probe_alignment() probes the required alignment and assume that 1 means ++ * the probing failed, so it falls back to a safe default of 4k. This can be ++ * avoided if we know that byte alignment is okay for the file. ++ */ ++static bool dio_byte_aligned(int fd) ++{ ++#ifdef __linux__ ++ struct statfs buf; ++ int ret; ++ ++ ret = fstatfs(fd, &buf); ++ if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) { ++ return true; ++ } ++#endif ++ return false; ++} ++ + /* Check if read is allowed with given memory buffer and length. + * + * This function is used to check O_DIRECT memory buffer and request alignment. +@@ -602,7 +626,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, + + s->has_discard = true; + s->has_write_zeroes = true; +- if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { ++ if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) { + s->needs_alignment = true; + } + +-- +2.18.2 + diff --git a/kvm-file-posix-Drop-hdev_co_create_opts.patch b/kvm-file-posix-Drop-hdev_co_create_opts.patch new file mode 100755 index 0000000000000000000000000000000000000000..ea2edbdba48c1547069629e10f3cd106150c0e06 --- /dev/null +++ b/kvm-file-posix-Drop-hdev_co_create_opts.patch @@ -0,0 +1,131 @@ +From 3d3509c010129bd15eb1f5ec1a7b9eedcdbf23f6 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:44 +0000 +Subject: [PATCH 03/20] file-posix: Drop hdev_co_create_opts() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-4-mlevitsk@redhat.com> +Patchwork-id: 94225 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 3/6] file-posix: Drop hdev_co_create_opts() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +The generic fallback implementation effectively does the same. + +Reviewed-by: Maxim Levitsky +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-4-mreitz@redhat.com> +Signed-off-by: Max Reitz +(cherry picked from commit 87ca3b8fa615b278b33cabf9ed22b3f44b5214ba) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 67 ------------------------------------------------------ + 1 file changed, 67 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 1b805bd..fd29372 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -3418,67 +3418,6 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, + return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true); + } + +-static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts, +- Error **errp) +-{ +- int fd; +- int ret = 0; +- struct stat stat_buf; +- int64_t total_size = 0; +- bool has_prefix; +- +- /* This function is used by both protocol block drivers and therefore either +- * of these prefixes may be given. +- * The return value has to be stored somewhere, otherwise this is an error +- * due to -Werror=unused-value. */ +- has_prefix = +- strstart(filename, "host_device:", &filename) || +- strstart(filename, "host_cdrom:" , &filename); +- +- (void)has_prefix; +- +- ret = raw_normalize_devicepath(&filename, errp); +- if (ret < 0) { +- return ret; +- } +- +- /* Read out options */ +- total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +- BDRV_SECTOR_SIZE); +- +- fd = qemu_open(filename, O_WRONLY | O_BINARY); +- if (fd < 0) { +- ret = -errno; +- error_setg_errno(errp, -ret, "Could not open device"); +- return ret; +- } +- +- if (fstat(fd, &stat_buf) < 0) { +- ret = -errno; +- error_setg_errno(errp, -ret, "Could not stat device"); +- } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { +- error_setg(errp, +- "The given file is neither a block nor a character device"); +- ret = -ENODEV; +- } else if (lseek(fd, 0, SEEK_END) < total_size) { +- error_setg(errp, "Device is too small"); +- ret = -ENOSPC; +- } +- +- if (!ret && total_size) { +- uint8_t buf[BDRV_SECTOR_SIZE] = { 0 }; +- int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size); +- if (lseek(fd, 0, SEEK_SET) == -1) { +- ret = -errno; +- } else { +- ret = qemu_write_full(fd, buf, zero_size); +- ret = ret == zero_size ? 0 : -errno; +- } +- } +- qemu_close(fd); +- return ret; +-} +- + static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .protocol_name = "host_device", +@@ -3491,8 +3430,6 @@ static BlockDriver bdrv_host_device = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, +- .bdrv_co_create_opts = hdev_co_create_opts, +- .create_opts = &raw_create_opts, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, +@@ -3619,8 +3556,6 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, +- .bdrv_co_create_opts = hdev_co_create_opts, +- .create_opts = &raw_create_opts, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + +@@ -3753,8 +3688,6 @@ static BlockDriver bdrv_host_cdrom = { + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, +- .bdrv_co_create_opts = hdev_co_create_opts, +- .create_opts = &raw_create_opts, + .mutable_opts = mutable_opts, + + .bdrv_co_preadv = raw_co_preadv, +-- +1.8.3.1 + diff --git a/kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch b/kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch new file mode 100755 index 0000000000000000000000000000000000000000..ac7b85963a6869e82fb9a2e2fa0bc9571f0131da --- /dev/null +++ b/kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch @@ -0,0 +1,59 @@ +From 94d99b13b48e922861570f043490efc966b3b445 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 25 Jun 2021 17:41:04 -0400 +Subject: [PATCH 4/4] file-posix: Handle `EINVAL` fallocate return value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Kevin Wolf +Message-id: <20210625174104.44313-3-kwolf@redhat.com> +Patchwork-id: 101778 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] file-posix: Handle `EINVAL` fallocate return value +Bugzilla: 1970912 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Antoine Damhet + +The `detect-zeroes=unmap` option may issue unaligned +`FALLOC_FL_PUNCH_HOLE` requests, raw block devices can (and will) return +`EINVAL`, qemu should then write the zeroes to the blockdev instead of +issuing an `IO_ERROR`. + +The problem can be reprodced like this: + +$ qemu-io -c 'write -P 0 42 1234' --image-opts driver=host_device,filename=/dev/loop0,detect-zeroes=unmap +write failed: Invalid argument + +Signed-off-by: Antoine Damhet +Message-Id: <20200717135603.51180-1-antoine.damhet@blade-group.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit bae127d4dcf6158c5042e2eee9582430839a9967) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 837edcf027..6cd19e6c9a 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1632,7 +1632,11 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque) + #ifdef CONFIG_FALLOCATE_PUNCH_HOLE + int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); +- if (ret != -ENOTSUP) { ++ switch (ret) { ++ case -ENOTSUP: ++ case -EINVAL: ++ break; ++ default: + return ret; + } + #endif +-- +2.27.0 + diff --git a/kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch b/kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch new file mode 100755 index 0000000000000000000000000000000000000000..e8639f3fb9408f61ade274a0c0621c37690999ec --- /dev/null +++ b/kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch @@ -0,0 +1,466 @@ +From 7ee01b5ccb7fc660dafaf3fdb1578649d17fbddf Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 26 May 2021 09:05:52 -0400 +Subject: [PATCH 1/4] file-posix: Mitigate file fragmentation with extent size + hints + +RH-Author: Kevin Wolf +Message-id: <20210526090552.155820-2-kwolf@redhat.com> +Patchwork-id: 101638 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] file-posix: Mitigate file fragmentation with extent size hints +Bugzilla: 1877163 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +Especially when O_DIRECT is used with image files so that the page cache +indirection can't cause a merge of allocating requests, the file will +fragment on the file system layer, with a potentially very small +fragment size (this depends on the requests the guest sent). + +On Linux, fragmentation can be reduced by setting an extent size hint +when creating the file (at least on XFS, it can't be set any more after +the first extent has been allocated), basically giving raw files a +"cluster size" for allocation. + +This adds a create option to set the extent size hint, and changes the +default from not setting a hint to setting it to 1 MB. The main reason +why qcow2 defaults to smaller cluster sizes is that COW becomes more +expensive, which is not an issue with raw files, so we can choose a +larger size. The tradeoff here is only potentially wasted disk space. + +For qcow2 (or other image formats) over file-posix, the advantage should +even be greater because they grow sequentially without leaving holes, so +there won't be wasted space. Setting even larger extent size hints for +such images may make sense. This can be done with the new option, but +let's keep the default conservative for now. + +The effect is very visible with a test that intentionally creates a +badly fragmented file with qemu-img bench (the time difference while +creating the file is already remarkable) and then looks at the number of +extents and the time a simple "qemu-img map" takes. + +Without an extent size hint: + + $ ./qemu-img create -f raw -o extent_size_hint=0 ~/tmp/test.raw 10G + Formatting '/home/kwolf/tmp/test.raw', fmt=raw size=10737418240 extent_size_hint=0 + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 0 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 0, step size 8192) + Run completed in 25.848 seconds. + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 4096 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 4096, step size 8192) + Run completed in 19.616 seconds. + $ filefrag ~/tmp/test.raw + /home/kwolf/tmp/test.raw: 2000000 extents found + $ time ./qemu-img map ~/tmp/test.raw + Offset Length Mapped to File + 0 0x1e8480000 0 /home/kwolf/tmp/test.raw + + real 0m1,279s + user 0m0,043s + sys 0m1,226s + +With the new default extent size hint of 1 MB: + + $ ./qemu-img create -f raw -o extent_size_hint=1M ~/tmp/test.raw 10G + Formatting '/home/kwolf/tmp/test.raw', fmt=raw size=10737418240 extent_size_hint=1048576 + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 0 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 0, step size 8192) + Run completed in 11.833 seconds. + $ ./qemu-img bench -f raw -t none -n -w ~/tmp/test.raw -c 1000000 -S 8192 -o 4096 + Sending 1000000 write requests, 4096 bytes each, 64 in parallel (starting at offset 4096, step size 8192) + Run completed in 10.155 seconds. + $ filefrag ~/tmp/test.raw + /home/kwolf/tmp/test.raw: 178 extents found + $ time ./qemu-img map ~/tmp/test.raw + Offset Length Mapped to File + 0 0x1e8480000 0 /home/kwolf/tmp/test.raw + + real 0m0,061s + user 0m0,040s + sys 0m0,014s + +Signed-off-by: Kevin Wolf +Message-Id: <20200707142329.48303-1-kwolf@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit ffa244c84a1a30dff69ecc80b0137a2b6d428ecb) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 44 ++++++++++++++++++++++++++++++++ + include/block/block_int.h | 1 + + qapi/block-core.json | 11 +++++--- + tests/qemu-iotests/082.out | 16 ++++++++++++ + tests/qemu-iotests/106 | 7 +++-- + tests/qemu-iotests/175 | 6 ++--- + tests/qemu-iotests/243 | 6 ++--- + tests/qemu-iotests/common.filter | 1 + + 8 files changed, 80 insertions(+), 12 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 2d834fbdf6..62a463229f 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -30,6 +30,7 @@ + #include "block/block_int.h" + #include "qemu/module.h" + #include "qemu/option.h" ++#include "qemu/units.h" + #include "trace.h" + #include "block/thread-pool.h" + #include "qemu/iov.h" +@@ -2289,6 +2290,14 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) + if (!file_opts->has_preallocation) { + file_opts->preallocation = PREALLOC_MODE_OFF; + } ++ if (!file_opts->has_extent_size_hint) { ++ file_opts->extent_size_hint = 1 * MiB; ++ } ++ if (file_opts->extent_size_hint > UINT32_MAX) { ++ result = -EINVAL; ++ error_setg(errp, "Extent size hint is too large"); ++ goto out; ++ } + + /* Create file */ + fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644); +@@ -2346,6 +2355,27 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) + } + #endif + } ++#ifdef FS_IOC_FSSETXATTR ++ /* ++ * Try to set the extent size hint. Failure is not fatal, and a warning is ++ * only printed if the option was explicitly specified. ++ */ ++ { ++ struct fsxattr attr; ++ result = ioctl(fd, FS_IOC_FSGETXATTR, &attr); ++ if (result == 0) { ++ attr.fsx_xflags |= FS_XFLAG_EXTSIZE; ++ attr.fsx_extsize = file_opts->extent_size_hint; ++ result = ioctl(fd, FS_IOC_FSSETXATTR, &attr); ++ } ++ if (result < 0 && file_opts->has_extent_size_hint && ++ file_opts->extent_size_hint) ++ { ++ warn_report("Failed to set extent size hint: %s", ++ strerror(errno)); ++ } ++ } ++#endif + + /* Resize and potentially preallocate the file to the desired + * final size */ +@@ -2381,6 +2411,8 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + { + BlockdevCreateOptions options; + int64_t total_size = 0; ++ int64_t extent_size_hint = 0; ++ bool has_extent_size_hint = false; + bool nocow = false; + PreallocMode prealloc; + char *buf = NULL; +@@ -2392,6 +2424,11 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); ++ if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) { ++ has_extent_size_hint = true; ++ extent_size_hint = ++ qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1); ++ } + nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, +@@ -2411,6 +2448,8 @@ static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + .preallocation = prealloc, + .has_nocow = true, + .nocow = nocow, ++ .has_extent_size_hint = has_extent_size_hint, ++ .extent_size_hint = extent_size_hint, + }, + }; + return raw_co_create(&options, errp); +@@ -2902,6 +2941,11 @@ static QemuOptsList raw_create_opts = { + #endif + ", full)" + }, ++ { ++ .name = BLOCK_OPT_EXTENT_SIZE_HINT, ++ .type = QEMU_OPT_SIZE, ++ .help = "Extent size hint for the image file, 0 to disable" ++ }, + { /* end of list */ } + } + }; +diff --git a/include/block/block_int.h b/include/block/block_int.h +index 41f13ecbed..4b23da2eb0 100644 +--- a/include/block/block_int.h ++++ b/include/block/block_int.h +@@ -53,6 +53,7 @@ + #define BLOCK_OPT_ADAPTER_TYPE "adapter_type" + #define BLOCK_OPT_REDUNDANCY "redundancy" + #define BLOCK_OPT_NOCOW "nocow" ++#define BLOCK_OPT_EXTENT_SIZE_HINT "extent_size_hint" + #define BLOCK_OPT_OBJECT_SIZE "object_size" + #define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" + #define BLOCK_OPT_DATA_FILE "data_file" +diff --git a/qapi/block-core.json b/qapi/block-core.json +index 289320902d..c7aa919fa3 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -4272,14 +4272,17 @@ + # falloc (if defined CONFIG_POSIX_FALLOCATE), + # full (if defined CONFIG_POSIX)) + # @nocow Turn off copy-on-write (valid only on btrfs; default: off) ++# @extent-size-hint: Extent size hint to add to the image file; 0 for not ++# adding an extent size hint (default: 1 MB, since 5.1) + # + # Since: 2.12 + ## + { 'struct': 'BlockdevCreateOptionsFile', +- 'data': { 'filename': 'str', +- 'size': 'size', +- '*preallocation': 'PreallocMode', +- '*nocow': 'bool' } } ++ 'data': { 'filename': 'str', ++ 'size': 'size', ++ '*preallocation': 'PreallocMode', ++ '*nocow': 'bool', ++ '*extent-size-hint': 'size'} } + + ## + # @BlockdevCreateOptionsGluster: +diff --git a/tests/qemu-iotests/082.out b/tests/qemu-iotests/082.out +index 9d4ed4dc9d..7a87946fa2 100644 +--- a/tests/qemu-iotests/082.out ++++ b/tests/qemu-iotests/082.out +@@ -59,6 +59,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -82,6 +83,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -105,6 +107,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -128,6 +131,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -151,6 +155,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -174,6 +179,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -197,6 +203,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -220,6 +227,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -339,6 +347,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -362,6 +371,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -385,6 +395,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -408,6 +419,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -431,6 +443,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -454,6 +467,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -477,6 +491,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +@@ -500,6 +515,7 @@ Supported options: + encrypt.ivgen-hash-alg= - Name of IV generator hash algorithm + encrypt.key-secret= - ID of secret providing qcow AES key or LUKS passphrase + encryption= - Encrypt the image with format 'aes'. (Deprecated in favor of encrypt.format=aes) ++ extent_size_hint= - Extent size hint for the image file, 0 to disable + lazy_refcounts= - Postpone refcount updates + nocow= - Turn off copy-on-write (valid only on btrfs) + preallocation= - Preallocation mode (allowed values: off, metadata, falloc, full) +diff --git a/tests/qemu-iotests/106 b/tests/qemu-iotests/106 +index ac47eaa0f5..ee6f51d08b 100755 +--- a/tests/qemu-iotests/106 ++++ b/tests/qemu-iotests/106 +@@ -51,7 +51,10 @@ for create_mode in off falloc full; do + echo + echo "--- create_mode=$create_mode growth_mode=$growth_mode ---" + +- IMGOPTS="preallocation=$create_mode" _make_test_img ${CREATION_SIZE}K ++ # Our calculation below assumes kilobytes as unit for the actual size. ++ # Disable the extent size hint because it would give us a result in ++ # megabytes. ++ IMGOPTS="preallocation=$create_mode,extent_size_hint=0" _make_test_img ${CREATION_SIZE}K + $QEMU_IMG resize -f "$IMGFMT" --preallocation=$growth_mode "$TEST_IMG" +${GROWTH_SIZE}K + + expected_size=0 +@@ -98,7 +101,7 @@ for growth_mode in falloc full; do + # plain int. We should use the correct type for the result, and + # this tests we do. + +- _make_test_img 2G ++ _make_test_img -o "extent_size_hint=0" 2G + $QEMU_IMG resize -f "$IMGFMT" --preallocation=$growth_mode "$TEST_IMG" +${GROWTH_SIZE}K + + actual_size=$($QEMU_IMG info -f "$IMGFMT" "$TEST_IMG" | grep 'disk size') +diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175 +index 55db2803ed..8a8494aeb6 100755 +--- a/tests/qemu-iotests/175 ++++ b/tests/qemu-iotests/175 +@@ -89,20 +89,20 @@ min_blocks=$(stat -c '%b' "$TEST_DIR/empty") + + echo + echo "== creating image with default preallocation ==" +-_make_test_img $size | _filter_imgfmt ++_make_test_img -o extent_size_hint=0 $size | _filter_imgfmt + stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size + + for mode in off full falloc; do + echo + echo "== creating image with preallocation $mode ==" +- IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt ++ IMGOPTS="preallocation=$mode,extent_size_hint=0" _make_test_img $size | _filter_imgfmt + stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size + done + + for new_size in 4096 1048576; do + echo + echo "== resize empty image with block_resize ==" +- _make_test_img 0 | _filter_imgfmt ++ _make_test_img -o extent_size_hint=0 0 | _filter_imgfmt + _block_resize $TEST_IMG $new_size >/dev/null + stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $new_size + done +diff --git a/tests/qemu-iotests/243 b/tests/qemu-iotests/243 +index e563761307..104c7256c4 100755 +--- a/tests/qemu-iotests/243 ++++ b/tests/qemu-iotests/243 +@@ -47,7 +47,7 @@ for mode in off metadata falloc full; do + echo "=== preallocation=$mode ===" + echo + +- IMGOPTS="preallocation=$mode" _make_test_img 64M ++ IMGOPTS="preallocation=$mode,extent_size_hint=0" _make_test_img 64M + + printf "File size: " + du -b $TEST_IMG | cut -f1 +@@ -64,7 +64,7 @@ for mode in off metadata falloc full; do + echo "=== External data file: preallocation=$mode ===" + echo + +- IMGOPTS="data_file=$TEST_IMG.data,preallocation=$mode" _make_test_img 64M ++ IMGOPTS="data_file=$TEST_IMG.data,preallocation=$mode,extent_size_hint=0" _make_test_img 64M + + echo -n "qcow2 file size: " + du -b $TEST_IMG | cut -f1 +@@ -75,7 +75,7 @@ for mode in off metadata falloc full; do + echo -n "qcow2 disk usage: " + [ $(du -B1 $TEST_IMG | cut -f1) -lt 1048576 ] && echo "low" || echo "high" + echo -n "data disk usage: " +- [ $(du -B1 $TEST_IMG.data | cut -f1) -lt 1048576 ] && echo "low" || echo "high" ++ [ $(du -B1 $TEST_IMG.data | cut -f1) -lt 2097152 ] && echo "low" || echo "high" + + done + +diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter +index c8e8663665..f29c1d3238 100644 +--- a/tests/qemu-iotests/common.filter ++++ b/tests/qemu-iotests/common.filter +@@ -146,6 +146,7 @@ _filter_img_create() + -e "s# refcount_bits=[0-9]\\+##g" \ + -e "s# key-secret=[a-zA-Z0-9]\\+##g" \ + -e "s# iter-time=[0-9]\\+##g" \ ++ -e "s# extent_size_hint=[0-9]\\+##g" \ + -e "s# force_size=\\(on\\|off\\)##g" + } + +-- +2.27.0 + diff --git a/kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch b/kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch new file mode 100755 index 0000000000000000000000000000000000000000..efdf16bc409579d50f9ee410514d3398c43deeea --- /dev/null +++ b/kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch @@ -0,0 +1,48 @@ +From 55bfda3a0e077b822f57e8ed901f0cee848bc471 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:35 +0100 +Subject: [PATCH 07/17] file-posix: Support BDRV_REQ_ZERO_WRITE for truncate + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-7-kwolf@redhat.com> +Patchwork-id: 97452 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 06/11] file-posix: Support BDRV_REQ_ZERO_WRITE for truncate +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +For regular files, we always get BDRV_REQ_ZERO_WRITE behaviour from the +OS, so we can advertise the flag and just ignore it. + +Signed-off-by: Kevin Wolf +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Max Reitz +Message-Id: <20200424125448.63318-7-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 2f0c6e7a650de133eccd94e9bb6cf7b2070f07f1) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/file-posix.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 7551e8d..adafbfa 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -674,6 +674,10 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, + #endif + + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; ++ if (S_ISREG(st.st_mode)) { ++ /* When extending regular files, we get zeros from the OS */ ++ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; ++ } + ret = 0; + fail: + if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { +-- +1.8.3.1 + diff --git a/kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch b/kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch new file mode 100755 index 0000000000000000000000000000000000000000..551b2eb635d0bdc1615a7c46df05fb4c3ace5859 --- /dev/null +++ b/kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch @@ -0,0 +1,89 @@ +From 15331267d11713906361ddd767c3e04ae46d9a83 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:50 -0400 +Subject: [PATCH 01/14] glib-compat: add g_unix_get_passwd_entry_qemu() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-2-marcandre.lureau@redhat.com> +Patchwork-id: 101687 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/4] glib-compat: add g_unix_get_passwd_entry_qemu() +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Marc-André Lureau + +The glib function was introduced in 2.64. It's a safer version of +getpwnam, and also simpler to use than getpwnam_r. + +Currently, it's only use by the next patch in qemu-ga, which doesn't +(well well...) need the thread safety guarantees. Since the fallback +version is still unsafe, I would rather keep the _qemu postfix, to make +sure it's not being misused by mistake. When/if necessary, we can +implement a safer fallback and drop the _qemu suffix. + +Signed-off-by: Marc-André Lureau +Reviewed-by: Michal Privoznik +*fix checkpatch warnings about newlines before/after block comments +Signed-off-by: Michael Roth + +(cherry picked from commit 6d593ab451c490b0ca941c6a519894231634751e) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + include/glib-compat.h | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +diff --git a/include/glib-compat.h b/include/glib-compat.h +index 0b0ec76299..695a96f7ea 100644 +--- a/include/glib-compat.h ++++ b/include/glib-compat.h +@@ -30,6 +30,11 @@ + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" + + #include ++#if defined(G_OS_UNIX) ++#include ++#include ++#include ++#endif + + /* + * Note that because of the GLIB_VERSION_MAX_ALLOWED constant above, allowing +@@ -72,6 +77,29 @@ + gint g_poll_fixed(GPollFD *fds, guint nfds, gint timeout); + #endif + ++#if defined(G_OS_UNIX) ++/* ++ * Note: The fallback implementation is not MT-safe, and it returns a copy of ++ * the libc passwd (must be g_free() after use) but not the content. Because of ++ * these important differences the caller must be aware of, it's not #define for ++ * GLib API substitution. ++ */ ++static inline struct passwd * ++g_unix_get_passwd_entry_qemu(const gchar *user_name, GError **error) ++{ ++#if GLIB_CHECK_VERSION(2, 64, 0) ++ return g_unix_get_passwd_entry(user_name, error); ++#else ++ struct passwd *p = getpwnam(user_name); ++ if (!p) { ++ g_set_error_literal(error, G_UNIX_ERROR, 0, g_strerror(errno)); ++ return NULL; ++ } ++ return (struct passwd *)g_memdup(p, sizeof(*p)); ++#endif ++} ++#endif /* G_OS_UNIX */ ++ + #pragma GCC diagnostic pop + + #endif +-- +2.27.0 + diff --git a/kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch b/kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch new file mode 100755 index 0000000000000000000000000000000000000000..e34f576659360871a9981932eb980760d771c986 --- /dev/null +++ b/kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch @@ -0,0 +1,275 @@ +From a0816e4374759048cb24b9b3549a093a2ccb6240 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:50 +0100 +Subject: [PATCH 07/12] hmat acpi: Build Memory Proximity Domain Attributes + Structure(s) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-7-plai@redhat.com> +Patchwork-id: 96734 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 06/11] hmat acpi: Build Memory Proximity Domain Attributes Structure(s) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table +(HMAT). The specification references below link: +http://www.uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf + +It describes the memory attributes, such as memory side cache +attributes and bandwidth and latency details, related to the +Memory Proximity Domain. The software is +expected to use this information as hint for optimization. + +This structure describes Memory Proximity Domain Attributes by memory +subsystem and its associativity with processor proximity domain as well as +hint for memory usage. + +In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report +the platform's HMAT tables. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Daniel Black +Reviewed-by: Jonathan Cameron +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-5-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit e6f123c3b81241be33f1b763d0ff8b36d1ae9c1e) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/Kconfig | 7 ++-- + hw/acpi/Makefile.objs | 1 + + hw/acpi/hmat.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/acpi/hmat.h | 42 ++++++++++++++++++++++ + hw/i386/acpi-build.c | 5 +++ + 5 files changed, 152 insertions(+), 2 deletions(-) + create mode 100644 hw/acpi/hmat.c + create mode 100644 hw/acpi/hmat.h + +diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig +index 12e3f1e..54209c6 100644 +--- a/hw/acpi/Kconfig ++++ b/hw/acpi/Kconfig +@@ -7,6 +7,7 @@ config ACPI_X86 + select ACPI_NVDIMM + select ACPI_CPU_HOTPLUG + select ACPI_MEMORY_HOTPLUG ++ select ACPI_HMAT + + config ACPI_X86_ICH + bool +@@ -23,6 +24,10 @@ config ACPI_NVDIMM + bool + depends on ACPI + ++config ACPI_HMAT ++ bool ++ depends on ACPI ++ + config ACPI_PCI + bool + depends on ACPI && PCI +@@ -33,5 +38,3 @@ config ACPI_VMGENID + depends on PC + + config ACPI_HW_REDUCED +- bool +- depends on ACPI +diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs +index 655a9c1..517bd88 100644 +--- a/hw/acpi/Makefile.objs ++++ b/hw/acpi/Makefile.objs +@@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o + common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o + common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o + common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o ++common-obj-$(CONFIG_ACPI_HMAT) += hmat.o + common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o + + common-obj-y += acpi_interface.o +diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c +new file mode 100644 +index 0000000..9ff7930 +--- /dev/null ++++ b/hw/acpi/hmat.c +@@ -0,0 +1,99 @@ ++/* ++ * HMAT ACPI Implementation ++ * ++ * Copyright(C) 2019 Intel Corporation. ++ * ++ * Author: ++ * Liu jingqi ++ * Tao Xu ++ * ++ * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table ++ * (HMAT) ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, see ++ */ ++ ++#include "qemu/osdep.h" ++#include "sysemu/numa.h" ++#include "hw/acpi/hmat.h" ++ ++/* ++ * ACPI 6.3: ++ * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145 ++ */ ++static void build_hmat_mpda(GArray *table_data, uint16_t flags, ++ uint32_t initiator, uint32_t mem_node) ++{ ++ ++ /* Memory Proximity Domain Attributes Structure */ ++ /* Type */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Length */ ++ build_append_int_noprefix(table_data, 40, 4); ++ /* Flags */ ++ build_append_int_noprefix(table_data, flags, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Proximity Domain for the Attached Initiator */ ++ build_append_int_noprefix(table_data, initiator, 4); ++ /* Proximity Domain for the Memory */ ++ build_append_int_noprefix(table_data, mem_node, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 4); ++ /* ++ * Reserved: ++ * Previously defined as the Start Address of the System Physical ++ * Address Range. Deprecated since ACPI Spec 6.3. ++ */ ++ build_append_int_noprefix(table_data, 0, 8); ++ /* ++ * Reserved: ++ * Previously defined as the Range Length of the region in bytes. ++ * Deprecated since ACPI Spec 6.3. ++ */ ++ build_append_int_noprefix(table_data, 0, 8); ++} ++ ++/* Build HMAT sub table structures */ ++static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) ++{ ++ uint16_t flags; ++ int i; ++ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ flags = 0; ++ ++ if (numa_state->nodes[i].initiator < MAX_NODES) { ++ flags |= HMAT_PROXIMITY_INITIATOR_VALID; ++ } ++ ++ build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); ++ } ++} ++ ++void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) ++{ ++ int hmat_start = table_data->len; ++ ++ /* reserve space for HMAT header */ ++ acpi_data_push(table_data, 40); ++ ++ hmat_build_table_structs(table_data, numa_state); ++ ++ build_header(linker, table_data, ++ (void *)(table_data->data + hmat_start), ++ "HMAT", table_data->len - hmat_start, 2, NULL, NULL); ++} +diff --git a/hw/acpi/hmat.h b/hw/acpi/hmat.h +new file mode 100644 +index 0000000..437dbc6 +--- /dev/null ++++ b/hw/acpi/hmat.h +@@ -0,0 +1,42 @@ ++/* ++ * HMAT ACPI Implementation Header ++ * ++ * Copyright(C) 2019 Intel Corporation. ++ * ++ * Author: ++ * Liu jingqi ++ * Tao Xu ++ * ++ * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table ++ * (HMAT) ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, see ++ */ ++ ++#ifndef HMAT_H ++#define HMAT_H ++ ++#include "hw/acpi/aml-build.h" ++ ++/* ++ * ACPI 6.3: 5.2.27.3 Memory Proximity Domain Attributes Structure, ++ * Table 5-145, Field "flag", Bit [0]: set to 1 to indicate that data in ++ * the Proximity Domain for the Attached Initiator field is valid. ++ * Other bits reserved. ++ */ ++#define HMAT_PROXIMITY_INITIATOR_VALID 0x1 ++ ++void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state); ++ ++#endif +diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c +index 6400189..b1f8c55 100644 +--- a/hw/i386/acpi-build.c ++++ b/hw/i386/acpi-build.c +@@ -67,6 +67,7 @@ + #include "hw/i386/intel_iommu.h" + + #include "hw/acpi/ipmi.h" ++#include "hw/acpi/hmat.h" + + /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and + * -M pc-i440fx-2.0. Even if the actual amount of AML generated grows +@@ -2837,6 +2838,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) + acpi_add_table(table_offsets, tables_blob); + build_slit(tables_blob, tables->linker, machine); + } ++ if (machine->numa_state->hmat_enabled) { ++ acpi_add_table(table_offsets, tables_blob); ++ build_hmat(tables_blob, tables->linker, machine->numa_state); ++ } + } + if (acpi_get_mcfg(&mcfg)) { + acpi_add_table(table_offsets, tables_blob); +-- +1.8.3.1 + diff --git a/kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch b/kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch new file mode 100755 index 0000000000000000000000000000000000000000..01ef4ce331398aea2fba52b838c2bb2d550bc0a2 --- /dev/null +++ b/kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch @@ -0,0 +1,137 @@ +From d00453667cb972dc2fe1242081d3b39313a6a925 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:52 +0100 +Subject: [PATCH 09/12] hmat acpi: Build Memory Side Cache Information + Structure(s) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-9-plai@redhat.com> +Patchwork-id: 96741 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 08/11] hmat acpi: Build Memory Side Cache Information Structure(s) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +This structure describes memory side cache information for memory +proximity domains if the memory side cache is present and the +physical device forms the memory side cache. +The software could use this information to effectively place +the data in memory to maximize the performance of the system +memory that use the memory side cache. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Daniel Black +Reviewed-by: Jonathan Cameron +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-7-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit a9c2b841af002db6e21e1297c9026b63fc22c875) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/hmat.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 68 insertions(+), 1 deletion(-) + +diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c +index 4635d45..7c24bb5 100644 +--- a/hw/acpi/hmat.c ++++ b/hw/acpi/hmat.c +@@ -143,14 +143,62 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + g_free(entry_list); + } + ++/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */ ++static void build_hmat_cache(GArray *table_data, uint8_t total_levels, ++ NumaHmatCacheOptions *hmat_cache) ++{ ++ /* ++ * Cache Attributes: Bits [3:0] – Total Cache Levels ++ * for this Memory Proximity Domain ++ */ ++ uint32_t cache_attr = total_levels; ++ ++ /* Bits [7:4] : Cache Level described in this structure */ ++ cache_attr |= (uint32_t) hmat_cache->level << 4; ++ ++ /* Bits [11:8] - Cache Associativity */ ++ cache_attr |= (uint32_t) hmat_cache->associativity << 8; ++ ++ /* Bits [15:12] - Write Policy */ ++ cache_attr |= (uint32_t) hmat_cache->policy << 12; ++ ++ /* Bits [31:16] - Cache Line size in bytes */ ++ cache_attr |= (uint32_t) hmat_cache->line << 16; ++ ++ /* Type */ ++ build_append_int_noprefix(table_data, 2, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Length */ ++ build_append_int_noprefix(table_data, 32, 4); ++ /* Proximity Domain for the Memory */ ++ build_append_int_noprefix(table_data, hmat_cache->node_id, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 4); ++ /* Memory Side Cache Size */ ++ build_append_int_noprefix(table_data, hmat_cache->size, 8); ++ /* Cache Attributes */ ++ build_append_int_noprefix(table_data, cache_attr, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* ++ * Number of SMBIOS handles (n) ++ * Linux kernel uses Memory Side Cache Information Structure ++ * without SMBIOS entries for now, so set Number of SMBIOS handles ++ * as 0. ++ */ ++ build_append_int_noprefix(table_data, 0, 2); ++} ++ + /* Build HMAT sub table structures */ + static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + { + uint16_t flags; + uint32_t num_initiator = 0; + uint32_t initiator_list[MAX_NODES]; +- int i, hierarchy, type; ++ int i, hierarchy, type, cache_level, total_levels; + HMAT_LB_Info *hmat_lb; ++ NumaHmatCacheOptions *hmat_cache; + + for (i = 0; i < numa_state->num_nodes; i++) { + flags = 0; +@@ -184,6 +232,25 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + } + } + } ++ ++ /* ++ * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: ++ * Table 5-147 ++ */ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ total_levels = 0; ++ for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) { ++ if (numa_state->hmat_cache[i][cache_level]) { ++ total_levels++; ++ } ++ } ++ for (cache_level = 0; cache_level <= total_levels; cache_level++) { ++ hmat_cache = numa_state->hmat_cache[i][cache_level]; ++ if (hmat_cache) { ++ build_hmat_cache(table_data, total_levels, hmat_cache); ++ } ++ } ++ } + } + + void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) +-- +1.8.3.1 + diff --git a/kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch b/kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch new file mode 100755 index 0000000000000000000000000000000000000000..a7120d7073cc7812e2cc6f40a5956c89e7360db4 --- /dev/null +++ b/kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch @@ -0,0 +1,173 @@ +From f55b8b251c323856087baf2380d93fbf2da15db7 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:51 +0100 +Subject: [PATCH 08/12] hmat acpi: Build System Locality Latency and Bandwidth + Information Structure(s) + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-8-plai@redhat.com> +Patchwork-id: 96733 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 07/11] hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +This structure describes the memory access latency and bandwidth +information from various memory access initiator proximity domains. +The latency and bandwidth numbers represented in this structure +correspond to rated latency and bandwidth for the platform. +The software could use this information as hint for optimization. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-6-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 4586a2cb833f80b19c80ebe364a005ac2fa0974a) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/acpi/hmat.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 103 insertions(+), 1 deletion(-) + +diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c +index 9ff7930..4635d45 100644 +--- a/hw/acpi/hmat.c ++++ b/hw/acpi/hmat.c +@@ -25,6 +25,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/units.h" + #include "sysemu/numa.h" + #include "hw/acpi/hmat.h" + +@@ -67,11 +68,89 @@ static void build_hmat_mpda(GArray *table_data, uint16_t flags, + build_append_int_noprefix(table_data, 0, 8); + } + ++/* ++ * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information ++ * Structure: Table 5-146 ++ */ ++static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, ++ uint32_t num_initiator, uint32_t num_target, ++ uint32_t *initiator_list) ++{ ++ int i, index; ++ HMAT_LB_Data *lb_data; ++ uint16_t *entry_list; ++ uint32_t base; ++ /* Length in bytes for entire structure */ ++ uint32_t lb_length ++ = 32 /* Table length upto and including Entry Base Unit */ ++ + 4 * num_initiator /* Initiator Proximity Domain List */ ++ + 4 * num_target /* Target Proximity Domain List */ ++ + 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ ++ ++ /* Type */ ++ build_append_int_noprefix(table_data, 1, 2); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Length */ ++ build_append_int_noprefix(table_data, lb_length, 4); ++ /* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ ++ assert(!(hmat_lb->hierarchy >> 4)); ++ build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); ++ /* Data Type */ ++ build_append_int_noprefix(table_data, hmat_lb->data_type, 1); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 2); ++ /* Number of Initiator Proximity Domains (s) */ ++ build_append_int_noprefix(table_data, num_initiator, 4); ++ /* Number of Target Proximity Domains (t) */ ++ build_append_int_noprefix(table_data, num_target, 4); ++ /* Reserved */ ++ build_append_int_noprefix(table_data, 0, 4); ++ ++ /* Entry Base Unit */ ++ if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { ++ /* Convert latency base from nanoseconds to picosecond */ ++ base = hmat_lb->base * 1000; ++ } else { ++ /* Convert bandwidth base from Byte to Megabyte */ ++ base = hmat_lb->base / MiB; ++ } ++ build_append_int_noprefix(table_data, base, 8); ++ ++ /* Initiator Proximity Domain List */ ++ for (i = 0; i < num_initiator; i++) { ++ build_append_int_noprefix(table_data, initiator_list[i], 4); ++ } ++ ++ /* Target Proximity Domain List */ ++ for (i = 0; i < num_target; i++) { ++ build_append_int_noprefix(table_data, i, 4); ++ } ++ ++ /* Latency or Bandwidth Entries */ ++ entry_list = g_malloc0(num_initiator * num_target * sizeof(uint16_t)); ++ for (i = 0; i < hmat_lb->list->len; i++) { ++ lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); ++ index = lb_data->initiator * num_target + lb_data->target; ++ ++ entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); ++ } ++ ++ for (i = 0; i < num_initiator * num_target; i++) { ++ build_append_int_noprefix(table_data, entry_list[i], 2); ++ } ++ ++ g_free(entry_list); ++} ++ + /* Build HMAT sub table structures */ + static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + { + uint16_t flags; +- int i; ++ uint32_t num_initiator = 0; ++ uint32_t initiator_list[MAX_NODES]; ++ int i, hierarchy, type; ++ HMAT_LB_Info *hmat_lb; + + for (i = 0; i < numa_state->num_nodes; i++) { + flags = 0; +@@ -82,6 +161,29 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) + + build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); + } ++ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ if (numa_state->nodes[i].has_cpu) { ++ initiator_list[num_initiator++] = i; ++ } ++ } ++ ++ /* ++ * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information ++ * Structure: Table 5-146 ++ */ ++ for (hierarchy = HMAT_LB_MEM_MEMORY; ++ hierarchy <= HMAT_LB_MEM_CACHE_3RD_LEVEL; hierarchy++) { ++ for (type = HMAT_LB_DATA_ACCESS_LATENCY; ++ type <= HMAT_LB_DATA_WRITE_BANDWIDTH; type++) { ++ hmat_lb = numa_state->hmat_lb[hierarchy][type]; ++ ++ if (hmat_lb && hmat_lb->list->len) { ++ build_hmat_lb(table_data, hmat_lb, num_initiator, ++ numa_state->num_nodes, initiator_list); ++ } ++ } ++ } + } + + void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) +-- +1.8.3.1 + diff --git a/kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch b/kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch new file mode 100755 index 0000000000000000000000000000000000000000..f01dec2dd226d8c12cdc6e527f0510036ba6a092 --- /dev/null +++ b/kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch @@ -0,0 +1,100 @@ +From cebc614e5ddd1f770c4d6dc26c066791f36e56df Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:02 +0000 +Subject: [PATCH 05/18] hmp: Allow using qdev ID for qemu-io command + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-5-kwolf@redhat.com> +Patchwork-id: 93750 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 4/6] hmp: Allow using qdev ID for qemu-io command +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +In order to issue requests on an existing BlockBackend with the +'qemu-io' HMP command, allow specifying the BlockBackend not only with a +BlockBackend name, but also with a qdev ID/QOM path for a device that +owns the (possibly anonymous) BlockBackend. + +Because qdev names could be conflicting with BlockBackend and node +names, introduce a -d option to explicitly address a device. If the +option is not given, a BlockBackend or a node is addressed. + +Signed-off-by: Kevin Wolf +(cherry picked from commit 89b6fc45614bb45dcd58f1590415afe5c2791abd) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + hmp-commands.hx | 8 +++++--- + monitor/hmp-cmds.c | 28 ++++++++++++++++++---------- + 2 files changed, 23 insertions(+), 13 deletions(-) + +diff --git a/hmp-commands.hx b/hmp-commands.hx +index cfcc044..dc23185 100644 +--- a/hmp-commands.hx ++++ b/hmp-commands.hx +@@ -1875,9 +1875,11 @@ ETEXI + + { + .name = "qemu-io", +- .args_type = "device:B,command:s", +- .params = "[device] \"[command]\"", +- .help = "run a qemu-io command on a block device", ++ .args_type = "qdev:-d,device:B,command:s", ++ .params = "[-d] [device] \"[command]\"", ++ .help = "run a qemu-io command on a block device\n\t\t\t" ++ "-d: [device] is a device ID rather than a " ++ "drive ID or node name", + .cmd = hmp_qemu_io, + }, + +diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c +index b2551c1..5f8941d 100644 +--- a/monitor/hmp-cmds.c ++++ b/monitor/hmp-cmds.c +@@ -2468,23 +2468,31 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) + { + BlockBackend *blk; + BlockBackend *local_blk = NULL; ++ bool qdev = qdict_get_try_bool(qdict, "qdev", false); + const char* device = qdict_get_str(qdict, "device"); + const char* command = qdict_get_str(qdict, "command"); + Error *err = NULL; + int ret; + +- blk = blk_by_name(device); +- if (!blk) { +- BlockDriverState *bs = bdrv_lookup_bs(NULL, device, &err); +- if (bs) { +- blk = local_blk = blk_new(bdrv_get_aio_context(bs), +- 0, BLK_PERM_ALL); +- ret = blk_insert_bs(blk, bs, &err); +- if (ret < 0) { ++ if (qdev) { ++ blk = blk_by_qdev_id(device, &err); ++ if (!blk) { ++ goto fail; ++ } ++ } else { ++ blk = blk_by_name(device); ++ if (!blk) { ++ BlockDriverState *bs = bdrv_lookup_bs(NULL, device, &err); ++ if (bs) { ++ blk = local_blk = blk_new(bdrv_get_aio_context(bs), ++ 0, BLK_PERM_ALL); ++ ret = blk_insert_bs(blk, bs, &err); ++ if (ret < 0) { ++ goto fail; ++ } ++ } else { + goto fail; + } +- } else { +- goto fail; + } + } + +-- +1.8.3.1 + diff --git a/kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch b/kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch new file mode 100755 index 0000000000000000000000000000000000000000..75788c50b18e591c687879ca3c7ff426ae7ebaf4 --- /dev/null +++ b/kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch @@ -0,0 +1,222 @@ +From 602f17920e422e2b8d3ce485e56066a97b74e723 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:29 -0500 +Subject: [PATCH 05/17] hw/arm/smmu: Introduce SMMUTLBEntry for PTW and IOTLB + value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-5-eperezma@redhat.com> +Patchwork-id: 100597 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 04/13] hw/arm/smmu: Introduce SMMUTLBEntry for PTW and IOTLB value +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Introduce a specialized SMMUTLBEntry to store the result of +the PTW and cache in the IOTLB. This structure extends the +generic IOMMUTLBEntry struct with the level of the entry and +the granule size. + +Those latter will be useful when implementing range invalidation. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-5-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit a7550158556b7fc2f2baaecf9092499c6687b160) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 32 +++++++++++++++++--------------- + hw/arm/smmuv3.c | 10 +++++----- + include/hw/arm/smmu-common.h | 12 +++++++++--- + 3 files changed, 31 insertions(+), 23 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 0b89c9fbbbc..06e9e38b007 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -64,11 +64,11 @@ SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova) + return key; + } + +-IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, +- hwaddr iova) ++SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, ++ hwaddr iova) + { + SMMUIOTLBKey key = smmu_get_iotlb_key(cfg->asid, iova); +- IOMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); ++ SMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); + + if (entry) { + cfg->iotlb_hits++; +@@ -86,7 +86,7 @@ IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, + return entry; + } + +-void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) + { + SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); + +@@ -94,9 +94,9 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) + smmu_iotlb_inv_all(bs); + } + +- *key = smmu_get_iotlb_key(cfg->asid, entry->iova); +- trace_smmu_iotlb_insert(cfg->asid, entry->iova); +- g_hash_table_insert(bs->iotlb, key, entry); ++ *key = smmu_get_iotlb_key(cfg->asid, new->entry.iova); ++ trace_smmu_iotlb_insert(cfg->asid, new->entry.iova); ++ g_hash_table_insert(bs->iotlb, key, new); + } + + inline void smmu_iotlb_inv_all(SMMUState *s) +@@ -217,7 +217,7 @@ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova) + * @cfg: translation config + * @iova: iova to translate + * @perm: access type +- * @tlbe: IOMMUTLBEntry (out) ++ * @tlbe: SMMUTLBEntry (out) + * @info: handle to an error info + * + * Return 0 on success, < 0 on error. In case of error, @info is filled +@@ -227,7 +227,7 @@ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova) + */ + static int smmu_ptw_64(SMMUTransCfg *cfg, + dma_addr_t iova, IOMMUAccessFlags perm, +- IOMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) ++ SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) + { + dma_addr_t baseaddr, indexmask; + int stage = cfg->stage; +@@ -247,8 +247,8 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + baseaddr = extract64(tt->ttb, 0, 48); + baseaddr &= ~indexmask; + +- tlbe->iova = iova; +- tlbe->addr_mask = (1 << granule_sz) - 1; ++ tlbe->entry.iova = iova; ++ tlbe->entry.addr_mask = (1 << granule_sz) - 1; + + while (level <= 3) { + uint64_t subpage_size = 1ULL << level_shift(level, granule_sz); +@@ -299,14 +299,16 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + goto error; + } + +- tlbe->translated_addr = gpa + (iova & mask); +- tlbe->perm = PTE_AP_TO_PERM(ap); ++ tlbe->entry.translated_addr = gpa + (iova & mask); ++ tlbe->entry.perm = PTE_AP_TO_PERM(ap); ++ tlbe->level = level; ++ tlbe->granule = granule_sz; + return 0; + } + info->type = SMMU_PTW_ERR_TRANSLATION; + + error: +- tlbe->perm = IOMMU_NONE; ++ tlbe->entry.perm = IOMMU_NONE; + return -EINVAL; + } + +@@ -322,7 +324,7 @@ error: + * return 0 on success + */ + inline int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, +- IOMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) ++ SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info) + { + if (!cfg->aa64) { + /* +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 34dea4df4da..ad8212779d3 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -614,7 +614,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + SMMUTranslationStatus status; + SMMUState *bs = ARM_SMMU(s); + uint64_t page_mask, aligned_addr; +- IOMMUTLBEntry *cached_entry = NULL; ++ SMMUTLBEntry *cached_entry = NULL; + SMMUTransTableInfo *tt; + SMMUTransCfg *cfg = NULL; + IOMMUTLBEntry entry = { +@@ -664,7 +664,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + + cached_entry = smmu_iotlb_lookup(bs, cfg, aligned_addr); + if (cached_entry) { +- if ((flag & IOMMU_WO) && !(cached_entry->perm & IOMMU_WO)) { ++ if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) { + status = SMMU_TRANS_ERROR; + if (event.record_trans_faults) { + event.type = SMMU_EVT_F_PERMISSION; +@@ -677,7 +677,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + goto epilogue; + } + +- cached_entry = g_new0(IOMMUTLBEntry, 1); ++ cached_entry = g_new0(SMMUTLBEntry, 1); + + if (smmu_ptw(cfg, aligned_addr, flag, cached_entry, &ptw_info)) { + g_free(cached_entry); +@@ -731,9 +731,9 @@ epilogue: + switch (status) { + case SMMU_TRANS_SUCCESS: + entry.perm = flag; +- entry.translated_addr = cached_entry->translated_addr + ++ entry.translated_addr = cached_entry->entry.translated_addr + + (addr & page_mask); +- entry.addr_mask = cached_entry->addr_mask; ++ entry.addr_mask = cached_entry->entry.addr_mask; + trace_smmuv3_translate_success(mr->parent_obj.name, sid, addr, + entry.translated_addr, entry.perm); + break; +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index bceba40885c..277923bdc0a 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -52,6 +52,12 @@ typedef struct SMMUTransTableInfo { + uint8_t granule_sz; /* granule page shift */ + } SMMUTransTableInfo; + ++typedef struct SMMUTLBEntry { ++ IOMMUTLBEntry entry; ++ uint8_t level; ++ uint8_t granule; ++} SMMUTLBEntry; ++ + /* + * Generic structure populated by derived SMMU devices + * after decoding the configuration information and used as +@@ -140,7 +146,7 @@ static inline uint16_t smmu_get_sid(SMMUDevice *sdev) + * pair, according to @cfg translation config + */ + int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, +- IOMMUTLBEntry *tlbe, SMMUPTWEventInfo *info); ++ SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info); + + /** + * select_tt - compute which translation table shall be used according to +@@ -153,8 +159,8 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + #define SMMU_IOTLB_MAX_SIZE 256 + +-IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); +-void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry); ++SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *entry); + SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch b/kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch new file mode 100755 index 0000000000000000000000000000000000000000..6500b41244957bad4b982b8dddda42db90effc7e --- /dev/null +++ b/kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch @@ -0,0 +1,166 @@ +From 7833c0bf8321cb39614ee889cf3e3a64511c0aa5 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:28 -0500 +Subject: [PATCH 04/17] hw/arm/smmu: Introduce smmu_get_iotlb_key() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-4-eperezma@redhat.com> +Patchwork-id: 100596 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 03/13] hw/arm/smmu: Introduce smmu_get_iotlb_key() +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Introduce the smmu_get_iotlb_key() helper and the +SMMU_IOTLB_ASID() macro. Also move smmu_get_iotlb_key and +smmu_iotlb_key_hash in the IOTLB related code section. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-4-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 60a61f1b31fc03080aadb63c9b1006f8b1972adb) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 66 ++++++++++++++++++++---------------- + hw/arm/smmu-internal.h | 1 + + include/hw/arm/smmu-common.h | 1 + + 3 files changed, 38 insertions(+), 30 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 8e01505dbee..0b89c9fbbbc 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -32,10 +32,42 @@ + + /* IOTLB Management */ + ++static guint smmu_iotlb_key_hash(gconstpointer v) ++{ ++ SMMUIOTLBKey *key = (SMMUIOTLBKey *)v; ++ uint32_t a, b, c; ++ ++ /* Jenkins hash */ ++ a = b = c = JHASH_INITVAL + sizeof(*key); ++ a += key->asid; ++ b += extract64(key->iova, 0, 32); ++ c += extract64(key->iova, 32, 32); ++ ++ __jhash_mix(a, b, c); ++ __jhash_final(a, b, c); ++ ++ return c; ++} ++ ++static gboolean smmu_iotlb_key_equal(gconstpointer v1, gconstpointer v2) ++{ ++ const SMMUIOTLBKey *k1 = v1; ++ const SMMUIOTLBKey *k2 = v2; ++ ++ return (k1->asid == k2->asid) && (k1->iova == k2->iova); ++} ++ ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova) ++{ ++ SMMUIOTLBKey key = {.asid = asid, .iova = iova}; ++ ++ return key; ++} ++ + IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, + hwaddr iova) + { +- SMMUIOTLBKey key = {.asid = cfg->asid, .iova = iova}; ++ SMMUIOTLBKey key = smmu_get_iotlb_key(cfg->asid, iova); + IOMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); + + if (entry) { +@@ -62,8 +94,7 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) + smmu_iotlb_inv_all(bs); + } + +- key->asid = cfg->asid; +- key->iova = entry->iova; ++ *key = smmu_get_iotlb_key(cfg->asid, entry->iova); + trace_smmu_iotlb_insert(cfg->asid, entry->iova); + g_hash_table_insert(bs->iotlb, key, entry); + } +@@ -80,12 +111,12 @@ static gboolean smmu_hash_remove_by_asid(gpointer key, gpointer value, + uint16_t asid = *(uint16_t *)user_data; + SMMUIOTLBKey *iotlb_key = (SMMUIOTLBKey *)key; + +- return iotlb_key->asid == asid; ++ return SMMU_IOTLB_ASID(*iotlb_key) == asid; + } + + inline void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova) + { +- SMMUIOTLBKey key = {.asid = asid, .iova = iova}; ++ SMMUIOTLBKey key = smmu_get_iotlb_key(asid, iova); + + trace_smmu_iotlb_inv_iova(asid, iova); + g_hash_table_remove(s->iotlb, &key); +@@ -382,31 +413,6 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) + return NULL; + } + +-static guint smmu_iotlb_key_hash(gconstpointer v) +-{ +- SMMUIOTLBKey *key = (SMMUIOTLBKey *)v; +- uint32_t a, b, c; +- +- /* Jenkins hash */ +- a = b = c = JHASH_INITVAL + sizeof(*key); +- a += key->asid; +- b += extract64(key->iova, 0, 32); +- c += extract64(key->iova, 32, 32); +- +- __jhash_mix(a, b, c); +- __jhash_final(a, b, c); +- +- return c; +-} +- +-static gboolean smmu_iotlb_key_equal(gconstpointer v1, gconstpointer v2) +-{ +- const SMMUIOTLBKey *k1 = v1; +- const SMMUIOTLBKey *k2 = v2; +- +- return (k1->asid == k2->asid) && (k1->iova == k2->iova); +-} +- + /* Unmap the whole notifier's range */ + static void smmu_unmap_notifier_range(IOMMUNotifier *n) + { +diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h +index 7794d6d3947..3104f768cd2 100644 +--- a/hw/arm/smmu-internal.h ++++ b/hw/arm/smmu-internal.h +@@ -96,4 +96,5 @@ uint64_t iova_level_offset(uint64_t iova, int inputsize, + MAKE_64BIT_MASK(0, gsz - 3); + } + ++#define SMMU_IOTLB_ASID(key) ((key).asid) + #endif +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index a28650c9350..bceba40885c 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -155,6 +155,7 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); + void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry); ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); + void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch b/kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch new file mode 100755 index 0000000000000000000000000000000000000000..ebe3d15142b93e08d98534f1d783601102eccaf9 --- /dev/null +++ b/kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch @@ -0,0 +1,181 @@ +From fbfa584e58a560f27081043ad8e90ee9022421c0 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:27 -0500 +Subject: [PATCH 03/17] hw/arm/smmu-common: Add IOTLB helpers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-3-eperezma@redhat.com> +Patchwork-id: 100595 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 02/13] hw/arm/smmu-common: Add IOTLB helpers +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Add two helpers: one to lookup for a given IOTLB entry and +one to insert a new entry. We also move the tracing there. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-3-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 6808bca939b8722d98165319ba42366ca80de907) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 36 ++++++++++++++++++++++++++++++++++++ + hw/arm/smmuv3.c | 26 ++------------------------ + hw/arm/trace-events | 5 +++-- + include/hw/arm/smmu-common.h | 2 ++ + 4 files changed, 43 insertions(+), 26 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index d2ba8b224ba..8e01505dbee 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -32,6 +32,42 @@ + + /* IOTLB Management */ + ++IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, ++ hwaddr iova) ++{ ++ SMMUIOTLBKey key = {.asid = cfg->asid, .iova = iova}; ++ IOMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); ++ ++ if (entry) { ++ cfg->iotlb_hits++; ++ trace_smmu_iotlb_lookup_hit(cfg->asid, iova, ++ cfg->iotlb_hits, cfg->iotlb_misses, ++ 100 * cfg->iotlb_hits / ++ (cfg->iotlb_hits + cfg->iotlb_misses)); ++ } else { ++ cfg->iotlb_misses++; ++ trace_smmu_iotlb_lookup_miss(cfg->asid, iova, ++ cfg->iotlb_hits, cfg->iotlb_misses, ++ 100 * cfg->iotlb_hits / ++ (cfg->iotlb_hits + cfg->iotlb_misses)); ++ } ++ return entry; ++} ++ ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry) ++{ ++ SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); ++ ++ if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { ++ smmu_iotlb_inv_all(bs); ++ } ++ ++ key->asid = cfg->asid; ++ key->iova = entry->iova; ++ trace_smmu_iotlb_insert(cfg->asid, entry->iova); ++ g_hash_table_insert(bs->iotlb, key, entry); ++} ++ + inline void smmu_iotlb_inv_all(SMMUState *s) + { + trace_smmu_iotlb_inv_all(); +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index e2fbb8357ea..34dea4df4da 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -624,7 +624,6 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + .addr_mask = ~(hwaddr)0, + .perm = IOMMU_NONE, + }; +- SMMUIOTLBKey key, *new_key; + + qemu_mutex_lock(&s->mutex); + +@@ -663,16 +662,8 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + page_mask = (1ULL << (tt->granule_sz)) - 1; + aligned_addr = addr & ~page_mask; + +- key.asid = cfg->asid; +- key.iova = aligned_addr; +- +- cached_entry = g_hash_table_lookup(bs->iotlb, &key); ++ cached_entry = smmu_iotlb_lookup(bs, cfg, aligned_addr); + if (cached_entry) { +- cfg->iotlb_hits++; +- trace_smmu_iotlb_cache_hit(cfg->asid, aligned_addr, +- cfg->iotlb_hits, cfg->iotlb_misses, +- 100 * cfg->iotlb_hits / +- (cfg->iotlb_hits + cfg->iotlb_misses)); + if ((flag & IOMMU_WO) && !(cached_entry->perm & IOMMU_WO)) { + status = SMMU_TRANS_ERROR; + if (event.record_trans_faults) { +@@ -686,16 +677,6 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + goto epilogue; + } + +- cfg->iotlb_misses++; +- trace_smmu_iotlb_cache_miss(cfg->asid, addr & ~page_mask, +- cfg->iotlb_hits, cfg->iotlb_misses, +- 100 * cfg->iotlb_hits / +- (cfg->iotlb_hits + cfg->iotlb_misses)); +- +- if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { +- smmu_iotlb_inv_all(bs); +- } +- + cached_entry = g_new0(IOMMUTLBEntry, 1); + + if (smmu_ptw(cfg, aligned_addr, flag, cached_entry, &ptw_info)) { +@@ -741,10 +722,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + } + status = SMMU_TRANS_ERROR; + } else { +- new_key = g_new0(SMMUIOTLBKey, 1); +- new_key->asid = cfg->asid; +- new_key->iova = aligned_addr; +- g_hash_table_insert(bs->iotlb, new_key, cached_entry); ++ smmu_iotlb_insert(bs, cfg, cached_entry); + status = SMMU_TRANS_SUCCESS; + } + +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 0acedcedc6f..b808a1bfc19 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -14,6 +14,9 @@ smmu_iotlb_inv_all(void) "IOTLB invalidate all" + smmu_iotlb_inv_asid(uint16_t asid) "IOTLB invalidate asid=%d" + smmu_iotlb_inv_iova(uint16_t asid, uint64_t addr) "IOTLB invalidate asid=%d addr=0x%"PRIx64 + smmu_inv_notifiers_mr(const char *name) "iommu mr=%s" ++smmu_iotlb_lookup_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" ++smmu_iotlb_lookup_miss(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache MISS asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" ++smmu_iotlb_insert(uint16_t asid, uint64_t addr) "IOTLB ++ asid=%d addr=0x%"PRIx64 + + # smmuv3.c + smmuv3_read_mmio(uint64_t addr, uint64_t val, unsigned size, uint32_t r) "addr: 0x%"PRIx64" val:0x%"PRIx64" size: 0x%x(%d)" +@@ -46,8 +49,6 @@ smmuv3_cmdq_tlbi_nh_va(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d a + smmuv3_cmdq_tlbi_nh_vaa(int vmid, uint64_t addr) "vmid =%d addr=0x%"PRIx64 + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" +-smmu_iotlb_cache_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" +-smmu_iotlb_cache_miss(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache MISS asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 1f37844e5c9..a28650c9350 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -153,6 +153,8 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + #define SMMU_IOTLB_MAX_SIZE 256 + ++IOMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); ++void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, IOMMUTLBEntry *entry); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); + void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch b/kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch new file mode 100755 index 0000000000000000000000000000000000000000..d973b136e8608af936996f5c2324242f089ee0dc --- /dev/null +++ b/kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch @@ -0,0 +1,124 @@ +From 79718d8c67c9c54fa86a77f66aa8784aca7651d5 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:26 -0500 +Subject: [PATCH 02/17] hw/arm/smmu-common: Factorize some code in + smmu_ptw_64() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-2-eperezma@redhat.com> +Patchwork-id: 100594 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 01/13] hw/arm/smmu-common: Factorize some code in smmu_ptw_64() +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Page and block PTE decoding can share some code. Let's +first handle table PTE and factorize some code shared by +page and block PTEs. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-2-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 1733837d7cdb207653a849a5f1fa78de878c6ac1) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 48 ++++++++++++++++---------------------------- + 1 file changed, 17 insertions(+), 31 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 245817d23e9..d2ba8b224ba 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -187,7 +187,7 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + uint64_t subpage_size = 1ULL << level_shift(level, granule_sz); + uint64_t mask = subpage_size - 1; + uint32_t offset = iova_level_offset(iova, inputsize, level, granule_sz); +- uint64_t pte; ++ uint64_t pte, gpa; + dma_addr_t pte_addr = baseaddr + offset * sizeof(pte); + uint8_t ap; + +@@ -200,56 +200,42 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + if (is_invalid_pte(pte) || is_reserved_pte(pte, level)) { + trace_smmu_ptw_invalid_pte(stage, level, baseaddr, + pte_addr, offset, pte); +- info->type = SMMU_PTW_ERR_TRANSLATION; +- goto error; ++ break; + } + +- if (is_page_pte(pte, level)) { +- uint64_t gpa = get_page_pte_address(pte, granule_sz); ++ if (is_table_pte(pte, level)) { ++ ap = PTE_APTABLE(pte); + +- ap = PTE_AP(pte); + if (is_permission_fault(ap, perm)) { + info->type = SMMU_PTW_ERR_PERMISSION; + goto error; + } +- +- tlbe->translated_addr = gpa + (iova & mask); +- tlbe->perm = PTE_AP_TO_PERM(ap); ++ baseaddr = get_table_pte_address(pte, granule_sz); ++ level++; ++ continue; ++ } else if (is_page_pte(pte, level)) { ++ gpa = get_page_pte_address(pte, granule_sz); + trace_smmu_ptw_page_pte(stage, level, iova, + baseaddr, pte_addr, pte, gpa); +- return 0; +- } +- if (is_block_pte(pte, level)) { ++ } else { + uint64_t block_size; +- hwaddr gpa = get_block_pte_address(pte, level, granule_sz, +- &block_size); +- +- ap = PTE_AP(pte); +- if (is_permission_fault(ap, perm)) { +- info->type = SMMU_PTW_ERR_PERMISSION; +- goto error; +- } + ++ gpa = get_block_pte_address(pte, level, granule_sz, ++ &block_size); + trace_smmu_ptw_block_pte(stage, level, baseaddr, + pte_addr, pte, iova, gpa, + block_size >> 20); +- +- tlbe->translated_addr = gpa + (iova & mask); +- tlbe->perm = PTE_AP_TO_PERM(ap); +- return 0; + } +- +- /* table pte */ +- ap = PTE_APTABLE(pte); +- ++ ap = PTE_AP(pte); + if (is_permission_fault(ap, perm)) { + info->type = SMMU_PTW_ERR_PERMISSION; + goto error; + } +- baseaddr = get_table_pte_address(pte, granule_sz); +- level++; +- } + ++ tlbe->translated_addr = gpa + (iova & mask); ++ tlbe->perm = PTE_AP_TO_PERM(ap); ++ return 0; ++ } + info->type = SMMU_PTW_ERR_TRANSLATION; + + error: +-- +2.27.0 + diff --git a/kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch b/kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch new file mode 100755 index 0000000000000000000000000000000000000000..e118225b9e291aff74944287a6b1467203c3c26a --- /dev/null +++ b/kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch @@ -0,0 +1,274 @@ +From 4770f43dab482e4585d3555933a473cf24e796db Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:30 -0500 +Subject: [PATCH 06/17] hw/arm/smmu-common: Manage IOTLB block entries +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-6-eperezma@redhat.com> +Patchwork-id: 100598 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 05/13] hw/arm/smmu-common: Manage IOTLB block entries +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +At the moment each entry in the IOTLB corresponds to a page sized +mapping (4K, 16K or 64K), even if the page belongs to a mapped +block. In case of block mapping this unefficiently consumes IOTLB +entries. + +Change the value of the entry so that it reflects the actual +mapping it belongs to (block or page start address and size). + +Also the level/tg of the entry is encoded in the key. In subsequent +patches we will enable range invalidation. This latter is able +to provide the level/tg of the entry. + +Encoding the level/tg directly in the key will allow to invalidate +using g_hash_table_remove() when num_pages equals to 1. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-6-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit 9e54dee71fcfaae69f87b8e1f51485a832266a39) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 67 ++++++++++++++++++++++++++---------- + hw/arm/smmu-internal.h | 7 ++++ + hw/arm/smmuv3.c | 6 ++-- + hw/arm/trace-events | 2 +- + include/hw/arm/smmu-common.h | 10 ++++-- + 5 files changed, 67 insertions(+), 25 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 06e9e38b007..8007edeaaa2 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -39,7 +39,7 @@ static guint smmu_iotlb_key_hash(gconstpointer v) + + /* Jenkins hash */ + a = b = c = JHASH_INITVAL + sizeof(*key); +- a += key->asid; ++ a += key->asid + key->level + key->tg; + b += extract64(key->iova, 0, 32); + c += extract64(key->iova, 32, 32); + +@@ -51,24 +51,41 @@ static guint smmu_iotlb_key_hash(gconstpointer v) + + static gboolean smmu_iotlb_key_equal(gconstpointer v1, gconstpointer v2) + { +- const SMMUIOTLBKey *k1 = v1; +- const SMMUIOTLBKey *k2 = v2; ++ SMMUIOTLBKey *k1 = (SMMUIOTLBKey *)v1, *k2 = (SMMUIOTLBKey *)v2; + +- return (k1->asid == k2->asid) && (k1->iova == k2->iova); ++ return (k1->asid == k2->asid) && (k1->iova == k2->iova) && ++ (k1->level == k2->level) && (k1->tg == k2->tg); + } + +-SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova) ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova, ++ uint8_t tg, uint8_t level) + { +- SMMUIOTLBKey key = {.asid = asid, .iova = iova}; ++ SMMUIOTLBKey key = {.asid = asid, .iova = iova, .tg = tg, .level = level}; + + return key; + } + + SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, +- hwaddr iova) ++ SMMUTransTableInfo *tt, hwaddr iova) + { +- SMMUIOTLBKey key = smmu_get_iotlb_key(cfg->asid, iova); +- SMMUTLBEntry *entry = g_hash_table_lookup(bs->iotlb, &key); ++ uint8_t tg = (tt->granule_sz - 10) / 2; ++ uint8_t inputsize = 64 - tt->tsz; ++ uint8_t stride = tt->granule_sz - 3; ++ uint8_t level = 4 - (inputsize - 4) / stride; ++ SMMUTLBEntry *entry = NULL; ++ ++ while (level <= 3) { ++ uint64_t subpage_size = 1ULL << level_shift(level, tt->granule_sz); ++ uint64_t mask = subpage_size - 1; ++ SMMUIOTLBKey key; ++ ++ key = smmu_get_iotlb_key(cfg->asid, iova & ~mask, tg, level); ++ entry = g_hash_table_lookup(bs->iotlb, &key); ++ if (entry) { ++ break; ++ } ++ level++; ++ } + + if (entry) { + cfg->iotlb_hits++; +@@ -89,13 +106,14 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, + void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) + { + SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); ++ uint8_t tg = (new->granule - 10) / 2; + + if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { + smmu_iotlb_inv_all(bs); + } + +- *key = smmu_get_iotlb_key(cfg->asid, new->entry.iova); +- trace_smmu_iotlb_insert(cfg->asid, new->entry.iova); ++ *key = smmu_get_iotlb_key(cfg->asid, new->entry.iova, tg, new->level); ++ trace_smmu_iotlb_insert(cfg->asid, new->entry.iova, tg, new->level); + g_hash_table_insert(bs->iotlb, key, new); + } + +@@ -114,12 +132,26 @@ static gboolean smmu_hash_remove_by_asid(gpointer key, gpointer value, + return SMMU_IOTLB_ASID(*iotlb_key) == asid; + } + +-inline void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova) ++static gboolean smmu_hash_remove_by_asid_iova(gpointer key, gpointer value, ++ gpointer user_data) + { +- SMMUIOTLBKey key = smmu_get_iotlb_key(asid, iova); ++ SMMUTLBEntry *iter = (SMMUTLBEntry *)value; ++ IOMMUTLBEntry *entry = &iter->entry; ++ SMMUIOTLBPageInvInfo *info = (SMMUIOTLBPageInvInfo *)user_data; ++ SMMUIOTLBKey iotlb_key = *(SMMUIOTLBKey *)key; ++ ++ if (info->asid >= 0 && info->asid != SMMU_IOTLB_ASID(iotlb_key)) { ++ return false; ++ } ++ return (info->iova & ~entry->addr_mask) == entry->iova; ++} ++ ++inline void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova) ++{ ++ SMMUIOTLBPageInvInfo info = {.asid = asid, .iova = iova}; + + trace_smmu_iotlb_inv_iova(asid, iova); +- g_hash_table_remove(s->iotlb, &key); ++ g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_asid_iova, &info); + } + + inline void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid) +@@ -247,9 +279,6 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + baseaddr = extract64(tt->ttb, 0, 48); + baseaddr &= ~indexmask; + +- tlbe->entry.iova = iova; +- tlbe->entry.addr_mask = (1 << granule_sz) - 1; +- + while (level <= 3) { + uint64_t subpage_size = 1ULL << level_shift(level, granule_sz); + uint64_t mask = subpage_size - 1; +@@ -299,7 +328,9 @@ static int smmu_ptw_64(SMMUTransCfg *cfg, + goto error; + } + +- tlbe->entry.translated_addr = gpa + (iova & mask); ++ tlbe->entry.translated_addr = gpa; ++ tlbe->entry.iova = iova & ~mask; ++ tlbe->entry.addr_mask = mask; + tlbe->entry.perm = PTE_AP_TO_PERM(ap); + tlbe->level = level; + tlbe->granule = granule_sz; +diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h +index 3104f768cd2..55147f29be4 100644 +--- a/hw/arm/smmu-internal.h ++++ b/hw/arm/smmu-internal.h +@@ -97,4 +97,11 @@ uint64_t iova_level_offset(uint64_t iova, int inputsize, + } + + #define SMMU_IOTLB_ASID(key) ((key).asid) ++ ++typedef struct SMMUIOTLBPageInvInfo { ++ int asid; ++ uint64_t iova; ++ uint64_t mask; ++} SMMUIOTLBPageInvInfo; ++ + #endif +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ad8212779d3..067c9480a03 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -662,7 +662,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion *mr, hwaddr addr, + page_mask = (1ULL << (tt->granule_sz)) - 1; + aligned_addr = addr & ~page_mask; + +- cached_entry = smmu_iotlb_lookup(bs, cfg, aligned_addr); ++ cached_entry = smmu_iotlb_lookup(bs, cfg, tt, aligned_addr); + if (cached_entry) { + if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) { + status = SMMU_TRANS_ERROR; +@@ -732,7 +732,7 @@ epilogue: + case SMMU_TRANS_SUCCESS: + entry.perm = flag; + entry.translated_addr = cached_entry->entry.translated_addr + +- (addr & page_mask); ++ (addr & cached_entry->entry.addr_mask); + entry.addr_mask = cached_entry->entry.addr_mask; + trace_smmuv3_translate_success(mr->parent_obj.name, sid, addr, + entry.translated_addr, entry.perm); +@@ -960,7 +960,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + + trace_smmuv3_cmdq_tlbi_nh_vaa(vmid, addr); + smmuv3_inv_notifiers_iova(bs, -1, addr); +- smmu_iotlb_inv_all(bs); ++ smmu_iotlb_inv_iova(bs, -1, addr); + break; + } + case SMMU_CMD_TLBI_NH_VA: +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index b808a1bfc19..f74d3e920f1 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -16,7 +16,7 @@ smmu_iotlb_inv_iova(uint16_t asid, uint64_t addr) "IOTLB invalidate asid=%d addr + smmu_inv_notifiers_mr(const char *name) "iommu mr=%s" + smmu_iotlb_lookup_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" + smmu_iotlb_lookup_miss(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache MISS asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" +-smmu_iotlb_insert(uint16_t asid, uint64_t addr) "IOTLB ++ asid=%d addr=0x%"PRIx64 ++smmu_iotlb_insert(uint16_t asid, uint64_t addr, uint8_t tg, uint8_t level) "IOTLB ++ asid=%d addr=0x%"PRIx64" tg=%d level=%d" + + # smmuv3.c + smmuv3_read_mmio(uint64_t addr, uint64_t val, unsigned size, uint32_t r) "addr: 0x%"PRIx64" val:0x%"PRIx64" size: 0x%x(%d)" +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 277923bdc0a..bbf3abc41fd 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -97,6 +97,8 @@ typedef struct SMMUPciBus { + typedef struct SMMUIOTLBKey { + uint64_t iova; + uint16_t asid; ++ uint8_t tg; ++ uint8_t level; + } SMMUIOTLBKey; + + typedef struct SMMUState { +@@ -159,12 +161,14 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); + + #define SMMU_IOTLB_MAX_SIZE 256 + +-SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, hwaddr iova); ++SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, ++ SMMUTransTableInfo *tt, hwaddr iova); + void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *entry); +-SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova); ++SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova, ++ uint8_t tg, uint8_t level); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); +-void smmu_iotlb_inv_iova(SMMUState *s, uint16_t asid, dma_addr_t iova); ++void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova); + + /* Unmap the range of all the notifiers registered to any IOMMU mr */ + void smmu_inv_notifiers_all(SMMUState *s); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch b/kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch new file mode 100755 index 0000000000000000000000000000000000000000..79e75d887a9fbed313cac2f61100553ffa054132 --- /dev/null +++ b/kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch @@ -0,0 +1,67 @@ +From 69d71311d3d70282dec3d1f19f9e4b90c7b7c6b9 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:33 -0500 +Subject: [PATCH 09/17] hw/arm/smmuv3: Fix potential integer overflow (CID + 1432363) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-9-eperezma@redhat.com> +Patchwork-id: 100601 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 08/13] hw/arm/smmuv3: Fix potential integer overflow (CID 1432363) +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Philippe Mathieu-Daudé + +Use the BIT_ULL() macro to ensure we use 64-bit arithmetic. +This fixes the following Coverity issue (OVERFLOW_BEFORE_WIDEN): + + CID 1432363 (#1 of 1): Unintentional integer overflow: + + overflow_before_widen: + Potentially overflowing expression 1 << scale with type int + (32 bits, signed) is evaluated using 32-bit arithmetic, and + then used in a context that expects an expression of type + hwaddr (64 bits, unsigned). + +Signed-off-by: Philippe Mathieu-Daudé +Acked-by: Eric Auger +Message-id: 20201030144617.1535064-1-philmd@redhat.com +Reviewed-by: Peter Maydell +Signed-off-by: Peter Maydell +(cherry picked from commit 744a790ec01a30033309e6a2155df4d61061e184) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmuv3.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index f4d5d9d8222..a418fab2aa6 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -17,6 +17,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/bitops.h" + #include "hw/irq.h" + #include "hw/sysbus.h" + #include "migration/vmstate.h" +@@ -847,7 +848,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + scale = CMD_SCALE(cmd); + num = CMD_NUM(cmd); + ttl = CMD_TTL(cmd); +- num_pages = (num + 1) * (1 << (scale)); ++ num_pages = (num + 1) * BIT_ULL(scale); + } + + if (type == SMMU_CMD_TLBI_NH_VA) { +-- +2.27.0 + diff --git a/kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch b/kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch new file mode 100755 index 0000000000000000000000000000000000000000..fd52e0ced3be30e7e0651cfafb54153182140c48 --- /dev/null +++ b/kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch @@ -0,0 +1,255 @@ +From 3f027ac56449e51a61e76c18b97fd341d302dc80 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:32 -0500 +Subject: [PATCH 08/17] hw/arm/smmuv3: Get prepared for range invalidation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-8-eperezma@redhat.com> +Patchwork-id: 100600 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 07/13] hw/arm/smmuv3: Get prepared for range invalidation +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Enhance the smmu_iotlb_inv_iova() helper with range invalidation. +This uses the new fields passed in the NH_VA and NH_VAA commands: +the size of the range, the level and the granule. + +As NH_VA and NH_VAA both use those fields, their decoding and +handling is factorized in a new smmuv3_s1_range_inval() helper. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-8-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit d52915616c059ed273caa2d496b58e5d215c5962) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 25 +++++++++++--- + hw/arm/smmuv3-internal.h | 4 +++ + hw/arm/smmuv3.c | 64 +++++++++++++++++++++++------------- + hw/arm/trace-events | 4 +-- + include/hw/arm/smmu-common.h | 3 +- + 5 files changed, 69 insertions(+), 31 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 8007edeaaa2..9780404f002 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -143,15 +143,30 @@ static gboolean smmu_hash_remove_by_asid_iova(gpointer key, gpointer value, + if (info->asid >= 0 && info->asid != SMMU_IOTLB_ASID(iotlb_key)) { + return false; + } +- return (info->iova & ~entry->addr_mask) == entry->iova; ++ return ((info->iova & ~entry->addr_mask) == entry->iova) || ++ ((entry->iova & ~info->mask) == info->iova); + } + +-inline void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova) ++inline void ++smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages, uint8_t ttl) + { +- SMMUIOTLBPageInvInfo info = {.asid = asid, .iova = iova}; ++ if (ttl && (num_pages == 1)) { ++ SMMUIOTLBKey key = smmu_get_iotlb_key(asid, iova, tg, ttl); + +- trace_smmu_iotlb_inv_iova(asid, iova); +- g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_asid_iova, &info); ++ g_hash_table_remove(s->iotlb, &key); ++ } else { ++ /* if tg is not set we use 4KB range invalidation */ ++ uint8_t granule = tg ? tg * 2 + 10 : 12; ++ ++ SMMUIOTLBPageInvInfo info = { ++ .asid = asid, .iova = iova, ++ .mask = (num_pages * 1 << granule) - 1}; ++ ++ g_hash_table_foreach_remove(s->iotlb, ++ smmu_hash_remove_by_asid_iova, ++ &info); ++ } + } + + inline void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid) +diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h +index d190181ef1b..a4ec2c591cd 100644 +--- a/hw/arm/smmuv3-internal.h ++++ b/hw/arm/smmuv3-internal.h +@@ -298,6 +298,8 @@ enum { /* Command completion notification */ + }; + + #define CMD_TYPE(x) extract32((x)->word[0], 0 , 8) ++#define CMD_NUM(x) extract32((x)->word[0], 12 , 5) ++#define CMD_SCALE(x) extract32((x)->word[0], 20 , 5) + #define CMD_SSEC(x) extract32((x)->word[0], 10, 1) + #define CMD_SSV(x) extract32((x)->word[0], 11, 1) + #define CMD_RESUME_AC(x) extract32((x)->word[0], 12, 1) +@@ -310,6 +312,8 @@ enum { /* Command completion notification */ + #define CMD_RESUME_STAG(x) extract32((x)->word[2], 0 , 16) + #define CMD_RESP(x) extract32((x)->word[2], 11, 2) + #define CMD_LEAF(x) extract32((x)->word[2], 0 , 1) ++#define CMD_TTL(x) extract32((x)->word[2], 8 , 2) ++#define CMD_TG(x) extract32((x)->word[2], 10, 2) + #define CMD_STE_RANGE(x) extract32((x)->word[2], 0 , 5) + #define CMD_ADDR(x) ({ \ + uint64_t high = (uint64_t)(x)->word[3]; \ +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ae2b769f891..f4d5d9d8222 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -773,42 +773,49 @@ epilogue: + * @n: notifier to be called + * @asid: address space ID or negative value if we don't care + * @iova: iova ++ * @tg: translation granule (if communicated through range invalidation) ++ * @num_pages: number of @granule sized pages (if tg != 0), otherwise 1 + */ + static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + IOMMUNotifier *n, +- int asid, +- dma_addr_t iova) ++ int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); +- SMMUEventInfo event = {.inval_ste_allowed = true}; +- SMMUTransTableInfo *tt; +- SMMUTransCfg *cfg; + IOMMUTLBEntry entry; ++ uint8_t granule = tg; + +- cfg = smmuv3_get_config(sdev, &event); +- if (!cfg) { +- return; +- } ++ if (!tg) { ++ SMMUEventInfo event = {.inval_ste_allowed = true}; ++ SMMUTransCfg *cfg = smmuv3_get_config(sdev, &event); ++ SMMUTransTableInfo *tt; + +- if (asid >= 0 && cfg->asid != asid) { +- return; +- } ++ if (!cfg) { ++ return; ++ } + +- tt = select_tt(cfg, iova); +- if (!tt) { +- return; ++ if (asid >= 0 && cfg->asid != asid) { ++ return; ++ } ++ ++ tt = select_tt(cfg, iova); ++ if (!tt) { ++ return; ++ } ++ granule = tt->granule_sz; + } + + entry.target_as = &address_space_memory; + entry.iova = iova; +- entry.addr_mask = (1 << tt->granule_sz) - 1; ++ entry.addr_mask = num_pages * (1 << granule) - 1; + entry.perm = IOMMU_NONE; + + memory_region_notify_one(n, &entry); + } + +-/* invalidate an asid/iova tuple in all mr's */ +-static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) ++/* invalidate an asid/iova range tuple in all mr's */ ++static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev; + +@@ -816,28 +823,39 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) + IOMMUMemoryRegion *mr = &sdev->iommu; + IOMMUNotifier *n; + +- trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, iova); ++ trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, iova, ++ tg, num_pages); + + IOMMU_NOTIFIER_FOREACH(n, mr) { +- smmuv3_notify_iova(mr, n, asid, iova); ++ smmuv3_notify_iova(mr, n, asid, iova, tg, num_pages); + } + } + } + + static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + { ++ uint8_t scale = 0, num = 0, ttl = 0; + dma_addr_t addr = CMD_ADDR(cmd); + uint8_t type = CMD_TYPE(cmd); + uint16_t vmid = CMD_VMID(cmd); + bool leaf = CMD_LEAF(cmd); ++ uint8_t tg = CMD_TG(cmd); ++ hwaddr num_pages = 1; + int asid = -1; + ++ if (tg) { ++ scale = CMD_SCALE(cmd); ++ num = CMD_NUM(cmd); ++ ttl = CMD_TTL(cmd); ++ num_pages = (num + 1) * (1 << (scale)); ++ } ++ + if (type == SMMU_CMD_TLBI_NH_VA) { + asid = CMD_ASID(cmd); + } +- trace_smmuv3_s1_range_inval(vmid, asid, addr, leaf); +- smmuv3_inv_notifiers_iova(s, asid, addr); +- smmu_iotlb_inv_iova(s, asid, addr); ++ trace_smmuv3_s1_range_inval(vmid, asid, addr, tg, num_pages, ttl, leaf); ++ smmuv3_inv_notifiers_iova(s, asid, addr, tg, num_pages); ++ smmu_iotlb_inv_iova(s, asid, addr, tg, num_pages, ttl); + } + + static int smmuv3_cmdq_consume(SMMUv3State *s) +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index c219fe9e828..3d905e0f7d0 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -45,11 +45,11 @@ smmuv3_cmdq_cfgi_ste_range(int start, int end) "start=0x%d - end=0x%d" + smmuv3_cmdq_cfgi_cd(uint32_t sid) "streamid = %d" + smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache HIT for sid %d (hits=%d, misses=%d, hit rate=%d)" + smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid %d (hits=%d, misses=%d, hit rate=%d)" +-smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d" ++smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, uint8_t tg, uint64_t num_pages, uint8_t ttl, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" ttl=%d leaf=%d" + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" +-smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova) "iommu mr=%s asid=%d iova=0x%"PRIx64 ++smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 + +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index bbf3abc41fd..13489a1ac0d 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -168,7 +168,8 @@ SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint64_t iova, + uint8_t tg, uint8_t level); + void smmu_iotlb_inv_all(SMMUState *s); + void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid); +-void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova); ++void smmu_iotlb_inv_iova(SMMUState *s, int asid, dma_addr_t iova, ++ uint8_t tg, uint64_t num_pages, uint8_t ttl); + + /* Unmap the range of all the notifiers registered to any IOMMU mr */ + void smmu_inv_notifiers_all(SMMUState *s); +-- +2.27.0 + diff --git a/kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch b/kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch new file mode 100755 index 0000000000000000000000000000000000000000..e77c403a51d707bdecc24b334f8fdc983994417a --- /dev/null +++ b/kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch @@ -0,0 +1,115 @@ +From c4ae2dbb8ee406f0a015b35fb76b3d6d131900d6 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:31 -0500 +Subject: [PATCH 07/17] hw/arm/smmuv3: Introduce smmuv3_s1_range_inval() helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-7-eperezma@redhat.com> +Patchwork-id: 100599 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 06/13] hw/arm/smmuv3: Introduce smmuv3_s1_range_inval() helper +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +From: Eric Auger + +Let's introduce an helper for S1 IOVA range invalidation. +This will be used for NH_VA and NH_VAA commands. It decodes +the same fields, trace, calls the UNMAP notifiers and +invalidate the corresponding IOTLB entries. + +At the moment, we do not support 3.2 range invalidation yet. +So it reduces to a single IOVA invalidation. + +Note the leaf bit now is also decoded for the CMD_TLBI_NH_VAA +command. At the moment it is only used for tracing. + +Signed-off-by: Eric Auger +Reviewed-by: Peter Maydell +Message-id: 20200728150815.11446-7-eric.auger@redhat.com +Signed-off-by: Peter Maydell +(cherry picked from commit c0f9ef70377cfcbd0fa6559d5dc729a930d71b7c) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmuv3.c | 36 +++++++++++++++++------------------- + hw/arm/trace-events | 3 +-- + 2 files changed, 18 insertions(+), 21 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 067c9480a03..ae2b769f891 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -824,6 +824,22 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) + } + } + ++static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) ++{ ++ dma_addr_t addr = CMD_ADDR(cmd); ++ uint8_t type = CMD_TYPE(cmd); ++ uint16_t vmid = CMD_VMID(cmd); ++ bool leaf = CMD_LEAF(cmd); ++ int asid = -1; ++ ++ if (type == SMMU_CMD_TLBI_NH_VA) { ++ asid = CMD_ASID(cmd); ++ } ++ trace_smmuv3_s1_range_inval(vmid, asid, addr, leaf); ++ smmuv3_inv_notifiers_iova(s, asid, addr); ++ smmu_iotlb_inv_iova(s, asid, addr); ++} ++ + static int smmuv3_cmdq_consume(SMMUv3State *s) + { + SMMUState *bs = ARM_SMMU(s); +@@ -954,27 +970,9 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + smmu_iotlb_inv_all(bs); + break; + case SMMU_CMD_TLBI_NH_VAA: +- { +- dma_addr_t addr = CMD_ADDR(&cmd); +- uint16_t vmid = CMD_VMID(&cmd); +- +- trace_smmuv3_cmdq_tlbi_nh_vaa(vmid, addr); +- smmuv3_inv_notifiers_iova(bs, -1, addr); +- smmu_iotlb_inv_iova(bs, -1, addr); +- break; +- } + case SMMU_CMD_TLBI_NH_VA: +- { +- uint16_t asid = CMD_ASID(&cmd); +- uint16_t vmid = CMD_VMID(&cmd); +- dma_addr_t addr = CMD_ADDR(&cmd); +- bool leaf = CMD_LEAF(&cmd); +- +- trace_smmuv3_cmdq_tlbi_nh_va(vmid, asid, addr, leaf); +- smmuv3_inv_notifiers_iova(bs, asid, addr); +- smmu_iotlb_inv_iova(bs, asid, addr); ++ smmuv3_s1_range_inval(bs, &cmd); + break; +- } + case SMMU_CMD_TLBI_EL3_ALL: + case SMMU_CMD_TLBI_EL3_VA: + case SMMU_CMD_TLBI_EL2_ALL: +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index f74d3e920f1..c219fe9e828 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -45,8 +45,7 @@ smmuv3_cmdq_cfgi_ste_range(int start, int end) "start=0x%d - end=0x%d" + smmuv3_cmdq_cfgi_cd(uint32_t sid) "streamid = %d" + smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache HIT for sid %d (hits=%d, misses=%d, hit rate=%d)" + smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid %d (hits=%d, misses=%d, hit rate=%d)" +-smmuv3_cmdq_tlbi_nh_va(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d" +-smmuv3_cmdq_tlbi_nh_vaa(int vmid, uint64_t addr) "vmid =%d addr=0x%"PRIx64 ++smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d" + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d" +-- +2.27.0 + diff --git a/kvm-hw-ehci-check-return-value-of-usb_packet_map.patch b/kvm-hw-ehci-check-return-value-of-usb_packet_map.patch new file mode 100755 index 0000000000000000000000000000000000000000..3e3ed87389c7dde8d5b1627ba8c0c3c3c5ecfc1e --- /dev/null +++ b/kvm-hw-ehci-check-return-value-of-usb_packet_map.patch @@ -0,0 +1,61 @@ +From 6955223aa15ab6ea53322218ec03fb3dc2b776f8 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 14 Jan 2021 00:07:05 -0500 +Subject: [PATCH 16/17] hw: ehci: check return value of 'usb_packet_map' + +RH-Author: Jon Maloy +Message-id: <20210114000705.945169-2-jmaloy@redhat.com> +Patchwork-id: 100634 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] hw: ehci: check return value of 'usb_packet_map' +Bugzilla: 1898628 +RH-Acked-by: Gerd Hoffmann +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Li Qiang + +If 'usb_packet_map' fails, we should stop to process the usb +request. + +Signed-off-by: Li Qiang +Message-Id: <20200812161727.29412-1-liq3ea@163.com> +Signed-off-by: Gerd Hoffmann + +(cherry picked from commit 2fdb42d840400d58f2e706ecca82c142b97bcbd6) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-ehci.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c +index 56ab2f457f4..024b1ed6b67 100644 +--- a/hw/usb/hcd-ehci.c ++++ b/hw/usb/hcd-ehci.c +@@ -1374,7 +1374,10 @@ static int ehci_execute(EHCIPacket *p, const char *action) + spd = (p->pid == USB_TOKEN_IN && NLPTR_TBIT(p->qtd.altnext) == 0); + usb_packet_setup(&p->packet, p->pid, ep, 0, p->qtdaddr, spd, + (p->qtd.token & QTD_TOKEN_IOC) != 0); +- usb_packet_map(&p->packet, &p->sgl); ++ if (usb_packet_map(&p->packet, &p->sgl)) { ++ qemu_sglist_destroy(&p->sgl); ++ return -1; ++ } + p->async = EHCI_ASYNC_INITIALIZED; + } + +@@ -1453,7 +1456,10 @@ static int ehci_process_itd(EHCIState *ehci, + if (ep && ep->type == USB_ENDPOINT_XFER_ISOC) { + usb_packet_setup(&ehci->ipacket, pid, ep, 0, addr, false, + (itd->transact[i] & ITD_XACT_IOC) != 0); +- usb_packet_map(&ehci->ipacket, &ehci->isgl); ++ if (usb_packet_map(&ehci->ipacket, &ehci->isgl)) { ++ qemu_sglist_destroy(&ehci->isgl); ++ return -1; ++ } + usb_handle_packet(dev, &ehci->ipacket); + usb_packet_unmap(&ehci->ipacket, &ehci->isgl); + } else { +-- +2.27.0 + diff --git a/kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch b/kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch new file mode 100755 index 0000000000000000000000000000000000000000..650555cacb44991df270fb3482b4fc2740638bb7 --- /dev/null +++ b/kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch @@ -0,0 +1,80 @@ +From dad4f9beaa3fd1eec1e0dd46c3d5cd2f444c0f48 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 20:05:51 -0400 +Subject: [PATCH 1/7] hw/intc/arm_gic: Fix interrupt ID in GICD_SGIR register +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210413200551.3825495-2-jmaloy@redhat.com> +Patchwork-id: 101471 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] hw/intc/arm_gic: Fix interrupt ID in GICD_SGIR register +Bugzilla: 1925430 +RH-Acked-by: Andrew Jones +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Philippe Mathieu-Daudé + +Per the ARM Generic Interrupt Controller Architecture specification +(document "ARM IHI 0048B.b (ID072613)"), the SGIINTID field is 4 bit, +not 10: + + - 4.3 Distributor register descriptions + - 4.3.15 Software Generated Interrupt Register, GICD_SG + + - Table 4-21 GICD_SGIR bit assignments + + The Interrupt ID of the SGI to forward to the specified CPU + interfaces. The value of this field is the Interrupt ID, in + the range 0-15, for example a value of 0b0011 specifies + Interrupt ID 3. + +Correct the irq mask to fix an undefined behavior (which eventually +lead to a heap-buffer-overflow, see [Buglink]): + + $ echo 'writel 0x8000f00 0xff4affb0' | qemu-system-aarch64 -M virt,accel=qtest -qtest stdio + [I 1612088147.116987] OPENED + [R +0.278293] writel 0x8000f00 0xff4affb0 + ../hw/intc/arm_gic.c:1498:13: runtime error: index 944 out of bounds for type 'uint8_t [16][8]' + SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior ../hw/intc/arm_gic.c:1498:13 + +This fixes a security issue when running with KVM on Arm with +kernel-irqchip=off. (The default is kernel-irqchip=on, which is +unaffected, and which is also the correct choice for performance.) + +Cc: qemu-stable@nongnu.org +Fixes: CVE-2021-20221 +Fixes: 9ee6e8bb853 ("ARMv7 support.") +Buglink: https://bugs.launchpad.net/qemu/+bug/1913916 +Buglink: https://bugs.launchpad.net/qemu/+bug/1913917 +Reported-by: Alexander Bulekov +Signed-off-by: Philippe Mathieu-Daudé +Message-id: 20210131103401.217160-1-f4bug@amsat.org +Reviewed-by: Peter Maydell +Signed-off-by: Peter Maydell + +(cherry picked from commit edfe2eb4360cde4ed5d95bda7777edcb3510f76a) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/arm_gic.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c +index 1d7da7baa2..df355f4d11 100644 +--- a/hw/intc/arm_gic.c ++++ b/hw/intc/arm_gic.c +@@ -1455,7 +1455,7 @@ static void gic_dist_writel(void *opaque, hwaddr offset, + int target_cpu; + + cpu = gic_get_current_cpu(s); +- irq = value & 0x3ff; ++ irq = value & 0xf; + switch ((value >> 24) & 3) { + case 0: + mask = (value >> 16) & ALL_CPU_MASK; +-- +2.27.0 + diff --git a/kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch b/kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch new file mode 100755 index 0000000000000000000000000000000000000000..cf9f6ab2af944332832fe1e4b92449bb72dfae84 --- /dev/null +++ b/kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch @@ -0,0 +1,62 @@ +From d48034cc2b331313995c1d19060decc0e5ca1356 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 14 Jan 2021 01:35:41 -0500 +Subject: [PATCH 17/17] hw/net/e1000e: advance desc_offset in case of null + descriptor +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210114013541.956735-2-jmaloy@redhat.com> +Patchwork-id: 100638 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] hw/net/e1000e: advance desc_offset in case of null descriptor +Bugzilla: 1903070 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Thomas Huth + +From: Prasad J Pandit + +While receiving packets via e1000e_write_packet_to_guest() routine, +'desc_offset' is advanced only when RX descriptor is processed. And +RX descriptor is not processed if it has NULL buffer address. +This may lead to an infinite loop condition. Increament 'desc_offset' +to process next descriptor in the ring to avoid infinite loop. + +Reported-by: Cheol-woo Myung <330cjfdn@gmail.com> +Signed-off-by: Prasad J Pandit +Signed-off-by: Jason Wang + +(cherry picked from c2cb511634012344e3d0fe49a037a33b12d8a98a) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/e1000e_core.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c +index 9b76f82db5b..166054f2e3f 100644 +--- a/hw/net/e1000e_core.c ++++ b/hw/net/e1000e_core.c +@@ -1596,13 +1596,13 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt, + (const char *) &fcs_pad, e1000x_fcs_len(core->mac)); + } + } +- desc_offset += desc_size; +- if (desc_offset >= total_size) { +- is_last = true; +- } + } else { /* as per intel docs; skip descriptors with null buf addr */ + trace_e1000e_rx_null_descriptor(); + } ++ desc_offset += desc_size; ++ if (desc_offset >= total_size) { ++ is_last = true; ++ } + + e1000e_write_rx_descr(core, desc, is_last ? core->rx_pkt : NULL, + rss_info, do_ps ? ps_hdr_len : 0, &bastate.written); +-- +2.27.0 + diff --git a/kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch b/kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch new file mode 100755 index 0000000000000000000000000000000000000000..228bdff4b1a9428301d36eea26e8cd9f413fa852 --- /dev/null +++ b/kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch @@ -0,0 +1,56 @@ +From 94ca0eddc117b57da009dacb19740fc8ae00143a Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Mon, 28 Sep 2020 18:27:35 -0400 +Subject: [PATCH] hw/net/net_tx_pkt: fix assertion failure in + net_tx_pkt_add_raw_fragment() + +RH-Author: Jon Maloy +Message-id: <20200928182735.1008839-2-jmaloy@redhat.com> +Patchwork-id: 98497 +O-Subject: [RHEL-8.0.0 qemu-kvm PATCH 1/1] hw/net/net_tx_pkt: fix assertion failure in net_tx_pkt_add_raw_fragment() +Bugzilla: 1860994 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Xiao Wang +RH-Acked-by: Thomas Huth +RH-Acked-by: Stefan Hajnoczi + +From: Mauro Matteo Cascella + +An assertion failure issue was found in the code that processes network packets +while adding data fragments into the packet context. It could be abused by a +malicious guest to abort the QEMU process on the host. This patch replaces the +affected assert() with a conditional statement, returning false if the current +data fragment exceeds max_raw_frags. + +Reported-by: Alexander Bulekov +Reported-by: Ziming Zhang +Reviewed-by: Dmitry Fleytman +Signed-off-by: Mauro Matteo Cascella +Signed-off-by: Jason Wang + +(cherry picked from commit 035e69b063835a5fd23cacabd63690a3d84532a8) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/net_tx_pkt.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c +index 162f802dd77..54d4c3bbd02 100644 +--- a/hw/net/net_tx_pkt.c ++++ b/hw/net/net_tx_pkt.c +@@ -379,7 +379,10 @@ bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa, + hwaddr mapped_len = 0; + struct iovec *ventry; + assert(pkt); +- assert(pkt->max_raw_frags > pkt->raw_frags); ++ ++ if (pkt->raw_frags >= pkt->max_raw_frags) { ++ return false; ++ } + + if (!len) { + return true; +-- +2.27.0 + diff --git a/kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch b/kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch new file mode 100755 index 0000000000000000000000000000000000000000..2f4f6dd43a7e3df3716d39ee7ab61fe1bf2cb445 --- /dev/null +++ b/kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch @@ -0,0 +1,77 @@ +From fe8a9f211fba3588d60507b3d2f48c41d8ee3c79 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Mon, 4 May 2020 21:25:04 +0100 +Subject: [PATCH 1/9] hw/pci/pcie: Forbid hot-plug if it's disabled on the slot + +RH-Author: Julia Suvorova +Message-id: <20200504212505.15977-2-jusual@redhat.com> +Patchwork-id: 96257 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] hw/pci/pcie: Forbid hot-plug if it's disabled on the slot +Bugzilla: 1820531 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Peter Xu + +Raise an error when trying to hot-plug/unplug a device through QMP to a device +with disabled hot-plug capability. This makes the device behaviour more +consistent and provides an explanation of the failure in the case of +asynchronous unplug. + +Signed-off-by: Julia Suvorova +Message-Id: <20200427182440.92433-2-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Marcel Apfelbaum +(cherry picked from commit 0501e1aa1d32a6e02dd06a79bba97fbe9d557cb5) +Signed-off-by: Danilo C. L. de Paula +--- + hw/pci/pcie.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index 0eb3a2a..6b48d04 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -415,6 +415,7 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + { + PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); + uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; ++ uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); + PCIDevice *pci_dev = PCI_DEVICE(dev); + + /* Don't send event when device is enabled during qemu machine creation: +@@ -430,6 +431,13 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + return; + } + ++ /* Check if hot-plug is disabled on the slot */ ++ if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { ++ error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", ++ DEVICE(hotplug_pdev)->id); ++ return; ++ } ++ + /* To enable multifunction hot-plug, we just ensure the function + * 0 added last. When function 0 is added, we set the sltsta and + * inform OS via event notification. +@@ -470,6 +478,17 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + Error *local_err = NULL; + PCIDevice *pci_dev = PCI_DEVICE(dev); + PCIBus *bus = pci_get_bus(pci_dev); ++ PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); ++ uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; ++ uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); ++ ++ /* Check if hot-unplug is disabled on the slot */ ++ if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { ++ error_setg(errp, "Hot-unplug failed: " ++ "unsupported by the port device '%s'", ++ DEVICE(hotplug_pdev)->id); ++ return; ++ } + + pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, &local_err); + if (local_err) { +-- +1.8.3.1 + diff --git a/kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch b/kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch new file mode 100755 index 0000000000000000000000000000000000000000..0c44c77631b88bf05c6fc4b92a218e24d9cbe131 --- /dev/null +++ b/kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch @@ -0,0 +1,90 @@ +From 035f8aaabf2c31cd6206bff6da23a12fee69d1b7 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Tue, 16 Jun 2020 12:25:36 -0400 +Subject: [PATCH 1/3] hw/pci/pcie: Move hot plug capability check to pre_plug + callback + +RH-Author: Julia Suvorova +Message-id: <20200616122536.1027685-1-jusual@redhat.com> +Patchwork-id: 97548 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] hw/pci/pcie: Move hot plug capability check to pre_plug callback +Bugzilla: 1820531 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Auger Eric +RH-Acked-by: Sergio Lopez Pascual + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1820531 +BRANCH: rhel-av-8.2.1 +UPSTREAM: merged +BREW: 29422092 + +Check for hot plug capability earlier to avoid removing devices attached +during the initialization process. + +Run qemu with an unattached drive: + -drive file=$FILE,if=none,id=drive0 \ + -device pcie-root-port,id=rp0,slot=3,bus=pcie.0,hotplug=off +Hotplug a block device: + device_add virtio-blk-pci,id=blk0,drive=drive0,bus=rp0 +If hotplug fails on plug_cb, drive0 will be deleted. + +Fixes: 0501e1aa1d32a6 ("hw/pci/pcie: Forbid hot-plug if it's disabled on the slot") + +Acked-by: Igor Mammedov +Signed-off-by: Julia Suvorova +Message-Id: <20200604125947.881210-1-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 0dabc0f6544f2c0310546f6d6cf3b68979580a9c) +Signed-off-by: Eduardo Lima (Etrunko) +--- + hw/pci/pcie.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index abc99b6eff..1386dd228c 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -407,6 +407,17 @@ static void pcie_cap_slot_plug_common(PCIDevice *hotplug_dev, DeviceState *dev, + void pcie_cap_slot_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) + { ++ PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); ++ uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; ++ uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); ++ ++ /* Check if hot-plug is disabled on the slot */ ++ if (dev->hotplugged && (sltcap & PCI_EXP_SLTCAP_HPC) == 0) { ++ error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", ++ DEVICE(hotplug_pdev)->id); ++ return; ++ } ++ + pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, errp); + } + +@@ -415,7 +426,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + { + PCIDevice *hotplug_pdev = PCI_DEVICE(hotplug_dev); + uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; +- uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); + PCIDevice *pci_dev = PCI_DEVICE(dev); + + /* Don't send event when device is enabled during qemu machine creation: +@@ -431,13 +441,6 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + return; + } + +- /* Check if hot-plug is disabled on the slot */ +- if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { +- error_setg(errp, "Hot-plug failed: unsupported by the port device '%s'", +- DEVICE(hotplug_pdev)->id); +- return; +- } +- + /* To enable multifunction hot-plug, we just ensure the function + * 0 added last. When function 0 is added, we set the sltsta and + * inform OS via event notification. +-- +2.27.0 + diff --git a/kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch b/kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch new file mode 100755 index 0000000000000000000000000000000000000000..51a587fedc1d4cf8d2b99dc7103913461e7d8887 --- /dev/null +++ b/kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch @@ -0,0 +1,62 @@ +From f98a1fdad0aa53337925ac46b73a3e6ad36f6295 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Mon, 4 May 2020 21:25:05 +0100 +Subject: [PATCH 2/9] hw/pci/pcie: Replace PCI_DEVICE() casts with existing + variable + +RH-Author: Julia Suvorova +Message-id: <20200504212505.15977-3-jusual@redhat.com> +Patchwork-id: 96259 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] hw/pci/pcie: Replace PCI_DEVICE() casts with existing variable +Bugzilla: 1820531 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Peter Xu + +A little cleanup is possible because of hotplug_pdev introduction. + +Signed-off-by: Julia Suvorova +Message-Id: <20200427182440.92433-3-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Marcel Apfelbaum +(cherry picked from commit 6a1e073378353eb6ac0565e0dc649b3db76ed5dc) +Signed-off-by: Danilo C. L. de Paula +--- + hw/pci/pcie.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index 6b48d04..abc99b6 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -449,7 +449,7 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + pci_word_test_and_set_mask(exp_cap + PCI_EXP_LNKSTA, + PCI_EXP_LNKSTA_DLLLA); + } +- pcie_cap_slot_event(PCI_DEVICE(hotplug_dev), ++ pcie_cap_slot_event(hotplug_pdev, + PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); + } + } +@@ -490,7 +490,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + return; + } + +- pcie_cap_slot_plug_common(PCI_DEVICE(hotplug_dev), dev, &local_err); ++ pcie_cap_slot_plug_common(hotplug_pdev, dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; +@@ -509,7 +509,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + return; + } + +- pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev)); ++ pcie_cap_slot_push_attention_button(hotplug_pdev); + } + + /* pci express slot for pci express root/downstream port +-- +1.8.3.1 + diff --git a/kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch b/kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch new file mode 100755 index 0000000000000000000000000000000000000000..9ea4c38f177493520f3e8ede7dc2b8c522beb608 --- /dev/null +++ b/kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch @@ -0,0 +1,61 @@ +From 60b05771b8afc08e0ca9956658d2c55cd1739652 Mon Sep 17 00:00:00 2001 +From: Mauro Matteo Cascella +Date: Thu, 4 Nov 2021 17:31:38 +0100 +Subject: [PATCH 1/2] hw/scsi/scsi-disk: MODE_PAGE_ALLS not allowed in MODE + SELECT commands +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +RH-MergeRequest: 70: hw/scsi/scsi-disk: MODE_PAGE_ALLS not allowed in MODE SELECT commands +RH-Commit: [1/1] bd3de8bdf48aa6c522612505d08c23dafb122a34 (jmaloy/qemu-kvm) +RH-Bugzilla: 2025605 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Laurent Vivier +RH-Acked-by: Philippe Mathieu-Daudé + +This avoids an off-by-one read of 'mode_sense_valid' buffer in +hw/scsi/scsi-disk.c:mode_sense_page(). + +Fixes: CVE-2021-3930 +Cc: qemu-stable@nongnu.org +Reported-by: Alexander Bulekov +Fixes: a8f4bbe2900 ("scsi-disk: store valid mode pages in a table") +Fixes: #546 +Reported-by: Qiuhao Li +Signed-off-by: Mauro Matteo Cascella +Signed-off-by: Paolo Bonzini +(cherry picked from commit b3af7fdf9cc537f8f0dd3e2423d83f5c99a457e8) +Signed-off-by: Jon Maloy +--- + hw/scsi/scsi-disk.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c +index 5cb5fd35bd..1d0ea72289 100644 +--- a/hw/scsi/scsi-disk.c ++++ b/hw/scsi/scsi-disk.c +@@ -1086,6 +1086,7 @@ static int mode_sense_page(SCSIDiskState *s, int page, uint8_t **p_outbuf, + uint8_t *p = *p_outbuf + 2; + int length; + ++ assert(page < ARRAY_SIZE(mode_sense_valid)); + if ((mode_sense_valid[page] & (1 << s->qdev.type)) == 0) { + return -1; + } +@@ -1427,6 +1428,11 @@ static int scsi_disk_check_mode_select(SCSIDiskState *s, int page, + return -1; + } + ++ /* MODE_PAGE_ALLS is only valid for MODE SENSE commands */ ++ if (page == MODE_PAGE_ALLS) { ++ return -1; ++ } ++ + p = mode_current; + memset(mode_current, 0, inlen + 2); + len = mode_sense_page(s, page, &p, 0); +-- +2.27.0 + diff --git a/kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch b/kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch new file mode 100755 index 0000000000000000000000000000000000000000..0f0f1261e21da95143290b1c5ff8901989a8b49a --- /dev/null +++ b/kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch @@ -0,0 +1,262 @@ +From e6c3fbfc82863180007569cf2a9132c28a47bf1f Mon Sep 17 00:00:00 2001 +From: "Daniel P. Berrange" +Date: Mon, 20 Jan 2020 16:13:08 +0000 +Subject: [PATCH 01/18] hw/smbios: set new default SMBIOS fields for Windows + driver support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Daniel P. Berrange +Message-id: <20200120161308.584989-2-berrange@redhat.com> +Patchwork-id: 93422 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] hw/smbios: set new default SMBIOS fields for Windows driver support +Bugzilla: 1782529 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Igor Mammedov +RH-Acked-by: Laszlo Ersek + +For Windows driver support, we have to follow this doc in order to +enable Windows to automatically determine the right drivers to install +for a given guest / host combination: + + https://docs.microsoft.com/en-us/windows-hardware/drivers/install/specifying-hardware-ids-for-a-computer + +Out of the choices available, it was decided that the Windows drivers +will be written to expect use of the scheme documented as "HardwareID-6" +against Windows 10. This uses SMBIOS System (Type 1) and Base Board +(Type 2) tables and will match on + + System Manufacturer = Red Hat + System SKU Number = 8.2.0 + Baseboard Manufacturer = Red Hat + Baseboard Product = RHEL-AV + +The new SMBIOS fields will be tied to machine type and only reported for +pc-q35-8.2.0 machine and later. + +The old SMBIOS fields, previously reported by all machines were: + + System Manufacturer: Red Hat + System Product Name: KVM + System Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + System Family: Red Hat Enterprise Linux + Baseboard Manufacturer: Red Hat + Baseboard Product Name: KVM + Baseboard Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + Chassis Manufacturer: Red Hat + Chassis Product Name: KVM + Chassis Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + Processor Manufacturer: Red Hat + Processor Product Name: KVM + Processor Version: RHEL-8.2.0 PC (Q35 + ICH9, 2009) + +This information will continue to be reported for all machines, except +where it conflicts with the requirement of the new SMBIOS data. IOW, +the "Baseboard Product Name" will change to "RHEL-AV" for pc-q35-8.2.0 +machine types and later. + +Management applications MUST NEVER override the 4 new SMBIOS fields that +are used for Windows driver matching, with differing values. Aside from +this, they are free to override any other field, including those from +the old SMBIOS field data. + +In particular if a management application wants to report its own +product name and version, it is recommended to use "System product" +and "System version" as identifying fields, as these avoid a clash with +the new SMBIOS fields used for Windows drivers. + +Note that until now the Baseboard (type 2) table has only been generated +by QEMU if explicitly asked for on the CLI. This patch makes it always +present for new machine types. + +Signed-off-by: Daniel P. Berrangé +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/virt.c | 2 +- + hw/i386/pc_piix.c | 2 ++ + hw/i386/pc_q35.c | 8 ++++++++ + hw/smbios/smbios.c | 45 +++++++++++++++++++++++++++++++++++++++++--- + include/hw/firmware/smbios.h | 5 ++++- + include/hw/i386/pc.h | 3 +++ + 6 files changed, 60 insertions(+), 5 deletions(-) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index d30d38c..2dcf6e7 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -1423,7 +1423,7 @@ static void virt_build_smbios(VirtMachineState *vms) + + smbios_set_defaults("QEMU", product, + vmc->smbios_old_sys_ver ? "1.0" : mc->name, false, +- true, SMBIOS_ENTRY_POINT_30); ++ true, NULL, NULL, SMBIOS_ENTRY_POINT_30); + + smbios_get_tables(MACHINE(vms), NULL, 0, &smbios_tables, &smbios_tables_len, + &smbios_anchor, &smbios_anchor_len); +diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c +index bd7fdb9..2ac94d5 100644 +--- a/hw/i386/pc_piix.c ++++ b/hw/i386/pc_piix.c +@@ -177,6 +177,8 @@ static void pc_init1(MachineState *machine, + smbios_set_defaults("Red Hat", "KVM", + mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, ++ pcmc->smbios_stream_product, ++ pcmc->smbios_stream_version, + SMBIOS_ENTRY_POINT_21); + } + +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 7531d8e..e975643 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -200,6 +200,8 @@ static void pc_q35_init(MachineState *machine) + smbios_set_defaults("Red Hat", "KVM", + mc->desc, pcmc->smbios_legacy_mode, + pcmc->smbios_uuid_encoded, ++ pcmc->smbios_stream_product, ++ pcmc->smbios_stream_version, + SMBIOS_ENTRY_POINT_21); + } + +@@ -565,8 +567,11 @@ static void pc_q35_init_rhel820(MachineState *machine) + + static void pc_q35_machine_rhel820_options(MachineClass *m) + { ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_q35_machine_rhel_options(m); + m->desc = "RHEL-8.2.0 PC (Q35 + ICH9, 2009)"; ++ pcmc->smbios_stream_product = "RHEL-AV"; ++ pcmc->smbios_stream_version = "8.2.0"; + } + + DEFINE_PC_MACHINE(q35_rhel820, "pc-q35-rhel8.2.0", pc_q35_init_rhel820, +@@ -579,9 +584,12 @@ static void pc_q35_init_rhel810(MachineState *machine) + + static void pc_q35_machine_rhel810_options(MachineClass *m) + { ++ PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_q35_machine_rhel820_options(m); + m->desc = "RHEL-8.1.0 PC (Q35 + ICH9, 2009)"; + m->alias = NULL; ++ pcmc->smbios_stream_product = NULL; ++ pcmc->smbios_stream_version = NULL; + compat_props_add(m->compat_props, hw_compat_rhel_8_1, hw_compat_rhel_8_1_len); + compat_props_add(m->compat_props, pc_rhel_8_1_compat, pc_rhel_8_1_compat_len); + } +diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c +index e6e9355..d65c149 100644 +--- a/hw/smbios/smbios.c ++++ b/hw/smbios/smbios.c +@@ -57,6 +57,9 @@ static bool smbios_legacy = true; + static bool smbios_uuid_encoded = true; + /* end: legacy structures & constants for <= 2.0 machines */ + ++/* Set to true for modern Windows 10 HardwareID-6 compat */ ++static bool smbios_type2_required; ++ + + uint8_t *smbios_tables; + size_t smbios_tables_len; +@@ -532,7 +535,7 @@ static void smbios_build_type_1_table(void) + + static void smbios_build_type_2_table(void) + { +- SMBIOS_BUILD_TABLE_PRE(2, 0x200, false); /* optional */ ++ SMBIOS_BUILD_TABLE_PRE(2, 0x200, smbios_type2_required); + + SMBIOS_TABLE_SET_STR(2, manufacturer_str, type2.manufacturer); + SMBIOS_TABLE_SET_STR(2, product_str, type2.product); +@@ -753,7 +756,10 @@ void smbios_set_cpuid(uint32_t version, uint32_t features) + + void smbios_set_defaults(const char *manufacturer, const char *product, + const char *version, bool legacy_mode, +- bool uuid_encoded, SmbiosEntryPointType ep_type) ++ bool uuid_encoded, ++ const char *stream_product, ++ const char *stream_version, ++ SmbiosEntryPointType ep_type) + { + smbios_have_defaults = true; + smbios_legacy = legacy_mode; +@@ -774,12 +780,45 @@ void smbios_set_defaults(const char *manufacturer, const char *product, + g_free(smbios_entries); + } + ++ /* ++ * If @stream_product & @stream_version are non-NULL, then ++ * we're following rules for new Windows driver support. ++ * The data we have to report is defined in this doc: ++ * ++ * https://docs.microsoft.com/en-us/windows-hardware/drivers/install/specifying-hardware-ids-for-a-computer ++ * ++ * The Windows drivers are written to expect use of the ++ * scheme documented as "HardwareID-6" against Windows 10, ++ * which uses SMBIOS System (Type 1) and Base Board (Type 2) ++ * tables and will match on ++ * ++ * System Manufacturer = Red Hat (@manufacturer) ++ * System SKU Number = 8.2.0 (@stream_version) ++ * Baseboard Manufacturer = Red Hat (@manufacturer) ++ * Baseboard Product = RHEL-AV (@stream_product) ++ * ++ * NB, SKU must be changed with each RHEL-AV release ++ * ++ * Other fields can be freely used by applications using ++ * QEMU. For example apps can use the "System product" ++ * and "System version" to identify themselves. ++ * ++ * We get 'System Manufacturer' and 'Baseboard Manufacturer' ++ */ + SMBIOS_SET_DEFAULT(type1.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type1.product, product); + SMBIOS_SET_DEFAULT(type1.version, version); + SMBIOS_SET_DEFAULT(type1.family, "Red Hat Enterprise Linux"); ++ if (stream_version != NULL) { ++ SMBIOS_SET_DEFAULT(type1.sku, stream_version); ++ } + SMBIOS_SET_DEFAULT(type2.manufacturer, manufacturer); +- SMBIOS_SET_DEFAULT(type2.product, product); ++ if (stream_product != NULL) { ++ SMBIOS_SET_DEFAULT(type2.product, stream_product); ++ smbios_type2_required = true; ++ } else { ++ SMBIOS_SET_DEFAULT(type2.product, product); ++ } + SMBIOS_SET_DEFAULT(type2.version, version); + SMBIOS_SET_DEFAULT(type3.manufacturer, manufacturer); + SMBIOS_SET_DEFAULT(type3.version, version); +diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h +index 02a0ced..67e38a1 100644 +--- a/include/hw/firmware/smbios.h ++++ b/include/hw/firmware/smbios.h +@@ -267,7 +267,10 @@ void smbios_entry_add(QemuOpts *opts, Error **errp); + void smbios_set_cpuid(uint32_t version, uint32_t features); + void smbios_set_defaults(const char *manufacturer, const char *product, + const char *version, bool legacy_mode, +- bool uuid_encoded, SmbiosEntryPointType ep_type); ++ bool uuid_encoded, ++ const char *stream_product, ++ const char *stream_version, ++ SmbiosEntryPointType ep_type); + uint8_t *smbios_get_table_legacy(MachineState *ms, size_t *length); + void smbios_get_tables(MachineState *ms, + const struct smbios_phys_mem_area *mem_array, +diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h +index 2e362c8..b9f29ba 100644 +--- a/include/hw/i386/pc.h ++++ b/include/hw/i386/pc.h +@@ -109,6 +109,9 @@ typedef struct PCMachineClass { + bool smbios_defaults; + bool smbios_legacy_mode; + bool smbios_uuid_encoded; ++ /* New fields needed for Windows HardwareID-6 matching */ ++ const char *smbios_stream_product; ++ const char *smbios_stream_version; + + /* RAM / address space compat: */ + bool gigabyte_align; +-- +1.8.3.1 + diff --git a/kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch b/kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch new file mode 100755 index 0000000000000000000000000000000000000000..b2cc438caca48c109a0977052a570e5e47ac2e6f --- /dev/null +++ b/kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch @@ -0,0 +1,199 @@ +From 1bee5a77b3f999d2933a440021737d0720b32269 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 29 Jul 2020 18:56:21 -0400 +Subject: [PATCH 1/4] i386: Add 2nd Generation AMD EPYC processors + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200729185621.152427-2-dgilbert@redhat.com> +Patchwork-id: 98078 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] i386: Add 2nd Generation AMD EPYC processors +Bugzilla: 1780385 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Maxim Levitsky + +From: "Moger, Babu" + +Adds the support for 2nd Gen AMD EPYC Processors. The model display +name will be EPYC-Rome. + +Adds the following new feature bits on top of the feature bits from the +first generation EPYC models. +perfctr-core : core performance counter extensions support. Enables the VM to + use extended performance counter support. It enables six + programmable counters instead of four counters. +clzero : instruction zeroes out the 64 byte cache line specified in RAX. +xsaveerptr : XSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES always save error + pointers and FXRSTOR, XRSTOR, XRSTORS always restore error + pointers. +wbnoinvd : Write back and do not invalidate cache +ibpb : Indirect Branch Prediction Barrier +amd-stibp : Single Thread Indirect Branch Predictor +clwb : Cache Line Write Back and Retain +xsaves : XSAVES, XRSTORS and IA32_XSS support +rdpid : Read Processor ID instruction support +umip : User-Mode Instruction Prevention support + +The Reference documents are available at +https://developer.amd.com/wp-content/resources/55803_0.54-PUB.pdf +https://www.amd.com/system/files/TechDocs/24594.pdf + +Depends on following kernel commits: +40bc47b08b6e ("kvm: x86: Enumerate support for CLZERO instruction") +504ce1954fba ("KVM: x86: Expose XSAVEERPTR to the guest") +6d61e3c32248 ("kvm: x86: Expose RDPID in KVM_GET_SUPPORTED_CPUID") +52297436199d ("kvm: svm: Update svm_xsaves_supported") + +Signed-off-by: Babu Moger +Message-Id: <157314966312.23828.17684821666338093910.stgit@naples-babu.amd.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 143c30d4d346831a09e59e9af45afdca0331e819) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 102 +++++++++++++++++++++++++++++++++++++++++++++- + target/i386/cpu.h | 2 + + 2 files changed, 103 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index a343de0c9d..ff39fc9905 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1133,7 +1133,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "clzero", NULL, "xsaveerptr", NULL, + NULL, NULL, NULL, NULL, + NULL, "wbnoinvd", NULL, NULL, +- "ibpb", NULL, NULL, NULL, ++ "ibpb", NULL, NULL, "amd-stibp", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "amd-ssbd", "virt-ssbd", "amd-no-ssb", NULL, +@@ -1803,6 +1803,56 @@ static CPUCaches epyc_cache_info = { + }, + }; + ++static CPUCaches epyc_rome_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 512 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 16 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 16384, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .complex_indexing = true, ++ }, ++}; ++ + /* The following VMX features are not supported by KVM and are left out in the + * CPU definitions: + * +@@ -4024,6 +4074,56 @@ static X86CPUDefinition builtin_x86_defs[] = { + .model_id = "Hygon Dhyana Processor", + .cache_info = &epyc_cache_info, + }, ++ { ++ .name = "EPYC-Rome", ++ .level = 0xd, ++ .vendor = CPUID_VENDOR_AMD, ++ .family = 23, ++ .model = 49, ++ .stepping = 0, ++ .features[FEAT_1_EDX] = ++ CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH | ++ CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE | ++ CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE | ++ CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE | ++ CPUID_VME | CPUID_FP87, ++ .features[FEAT_1_ECX] = ++ CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX | ++ CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT | ++ CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | ++ CPUID_EXT_CX16 | CPUID_EXT_FMA | CPUID_EXT_SSSE3 | ++ CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, ++ .features[FEAT_8000_0001_EDX] = ++ CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB | ++ CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX | ++ CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = ++ CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH | ++ CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | ++ CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM | ++ CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE, ++ .features[FEAT_8000_0008_EBX] = ++ CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR | ++ CPUID_8000_0008_EBX_WBNOINVD | CPUID_8000_0008_EBX_IBPB | ++ CPUID_8000_0008_EBX_STIBP, ++ .features[FEAT_7_0_EBX] = ++ CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | ++ CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_RDSEED | ++ CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | ++ CPUID_7_0_EBX_SHA_NI | CPUID_7_0_EBX_CLWB, ++ .features[FEAT_7_0_ECX] = ++ CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_RDPID, ++ .features[FEAT_XSAVE] = ++ CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | ++ CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, ++ .features[FEAT_6_EAX] = ++ CPUID_6_EAX_ARAT, ++ .features[FEAT_SVM] = ++ CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE, ++ .xlevel = 0x8000001E, ++ .model_id = "AMD EPYC-Rome Processor", ++ .cache_info = &epyc_rome_cache_info, ++ }, + }; + + /* KVM-specific features that are automatically added/removed +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 7bfbf2a5e5..f3da25cb8a 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -792,6 +792,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_8000_0008_EBX_WBNOINVD (1U << 9) + /* Indirect Branch Prediction Barrier */ + #define CPUID_8000_0008_EBX_IBPB (1U << 12) ++/* Single Thread Indirect Branch Predictors */ ++#define CPUID_8000_0008_EBX_STIBP (1U << 15) + + #define CPUID_XSAVE_XSAVEOPT (1U << 0) + #define CPUID_XSAVE_XSAVEC (1U << 1) +-- +2.27.0 + diff --git a/kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch b/kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch new file mode 100755 index 0000000000000000000000000000000000000000..823ff0ce4ac734111e3f6235f29f35600ce1ae0e --- /dev/null +++ b/kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch @@ -0,0 +1,46 @@ +From cdafcc1d68110ed172c09c9e6bba42ee15b5a6df Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:40 +0100 +Subject: [PATCH 13/17] i386: Add MSR feature bit for MDS-NO + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-2-plai@redhat.com> +Patchwork-id: 96609 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 1/4] i386: Add MSR feature bit for MDS-NO +Bugzilla: 1769912 +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Dr. David Alan Gilbert + +From: Cathy Zhang + +Define MSR_ARCH_CAP_MDS_NO in the IA32_ARCH_CAPABILITIES MSR to allow +CPU models to report the feature when host supports it. + +Signed-off-by: Cathy Zhang +Reviewed-by: Xiaoyao Li +Reviewed-by: Tao Xu +Message-Id: <1571729728-23284-2-git-send-email-cathy.zhang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 77b168d221191156c47fcd8d1c47329dfdb9439e) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 4441061..60304cc 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -839,6 +839,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define MSR_ARCH_CAP_RSBA (1U << 2) + #define MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY (1U << 3) + #define MSR_ARCH_CAP_SSB_NO (1U << 4) ++#define MSR_ARCH_CAP_MDS_NO (1U << 5) + + #define MSR_CORE_CAP_SPLIT_LOCK_DETECT (1U << 5) + +-- +1.8.3.1 + diff --git a/kvm-i386-Add-macro-for-stibp.patch b/kvm-i386-Add-macro-for-stibp.patch new file mode 100755 index 0000000000000000000000000000000000000000..17dd149943b8559e024d708481e9a7cdbd15cda4 --- /dev/null +++ b/kvm-i386-Add-macro-for-stibp.patch @@ -0,0 +1,49 @@ +From 00f916987589f114f42ce20b138c00c47b9e4df7 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:41 +0100 +Subject: [PATCH 14/17] i386: Add macro for stibp + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-3-plai@redhat.com> +Patchwork-id: 96610 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 2/4] i386: Add macro for stibp +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Cathy Zhang + +stibp feature is already added through the following commit. +https://github.com/qemu/qemu/commit/0e8916582991b9fd0b94850a8444b8b80d0a0955 + +Add a macro for it to allow CPU models to report it when host supports. + +Signed-off-by: Cathy Zhang +Reviewed-by: Xiaoyao Li +Reviewed-by: Tao Xu +Message-Id: <1571729728-23284-3-git-send-email-cathy.zhang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 5af514d0cb314f43bc53f2aefb437f6451d64d0c) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 60304cc..e77d101 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -772,6 +772,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) + /* Speculation Control */ + #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) ++/* Single Thread Indirect Branch Predictors */ ++#define CPUID_7_0_EDX_STIBP (1U << 27) + /* Arch Capabilities */ + #define CPUID_7_0_EDX_ARCH_CAPABILITIES (1U << 29) + /* Core Capability */ +-- +1.8.3.1 + diff --git a/kvm-i386-Add-new-CPU-model-Cooperlake.patch b/kvm-i386-Add-new-CPU-model-Cooperlake.patch new file mode 100755 index 0000000000000000000000000000000000000000..289d1e36d20cff163b5b01e666d862f2d930e265 --- /dev/null +++ b/kvm-i386-Add-new-CPU-model-Cooperlake.patch @@ -0,0 +1,108 @@ +From cf62577aed781b2515ea97b9f42285c2f608a7bf Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:42 +0100 +Subject: [PATCH 16/17] i386: Add new CPU model Cooperlake + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-4-plai@redhat.com> +Patchwork-id: 96608 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 3/4] i386: Add new CPU model Cooperlake +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Cathy Zhang + +Cooper Lake is intel's successor to Cascade Lake, the new +CPU model inherits features from Cascadelake-Server, while +add one platform associated new feature: AVX512_BF16. Meanwhile, +add STIBP for speculative execution. + +Signed-off-by: Cathy Zhang +Reviewed-by: Xiaoyao Li +Reviewed-by: Tao Xu +Message-Id: <1571729728-23284-4-git-send-email-cathy.zhang@intel.com> +Reviewed-by: Bruce Rogers +Signed-off-by: Eduardo Habkost +(cherry picked from commit 22a866b6166db5caa4abaa6e656c2a431fa60726) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 60 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 0f0a2db..996a74f 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3161,6 +3161,66 @@ static X86CPUDefinition builtin_x86_defs[] = { + } + }, + { ++ .name = "Cooperlake", ++ .level = 0xd, ++ .vendor = CPUID_VENDOR_INTEL, ++ .family = 6, ++ .model = 85, ++ .stepping = 10, ++ .features[FEAT_1_EDX] = ++ CPUID_VME | CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | ++ CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | ++ CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | ++ CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | ++ CPUID_DE | CPUID_FP87, ++ .features[FEAT_1_ECX] = ++ CPUID_EXT_AVX | CPUID_EXT_XSAVE | CPUID_EXT_AES | ++ CPUID_EXT_POPCNT | CPUID_EXT_X2APIC | CPUID_EXT_SSE42 | ++ CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | ++ CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3 | ++ CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_FMA | CPUID_EXT_MOVBE | ++ CPUID_EXT_PCID | CPUID_EXT_F16C | CPUID_EXT_RDRAND, ++ .features[FEAT_8000_0001_EDX] = ++ CPUID_EXT2_LM | CPUID_EXT2_PDPE1GB | CPUID_EXT2_RDTSCP | ++ CPUID_EXT2_NX | CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = ++ CPUID_EXT3_ABM | CPUID_EXT3_LAHF_LM | CPUID_EXT3_3DNOWPREFETCH, ++ .features[FEAT_7_0_EBX] = ++ CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | ++ CPUID_7_0_EBX_HLE | CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | ++ CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | ++ CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | ++ CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLWB | ++ CPUID_7_0_EBX_AVX512F | CPUID_7_0_EBX_AVX512DQ | ++ CPUID_7_0_EBX_AVX512BW | CPUID_7_0_EBX_AVX512CD | ++ CPUID_7_0_EBX_AVX512VL | CPUID_7_0_EBX_CLFLUSHOPT, ++ .features[FEAT_7_0_ECX] = ++ CPUID_7_0_ECX_PKU | ++ CPUID_7_0_ECX_AVX512VNNI, ++ .features[FEAT_7_0_EDX] = ++ CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_STIBP | ++ CPUID_7_0_EDX_SPEC_CTRL_SSBD | CPUID_7_0_EDX_ARCH_CAPABILITIES, ++ .features[FEAT_ARCH_CAPABILITIES] = ++ MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | ++ MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO, ++ .features[FEAT_7_1_EAX] = ++ CPUID_7_1_EAX_AVX512_BF16, ++ /* ++ * Missing: XSAVES (not supported by some Linux versions, ++ * including v4.1 to v4.12). ++ * KVM doesn't yet expose any XSAVES state save component, ++ * and the only one defined in Skylake (processor tracing) ++ * probably will block migration anyway. ++ */ ++ .features[FEAT_XSAVE] = ++ CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | ++ CPUID_XSAVE_XGETBV1, ++ .features[FEAT_6_EAX] = ++ CPUID_6_EAX_ARAT, ++ .xlevel = 0x80000008, ++ .model_id = "Intel Xeon Processor (Cooperlake)", ++ }, ++ { + .name = "Icelake-Client", + .level = 0xd, + .vendor = CPUID_VENDOR_INTEL, +-- +1.8.3.1 + diff --git a/kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch b/kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch new file mode 100755 index 0000000000000000000000000000000000000000..5c335f84b9bd385867663485cc80b0429a6ef173 --- /dev/null +++ b/kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch @@ -0,0 +1,213 @@ +From 4daa8dca77edec191dfe0ae4a0a9fc70f8f63607 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:37 -0500 +Subject: [PATCH 4/4] i386: Add the support for AMD EPYC 3rd generation + processors + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-5-dgilbert@redhat.com> +Patchwork-id: 101202 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/4] i386: Add the support for AMD EPYC 3rd generation processors +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Babu Moger + +Adds the support for AMD 3rd generation processors. The model +display for the new processor will be EPYC-Milan. + +Adds the following new feature bits on top of the feature bits from +the first and second generation EPYC models. + +pcid : Process context identifiers support +ibrs : Indirect Branch Restricted Speculation +ssbd : Speculative Store Bypass Disable +erms : Enhanced REP MOVSB/STOSB support +fsrm : Fast Short REP MOVSB support +invpcid : Invalidate processor context ID +pku : Protection keys support +svme-addr-chk : SVM instructions address check for #GP handling + +Depends on the following kernel commits: +14c2bf81fcd2 ("KVM: SVM: Fix #GP handling for doubly-nested virtualization") +3b9c723ed7cf ("KVM: SVM: Add support for SVM instruction address check change") +4aa2691dcbd3 ("8ce1c461188799d863398dd2865d KVM: x86: Factor out x86 instruction emulation with decoding") +4407a797e941 ("KVM: SVM: Enable INVPCID feature on AMD") +9715092f8d7e ("KVM: X86: Move handling of INVPCID types to x86") +3f3393b3ce38 ("KVM: X86: Rename and move the function vmx_handle_memory_failure to x86.c") +830bd71f2c06 ("KVM: SVM: Remove set_cr_intercept, clr_cr_intercept and is_cr_intercept") +4c44e8d6c193 ("KVM: SVM: Add new intercept word in vmcb_control_area") +c62e2e94b9d4 ("KVM: SVM: Modify 64 bit intercept field to two 32 bit vectors") +9780d51dc2af ("KVM: SVM: Modify intercept_exceptions to generic intercepts") +30abaa88382c ("KVM: SVM: Change intercept_dr to generic intercepts") +03bfeeb988a9 ("KVM: SVM: Change intercept_cr to generic intercepts") +c45ad7229d13 ("KVM: SVM: Introduce vmcb_(set_intercept/clr_intercept/_is_intercept)") +a90c1ed9f11d ("(pcid) KVM: nSVM: Remove unused field") +fa44b82eb831 ("KVM: x86: Move MPK feature detection to common code") +38f3e775e9c2 ("x86/Kconfig: Update config and kernel doc for MPK feature on AMD") +37486135d3a7 ("KVM: x86: Fix pkru save/restore when guest CR4.PKE=0, move it to x86.c") + +Signed-off-by: Babu Moger +Message-Id: <161290460478.11352.8933244555799318236.stgit@bmoger-ubuntu> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 623972ceae091b31331ae4a1dc94fe5cbb891937) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 107 +++++++++++++++++++++++++++++++++++++++++++++- + target/i386/cpu.h | 4 ++ + 2 files changed, 110 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 7227c803c3..d5b0d4b7f0 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1133,7 +1133,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "clzero", NULL, "xsaveerptr", NULL, + NULL, NULL, NULL, NULL, + NULL, "wbnoinvd", NULL, NULL, +- "ibpb", NULL, NULL, "amd-stibp", ++ "ibpb", NULL, "ibrs", "amd-stibp", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "amd-ssbd", "virt-ssbd", "amd-no-ssb", NULL, +@@ -1853,6 +1853,56 @@ static CPUCaches epyc_rome_cache_info = { + }, + }; + ++static CPUCaches epyc_milan_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = 1, ++ .no_invd_sharing = true, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 512 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 32 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 32768, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .complex_indexing = true, ++ }, ++}; ++ + /* The following VMX features are not supported by KVM and are left out in the + * CPU definitions: + * +@@ -4124,6 +4174,61 @@ static X86CPUDefinition builtin_x86_defs[] = { + .model_id = "AMD EPYC-Rome Processor", + .cache_info = &epyc_rome_cache_info, + }, ++ { ++ .name = "EPYC-Milan", ++ .level = 0xd, ++ .vendor = CPUID_VENDOR_AMD, ++ .family = 25, ++ .model = 1, ++ .stepping = 1, ++ .features[FEAT_1_EDX] = ++ CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH | ++ CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE | ++ CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE | ++ CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE | ++ CPUID_VME | CPUID_FP87, ++ .features[FEAT_1_ECX] = ++ CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX | ++ CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT | ++ CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | ++ CPUID_EXT_CX16 | CPUID_EXT_FMA | CPUID_EXT_SSSE3 | ++ CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3 | ++ CPUID_EXT_PCID, ++ .features[FEAT_8000_0001_EDX] = ++ CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB | ++ CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX | ++ CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0001_ECX] = ++ CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH | ++ CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | ++ CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM | ++ CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE, ++ .features[FEAT_8000_0008_EBX] = ++ CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR | ++ CPUID_8000_0008_EBX_WBNOINVD | CPUID_8000_0008_EBX_IBPB | ++ CPUID_8000_0008_EBX_IBRS | CPUID_8000_0008_EBX_STIBP | ++ CPUID_8000_0008_EBX_AMD_SSBD, ++ .features[FEAT_7_0_EBX] = ++ CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | ++ CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_RDSEED | ++ CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | ++ CPUID_7_0_EBX_SHA_NI | CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_ERMS | ++ CPUID_7_0_EBX_INVPCID, ++ .features[FEAT_7_0_ECX] = ++ CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_PKU, ++ .features[FEAT_7_0_EDX] = ++ CPUID_7_0_EDX_FSRM, ++ .features[FEAT_XSAVE] = ++ CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | ++ CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, ++ .features[FEAT_6_EAX] = ++ CPUID_6_EAX_ARAT, ++ .features[FEAT_SVM] = ++ CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE | CPUID_SVM_SVME_ADDR_CHK, ++ .xlevel = 0x8000001E, ++ .model_id = "AMD EPYC-Milan Processor", ++ .cache_info = &epyc_milan_cache_info, ++ }, + }; + + /* KVM-specific features that are automatically added/removed +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index e1b67910c2..7a3aa40201 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -800,8 +800,12 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_8000_0008_EBX_WBNOINVD (1U << 9) + /* Indirect Branch Prediction Barrier */ + #define CPUID_8000_0008_EBX_IBPB (1U << 12) ++/* Indirect Branch Restricted Speculation */ ++#define CPUID_8000_0008_EBX_IBRS (1U << 14) + /* Single Thread Indirect Branch Predictors */ + #define CPUID_8000_0008_EBX_STIBP (1U << 15) ++/* Speculative Store Bypass Disable */ ++#define CPUID_8000_0008_EBX_AMD_SSBD (1U << 24) + + #define CPUID_XSAVE_XSAVEOPT (1U << 0) + #define CPUID_XSAVE_XSAVEC (1U << 1) +-- +2.27.0 + diff --git a/kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch b/kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch new file mode 100755 index 0000000000000000000000000000000000000000..17251bf2b3263967bbbefd91785095896b520564 --- /dev/null +++ b/kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch @@ -0,0 +1,82 @@ +From d3b9c1891a6d05308dd5ea119d2c32c8f98a25da Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Tue, 30 Jun 2020 23:40:15 -0400 +Subject: [PATCH 1/4] i386: Mask SVM features if nested SVM is disabled + +RH-Author: Eduardo Habkost +Message-id: <20200630234015.166253-2-ehabkost@redhat.com> +Patchwork-id: 97852 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] i386: Mask SVM features if nested SVM is disabled +Bugzilla: 1835390 +RH-Acked-by: Igor Mammedov +RH-Acked-by: Bandan Das +RH-Acked-by: Dr. David Alan Gilbert + +QEMU incorrectly validates FEAT_SVM feature flags against +GET_SUPPORTED_CPUID even if SVM features are being masked out by +cpu_x86_cpuid(). This can make QEMU print warnings on most AMD +CPU models, even when SVM nesting is disabled (which is the +default). + +This bug was never detected before because of a Linux KVM bug: +until Linux v5.6, KVM was not filtering out SVM features in +GET_SUPPORTED_CPUID when nested was disabled. This KVM bug was +fixed in Linux v5.7-rc1, on Linux commit a50718cc3f43 ("KVM: +nSVM: Expose SVM features to L1 iff nested is enabled"). + +Fix the problem by adding a CPUID_EXT3_SVM dependency to all +FEAT_SVM feature flags in the feature_dependencies table. + +Reported-by: Yanan Fu +Signed-off-by: Eduardo Habkost +Message-Id: <20200623230116.277409-1-ehabkost@redhat.com> +[Fix testcase. - Paolo] +Signed-off-by: Paolo Bonzini +(cherry picked from commit 730319aef0fcb94f11a4a2d32656437fdde7efdd) +Signed-off-by: Eduardo Habkost +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 4 ++++ + tests/test-x86-cpuid-compat.c | 4 ++-- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 7d7b016bb7..a343de0c9d 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1477,6 +1477,10 @@ static FeatureDep feature_dependencies[] = { + .from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_VMFUNC }, + .to = { FEAT_VMX_VMFUNC, ~0ull }, + }, ++ { ++ .from = { FEAT_8000_0001_ECX, CPUID_EXT3_SVM }, ++ .to = { FEAT_SVM, ~0ull }, ++ }, + }; + + typedef struct X86RegisterInfo32 { +diff --git a/tests/test-x86-cpuid-compat.c b/tests/test-x86-cpuid-compat.c +index e7c075ed98..983aa0719a 100644 +--- a/tests/test-x86-cpuid-compat.c ++++ b/tests/test-x86-cpuid-compat.c +@@ -256,7 +256,7 @@ int main(int argc, char **argv) + "-cpu 486,+invtsc", "xlevel", 0x80000007); + /* CPUID[8000_000A].EDX: */ + add_cpuid_test("x86/cpuid/auto-xlevel/486/npt", +- "-cpu 486,+npt", "xlevel", 0x8000000A); ++ "-cpu 486,+svm,+npt", "xlevel", 0x8000000A); + /* CPUID[C000_0001].EDX: */ + add_cpuid_test("x86/cpuid/auto-xlevel2/phenom/xstore", + "-cpu phenom,+xstore", "xlevel2", 0xC0000001); +@@ -349,7 +349,7 @@ int main(int argc, char **argv) + "-machine pc-i440fx-2.4 -cpu SandyBridge,", + "xlevel", 0x80000008); + add_cpuid_test("x86/cpuid/xlevel-compat/pc-i440fx-2.4/npt-on", +- "-machine pc-i440fx-2.4 -cpu SandyBridge,+npt", ++ "-machine pc-i440fx-2.4 -cpu SandyBridge,+svm,+npt", + "xlevel", 0x80000008); + #endif + +-- +2.27.0 + diff --git a/kvm-i386-Remove-cpu64-rhel6-CPU-model.patch b/kvm-i386-Remove-cpu64-rhel6-CPU-model.patch new file mode 100755 index 0000000000000000000000000000000000000000..5d62acea2cdca259cc5269e42be160bceac7fe2f --- /dev/null +++ b/kvm-i386-Remove-cpu64-rhel6-CPU-model.patch @@ -0,0 +1,77 @@ +From 4543a3c19816bd07f27eb900f20ae609df03703c Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Mon, 23 Dec 2019 21:10:31 +0000 +Subject: [PATCH 1/2] i386: Remove cpu64-rhel6 CPU model + +RH-Author: Eduardo Habkost +Message-id: <20191223211031.26503-1-ehabkost@redhat.com> +Patchwork-id: 93213 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] i386: Remove cpu64-rhel6 CPU model +Bugzilla: 1741345 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Laszlo Ersek + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1741345 +BRANCH: rhel-av-8.2.0 +Upstream: not applicable +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=25525975 + +We don't provide rhel6 machine types anymore, so we don't need to +provide compatibility with RHEl6. cpu64-rhel6 was documented as +deprecated and scheduled for removal in 8.2, so now it's time to +remove it. + +Signed-off-by: Eduardo Habkost +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 26 +------------------------- + 1 file changed, 1 insertion(+), 25 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 790db77..6dce6f2 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1829,12 +1829,7 @@ static CPUCaches epyc_cache_info = { + + static X86CPUDefinition builtin_x86_defs[] = { + { +- /* qemu64 is the default CPU model for all *-rhel7.* machine-types. +- * The default on RHEL-6 was cpu64-rhel6. +- * libvirt assumes that qemu64 is the default for _all_ machine-types, +- * so we should try to keep qemu64 and cpu64-rhel6 as similar as +- * possible. +- */ ++ /* qemu64 is the default CPU model for all machine-types */ + .name = "qemu64", + .level = 0xd, + .vendor = CPUID_VENDOR_AMD, +@@ -2135,25 +2130,6 @@ static X86CPUDefinition builtin_x86_defs[] = { + .model_id = "Intel(R) Atom(TM) CPU N270 @ 1.60GHz", + }, + { +- .name = "cpu64-rhel6", +- .level = 4, +- .vendor = CPUID_VENDOR_AMD, +- .family = 6, +- .model = 13, +- .stepping = 3, +- .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | +- CPUID_MMX | CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | +- CPUID_MCA | CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | +- CPUID_CX8 | CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | +- CPUID_PSE | CPUID_DE | CPUID_FP87, +- .features[FEAT_1_ECX] = CPUID_EXT_CX16 | CPUID_EXT_SSE3, +- .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_NX | CPUID_EXT2_SYSCALL, +- .features[FEAT_8000_0001_ECX] = CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | +- CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, +- .xlevel = 0x8000000A, +- .model_id = "QEMU Virtual CPU version (cpu64-rhel6)", +- }, +- { + .name = "Conroe", + .level = 10, + .vendor = CPUID_VENDOR_INTEL, +-- +1.8.3.1 + diff --git a/kvm-i386-Resolve-CPU-models-to-v1-by-default.patch b/kvm-i386-Resolve-CPU-models-to-v1-by-default.patch new file mode 100755 index 0000000000000000000000000000000000000000..1027341a0c4a94f2bcdafda727d10e852901c1c0 --- /dev/null +++ b/kvm-i386-Resolve-CPU-models-to-v1-by-default.patch @@ -0,0 +1,95 @@ +From ccda4494b0ea4b81b6b0c3e539a0bcf7e673c68c Mon Sep 17 00:00:00 2001 +From: Eduardo Habkost +Date: Thu, 5 Dec 2019 21:56:50 +0000 +Subject: [PATCH 01/18] i386: Resolve CPU models to v1 by default + +RH-Author: Eduardo Habkost +Message-id: <20191205225650.772600-2-ehabkost@redhat.com> +Patchwork-id: 92907 +O-Subject: [RHEL-AV-8.1.1 qemu-kvm PATCH 1/1] i386: Resolve CPU models to v1 by default +Bugzilla: 1787291 1779078 1779078 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Igor Mammedov +RH-Acked-by: Paolo Bonzini + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1779078 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=25187823 +Upstream: submitted, Message-Id: <20191205223339.764534-1-ehabkost@redhat.com> + +When using `query-cpu-definitions` using `-machine none`, +QEMU is resolving all CPU models to their latest versions. The +actual CPU model version being used by another machine type (e.g. +`pc-q35-4.0`) might be different. + +In theory, this was OK because the correct CPU model +version is returned when using the correct `-machine` argument. + +Except that in practice, this breaks libvirt expectations: +libvirt always use `-machine none` when checking if a CPU model +is runnable, because runnability is not expected to be affected +when the machine type is changed. + +For example, when running on a Haswell host without TSX, +Haswell-v4 is runnable, but Haswell-v1 is not. On those hosts, +`query-cpu-definitions` says Haswell is runnable if using +`-machine none`, but Haswell is actually not runnable using any +of the `pc-*` machine types (because they resolve Haswell to +Haswell-v1). In other words, we're breaking the "runnability +guarantee" we promised to not break for a few releases (see +qemu-deprecated.texi). + +To address this issue, change the default CPU model version to v1 +on all machine types, so we make `query-cpu-definitions` output +when using `-machine none` match the results when using `pc-*`. +This will change in the future (the plan is to always return the +latest CPU model version if using `-machine none`), but only +after giving libvirt the opportunity to adapt. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1779078 +Signed-off-by: Eduardo Habkost +Signed-off-by: Danilo C. L. de Paula +--- + qemu-deprecated.texi | 7 +++++++ + target/i386/cpu.c | 8 +++++++- + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/qemu-deprecated.texi b/qemu-deprecated.texi +index 4b4b742..534ebe9 100644 +--- a/qemu-deprecated.texi ++++ b/qemu-deprecated.texi +@@ -374,6 +374,13 @@ guarantees must resolve the CPU model aliases using te + ``alias-of'' field returned by the ``query-cpu-definitions'' QMP + command. + ++While those guarantees are kept, the return value of ++``query-cpu-definitions'' will have existing CPU model aliases ++point to a version that doesn't break runnability guarantees ++(specifically, version 1 of those CPU models). In future QEMU ++versions, aliases will point to newer CPU model versions ++depending on the machine type, so management software must ++resolve CPU model aliases before starting a virtual machine. + + @node Recently removed features + @appendix Recently removed features +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 6dce6f2..863192c 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3926,7 +3926,13 @@ static PropValue tcg_default_props[] = { + }; + + +-X86CPUVersion default_cpu_version = CPU_VERSION_LATEST; ++/* ++ * We resolve CPU model aliases using -v1 when using "-machine ++ * none", but this is just for compatibility while libvirt isn't ++ * adapted to resolve CPU model versions before creating VMs. ++ * See "Runnability guarantee of CPU models" at * qemu-deprecated.texi. ++ */ ++X86CPUVersion default_cpu_version = 1; + + void x86_cpu_set_default_version(X86CPUVersion version) + { +-- +1.8.3.1 + diff --git a/kvm-ide-atapi-check-logical-block-address-and-read-size-.patch b/kvm-ide-atapi-check-logical-block-address-and-read-size-.patch new file mode 100755 index 0000000000000000000000000000000000000000..706bd8b421357a7d13488ffa3739eafbef9d717a --- /dev/null +++ b/kvm-ide-atapi-check-logical-block-address-and-read-size-.patch @@ -0,0 +1,120 @@ +From 0453588f95294ed5ce912cb8b810a322bf9d91e0 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 25 Feb 2021 19:43:02 -0500 +Subject: [PATCH] ide: atapi: check logical block address and read size + (CVE-2020-29443) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210225194302.3137699-2-jmaloy@redhat.com> +Patchwork-id: 101208 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 1/1] ide: atapi: check logical block address and read size (CVE-2020-29443) +Bugzilla: 1917451 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Danilo de Paula +RH-Acked-by: Paolo Bonzini + +From: Prasad J Pandit + +While processing ATAPI cmd_read/cmd_read_cd commands, +Logical Block Address (LBA) maybe invalid OR closer to the last block, +leading to an OOB access issues. Add range check to avoid it. + +Fixes: CVE-2020-29443 +Reported-by: Wenxiang Qian +Suggested-by: Paolo Bonzini +Reviewed-by: Paolo Bonzini +Signed-off-by: Prasad J Pandit +Message-Id: <20210118115130.457044-1-ppandit@redhat.com> +Signed-off-by: Paolo Bonzini + +(cherry picked from commit b8d7f1bc59276fec85e4d09f1567613a3e14d31e) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/ide/atapi.c | 30 ++++++++++++++++++++++++------ + 1 file changed, 24 insertions(+), 6 deletions(-) + +diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c +index 17a9d635d8..d064935c8d 100644 +--- a/hw/ide/atapi.c ++++ b/hw/ide/atapi.c +@@ -320,6 +320,8 @@ static void ide_atapi_cmd_reply(IDEState *s, int size, int max_size) + static void ide_atapi_cmd_read_pio(IDEState *s, int lba, int nb_sectors, + int sector_size) + { ++ assert(0 <= lba && lba < (s->nb_sectors >> 2)); ++ + s->lba = lba; + s->packet_transfer_size = nb_sectors * sector_size; + s->elementary_transfer_size = 0; +@@ -418,6 +420,8 @@ eot: + static void ide_atapi_cmd_read_dma(IDEState *s, int lba, int nb_sectors, + int sector_size) + { ++ assert(0 <= lba && lba < (s->nb_sectors >> 2)); ++ + s->lba = lba; + s->packet_transfer_size = nb_sectors * sector_size; + s->io_buffer_size = 0; +@@ -971,35 +975,49 @@ static void cmd_prevent_allow_medium_removal(IDEState *s, uint8_t* buf) + + static void cmd_read(IDEState *s, uint8_t* buf) + { +- int nb_sectors, lba; ++ unsigned int nb_sectors, lba; ++ ++ /* Total logical sectors of ATAPI_SECTOR_SIZE(=2048) bytes */ ++ uint64_t total_sectors = s->nb_sectors >> 2; + + if (buf[0] == GPCMD_READ_10) { + nb_sectors = lduw_be_p(buf + 7); + } else { + nb_sectors = ldl_be_p(buf + 6); + } +- +- lba = ldl_be_p(buf + 2); + if (nb_sectors == 0) { + ide_atapi_cmd_ok(s); + return; + } + ++ lba = ldl_be_p(buf + 2); ++ if (lba >= total_sectors || lba + nb_sectors - 1 >= total_sectors) { ++ ide_atapi_cmd_error(s, ILLEGAL_REQUEST, ASC_LOGICAL_BLOCK_OOR); ++ return; ++ } ++ + ide_atapi_cmd_read(s, lba, nb_sectors, 2048); + } + + static void cmd_read_cd(IDEState *s, uint8_t* buf) + { +- int nb_sectors, lba, transfer_request; ++ unsigned int nb_sectors, lba, transfer_request; + +- nb_sectors = (buf[6] << 16) | (buf[7] << 8) | buf[8]; +- lba = ldl_be_p(buf + 2); ++ /* Total logical sectors of ATAPI_SECTOR_SIZE(=2048) bytes */ ++ uint64_t total_sectors = s->nb_sectors >> 2; + ++ nb_sectors = (buf[6] << 16) | (buf[7] << 8) | buf[8]; + if (nb_sectors == 0) { + ide_atapi_cmd_ok(s); + return; + } + ++ lba = ldl_be_p(buf + 2); ++ if (lba >= total_sectors || lba + nb_sectors - 1 >= total_sectors) { ++ ide_atapi_cmd_error(s, ILLEGAL_REQUEST, ASC_LOGICAL_BLOCK_OOR); ++ return; ++ } ++ + transfer_request = buf[9] & 0xf8; + if (transfer_request == 0x00) { + /* nothing */ +-- +2.27.0 + diff --git a/kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch b/kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch new file mode 100755 index 0000000000000000000000000000000000000000..db89a06865d0bbfccb7f665a626f88a12b078209 --- /dev/null +++ b/kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch @@ -0,0 +1,58 @@ +From d8f84a8086dbe339a9f97dbcd10abd6379525068 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:37 -0500 +Subject: [PATCH 13/17] intel_iommu: Skip page walking on device iotlb + invalidations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-13-eperezma@redhat.com> +Patchwork-id: 100605 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 12/13] intel_iommu: Skip page walking on device iotlb invalidations +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +Although they didn't reach the notifier because of the filtering in +memory_region_notify_iommu_one, the vt-d was still splitting huge +memory invalidations in chunks. Skipping it. + +This improves performance in case of netperf with vhost-net: +* TCP_STREAM: From 1923.6Mbit/s to 2175.13Mbit/s (13%) +* TCP_RR: From 8464.73 trans/s to 8932.703333 trans/s (5.5%) +* UDP_RR: From 8562.08 trans/s to 9005.62/s (5.1%) +* UDP_STREAM: No change observed (insignificant 0.1% improvement) + +Signed-off-by: Eugenio Pérez +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-5-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit f7701e2c7983b680790af47117577b285b6a1aed) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/i386/intel_iommu.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 3640bc2ed15..2b270f06645 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -1421,6 +1421,10 @@ static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) + VTDContextEntry ce; + IOMMUNotifier *n; + ++ if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) { ++ return 0; ++ } ++ + ret = vtd_dev_to_context_entry(vtd_as->iommu_state, + pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); +-- +2.27.0 + diff --git a/kvm-introduce-kvm_kernel_irqchip_-functions.patch b/kvm-introduce-kvm_kernel_irqchip_-functions.patch new file mode 100755 index 0000000000000000000000000000000000000000..b171749d4da8cc0d26a793c612e6fb9c6caea358 --- /dev/null +++ b/kvm-introduce-kvm_kernel_irqchip_-functions.patch @@ -0,0 +1,281 @@ +From 3899672db472c1ca530badd49d17726a1057f8af Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 5 Jun 2020 07:41:10 -0400 +Subject: [PATCH 40/42] kvm: introduce kvm_kernel_irqchip_* functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200605074111.2185-3-thuth@redhat.com> +Patchwork-id: 97369 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/3] kvm: introduce kvm_kernel_irqchip_* functions +Bugzilla: 1756946 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Paolo Bonzini + +The KVMState struct is opaque, so provide accessors for the fields +that will be moved from current_machine to the accelerator. For now +they just forward to the machine object, but this will change. + +Signed-off-by: Paolo Bonzini +(cherry picked from commit 4376c40dedb22530738eeb104a603e94ed03f719) + +Conflicts: + accel/kvm/kvm-all.c + (contextual conflict due to missing other commits in downstream) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + accel/kvm/kvm-all.c | 23 +++++++++++++++++++---- + hw/ppc/e500.c | 5 ++--- + hw/ppc/spapr_irq.c | 16 ++++------------ + include/sysemu/kvm.h | 7 +++++-- + target/arm/kvm.c | 8 ++++---- + target/i386/kvm.c | 4 ++-- + target/mips/kvm.c | 2 +- + target/ppc/kvm.c | 2 +- + target/s390x/kvm.c | 2 +- + 9 files changed, 39 insertions(+), 30 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 5007bdad96..b0250209f5 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -1772,7 +1772,7 @@ void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) + g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); + } + +-static void kvm_irqchip_create(MachineState *machine, KVMState *s) ++static void kvm_irqchip_create(KVMState *s) + { + int ret; + +@@ -1790,9 +1790,9 @@ static void kvm_irqchip_create(MachineState *machine, KVMState *s) + + /* First probe and see if there's a arch-specific hook to create the + * in-kernel irqchip for us */ +- ret = kvm_arch_irqchip_create(machine, s); ++ ret = kvm_arch_irqchip_create(s); + if (ret == 0) { +- if (machine_kernel_irqchip_split(machine)) { ++ if (kvm_kernel_irqchip_split()) { + perror("Split IRQ chip mode not supported."); + exit(1); + } else { +@@ -2076,7 +2076,7 @@ static int kvm_init(MachineState *ms) + } + + if (machine_kernel_irqchip_allowed(ms)) { +- kvm_irqchip_create(ms, s); ++ kvm_irqchip_create(s); + } + + if (kvm_eventfds_allowed) { +@@ -2966,6 +2966,21 @@ static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, + return false; + } + ++bool kvm_kernel_irqchip_allowed(void) ++{ ++ return machine_kernel_irqchip_allowed(current_machine); ++} ++ ++bool kvm_kernel_irqchip_required(void) ++{ ++ return machine_kernel_irqchip_required(current_machine); ++} ++ ++bool kvm_kernel_irqchip_split(void) ++{ ++ return machine_kernel_irqchip_split(current_machine); ++} ++ + static void kvm_accel_class_init(ObjectClass *oc, void *data) + { + AccelClass *ac = ACCEL_CLASS(oc); +diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c +index 91cd4c26f9..12b6a5b2a8 100644 +--- a/hw/ppc/e500.c ++++ b/hw/ppc/e500.c +@@ -793,7 +793,6 @@ static DeviceState *ppce500_init_mpic(PPCE500MachineState *pms, + MemoryRegion *ccsr, + IrqLines *irqs) + { +- MachineState *machine = MACHINE(pms); + const PPCE500MachineClass *pmc = PPCE500_MACHINE_GET_CLASS(pms); + DeviceState *dev = NULL; + SysBusDevice *s; +@@ -801,10 +800,10 @@ static DeviceState *ppce500_init_mpic(PPCE500MachineState *pms, + if (kvm_enabled()) { + Error *err = NULL; + +- if (machine_kernel_irqchip_allowed(machine)) { ++ if (kvm_kernel_irqchip_allowed()) { + dev = ppce500_init_mpic_kvm(pmc, irqs, &err); + } +- if (machine_kernel_irqchip_required(machine) && !dev) { ++ if (kvm_kernel_irqchip_required() && !dev) { + error_reportf_err(err, + "kernel_irqchip requested but unavailable: "); + exit(1); +diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c +index 9da423658a..f388d07bf9 100644 +--- a/hw/ppc/spapr_irq.c ++++ b/hw/ppc/spapr_irq.c +@@ -75,12 +75,11 @@ int spapr_irq_init_kvm(SpaprInterruptControllerInitKvm fn, + uint32_t nr_servers, + Error **errp) + { +- MachineState *machine = MACHINE(qdev_get_machine()); + Error *local_err = NULL; + +- if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { ++ if (kvm_enabled() && kvm_kernel_irqchip_allowed()) { + if (fn(intc, nr_servers, &local_err) < 0) { +- if (machine_kernel_irqchip_required(machine)) { ++ if (kvm_kernel_irqchip_required()) { + error_prepend(&local_err, + "kernel_irqchip requested but unavailable: "); + error_propagate(errp, local_err); +@@ -185,7 +184,7 @@ static int spapr_irq_check(SpaprMachineState *spapr, Error **errp) + */ + if (kvm_enabled() && + spapr->irq == &spapr_irq_dual && +- machine_kernel_irqchip_required(machine) && ++ kvm_kernel_irqchip_required() && + xics_kvm_has_broken_disconnect(spapr)) { + error_setg(errp, "KVM is too old to support ic-mode=dual,kernel-irqchip=on"); + return -1; +@@ -288,20 +287,13 @@ uint32_t spapr_irq_nr_msis(SpaprMachineState *spapr) + + void spapr_irq_init(SpaprMachineState *spapr, Error **errp) + { +- MachineState *machine = MACHINE(spapr); + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + +- if (machine_kernel_irqchip_split(machine)) { ++ if (kvm_enabled() && kvm_kernel_irqchip_split()) { + error_setg(errp, "kernel_irqchip split mode not supported on pseries"); + return; + } + +- if (!kvm_enabled() && machine_kernel_irqchip_required(machine)) { +- error_setg(errp, +- "kernel_irqchip requested but only available with KVM"); +- return; +- } +- + if (spapr_irq_check(spapr, errp) < 0) { + return; + } +diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h +index 9fe233b9bf..aaf2a502e8 100644 +--- a/include/sysemu/kvm.h ++++ b/include/sysemu/kvm.h +@@ -519,10 +519,13 @@ void kvm_pc_gsi_handler(void *opaque, int n, int level); + void kvm_pc_setup_irq_routing(bool pci_enabled); + void kvm_init_irq_routing(KVMState *s); + ++bool kvm_kernel_irqchip_allowed(void); ++bool kvm_kernel_irqchip_required(void); ++bool kvm_kernel_irqchip_split(void); ++ + /** + * kvm_arch_irqchip_create: + * @KVMState: The KVMState pointer +- * @MachineState: The MachineState pointer + * + * Allow architectures to create an in-kernel irq chip themselves. + * +@@ -530,7 +533,7 @@ void kvm_init_irq_routing(KVMState *s); + * 0: irq chip was not created + * > 0: irq chip was created + */ +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s); ++int kvm_arch_irqchip_create(KVMState *s); + + /** + * kvm_set_one_reg - set a register value in KVM via KVM_SET_ONE_REG ioctl +diff --git a/target/arm/kvm.c b/target/arm/kvm.c +index 4be9497402..418bcedc3e 100644 +--- a/target/arm/kvm.c ++++ b/target/arm/kvm.c +@@ -861,11 +861,11 @@ void kvm_arch_init_irq_routing(KVMState *s) + { + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { +- if (machine_kernel_irqchip_split(ms)) { +- perror("-machine kernel_irqchip=split is not supported on ARM."); +- exit(1); ++ if (kvm_kernel_irqchip_split()) { ++ perror("-machine kernel_irqchip=split is not supported on ARM."); ++ exit(1); + } + + /* If we can create the VGIC using the newer device control API, we +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index fcc8f7d1f3..f5c17e0028 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -4532,10 +4532,10 @@ void kvm_arch_init_irq_routing(KVMState *s) + } + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + int ret; +- if (machine_kernel_irqchip_split(ms)) { ++ if (kvm_kernel_irqchip_split()) { + ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24); + if (ret) { + error_report("Could not enable split irqchip mode: %s", +diff --git a/target/mips/kvm.c b/target/mips/kvm.c +index 578bc14625..de3e26ef1f 100644 +--- a/target/mips/kvm.c ++++ b/target/mips/kvm.c +@@ -57,7 +57,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + return 0; + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + return 0; + } +diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c +index c77f9848ec..461dc6dae1 100644 +--- a/target/ppc/kvm.c ++++ b/target/ppc/kvm.c +@@ -152,7 +152,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + return 0; + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + return 0; + } +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 84d7cadd09..c589ef9034 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -386,7 +386,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + return 0; + } + +-int kvm_arch_irqchip_create(MachineState *ms, KVMState *s) ++int kvm_arch_irqchip_create(KVMState *s) + { + return 0; + } +-- +2.27.0 + diff --git a/kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch b/kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch new file mode 100755 index 0000000000000000000000000000000000000000..a50bff93a58590ca5cabfa8393e7c82c47ecc85f --- /dev/null +++ b/kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch @@ -0,0 +1,241 @@ +From a4a984e67e276e643b8a51f39ca426d0967754a0 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 13 Jul 2020 14:24:51 -0400 +Subject: [PATCH 4/4] iotests/026: Move v3-exclusive test to new file +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-5-mreitz@redhat.com> +Patchwork-id: 97956 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 4/4] iotests/026: Move v3-exclusive test to new file +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +data_file does not work with v2, and we probably want 026 to keep +working for v2 images. Thus, open a new file for v3-exclusive error +path test cases. + +Fixes: 81311255f217859413c94f2cd9cebf2684bbda94 + (“iotests/026: Test EIO on allocation in a data-file”) +Signed-off-by: Max Reitz +Message-Id: <20200311140707.1243218-1-mreitz@redhat.com> +Reviewed-by: John Snow +Tested-by: John Snow +Signed-off-by: Max Reitz +(cherry picked from commit c264e5d2f9f5d73977eac8e5d084f727b3d07ea9) + +Conflicts: + tests/qemu-iotests/group + - As per usual. + +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/026 | 31 ----------- + tests/qemu-iotests/026.out | 6 -- + tests/qemu-iotests/026.out.nocache | 6 -- + tests/qemu-iotests/289 | 89 ++++++++++++++++++++++++++++++ + tests/qemu-iotests/289.out | 8 +++ + tests/qemu-iotests/group | 1 + + 6 files changed, 98 insertions(+), 43 deletions(-) + create mode 100755 tests/qemu-iotests/289 + create mode 100644 tests/qemu-iotests/289.out + +diff --git a/tests/qemu-iotests/026 b/tests/qemu-iotests/026 +index c1c96a41d9..3afd708863 100755 +--- a/tests/qemu-iotests/026 ++++ b/tests/qemu-iotests/026 +@@ -237,37 +237,6 @@ $QEMU_IO -c "write 0 $CLUSTER_SIZE" "$BLKDBG_TEST_IMG" | _filter_qemu_io + + _check_test_img + +-echo +-echo === Avoid freeing external data clusters on failure === +-echo +- +-# Similar test as the last one, except we test what happens when there +-# is an error when writing to an external data file instead of when +-# writing to a preallocated zero cluster +-_make_test_img -o "data_file=$TEST_IMG.data_file" $CLUSTER_SIZE +- +-# Put blkdebug above the data-file, and a raw node on top of that so +-# that blkdebug will see a write_aio event and emit an error +-$QEMU_IO -c "write 0 $CLUSTER_SIZE" \ +- "json:{ +- 'driver': 'qcow2', +- 'file': { 'driver': 'file', 'filename': '$TEST_IMG' }, +- 'data-file': { +- 'driver': 'raw', +- 'file': { +- 'driver': 'blkdebug', +- 'config': '$TEST_DIR/blkdebug.conf', +- 'image': { +- 'driver': 'file', +- 'filename': '$TEST_IMG.data_file' +- } +- } +- } +- }" \ +- | _filter_qemu_io +- +-_check_test_img +- + # success, all done + echo "*** done" + rm -f $seq.full +diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out +index c1b3b58482..83989996ff 100644 +--- a/tests/qemu-iotests/026.out ++++ b/tests/qemu-iotests/026.out +@@ -653,10 +653,4 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. +- +-=== Avoid freeing external data clusters on failure === +- +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file +-write failed: Input/output error +-No errors were found on the image. + *** done +diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache +index 8d5001648a..9359d26d7e 100644 +--- a/tests/qemu-iotests/026.out.nocache ++++ b/tests/qemu-iotests/026.out.nocache +@@ -661,10 +661,4 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. +- +-=== Avoid freeing external data clusters on failure === +- +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file +-write failed: Input/output error +-No errors were found on the image. + *** done +diff --git a/tests/qemu-iotests/289 b/tests/qemu-iotests/289 +new file mode 100755 +index 0000000000..1c11d4030e +--- /dev/null ++++ b/tests/qemu-iotests/289 +@@ -0,0 +1,89 @@ ++#!/usr/bin/env bash ++# ++# qcow2 v3-exclusive error path testing ++# (026 tests paths common to v2 and v3) ++# ++# Copyright (C) 2020 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++seq=$(basename $0) ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++ rm "$TEST_DIR/blkdebug.conf" ++ rm -f "$TEST_IMG.data_file" ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++. ./common.pattern ++ ++_supported_fmt qcow2 ++_supported_proto file ++# This is a v3-exclusive test; ++# As for data_file, error paths often very much depend on whether ++# there is an external data file or not; so we create one exactly when ++# we want to test it ++_unsupported_imgopts 'compat=0.10' data_file ++ ++echo ++echo === Avoid freeing external data clusters on failure === ++echo ++ ++cat > "$TEST_DIR/blkdebug.conf" < +Date: Mon, 13 Jul 2020 14:24:50 -0400 +Subject: [PATCH 3/4] iotests/026: Test EIO on allocation in a data-file + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-4-mreitz@redhat.com> +Patchwork-id: 97955 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 3/4] iotests/026: Test EIO on allocation in a data-file +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +Test what happens when writing data to an external data file, where the +write requires an L2 entry to be allocated, but the data write fails. + +Signed-off-by: Max Reitz +Message-Id: <20200225143130.111267-4-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 81311255f217859413c94f2cd9cebf2684bbda94) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/026 | 32 ++++++++++++++++++++++++++++++ + tests/qemu-iotests/026.out | 6 ++++++ + tests/qemu-iotests/026.out.nocache | 6 ++++++ + 3 files changed, 44 insertions(+) + +diff --git a/tests/qemu-iotests/026 b/tests/qemu-iotests/026 +index d89729697f..c1c96a41d9 100755 +--- a/tests/qemu-iotests/026 ++++ b/tests/qemu-iotests/026 +@@ -30,6 +30,7 @@ _cleanup() + { + _cleanup_test_img + rm "$TEST_DIR/blkdebug.conf" ++ rm -f "$TEST_IMG.data_file" + } + trap "_cleanup; exit \$status" 0 1 2 3 15 + +@@ -236,6 +237,37 @@ $QEMU_IO -c "write 0 $CLUSTER_SIZE" "$BLKDBG_TEST_IMG" | _filter_qemu_io + + _check_test_img + ++echo ++echo === Avoid freeing external data clusters on failure === ++echo ++ ++# Similar test as the last one, except we test what happens when there ++# is an error when writing to an external data file instead of when ++# writing to a preallocated zero cluster ++_make_test_img -o "data_file=$TEST_IMG.data_file" $CLUSTER_SIZE ++ ++# Put blkdebug above the data-file, and a raw node on top of that so ++# that blkdebug will see a write_aio event and emit an error ++$QEMU_IO -c "write 0 $CLUSTER_SIZE" \ ++ "json:{ ++ 'driver': 'qcow2', ++ 'file': { 'driver': 'file', 'filename': '$TEST_IMG' }, ++ 'data-file': { ++ 'driver': 'raw', ++ 'file': { ++ 'driver': 'blkdebug', ++ 'config': '$TEST_DIR/blkdebug.conf', ++ 'image': { ++ 'driver': 'file', ++ 'filename': '$TEST_IMG.data_file' ++ } ++ } ++ } ++ }" \ ++ | _filter_qemu_io ++ ++_check_test_img ++ + # success, all done + echo "*** done" + rm -f $seq.full +diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out +index 83989996ff..c1b3b58482 100644 +--- a/tests/qemu-iotests/026.out ++++ b/tests/qemu-iotests/026.out +@@ -653,4 +653,10 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. ++ ++=== Avoid freeing external data clusters on failure === ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file ++write failed: Input/output error ++No errors were found on the image. + *** done +diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache +index 9359d26d7e..8d5001648a 100644 +--- a/tests/qemu-iotests/026.out.nocache ++++ b/tests/qemu-iotests/026.out.nocache +@@ -661,4 +661,10 @@ wrote 1024/1024 bytes at offset 0 + 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + write failed: Input/output error + No errors were found on the image. ++ ++=== Avoid freeing external data clusters on failure === ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1024 data_file=TEST_DIR/t.IMGFMT.data_file ++write failed: Input/output error ++No errors were found on the image. + *** done +-- +2.27.0 + diff --git a/kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch b/kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch new file mode 100755 index 0000000000000000000000000000000000000000..36d609c5f776b81069d04fccaa05c2c3d1254142 --- /dev/null +++ b/kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch @@ -0,0 +1,102 @@ +From b1035096f2d46e2146704d1db9581c6d2131d1f4 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 13 Jul 2020 14:24:49 -0400 +Subject: [PATCH 2/4] iotests/026: Test EIO on preallocated zero cluster + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-3-mreitz@redhat.com> +Patchwork-id: 97953 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/4] iotests/026: Test EIO on preallocated zero cluster +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +Test what happens when writing data to a preallocated zero cluster, but +the data write fails. + +Signed-off-by: Max Reitz +Message-Id: <20200225143130.111267-3-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 31ab00f3747c00fdbb9027cea644b40dd1405480) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/026 | 21 +++++++++++++++++++++ + tests/qemu-iotests/026.out | 10 ++++++++++ + tests/qemu-iotests/026.out.nocache | 10 ++++++++++ + 3 files changed, 41 insertions(+) + +diff --git a/tests/qemu-iotests/026 b/tests/qemu-iotests/026 +index 3430029ed6..d89729697f 100755 +--- a/tests/qemu-iotests/026 ++++ b/tests/qemu-iotests/026 +@@ -215,6 +215,27 @@ _make_test_img 64M + $QEMU_IO -c "write 0 1M" -c "write 0 1M" "$BLKDBG_TEST_IMG" | _filter_qemu_io + _check_test_img + ++echo ++echo === Avoid freeing preallocated zero clusters on failure === ++echo ++ ++cat > "$TEST_DIR/blkdebug.conf" < +Date: Wed, 3 Jun 2020 16:03:17 +0100 +Subject: [PATCH 18/26] iotests/055: refactor compressed backup to vmdk + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-4-kwolf@redhat.com> +Patchwork-id: 97104 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 03/11] iotests/055: refactor compressed backup to vmdk +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: Vladimir Sementsov-Ogievskiy + +Instead of looping in each test, let's better refactor vmdk target case +as a subclass. + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200430124713.3067-6-vsementsov@virtuozzo.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 8e8372944e5e097e98844b4db10f867689065e16) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 70 ++++++++++++++++++++++++---------------------- + tests/qemu-iotests/055.out | 4 +-- + 2 files changed, 39 insertions(+), 35 deletions(-) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index eb50c9f..8666601 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -450,10 +450,9 @@ class TestSingleTransaction(iotests.QMPTestCase): + self.assert_no_active_block_jobs() + + +-class TestDriveCompression(iotests.QMPTestCase): ++class TestCompressedToQcow2(iotests.QMPTestCase): + image_len = 64 * 1024 * 1024 # MB +- fmt_supports_compression = [{'type': 'qcow2', 'args': ()}, +- {'type': 'vmdk', 'args': ('-o', 'subformat=streamOptimized')}] ++ target_fmt = {'type': 'qcow2', 'args': ()} + + def tearDown(self): + self.vm.shutdown() +@@ -463,19 +462,20 @@ class TestDriveCompression(iotests.QMPTestCase): + except OSError: + pass + +- def do_prepare_drives(self, fmt, args, attach_target): ++ def do_prepare_drives(self, attach_target): + self.vm = iotests.VM().add_drive('blkdebug::' + test_img) + +- qemu_img('create', '-f', fmt, blockdev_target_img, +- str(TestDriveCompression.image_len), *args) ++ qemu_img('create', '-f', self.target_fmt['type'], blockdev_target_img, ++ str(self.image_len), *self.target_fmt['args']) + if attach_target: + self.vm.add_drive(blockdev_target_img, +- img_format=fmt, interface="none") ++ img_format=self.target_fmt['type'], ++ interface="none") + + self.vm.launch() + +- def do_test_compress_complete(self, cmd, format, attach_target, **args): +- self.do_prepare_drives(format['type'], format['args'], attach_target) ++ def do_test_compress_complete(self, cmd, attach_target, **args): ++ self.do_prepare_drives(attach_target) + + self.assert_no_active_block_jobs() + +@@ -486,21 +486,21 @@ class TestDriveCompression(iotests.QMPTestCase): + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, blockdev_target_img, +- iotests.imgfmt, format['type']), ++ iotests.imgfmt, ++ self.target_fmt['type']), + 'target image does not match source after backup') + + def test_complete_compress_drive_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_complete('drive-backup', format, False, +- target=blockdev_target_img, mode='existing') ++ self.do_test_compress_complete('drive-backup', False, ++ target=blockdev_target_img, ++ mode='existing') + + def test_complete_compress_blockdev_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_complete('blockdev-backup', format, True, +- target='drive1') ++ self.do_test_compress_complete('blockdev-backup', ++ True, target='drive1') + +- def do_test_compress_cancel(self, cmd, format, attach_target, **args): +- self.do_prepare_drives(format['type'], format['args'], attach_target) ++ def do_test_compress_cancel(self, cmd, attach_target, **args): ++ self.do_prepare_drives(attach_target) + + self.assert_no_active_block_jobs() + +@@ -514,17 +514,16 @@ class TestDriveCompression(iotests.QMPTestCase): + self.vm.shutdown() + + def test_compress_cancel_drive_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_cancel('drive-backup', format, False, +- target=blockdev_target_img, mode='existing') ++ self.do_test_compress_cancel('drive-backup', False, ++ target=blockdev_target_img, ++ mode='existing') + + def test_compress_cancel_blockdev_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_cancel('blockdev-backup', format, True, +- target='drive1') ++ self.do_test_compress_cancel('blockdev-backup', True, ++ target='drive1') + +- def do_test_compress_pause(self, cmd, format, attach_target, **args): +- self.do_prepare_drives(format['type'], format['args'], attach_target) ++ def do_test_compress_pause(self, cmd, attach_target, **args): ++ self.do_prepare_drives(attach_target) + + self.assert_no_active_block_jobs() + +@@ -550,18 +549,23 @@ class TestDriveCompression(iotests.QMPTestCase): + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, blockdev_target_img, +- iotests.imgfmt, format['type']), ++ iotests.imgfmt, ++ self.target_fmt['type']), + 'target image does not match source after backup') + + def test_compress_pause_drive_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_pause('drive-backup', format, False, +- target=blockdev_target_img, mode='existing') ++ self.do_test_compress_pause('drive-backup', False, ++ target=blockdev_target_img, ++ mode='existing') + + def test_compress_pause_blockdev_backup(self): +- for format in TestDriveCompression.fmt_supports_compression: +- self.do_test_compress_pause('blockdev-backup', format, True, +- target='drive1') ++ self.do_test_compress_pause('blockdev-backup', True, ++ target='drive1') ++ ++ ++class TestCompressedToVmdk(TestCompressedToQcow2): ++ target_fmt = {'type': 'vmdk', 'args': ('-o', 'subformat=streamOptimized')} ++ + + if __name__ == '__main__': + iotests.main(supported_fmts=['raw', 'qcow2'], +diff --git a/tests/qemu-iotests/055.out b/tests/qemu-iotests/055.out +index 5ce2f9a..5c26d15 100644 +--- a/tests/qemu-iotests/055.out ++++ b/tests/qemu-iotests/055.out +@@ -1,5 +1,5 @@ +-.............................. ++.................................... + ---------------------------------------------------------------------- +-Ran 30 tests ++Ran 36 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch b/kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch new file mode 100755 index 0000000000000000000000000000000000000000..260d511dc864821742c27f0962f49618e7bbaf02 --- /dev/null +++ b/kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch @@ -0,0 +1,45 @@ +From 9a0ca4797cbd029dab9209d88f8c81b78ded8fd0 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:18 +0100 +Subject: [PATCH 19/26] iotests/055: skip vmdk target tests if vmdk is not + whitelisted + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-5-kwolf@redhat.com> +Patchwork-id: 97101 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 04/11] iotests/055: skip vmdk target tests if vmdk is not whitelisted +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: Vladimir Sementsov-Ogievskiy + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200430124713.3067-7-vsementsov@virtuozzo.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 761cd2e791eae38c3d08ea5f83309ce58bb85ff7) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index 8666601..c9cdc06 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -566,6 +566,10 @@ class TestCompressedToQcow2(iotests.QMPTestCase): + class TestCompressedToVmdk(TestCompressedToQcow2): + target_fmt = {'type': 'vmdk', 'args': ('-o', 'subformat=streamOptimized')} + ++ @iotests.skip_if_unsupported(['vmdk']) ++ def setUp(self): ++ pass ++ + + if __name__ == '__main__': + iotests.main(supported_fmts=['raw', 'qcow2'], +-- +1.8.3.1 + diff --git a/kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch b/kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..c71bcbae66e386e5b9983ec8bb81c8ce094018d9 --- /dev/null +++ b/kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch @@ -0,0 +1,387 @@ +From 2202321b549dda551190d919a5a1cbee0fab8c90 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:22 +0100 +Subject: [PATCH 23/26] iotests/109: Don't mirror with mismatched size + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-9-kwolf@redhat.com> +Patchwork-id: 97105 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 08/11] iotests/109: Don't mirror with mismatched size +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +This patch makes the raw image the same size as the file in a different +format that is mirrored as raw to it to avoid errors when mirror starts +to enforce that source and target are the same size. + +We check only that the first 512 bytes are zeroed (instead of 64k) +because some image formats create image files that are smaller than 64k, +so trying to read 64k would result in I/O errors. Apart from this, 512 +is more appropriate anyway because the raw format driver protects +specifically the first 512 bytes. + +Signed-off-by: Kevin Wolf +Message-Id: <20200511135825.219437-2-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit ffa41a62d0b0e6d91f2071328befa046d56993e1) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/109 | 10 +++--- + tests/qemu-iotests/109.out | 74 +++++++++++++++++----------------------- + tests/qemu-iotests/common.filter | 5 +++ + 3 files changed, 41 insertions(+), 48 deletions(-) + +diff --git a/tests/qemu-iotests/109 b/tests/qemu-iotests/109 +index 9897ceb..190c35e 100755 +--- a/tests/qemu-iotests/109 ++++ b/tests/qemu-iotests/109 +@@ -76,14 +76,14 @@ for fmt in qcow qcow2 qed vdi vmdk vpc; do + echo "=== Writing a $fmt header into raw ===" + echo + +- _make_test_img 64M + TEST_IMG="$TEST_IMG.src" IMGFMT=$fmt _make_test_img 64M ++ _make_test_img $(du -b "$TEST_IMG.src" | cut -f1) | _filter_img_create_size + + # This first test should fail: The image format was probed, we may not + # write an image header at the start of the image + run_qemu "$TEST_IMG" "$TEST_IMG.src" "" "BLOCK_JOB_ERROR" | + _filter_block_job_len +- $QEMU_IO -c 'read -P 0 0 64k' "$TEST_IMG" | _filter_qemu_io ++ $QEMU_IO -c 'read -P 0 0 512' "$TEST_IMG" | _filter_qemu_io + + + # When raw was explicitly specified, the same must succeed +@@ -102,12 +102,12 @@ for sample_img in empty.bochs iotest-dirtylog-10G-4M.vhdx parallels-v1 \ + + # Can't use _use_sample_img because that isn't designed to be used multiple + # times and it overwrites $TEST_IMG (both breaks cleanup) +- _make_test_img 64M + bzcat "$SAMPLE_IMG_DIR/$sample_img.bz2" > "$TEST_IMG.src" ++ _make_test_img $(du -b "$TEST_IMG.src" | cut -f1) | _filter_img_create_size + + run_qemu "$TEST_IMG" "$TEST_IMG.src" "" "BLOCK_JOB_ERROR" | + _filter_block_job_offset | _filter_block_job_len +- $QEMU_IO -c 'read -P 0 0 64k' "$TEST_IMG" | _filter_qemu_io ++ $QEMU_IO -c 'read -P 0 0 512' "$TEST_IMG" | _filter_qemu_io + + run_qemu "$TEST_IMG" "$TEST_IMG.src" "'format': 'raw'," "BLOCK_JOB_READY" + $QEMU_IMG compare -f raw -F raw "$TEST_IMG" "$TEST_IMG.src" +@@ -118,8 +118,8 @@ echo "=== Write legitimate MBR into raw ===" + echo + + for sample_img in grub_mbr.raw; do +- _make_test_img 64M + bzcat "$SAMPLE_IMG_DIR/$sample_img.bz2" > "$TEST_IMG.src" ++ _make_test_img $(du -b "$TEST_IMG.src" | cut -f1) | _filter_img_create_size + + run_qemu "$TEST_IMG" "$TEST_IMG.src" "" "BLOCK_JOB_READY" + $QEMU_IMG compare -f raw -F raw "$TEST_IMG" "$TEST_IMG.src" +diff --git a/tests/qemu-iotests/109.out b/tests/qemu-iotests/109.out +index 884f65f..ad739df 100644 +--- a/tests/qemu-iotests/109.out ++++ b/tests/qemu-iotests/109.out +@@ -2,8 +2,8 @@ QA output created by 109 + + === Writing a qcow header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -23,8 +23,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -43,13 +43,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a qcow2 header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -69,8 +68,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -89,13 +88,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 197120, "offset": 197120, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a qed header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -115,8 +113,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -135,13 +133,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 327680, "offset": 327680, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a vdi header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -161,8 +158,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -181,13 +178,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a vmdk header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -207,8 +203,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -227,13 +223,12 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Writing a vpc header into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 + Formatting 'TEST_DIR/t.raw.src', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -253,8 +248,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -273,12 +268,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image empty.bochs into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -298,8 +292,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -318,12 +312,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image iotest-dirtylog-10G-4M.vhdx into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -343,8 +336,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -363,12 +356,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 31457280, "offset": 31457280, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image parallels-v1 into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -388,8 +380,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -408,12 +400,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 327680, "offset": 327680, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Copying sample image simple-pattern.cloop into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -433,8 +424,8 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"execute":"quit"} + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-read 65536/65536 bytes at offset 0 +-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'format': 'IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -453,12 +444,11 @@ read 65536/65536 bytes at offset 0 + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 2048, "offset": 2048, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + + === Write legitimate MBR into raw === + +-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=SIZE + { 'execute': 'qmp_capabilities' } + {"return": {}} + {'execute':'drive-mirror', 'arguments':{ 'device': 'src', 'target': 'TEST_DIR/t.IMGFMT', 'mode': 'existing', 'sync': 'full'}} +@@ -480,7 +470,6 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + { 'execute': 'qmp_capabilities' } + {"return": {}} +@@ -500,6 +489,5 @@ Images are identical. + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "src"}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "src"}} +-Warning: Image size mismatch! + Images are identical. + *** done +diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter +index 5367dee..c8e8663 100644 +--- a/tests/qemu-iotests/common.filter ++++ b/tests/qemu-iotests/common.filter +@@ -149,6 +149,11 @@ _filter_img_create() + -e "s# force_size=\\(on\\|off\\)##g" + } + ++_filter_img_create_size() ++{ ++ $SED -e "s# size=[0-9]\\+# size=SIZE#g" ++} ++ + _filter_img_info() + { + if [[ "$1" == "--format-specific" ]]; then +-- +1.8.3.1 + diff --git a/kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch b/kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch new file mode 100755 index 0000000000000000000000000000000000000000..ef8807ca93c3d59db903bdb1ae996bfbb6c0fe18 --- /dev/null +++ b/kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch @@ -0,0 +1,120 @@ +From 104c8f6210bf573cf39c2a14cdb0b081baaaa3f0 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:23 +0100 +Subject: [PATCH 24/26] iotests/229: Use blkdebug to inject an error + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-10-kwolf@redhat.com> +Patchwork-id: 97108 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 09/11] iotests/229: Use blkdebug to inject an error +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +229 relies on the mirror running into an I/O error when the target is +smaller than the source. After changing mirror to catch this condition +while starting the job, this test case won't get a job that is paused +for an I/O error any more. Use blkdebug instead to inject an error. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200511135825.219437-3-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit d89ac3cf305b28c024a76805a84d75c0ee1e786f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/229 | 18 +++++++++++++----- + tests/qemu-iotests/229.out | 6 +++--- + 2 files changed, 16 insertions(+), 8 deletions(-) + +diff --git a/tests/qemu-iotests/229 b/tests/qemu-iotests/229 +index e18a464..511fbc0 100755 +--- a/tests/qemu-iotests/229 ++++ b/tests/qemu-iotests/229 +@@ -32,6 +32,7 @@ _cleanup() + _cleanup_qemu + _cleanup_test_img + rm -f "$TEST_IMG" "$DEST_IMG" ++ rm -f "$TEST_DIR/blkdebug.conf" + } + trap "_cleanup; exit \$status" 0 1 2 3 15 + +@@ -48,11 +49,10 @@ _supported_os Linux + + DEST_IMG="$TEST_DIR/d.$IMGFMT" + TEST_IMG="$TEST_DIR/b.$IMGFMT" ++BLKDEBUG_CONF="$TEST_DIR/blkdebug.conf" + + _make_test_img 2M +- +-# destination for mirror will be too small, causing error +-TEST_IMG=$DEST_IMG _make_test_img 1M ++TEST_IMG=$DEST_IMG _make_test_img 2M + + $QEMU_IO -c 'write 0 2M' "$TEST_IMG" | _filter_qemu_io + +@@ -66,11 +66,18 @@ echo + echo '=== Starting drive-mirror, causing error & stop ===' + echo + ++cat > "$BLKDEBUG_CONF" < +Date: Fri, 13 Mar 2020 12:34:38 +0000 +Subject: [PATCH 18/20] iotests: Add iothread cases to 155 + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-13-kwolf@redhat.com> +Patchwork-id: 94289 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 12/13] iotests: Add iothread cases to 155 +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +This patch adds test cases for attaching the backing chain to a mirror +job target right before finalising the job, where the image is in a +non-mainloop AioContext (i.e. the backing chain needs to be moved to the +AioContext of the mirror target). + +This requires switching the test case from virtio-blk to virtio-scsi +because virtio-blk only actually starts using the iothreads when the +guest driver initialises the device (which never happens in a test case +without a guest OS). virtio-scsi always keeps its block nodes in the +AioContext of the the requested iothread without guest interaction. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-7-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit 6a5f6403a11307794ec79d277a065c137cfc12b2) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/155 | 32 +++++++++++++++++++++++--------- + tests/qemu-iotests/155.out | 4 ++-- + 2 files changed, 25 insertions(+), 11 deletions(-) + +diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 +index 3053e50..b552d1f 100755 +--- a/tests/qemu-iotests/155 ++++ b/tests/qemu-iotests/155 +@@ -49,11 +49,14 @@ target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt) + # chain opened right away. If False, blockdev-add + # opens it without a backing file and job completion + # is supposed to open the backing chain. ++# use_iothread: If True, an iothread is configured for the virtio-blk device ++# that uses the image being mirrored + + class BaseClass(iotests.QMPTestCase): + target_blockdev_backing = None + target_real_backing = None + target_open_with_backing = True ++ use_iothread = False + + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, back0_img, '1440K') +@@ -69,7 +72,16 @@ class BaseClass(iotests.QMPTestCase): + 'file': {'driver': 'file', + 'filename': source_img}} + self.vm.add_blockdev(self.vm.qmp_to_opts(blockdev)) +- self.vm.add_device('virtio-blk,id=qdev0,drive=source') ++ ++ if self.use_iothread: ++ self.vm.add_object('iothread,id=iothread0') ++ iothread = ",iothread=iothread0" ++ else: ++ iothread = "" ++ ++ self.vm.add_device('virtio-scsi%s' % iothread) ++ self.vm.add_device('scsi-hd,id=qdev0,drive=source') ++ + self.vm.launch() + + self.assertIntactSourceBackingChain() +@@ -182,24 +194,21 @@ class MirrorBaseClass(BaseClass): + def testFull(self): + self.runMirror('full') + +- node = self.findBlockNode('target', +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode('target', 'qdev0') + self.assertCorrectBackingImage(node, None) + self.assertIntactSourceBackingChain() + + def testTop(self): + self.runMirror('top') + +- node = self.findBlockNode('target', +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode('target', 'qdev0') + self.assertCorrectBackingImage(node, back2_img) + self.assertIntactSourceBackingChain() + + def testNone(self): + self.runMirror('none') + +- node = self.findBlockNode('target', +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode('target', 'qdev0') + self.assertCorrectBackingImage(node, source_img) + self.assertIntactSourceBackingChain() + +@@ -252,6 +261,9 @@ class TestBlockdevMirrorReopen(MirrorBaseClass): + backing="backing") + self.assert_qmp(result, 'return', {}) + ++class TestBlockdevMirrorReopenIothread(TestBlockdevMirrorReopen): ++ use_iothread = True ++ + # Attach the backing chain only during completion, with blockdev-snapshot + class TestBlockdevMirrorSnapshot(MirrorBaseClass): + cmd = 'blockdev-mirror' +@@ -268,6 +280,9 @@ class TestBlockdevMirrorSnapshot(MirrorBaseClass): + overlay="target") + self.assert_qmp(result, 'return', {}) + ++class TestBlockdevMirrorSnapshotIothread(TestBlockdevMirrorSnapshot): ++ use_iothread = True ++ + class TestCommit(BaseClass): + existing = False + +@@ -283,8 +298,7 @@ class TestCommit(BaseClass): + + self.vm.event_wait('BLOCK_JOB_COMPLETED') + +- node = self.findBlockNode(None, +- '/machine/peripheral/qdev0/virtio-backend') ++ node = self.findBlockNode(None, 'qdev0') + self.assert_qmp(node, 'image' + '/backing-image' * 0 + '/filename', + back1_img) + self.assert_qmp(node, 'image' + '/backing-image' * 1 + '/filename', +diff --git a/tests/qemu-iotests/155.out b/tests/qemu-iotests/155.out +index 4fd1c2d..ed714d5 100644 +--- a/tests/qemu-iotests/155.out ++++ b/tests/qemu-iotests/155.out +@@ -1,5 +1,5 @@ +-......................... ++............................... + ---------------------------------------------------------------------- +-Ran 25 tests ++Ran 31 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch b/kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch new file mode 100755 index 0000000000000000000000000000000000000000..6bdf1308144439a280cbde63917a3741cc2a6f45 --- /dev/null +++ b/kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch @@ -0,0 +1,236 @@ +From adda561394bb07c13ef3f2712b36704790530891 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:15 +0100 +Subject: [PATCH 16/26] iotests: Add more "skip_if_unsupported" statements to + the python tests + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-2-kwolf@redhat.com> +Patchwork-id: 97099 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 01/11] iotests: Add more "skip_if_unsupported" statements to the python tests +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: Thomas Huth + +The python code already contains a possibility to skip tests if the +corresponding driver is not available in the qemu binary - use it +in more spots to avoid that the tests are failing if the driver has +been disabled. + +While we're at it, we can now also remove some of the old checks that +were using iotests.supports_quorum() - and which were apparently not +working as expected since the tests aborted instead of being skipped +when "quorum" was missing in the QEMU binary. + +Signed-off-by: Thomas Huth +Signed-off-by: Kevin Wolf +(cherry picked from commit 9442bebe6e67a5d038bbf2572b79e7b59d202a23) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/030 | 4 +--- + tests/qemu-iotests/040 | 2 ++ + tests/qemu-iotests/041 | 39 +++------------------------------------ + tests/qemu-iotests/245 | 2 ++ + 4 files changed, 8 insertions(+), 39 deletions(-) + +diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 +index f3766f2..bddbb30 100755 +--- a/tests/qemu-iotests/030 ++++ b/tests/qemu-iotests/030 +@@ -530,6 +530,7 @@ class TestQuorum(iotests.QMPTestCase): + children = [] + backing = [] + ++ @iotests.skip_if_unsupported(['quorum']) + def setUp(self): + opts = ['driver=quorum', 'vote-threshold=2'] + +@@ -560,9 +561,6 @@ class TestQuorum(iotests.QMPTestCase): + os.remove(img) + + def test_stream_quorum(self): +- if not iotests.supports_quorum(): +- return +- + self.assertNotEqual(qemu_io('-f', iotests.imgfmt, '-rU', '-c', 'map', self.children[0]), + qemu_io('-f', iotests.imgfmt, '-rU', '-c', 'map', self.backing[0]), + 'image file map matches backing file before streaming') +diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 +index 762ad1e..74f62c3 100755 +--- a/tests/qemu-iotests/040 ++++ b/tests/qemu-iotests/040 +@@ -106,6 +106,7 @@ class TestSingleDrive(ImageCommitTestCase): + self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xab 0 524288', backing_img).find("verification failed")) + self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed")) + ++ @iotests.skip_if_unsupported(['throttle']) + def test_commit_with_filter_and_quit(self): + result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg') + self.assert_qmp(result, 'return', {}) +@@ -125,6 +126,7 @@ class TestSingleDrive(ImageCommitTestCase): + self.has_quit = True + + # Same as above, but this time we add the filter after starting the job ++ @iotests.skip_if_unsupported(['throttle']) + def test_commit_plus_filter_and_quit(self): + result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg') + self.assert_qmp(result, 'return', {}) +diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 +index 8568426..a543b15 100755 +--- a/tests/qemu-iotests/041 ++++ b/tests/qemu-iotests/041 +@@ -871,6 +871,7 @@ class TestRepairQuorum(iotests.QMPTestCase): + image_len = 1 * 1024 * 1024 # MB + IMAGES = [ quorum_img1, quorum_img2, quorum_img3 ] + ++ @iotests.skip_if_unsupported(['quorum']) + def setUp(self): + self.vm = iotests.VM() + +@@ -891,9 +892,8 @@ class TestRepairQuorum(iotests.QMPTestCase): + #assemble the quorum block device from the individual files + args = { "driver": "quorum", "node-name": "quorum0", + "vote-threshold": 2, "children": [ "img0", "img1", "img2" ] } +- if iotests.supports_quorum(): +- result = self.vm.qmp("blockdev-add", **args) +- self.assert_qmp(result, 'return', {}) ++ result = self.vm.qmp("blockdev-add", **args) ++ self.assert_qmp(result, 'return', {}) + + + def tearDown(self): +@@ -906,9 +906,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + pass + + def test_complete(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -925,9 +922,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + 'target image does not match source after mirroring') + + def test_cancel(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -942,9 +936,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.vm.shutdown() + + def test_cancel_after_ready(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -961,9 +952,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + 'target image does not match source after mirroring') + + def test_pause(self): +- if not iotests.supports_quorum(): +- return +- + self.assert_no_active_block_jobs() + + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', +@@ -989,9 +977,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + 'target image does not match source after mirroring') + + def test_medium_not_found(self): +- if not iotests.supports_quorum(): +- return +- + if iotests.qemu_default_machine != 'pc': + return + +@@ -1003,9 +988,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_image_not_found(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', + sync='full', node_name='repair0', replaces='img1', + mode='existing', target=quorum_repair_img, +@@ -1013,9 +995,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_device_not_found(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', + device='nonexistent', sync='full', + node_name='repair0', +@@ -1024,9 +1003,6 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_wrong_sync_mode(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', device='quorum0', job_id='job0', + node_name='repair0', + replaces='img1', +@@ -1034,27 +1010,18 @@ class TestRepairQuorum(iotests.QMPTestCase): + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_no_node_name(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', + sync='full', replaces='img1', + target=quorum_repair_img, format=iotests.imgfmt) + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_nonexistent_replaces(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0', + sync='full', node_name='repair0', replaces='img77', + target=quorum_repair_img, format=iotests.imgfmt) + self.assert_qmp(result, 'error/class', 'GenericError') + + def test_after_a_quorum_snapshot(self): +- if not iotests.supports_quorum(): +- return +- + result = self.vm.qmp('blockdev-snapshot-sync', node_name='img1', + snapshot_file=quorum_snapshot_file, + snapshot_node_name="snap1"); +diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 +index 919131d..ed972f9 100644 +--- a/tests/qemu-iotests/245 ++++ b/tests/qemu-iotests/245 +@@ -478,6 +478,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): + # This test verifies that we can't change the children of a block + # device during a reopen operation in a way that would create + # cycles in the node graph ++ @iotests.skip_if_unsupported(['blkverify']) + def test_graph_cycles(self): + opts = [] + +@@ -534,6 +535,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): + self.assert_qmp(result, 'return', {}) + + # Misc reopen tests with different block drivers ++ @iotests.skip_if_unsupported(['quorum', 'throttle']) + def test_misc_drivers(self): + #################### + ###### quorum ###### +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-qemu_io_log.patch b/kvm-iotests-Add-qemu_io_log.patch new file mode 100755 index 0000000000000000000000000000000000000000..a65bc5a961809e121b8fa5042c7656ae9130e5af --- /dev/null +++ b/kvm-iotests-Add-qemu_io_log.patch @@ -0,0 +1,48 @@ +From 2be333e847c01397bb6a92b2e4c60e904957675d Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:37 +0100 +Subject: [PATCH 09/17] iotests: Add qemu_io_log() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-9-kwolf@redhat.com> +Patchwork-id: 97451 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 08/11] iotests: Add qemu_io_log() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Add a function that runs qemu-io and logs the output with the +appropriate filters applied. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit a96f0350e3d95c98f2bff1863d14493af5c1d360) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index be20d56..7a9c779 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -162,6 +162,11 @@ def qemu_io(*args): + sys.stderr.write('qemu-io received signal %i: %s\n' % (-exitcode, ' '.join(args))) + return subp.communicate()[0] + ++def qemu_io_log(*args): ++ result = qemu_io(*args) ++ log(result, filters=[filter_testfiles, filter_qemu_io]) ++ return result ++ + def qemu_io_silent(*args): + '''Run qemu-io and return the exit code, suppressing stdout''' + args = qemu_io_args + list(args) +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch b/kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch new file mode 100755 index 0000000000000000000000000000000000000000..6144043de5ba75e7e6bad7f4ad01f3e8e437bc74 --- /dev/null +++ b/kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch @@ -0,0 +1,253 @@ +From eccae2f252513d2965ef919022c3ed068da275bd Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:20 +0100 +Subject: [PATCH 15/26] iotests: Add test 291 to for qemu-img bitmap coverage + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-13-eblake@redhat.com> +Patchwork-id: 97079 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 12/12] iotests: Add test 291 to for qemu-img bitmap coverage +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Add a new test covering the 'qemu-img bitmap' subcommand, as well as +'qemu-img convert --bitmaps', both added in recent patches. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200521192137.1120211-6-eblake@redhat.com> +(cherry picked from commit cf2d1203dcfc2bf964453d83a2302231ce77f2dc) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + tests/qemu-iotests/group - context: other tests not backported + tests/qemu-iotests/291.out - zstd compression not backported +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/291 | 112 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/291.out | 78 +++++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 191 insertions(+) + create mode 100755 tests/qemu-iotests/291 + create mode 100644 tests/qemu-iotests/291.out + +diff --git a/tests/qemu-iotests/291 b/tests/qemu-iotests/291 +new file mode 100755 +index 0000000..3ca83b9 +--- /dev/null ++++ b/tests/qemu-iotests/291 +@@ -0,0 +1,112 @@ ++#!/usr/bin/env bash ++# ++# Test qemu-img bitmap handling ++# ++# Copyright (C) 2018-2020 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++seq="$(basename $0)" ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++ nbd_server_stop ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++. ./common.nbd ++ ++_supported_fmt qcow2 ++_supported_proto file ++_supported_os Linux ++_require_command QEMU_NBD ++ ++echo ++echo "=== Initial image setup ===" ++echo ++ ++# Create backing image with one bitmap ++TEST_IMG="$TEST_IMG.base" _make_test_img 10M ++$QEMU_IMG bitmap --add -f $IMGFMT "$TEST_IMG.base" b0 ++$QEMU_IO -c 'w 3M 1M' -f $IMGFMT "$TEST_IMG.base" | _filter_qemu_io ++ ++# Create initial image and populate two bitmaps: one active, one inactive. ++ORIG_IMG=$TEST_IMG ++TEST_IMG=$TEST_IMG.orig ++_make_test_img -b "$ORIG_IMG.base" -F $IMGFMT 10M ++$QEMU_IO -c 'w 0 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++$QEMU_IMG bitmap --add -g 512k -f $IMGFMT "$TEST_IMG" b1 ++$QEMU_IMG bitmap --add --disable -f $IMGFMT "$TEST_IMG" b2 ++$QEMU_IO -c 'w 3M 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++$QEMU_IMG bitmap --clear -f $IMGFMT "$TEST_IMG" b1 ++$QEMU_IO -c 'w 1M 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++$QEMU_IMG bitmap --disable -f $IMGFMT "$TEST_IMG" b1 ++$QEMU_IMG bitmap --enable -f $IMGFMT "$TEST_IMG" b2 ++$QEMU_IO -c 'w 2M 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io ++ ++echo ++echo "=== Bitmap preservation not possible to non-qcow2 ===" ++echo ++ ++TEST_IMG=$ORIG_IMG ++$QEMU_IMG convert --bitmaps -O raw "$TEST_IMG.orig" "$TEST_IMG" && ++ echo "unexpected success" ++ ++echo ++echo "=== Convert with bitmap preservation ===" ++echo ++ ++# Only bitmaps from the active layer are copied ++$QEMU_IMG convert --bitmaps -O qcow2 "$TEST_IMG.orig" "$TEST_IMG" ++$QEMU_IMG info "$TEST_IMG" | _filter_img_info --format-specific ++# But we can also merge in bitmaps from other layers. This test is a bit ++# contrived to cover more code paths, in reality, you could merge directly ++# into b0 without going through tmp ++$QEMU_IMG bitmap --add --disable -f $IMGFMT "$TEST_IMG" b0 ++$QEMU_IMG bitmap --add --merge b0 -b "$TEST_IMG.base" -F $IMGFMT \ ++ -f $IMGFMT "$TEST_IMG" tmp ++$QEMU_IMG bitmap --merge tmp -f $IMGFMT "$TEST_IMG" b0 ++$QEMU_IMG bitmap --remove --image-opts \ ++ driver=$IMGFMT,file.driver=file,file.filename="$TEST_IMG" tmp ++$QEMU_IMG info "$TEST_IMG" | _filter_img_info --format-specific ++ ++echo ++echo "=== Check bitmap contents ===" ++echo ++ ++# x-dirty-bitmap is a hack for reading bitmaps; it abuses block status to ++# report "data":false for portions of the bitmap which are set ++IMG="driver=nbd,server.type=unix,server.path=$nbd_unix_socket" ++nbd_server_start_unix_socket -r -f qcow2 -B b0 "$TEST_IMG" ++$QEMU_IMG map --output=json --image-opts \ ++ "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b0" | _filter_qemu_img_map ++nbd_server_start_unix_socket -r -f qcow2 -B b1 "$TEST_IMG" ++$QEMU_IMG map --output=json --image-opts \ ++ "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b1" | _filter_qemu_img_map ++nbd_server_start_unix_socket -r -f qcow2 -B b2 "$TEST_IMG" ++$QEMU_IMG map --output=json --image-opts \ ++ "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b2" | _filter_qemu_img_map ++ ++# success, all done ++echo '*** done' ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/291.out b/tests/qemu-iotests/291.out +new file mode 100644 +index 0000000..14e5cfc +--- /dev/null ++++ b/tests/qemu-iotests/291.out +@@ -0,0 +1,78 @@ ++QA output created by 291 ++ ++=== Initial image setup === ++ ++Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=10485760 ++wrote 1048576/1048576 bytes at offset 3145728 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++Formatting 'TEST_DIR/t.IMGFMT.orig', fmt=IMGFMT size=10485760 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT ++wrote 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 1048576/1048576 bytes at offset 3145728 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 1048576/1048576 bytes at offset 2097152 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Bitmap preservation not possible to non-qcow2 === ++ ++qemu-img: Format driver 'raw' does not support bitmaps ++ ++=== Convert with bitmap preservation === ++ ++image: TEST_DIR/t.IMGFMT ++file format: IMGFMT ++virtual size: 10 MiB (10485760 bytes) ++disk size: 4.39 MiB ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ bitmaps: ++ [0]: ++ flags: ++ name: b1 ++ granularity: 524288 ++ [1]: ++ flags: ++ [0]: auto ++ name: b2 ++ granularity: 65536 ++ refcount bits: 16 ++ corrupt: false ++image: TEST_DIR/t.IMGFMT ++file format: IMGFMT ++virtual size: 10 MiB (10485760 bytes) ++disk size: 4.48 MiB ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ bitmaps: ++ [0]: ++ flags: ++ name: b1 ++ granularity: 524288 ++ [1]: ++ flags: ++ [0]: auto ++ name: b2 ++ granularity: 65536 ++ [2]: ++ flags: ++ name: b0 ++ granularity: 65536 ++ refcount bits: 16 ++ corrupt: false ++ ++=== Check bitmap contents === ++ ++[{ "start": 0, "length": 3145728, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 3145728, "length": 1048576, "depth": 0, "zero": false, "data": false}, ++{ "start": 4194304, "length": 6291456, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] ++[{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 1048576, "length": 1048576, "depth": 0, "zero": false, "data": false}, ++{ "start": 2097152, "length": 8388608, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] ++[{ "start": 0, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, ++{ "start": 2097152, "length": 1048576, "depth": 0, "zero": false, "data": false}, ++{ "start": 3145728, "length": 7340032, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] ++*** done +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 9c565cf..033b54d 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -290,3 +290,4 @@ + 280 rw migration quick + 281 rw quick + 284 rw ++291 rw quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Add-test-for-image-creation-fallback.patch b/kvm-iotests-Add-test-for-image-creation-fallback.patch new file mode 100755 index 0000000000000000000000000000000000000000..a8ea8f7017f3f14439aeee9821a36165046f93d9 --- /dev/null +++ b/kvm-iotests-Add-test-for-image-creation-fallback.patch @@ -0,0 +1,138 @@ +From 55f3a02574da226299d99bd74d12dd91b0f228dc Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:46 +0000 +Subject: [PATCH 05/20] iotests: Add test for image creation fallback + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-6-mlevitsk@redhat.com> +Patchwork-id: 94228 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 5/6] iotests: Add test for image creation fallback +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-6-mreitz@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Maxim Levitsky +[mreitz: Added a note that NBD does not support resizing, which is why + the second case is expected to fail] +Signed-off-by: Max Reitz +(cherry picked from commit 4dddeac115c5a2c5f74731fda0afd031a0b45490) +Signed-off-by: Maxim Levitsky + +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/259 | 62 ++++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/259.out | 14 +++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 77 insertions(+) + create mode 100755 tests/qemu-iotests/259 + create mode 100644 tests/qemu-iotests/259.out + +diff --git a/tests/qemu-iotests/259 b/tests/qemu-iotests/259 +new file mode 100755 +index 0000000..62e29af +--- /dev/null ++++ b/tests/qemu-iotests/259 +@@ -0,0 +1,62 @@ ++#!/usr/bin/env bash ++# ++# Test generic image creation fallback (by using NBD) ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++# creator ++owner=mreitz@redhat.com ++ ++seq=$(basename $0) ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++. ./common.rc ++. ./common.filter ++ ++_supported_fmt raw ++_supported_proto nbd ++_supported_os Linux ++ ++ ++_make_test_img 64M ++ ++echo ++echo '--- Testing creation ---' ++ ++$QEMU_IMG create -f qcow2 "$TEST_IMG" 64M | _filter_img_create ++$QEMU_IMG info "$TEST_IMG" | _filter_img_info ++ ++echo ++echo '--- Testing creation for which the node would need to grow ---' ++ ++# NBD does not support resizing, so this will fail ++$QEMU_IMG create -f qcow2 -o preallocation=metadata "$TEST_IMG" 64M 2>&1 \ ++ | _filter_img_create ++ ++# success, all done ++echo "*** done" ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/259.out b/tests/qemu-iotests/259.out +new file mode 100644 +index 0000000..ffed19c +--- /dev/null ++++ b/tests/qemu-iotests/259.out +@@ -0,0 +1,14 @@ ++QA output created by 259 ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 ++ ++--- Testing creation --- ++Formatting 'TEST_DIR/t.IMGFMT', fmt=qcow2 size=67108864 ++image: TEST_DIR/t.IMGFMT ++file format: qcow2 ++virtual size: 64 MiB (67108864 bytes) ++disk size: unavailable ++ ++--- Testing creation for which the node would need to grow --- ++qemu-img: TEST_DIR/t.IMGFMT: Could not resize image: Image format driver does not support resize ++Formatting 'TEST_DIR/t.IMGFMT', fmt=qcow2 size=67108864 preallocation=metadata ++*** done +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index c0e8197..e47cbfc 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -273,6 +273,7 @@ + 256 rw quick + 257 rw + 258 rw quick ++259 rw auto quick + 260 rw quick + 261 rw + 262 rw quick migration +-- +1.8.3.1 + diff --git a/kvm-iotests-Backup-with-different-source-target-size.patch b/kvm-iotests-Backup-with-different-source-target-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..40084137db1dc885ce3cb5ccc2485efcd2bb1396 --- /dev/null +++ b/kvm-iotests-Backup-with-different-source-target-size.patch @@ -0,0 +1,105 @@ +From 456c5e79c32e3f2f9319a7d1452fe523aded7835 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:21 +0100 +Subject: [PATCH 22/26] iotests: Backup with different source/target size + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-8-kwolf@redhat.com> +Patchwork-id: 97106 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 07/11] iotests: Backup with different source/target size +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +This tests that the backup job catches situations where the target node +has a different size than the source node. It must also forbid resize +operations when the job is already running. + +Signed-off-by: Kevin Wolf +Message-Id: <20200430142755.315494-5-kwolf@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 0a82a9273062d05764e3df3637b3aa95ad8291c6) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 42 ++++++++++++++++++++++++++++++++++++++++-- + tests/qemu-iotests/055.out | 4 ++-- + 2 files changed, 42 insertions(+), 4 deletions(-) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index c9cdc06..1c70389 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -48,8 +48,10 @@ class TestSingleDrive(iotests.QMPTestCase): + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, blockdev_target_img, str(image_len)) + +- self.vm = iotests.VM().add_drive('blkdebug::' + test_img) +- self.vm.add_drive(blockdev_target_img, interface="none") ++ self.vm = iotests.VM() ++ self.vm.add_drive('blkdebug::' + test_img, 'node-name=source') ++ self.vm.add_drive(blockdev_target_img, 'node-name=target', ++ interface="none") + if iotests.qemu_default_machine == 'pc': + self.vm.add_drive(None, 'media=cdrom', 'ide') + self.vm.launch() +@@ -112,6 +114,42 @@ class TestSingleDrive(iotests.QMPTestCase): + def test_pause_blockdev_backup(self): + self.do_test_pause('blockdev-backup', 'drive1', blockdev_target_img) + ++ def do_test_resize_blockdev_backup(self, device, node): ++ def pre_finalize(): ++ result = self.vm.qmp('block_resize', device=device, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp('block_resize', node_name=node, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp('blockdev-backup', job_id='job0', device='drive0', ++ target='drive1', sync='full', auto_finalize=False, ++ auto_dismiss=False) ++ self.assert_qmp(result, 'return', {}) ++ ++ self.vm.run_job('job0', auto_finalize=False, pre_finalize=pre_finalize, ++ use_log=False) ++ ++ def test_source_resize_blockdev_backup(self): ++ self.do_test_resize_blockdev_backup('drive0', 'source') ++ ++ def test_target_resize_blockdev_backup(self): ++ self.do_test_resize_blockdev_backup('drive1', 'target') ++ ++ def do_test_target_size(self, size): ++ result = self.vm.qmp('block_resize', device='drive1', size=size) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.qmp('blockdev-backup', job_id='job0', device='drive0', ++ target='drive1', sync='full') ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ def test_small_target(self): ++ self.do_test_target_size(image_len // 2) ++ ++ def test_large_target(self): ++ self.do_test_target_size(image_len * 2) ++ + def test_medium_not_found(self): + if iotests.qemu_default_machine != 'pc': + return +diff --git a/tests/qemu-iotests/055.out b/tests/qemu-iotests/055.out +index 5c26d15..0a5e958 100644 +--- a/tests/qemu-iotests/055.out ++++ b/tests/qemu-iotests/055.out +@@ -1,5 +1,5 @@ +-.................................... ++........................................ + ---------------------------------------------------------------------- +-Ran 36 tests ++Ran 40 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Create-VM.blockdev_create.patch b/kvm-iotests-Create-VM.blockdev_create.patch new file mode 100755 index 0000000000000000000000000000000000000000..805b31ac97f8310c75ecc9fabce7a71b4ffa5a42 --- /dev/null +++ b/kvm-iotests-Create-VM.blockdev_create.patch @@ -0,0 +1,59 @@ +From 05fedde1374abb180cd2b51457385d8128aa7fe4 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:00 +0000 +Subject: [PATCH 03/18] iotests: Create VM.blockdev_create() + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-3-kwolf@redhat.com> +Patchwork-id: 93748 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 2/6] iotests: Create VM.blockdev_create() +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +We have several almost identical copies of a blockdev_create() function +in different test cases. Time to create one unified function in +iotests.py. + +To keep the diff managable, this patch only creates the function and +follow-up patches will convert the individual test cases. + +Signed-off-by: Kevin Wolf +(cherry picked from commit e9dbd1cae86f7cb6f8e470e1485aeb0c6e23ae64) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 3cff671..5741efb 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -638,6 +638,22 @@ class VM(qtest.QEMUQtestMachine): + elif status == 'null': + return error + ++ # Returns None on success, and an error string on failure ++ def blockdev_create(self, options, job_id='job0', filters=None): ++ if filters is None: ++ filters = [filter_qmp_testfiles] ++ result = self.qmp_log('blockdev-create', filters=filters, ++ job_id=job_id, options=options) ++ ++ if 'return' in result: ++ assert result['return'] == {} ++ job_result = self.run_job(job_id) ++ else: ++ job_result = result['error'] ++ ++ log("") ++ return job_result ++ + def enable_migration_events(self, name): + log('Enabling migration QMP events on %s...' % name) + log(self.qmp('migrate-set-capabilities', capabilities=[ +-- +1.8.3.1 + diff --git a/kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch b/kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch new file mode 100755 index 0000000000000000000000000000000000000000..60c08ec9308d30f1585d2e8ec85a8fa82182d4fa --- /dev/null +++ b/kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch @@ -0,0 +1,52 @@ +From 8dc8a17d4e98aae41db01cbc073e69de44291b63 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:38 +0100 +Subject: [PATCH 10/17] iotests: Filter testfiles out in filter_img_info() + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-10-kwolf@redhat.com> +Patchwork-id: 97455 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 09/11] iotests: Filter testfiles out in filter_img_info() +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +We want to keep TEST_IMG for the full path of the main test image, but +filter_testfiles() must be called for other test images before replacing +other things like the image format because the test directory path could +contain the format as a substring. + +Insert a filter_testfiles() call between both. + +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200424125448.63318-9-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit fd586ce8bee50d98773436214dc9e644ddda54aa) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 7a9c779..cd5df36 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -335,8 +335,9 @@ def filter_img_info(output, filename): + for line in output.split('\n'): + if 'disk size' in line or 'actual-size' in line: + continue +- line = line.replace(filename, 'TEST_IMG') \ +- .replace(imgfmt, 'IMGFMT') ++ line = line.replace(filename, 'TEST_IMG') ++ line = filter_testfiles(line) ++ line = line.replace(imgfmt, 'IMGFMT') + line = re.sub('iters: [0-9]+', 'iters: XXX', line) + line = re.sub('uuid: [-a-f0-9]+', 'uuid: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX', line) + line = re.sub('cid: [0-9]+', 'cid: XXXXXXXXXX', line) +-- +1.8.3.1 + diff --git a/kvm-iotests-Fix-run_job-with-use_log-False.patch b/kvm-iotests-Fix-run_job-with-use_log-False.patch new file mode 100755 index 0000000000000000000000000000000000000000..b105fc2701b7472d4c0f568951dfd3944ffbb7a6 --- /dev/null +++ b/kvm-iotests-Fix-run_job-with-use_log-False.patch @@ -0,0 +1,47 @@ +From bb7b968a02c97564596b73d8d080cd745d96ed6b Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:35 +0000 +Subject: [PATCH 15/20] iotests: Fix run_job() with use_log=False + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-10-kwolf@redhat.com> +Patchwork-id: 94284 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 09/13] iotests: Fix run_job() with use_log=False +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +The 'job-complete' QMP command should be run with qmp() rather than +qmp_log() if use_log=False is passed. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-4-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit b31b532122ec6f68d17168449c034d2197bf96ec) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 0c55f7b..46f880c 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -618,7 +618,10 @@ class VM(qtest.QEMUQtestMachine): + if use_log: + log('Job failed: %s' % (j['error'])) + elif status == 'ready': +- self.qmp_log('job-complete', id=job) ++ if use_log: ++ self.qmp_log('job-complete', id=job) ++ else: ++ self.qmp('job-complete', id=job) + elif status == 'pending' and not auto_finalize: + if pre_finalize: + pre_finalize() +-- +1.8.3.1 + diff --git a/kvm-iotests-Fix-test-178.patch b/kvm-iotests-Fix-test-178.patch new file mode 100755 index 0000000000000000000000000000000000000000..5e54daaf8e32d94cdb1e8640b2f708aace766cae --- /dev/null +++ b/kvm-iotests-Fix-test-178.patch @@ -0,0 +1,59 @@ +From a04d324e41a40a6893bc94109994afc017f17192 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:16 +0100 +Subject: [PATCH 11/26] iotests: Fix test 178 + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-9-eblake@redhat.com> +Patchwork-id: 97075 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 08/12] iotests: Fix test 178 +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +A recent change to qemu-img changed expected error message output, but +178 takes long enough to execute that it does not get run by 'make +check' or './check -g quick'. + +Fixes: 43d589b074 +Signed-off-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200521192137.1120211-2-eblake@redhat.com> +(cherry picked from commit ca01b7a641527052e3e8961845b40b81706ce5f9) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/178.out.qcow2 | 2 +- + tests/qemu-iotests/178.out.raw | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2 +index 9e7d8c4..345eab3 100644 +--- a/tests/qemu-iotests/178.out.qcow2 ++++ b/tests/qemu-iotests/178.out.qcow2 +@@ -13,7 +13,7 @@ qemu-img: Invalid option list: , + qemu-img: Invalid parameter 'snapshot.foo' + qemu-img: Failed in parsing snapshot param 'snapshot.foo' + qemu-img: --output must be used with human or json as argument. +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + qemu-img: Unknown file format 'foo' + + == Size calculation for a new file (human) == +diff --git a/tests/qemu-iotests/178.out.raw b/tests/qemu-iotests/178.out.raw +index 6478365..15da915 100644 +--- a/tests/qemu-iotests/178.out.raw ++++ b/tests/qemu-iotests/178.out.raw +@@ -13,7 +13,7 @@ qemu-img: Invalid option list: , + qemu-img: Invalid parameter 'snapshot.foo' + qemu-img: Failed in parsing snapshot param 'snapshot.foo' + qemu-img: --output must be used with human or json as argument. +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + qemu-img: Unknown file format 'foo' + + == Size calculation for a new file (human) == +-- +1.8.3.1 + diff --git a/kvm-iotests-Let-_make_test_img-parse-its-parameters.patch b/kvm-iotests-Let-_make_test_img-parse-its-parameters.patch new file mode 100755 index 0000000000000000000000000000000000000000..d24f5e7635d827d228b244c3d57f0ddf4a46a390 --- /dev/null +++ b/kvm-iotests-Let-_make_test_img-parse-its-parameters.patch @@ -0,0 +1,91 @@ +From 3c96dbd74fb67e2ae1a116b2771290b192041707 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:10 +0100 +Subject: [PATCH 05/26] iotests: Let _make_test_img parse its parameters + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-3-eblake@redhat.com> +Patchwork-id: 97070 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 02/12] iotests: Let _make_test_img parse its parameters +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +From: Max Reitz + +This will allow us to add more options than just -b. + +Signed-off-by: Max Reitz +Reviewed-by: Maxim Levitsky +Message-id: 20191107163708.833192-9-mreitz@redhat.com +Signed-off-by: Max Reitz +(cherry picked from commit eea871d047701b563cfd66c1566b9ff6d163882b) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/common.rc | 28 ++++++++++++++++++++-------- + 1 file changed, 20 insertions(+), 8 deletions(-) + +diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc +index 0cc8acc..99fef4d 100644 +--- a/tests/qemu-iotests/common.rc ++++ b/tests/qemu-iotests/common.rc +@@ -302,12 +302,12 @@ _make_test_img() + # extra qemu-img options can be added by tests + # at least one argument (the image size) needs to be added + local extra_img_options="" +- local image_size=$* + local optstr="" + local img_name="" + local use_backing=0 + local backing_file="" + local object_options="" ++ local misc_params=() + + if [ -n "$TEST_IMG_FILE" ]; then + img_name=$TEST_IMG_FILE +@@ -323,11 +323,23 @@ _make_test_img() + optstr=$(_optstr_add "$optstr" "key-secret=keysec0") + fi + +- if [ "$1" = "-b" ]; then +- use_backing=1 +- backing_file=$2 +- image_size=$3 +- fi ++ for param; do ++ if [ "$use_backing" = "1" -a -z "$backing_file" ]; then ++ backing_file=$param ++ continue ++ fi ++ ++ case "$param" in ++ -b) ++ use_backing=1 ++ ;; ++ ++ *) ++ misc_params=("${misc_params[@]}" "$param") ++ ;; ++ esac ++ done ++ + if [ \( "$IMGFMT" = "qcow2" -o "$IMGFMT" = "qed" \) -a -n "$CLUSTER_SIZE" ]; then + optstr=$(_optstr_add "$optstr" "cluster_size=$CLUSTER_SIZE") + fi +@@ -343,9 +355,9 @@ _make_test_img() + # XXX(hch): have global image options? + ( + if [ $use_backing = 1 ]; then +- $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options -b "$backing_file" "$img_name" $image_size 2>&1 ++ $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options -b "$backing_file" "$img_name" "${misc_params[@]}" 2>&1 + else +- $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options "$img_name" $image_size 2>&1 ++ $QEMU_IMG create $object_options -f $IMGFMT $extra_img_options "$img_name" "${misc_params[@]}" 2>&1 + fi + ) | _filter_img_create + +-- +1.8.3.1 + diff --git a/kvm-iotests-Mirror-with-different-source-target-size.patch b/kvm-iotests-Mirror-with-different-source-target-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..7757632679941eb2ec2bf0124e7ef658f835b642 --- /dev/null +++ b/kvm-iotests-Mirror-with-different-source-target-size.patch @@ -0,0 +1,110 @@ +From aff543186ff316d66b2c7acb434c6c17bdb8da78 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:25 +0100 +Subject: [PATCH 26/26] iotests: Mirror with different source/target size + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-12-kwolf@redhat.com> +Patchwork-id: 97109 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 11/11] iotests: Mirror with different source/target size +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +This tests that the mirror job catches situations where the target node +has a different size than the source node. It must also forbid resize +operations when the job is already running. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200511135825.219437-5-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit 16cea4ee1c8e5a69a058e76f426b2e17974d8d7d) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/041 | 45 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/041.out | 4 ++-- + 2 files changed, 47 insertions(+), 2 deletions(-) + +diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 +index a543b15..20fb68a 100755 +--- a/tests/qemu-iotests/041 ++++ b/tests/qemu-iotests/041 +@@ -240,6 +240,49 @@ class TestSingleBlockdev(TestSingleDrive): + target=self.qmp_target) + self.assert_qmp(result, 'error/class', 'GenericError') + ++ def do_test_resize(self, device, node): ++ def pre_finalize(): ++ if device: ++ result = self.vm.qmp('block_resize', device=device, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp('block_resize', node_name=node, size=65536) ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ result = self.vm.qmp(self.qmp_cmd, job_id='job0', device='drive0', ++ sync='full', target=self.qmp_target, ++ auto_finalize=False, auto_dismiss=False) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.run_job('job0', auto_finalize=False, ++ pre_finalize=pre_finalize, use_log=False) ++ self.assertEqual(result, None) ++ ++ def test_source_resize(self): ++ self.do_test_resize('drive0', 'top') ++ ++ def test_target_resize(self): ++ self.do_test_resize(None, self.qmp_target) ++ ++ def do_test_target_size(self, size): ++ result = self.vm.qmp('block_resize', node_name=self.qmp_target, ++ size=size) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.qmp(self.qmp_cmd, job_id='job0', ++ device='drive0', sync='full', auto_dismiss=False, ++ target=self.qmp_target) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm.run_job('job0', use_log=False) ++ self.assertEqual(result, 'Source and target image have different sizes') ++ ++ def test_small_target(self): ++ self.do_test_target_size(self.image_len // 2) ++ ++ def test_large_target(self): ++ self.do_test_target_size(self.image_len * 2) ++ + test_large_cluster = None + test_image_not_found = None + test_small_buffer2 = None +@@ -251,6 +294,8 @@ class TestSingleDriveZeroLength(TestSingleDrive): + + class TestSingleBlockdevZeroLength(TestSingleBlockdev): + image_len = 0 ++ test_small_target = None ++ test_large_target = None + + class TestSingleDriveUnalignedLength(TestSingleDrive): + image_len = 1025 * 1024 +diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out +index 2c448b4..3ea6aa4 100644 +--- a/tests/qemu-iotests/041.out ++++ b/tests/qemu-iotests/041.out +@@ -1,5 +1,5 @@ +-.......................................................................................... ++.................................................................................................... + ---------------------------------------------------------------------- +-Ran 90 tests ++Ran 100 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch b/kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch new file mode 100755 index 0000000000000000000000000000000000000000..17e4a41aa98943ebe4a97c07931b598294234a7e --- /dev/null +++ b/kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch @@ -0,0 +1,122 @@ +From 7e23b64dc20b64ca6fa887cd06cc5e52374f6268 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:30 +0000 +Subject: [PATCH 10/20] iotests: Refactor blockdev-reopen test for iothreads + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-5-kwolf@redhat.com> +Patchwork-id: 94281 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 04/13] iotests: Refactor blockdev-reopen test for iothreads +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +We'll want to test more than one successful case in the future, so +prepare the test for that by a refactoring that runs each scenario in a +separate VM. + +test_iothreads_switch_{backing,overlay} currently produce errors, but +these are cases that should actually work, by switching either the +backing file node or the overlay node to the AioContext of the other +node. + +Signed-off-by: Kevin Wolf +Tested-by: Peter Krempa +Message-Id: <20200306141413.30705-2-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 97518e11c3d902a32386d33797044f6b79bccc6f) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/245 | 47 ++++++++++++++++++++++++++++++++++++---------- + tests/qemu-iotests/245.out | 4 ++-- + 2 files changed, 39 insertions(+), 12 deletions(-) + +diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 +index e66a23c..f69c2fa 100644 +--- a/tests/qemu-iotests/245 ++++ b/tests/qemu-iotests/245 +@@ -968,8 +968,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): + self.assertEqual(self.get_node('hd1'), None) + self.assert_qmp(self.get_node('hd2'), 'ro', True) + +- # We don't allow setting a backing file that uses a different AioContext +- def test_iothreads(self): ++ def run_test_iothreads(self, iothread_a, iothread_b, errmsg = None): + opts = hd_opts(0) + result = self.vm.qmp('blockdev-add', conv_keys = False, **opts) + self.assert_qmp(result, 'return', {}) +@@ -984,20 +983,48 @@ class TestBlockdevReopen(iotests.QMPTestCase): + result = self.vm.qmp('object-add', qom_type='iothread', id='iothread1') + self.assert_qmp(result, 'return', {}) + +- result = self.vm.qmp('x-blockdev-set-iothread', node_name='hd0', iothread='iothread0') ++ result = self.vm.qmp('device_add', driver='virtio-scsi', id='scsi0', ++ iothread=iothread_a) + self.assert_qmp(result, 'return', {}) + +- self.reopen(opts, {'backing': 'hd2'}, "Cannot use a new backing file with a different AioContext") +- +- result = self.vm.qmp('x-blockdev-set-iothread', node_name='hd2', iothread='iothread1') ++ result = self.vm.qmp('device_add', driver='virtio-scsi', id='scsi1', ++ iothread=iothread_b) + self.assert_qmp(result, 'return', {}) + +- self.reopen(opts, {'backing': 'hd2'}, "Cannot use a new backing file with a different AioContext") ++ if iothread_a: ++ result = self.vm.qmp('device_add', driver='scsi-hd', drive='hd0', ++ share_rw=True, bus="scsi0.0") ++ self.assert_qmp(result, 'return', {}) + +- result = self.vm.qmp('x-blockdev-set-iothread', node_name='hd2', iothread='iothread0') +- self.assert_qmp(result, 'return', {}) ++ if iothread_b: ++ result = self.vm.qmp('device_add', driver='scsi-hd', drive='hd2', ++ share_rw=True, bus="scsi1.0") ++ self.assert_qmp(result, 'return', {}) + +- self.reopen(opts, {'backing': 'hd2'}) ++ # Attaching the backing file may or may not work ++ self.reopen(opts, {'backing': 'hd2'}, errmsg) ++ ++ # But removing the backing file should always work ++ self.reopen(opts, {'backing': None}) ++ ++ self.vm.shutdown() ++ ++ # We don't allow setting a backing file that uses a different AioContext if ++ # neither of them can switch to the other AioContext ++ def test_iothreads_error(self): ++ self.run_test_iothreads('iothread0', 'iothread1', ++ "Cannot use a new backing file with a different AioContext") ++ ++ def test_iothreads_compatible_users(self): ++ self.run_test_iothreads('iothread0', 'iothread0') ++ ++ def test_iothreads_switch_backing(self): ++ self.run_test_iothreads('iothread0', None, ++ "Cannot use a new backing file with a different AioContext") ++ ++ def test_iothreads_switch_overlay(self): ++ self.run_test_iothreads(None, 'iothread0', ++ "Cannot use a new backing file with a different AioContext") + + if __name__ == '__main__': + iotests.main(supported_fmts=["qcow2"], +diff --git a/tests/qemu-iotests/245.out b/tests/qemu-iotests/245.out +index a19de52..682b933 100644 +--- a/tests/qemu-iotests/245.out ++++ b/tests/qemu-iotests/245.out +@@ -1,6 +1,6 @@ +-.................. ++..................... + ---------------------------------------------------------------------- +-Ran 18 tests ++Ran 21 tests + + OK + {"execute": "job-finalize", "arguments": {"id": "commit0"}} +-- +1.8.3.1 + diff --git a/kvm-iotests-Support-job-complete-in-run_job.patch b/kvm-iotests-Support-job-complete-in-run_job.patch new file mode 100755 index 0000000000000000000000000000000000000000..08971a078565ce2a29f4ec3b96da5d6df309c912 --- /dev/null +++ b/kvm-iotests-Support-job-complete-in-run_job.patch @@ -0,0 +1,46 @@ +From a3778aef0be61dead835af39073a62bbf72c8e20 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:23:59 +0000 +Subject: [PATCH 02/18] iotests: Support job-complete in run_job() + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-2-kwolf@redhat.com> +Patchwork-id: 93746 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 1/6] iotests: Support job-complete in run_job() +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Automatically complete jobs that have a 'ready' state and need an +explicit job-complete. Without this, run_job() would hang for such +jobs. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Reviewed-by: Alberto Garcia +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 4688c4e32ec76004676470f11734478799673d6d) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/iotests.py | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index df07089..3cff671 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -617,6 +617,8 @@ class VM(qtest.QEMUQtestMachine): + error = j['error'] + if use_log: + log('Job failed: %s' % (j['error'])) ++ elif status == 'ready': ++ self.qmp_log('job-complete', id=job) + elif status == 'pending' and not auto_finalize: + if pre_finalize: + pre_finalize() +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-committing-to-short-backing-file.patch b/kvm-iotests-Test-committing-to-short-backing-file.patch new file mode 100755 index 0000000000000000000000000000000000000000..fbbaac67a49d5bc47783ccaf04a252dd231f8f01 --- /dev/null +++ b/kvm-iotests-Test-committing-to-short-backing-file.patch @@ -0,0 +1,480 @@ +From e2a1b3fd32be8bb730656a6f22eb4f543b120c9d Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:39 +0100 +Subject: [PATCH 11/17] iotests: Test committing to short backing file + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-11-kwolf@redhat.com> +Patchwork-id: 97453 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 10/11] iotests: Test committing to short backing file +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +Signed-off-by: Kevin Wolf +Message-Id: <20200424125448.63318-10-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit bf03dede475e29a16f9188ea85a4d77cd3dcf2b7) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/274 | 155 ++++++++++++++++++++++++++ + tests/qemu-iotests/274.out | 268 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 424 insertions(+) + create mode 100755 tests/qemu-iotests/274 + create mode 100644 tests/qemu-iotests/274.out + +diff --git a/tests/qemu-iotests/274 b/tests/qemu-iotests/274 +new file mode 100755 +index 0000000..e951f72 +--- /dev/null ++++ b/tests/qemu-iotests/274 +@@ -0,0 +1,155 @@ ++#!/usr/bin/env python3 ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++# Creator/Owner: Kevin Wolf ++# ++# Some tests for short backing files and short overlays ++ ++import iotests ++ ++iotests.verify_image_format(supported_fmts=['qcow2']) ++iotests.verify_platform(['linux']) ++ ++size_short = 1 * 1024 * 1024 ++size_long = 2 * 1024 * 1024 ++size_diff = size_long - size_short ++ ++def create_chain() -> None: ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, base, ++ str(size_long)) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, mid, ++ str(size_short)) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', mid, top, ++ str(size_long)) ++ ++ iotests.qemu_io_log('-c', 'write -P 1 0 %d' % size_long, base) ++ ++def create_vm() -> iotests.VM: ++ vm = iotests.VM() ++ vm.add_blockdev('file,filename=%s,node-name=base-file' % base) ++ vm.add_blockdev('%s,file=base-file,node-name=base' % iotests.imgfmt) ++ vm.add_blockdev('file,filename=%s,node-name=mid-file' % mid) ++ vm.add_blockdev('%s,file=mid-file,node-name=mid,backing=base' ++ % iotests.imgfmt) ++ vm.add_drive(top, 'backing=mid,node-name=top') ++ return vm ++ ++with iotests.FilePath('base') as base, \ ++ iotests.FilePath('mid') as mid, \ ++ iotests.FilePath('top') as top: ++ ++ iotests.log('== Commit tests ==') ++ ++ create_chain() ++ ++ iotests.log('=== Check visible data ===') ++ ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, top) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), top) ++ ++ iotests.log('=== Checking allocation status ===') ++ ++ iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short, ++ '-c', 'alloc %d %d' % (size_short, size_diff), ++ base) ++ ++ iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short, ++ '-c', 'alloc %d %d' % (size_short, size_diff), ++ mid) ++ ++ iotests.qemu_io_log('-c', 'alloc 0 %d' % size_short, ++ '-c', 'alloc %d %d' % (size_short, size_diff), ++ top) ++ ++ iotests.log('=== Checking map ===') ++ ++ iotests.qemu_img_log('map', '--output=json', base) ++ iotests.qemu_img_log('map', '--output=human', base) ++ iotests.qemu_img_log('map', '--output=json', mid) ++ iotests.qemu_img_log('map', '--output=human', mid) ++ iotests.qemu_img_log('map', '--output=json', top) ++ iotests.qemu_img_log('map', '--output=human', top) ++ ++ iotests.log('=== Testing qemu-img commit (top -> mid) ===') ++ ++ iotests.qemu_img_log('commit', top) ++ iotests.img_info_log(mid) ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid) ++ ++ iotests.log('=== Testing HMP commit (top -> mid) ===') ++ ++ create_chain() ++ with create_vm() as vm: ++ vm.launch() ++ vm.qmp_log('human-monitor-command', command_line='commit drive0') ++ ++ iotests.img_info_log(mid) ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid) ++ ++ iotests.log('=== Testing QMP active commit (top -> mid) ===') ++ ++ create_chain() ++ with create_vm() as vm: ++ vm.launch() ++ vm.qmp_log('block-commit', device='top', base_node='mid', ++ job_id='job0', auto_dismiss=False) ++ vm.run_job('job0', wait=5) ++ ++ iotests.img_info_log(mid) ++ iotests.qemu_io_log('-c', 'read -P 1 0 %d' % size_short, mid) ++ iotests.qemu_io_log('-c', 'read -P 0 %d %d' % (size_short, size_diff), mid) ++ ++ ++ iotests.log('== Resize tests ==') ++ ++ # Use different sizes for different allocation modes: ++ # ++ # We want to have at least one test where 32 bit truncation in the size of ++ # the overlapping area becomes visible. This is covered by the ++ # prealloc='off' case (1G to 6G is an overlap of 5G). ++ # ++ # However, we can only do this for modes that don't preallocate data ++ # because otherwise we might run out of space on the test host. ++ # ++ # We also want to test some unaligned combinations. ++ for (prealloc, base_size, top_size_old, top_size_new, off) in [ ++ ('off', '6G', '1G', '8G', '5G'), ++ ('metadata', '32G', '30G', '33G', '31G'), ++ ('falloc', '10M', '5M', '15M', '9M'), ++ ('full', '16M', '8M', '12M', '11M'), ++ ('off', '384k', '253k', '512k', '253k'), ++ ('off', '400k', '256k', '512k', '336k'), ++ ('off', '512k', '256k', '500k', '436k')]: ++ ++ iotests.log('=== preallocation=%s ===' % prealloc) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, base, base_size) ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, '-b', base, top, ++ top_size_old) ++ iotests.qemu_io_log('-c', 'write -P 1 %s 64k' % off, base) ++ ++ # After this, top_size_old to base_size should be allocated/zeroed. ++ # ++ # In theory, leaving base_size to top_size_new unallocated would be ++ # correct, but in practice, if we zero out anything, we zero out ++ # everything up to top_size_new. ++ iotests.qemu_img_log('resize', '-f', iotests.imgfmt, ++ '--preallocation', prealloc, top, top_size_new) ++ iotests.qemu_io_log('-c', 'read -P 0 %s 64k' % off, top) ++ iotests.qemu_io_log('-c', 'map', top) ++ iotests.qemu_img_log('map', '--output=json', top) +diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out +new file mode 100644 +index 0000000..1a796fd +--- /dev/null ++++ b/tests/qemu-iotests/274.out +@@ -0,0 +1,268 @@ ++== Commit tests == ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 2097152/2097152 bytes at offset 0 ++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Check visible data === ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Checking allocation status === ++1048576/1048576 bytes allocated at offset 0 bytes ++1048576/1048576 bytes allocated at offset 1 MiB ++ ++0/1048576 bytes allocated at offset 0 bytes ++0/0 bytes allocated at offset 1 MiB ++ ++0/1048576 bytes allocated at offset 0 bytes ++0/1048576 bytes allocated at offset 1 MiB ++ ++=== Checking map === ++[{ "start": 0, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": 327680}] ++ ++Offset Length Mapped to File ++0 0x200000 0x50000 TEST_DIR/PID-base ++ ++[{ "start": 0, "length": 1048576, "depth": 1, "zero": false, "data": true, "offset": 327680}] ++ ++Offset Length Mapped to File ++0 0x100000 0x50000 TEST_DIR/PID-base ++ ++[{ "start": 0, "length": 1048576, "depth": 2, "zero": false, "data": true, "offset": 327680}, ++{ "start": 1048576, "length": 1048576, "depth": 0, "zero": true, "data": false}] ++ ++Offset Length Mapped to File ++0 0x100000 0x50000 TEST_DIR/PID-base ++ ++=== Testing qemu-img commit (top -> mid) === ++Image committed. ++ ++image: TEST_IMG ++file format: IMGFMT ++virtual size: 2 MiB (2097152 bytes) ++cluster_size: 65536 ++backing file: TEST_DIR/PID-base ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ refcount bits: 16 ++ corrupt: false ++ ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Testing HMP commit (top -> mid) === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 2097152/2097152 bytes at offset 0 ++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++{"execute": "human-monitor-command", "arguments": {"command-line": "commit drive0"}} ++{"return": ""} ++image: TEST_IMG ++file format: IMGFMT ++virtual size: 2 MiB (2097152 bytes) ++cluster_size: 65536 ++backing file: TEST_DIR/PID-base ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ refcount bits: 16 ++ corrupt: false ++ ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++=== Testing QMP active commit (top -> mid) === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=2097152 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-mid', fmt=qcow2 size=1048576 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=2097152 backing_file=TEST_DIR/PID-mid cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 2097152/2097152 bytes at offset 0 ++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++{"execute": "block-commit", "arguments": {"auto-dismiss": false, "base-node": "mid", "device": "top", "job-id": "job0"}} ++{"return": {}} ++{"execute": "job-complete", "arguments": {"id": "job0"}} ++{"return": {}} ++{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} ++image: TEST_IMG ++file format: IMGFMT ++virtual size: 2 MiB (2097152 bytes) ++cluster_size: 65536 ++backing file: TEST_DIR/PID-base ++Format specific information: ++ compat: 1.1 ++ lazy refcounts: false ++ refcount bits: 16 ++ corrupt: false ++ ++read 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++read 1048576/1048576 bytes at offset 1048576 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== Resize tests == ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=6442450944 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=1073741824 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 5368709120 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 5368709120 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++1 GiB (0x40000000) bytes not allocated at offset 0 bytes (0x0) ++7 GiB (0x1c0000000) bytes allocated at offset 1 GiB (0x40000000) ++ ++[{ "start": 0, "length": 1073741824, "depth": 1, "zero": true, "data": false}, ++{ "start": 1073741824, "length": 7516192768, "depth": 0, "zero": true, "data": false}] ++ ++=== preallocation=metadata === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=34359738368 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=32212254720 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 33285996544 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 33285996544 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++30 GiB (0x780000000) bytes not allocated at offset 0 bytes (0x0) ++3 GiB (0xc0000000) bytes allocated at offset 30 GiB (0x780000000) ++ ++[{ "start": 0, "length": 32212254720, "depth": 1, "zero": true, "data": false}, ++{ "start": 32212254720, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 327680}, ++{ "start": 32749125632, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 537264128}, ++{ "start": 33285996544, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1074200576}, ++{ "start": 33822867456, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 1611137024}, ++{ "start": 34359738368, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2148139008}, ++{ "start": 34896609280, "length": 536870912, "depth": 0, "zero": true, "data": false, "offset": 2685075456}] ++ ++=== preallocation=falloc === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=10485760 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=5242880 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 9437184 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 9437184 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++5 MiB (0x500000) bytes not allocated at offset 0 bytes (0x0) ++10 MiB (0xa00000) bytes allocated at offset 5 MiB (0x500000) ++ ++[{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false}, ++{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++ ++=== preallocation=full === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=8388608 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 11534336 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 11534336 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++8 MiB (0x800000) bytes not allocated at offset 0 bytes (0x0) ++4 MiB (0x400000) bytes allocated at offset 8 MiB (0x800000) ++ ++[{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false}, ++{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++ ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=259072 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 259072 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 259072 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++192 KiB (0x30000) bytes not allocated at offset 0 bytes (0x0) ++320 KiB (0x50000) bytes allocated at offset 192 KiB (0x30000) ++ ++[{ "start": 0, "length": 196608, "depth": 1, "zero": true, "data": false}, ++{ "start": 196608, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, ++{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}] ++ ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=409600 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 344064 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 344064 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0) ++256 KiB (0x40000) bytes allocated at offset 256 KiB (0x40000) ++ ++[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false}, ++{ "start": 262144, "length": 262144, "depth": 0, "zero": true, "data": false}] ++ ++=== preallocation=off === ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=524288 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++Formatting 'TEST_DIR/PID-top', fmt=qcow2 size=262144 backing_file=TEST_DIR/PID-base cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++wrote 65536/65536 bytes at offset 446464 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++Image resized. ++ ++read 65536/65536 bytes at offset 446464 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++256 KiB (0x40000) bytes not allocated at offset 0 bytes (0x0) ++244 KiB (0x3d000) bytes allocated at offset 256 KiB (0x40000) ++ ++[{ "start": 0, "length": 262144, "depth": 1, "zero": true, "data": false}, ++{ "start": 262144, "length": 249856, "depth": 0, "zero": true, "data": false}] ++ +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 033b54d..cddae00 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -286,6 +286,7 @@ + 270 rw backing quick + 272 rw + 273 backing quick ++274 rw backing + 277 rw quick + 280 rw migration quick + 281 rw quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-external-snapshot-with-VM-state.patch b/kvm-iotests-Test-external-snapshot-with-VM-state.patch new file mode 100755 index 0000000000000000000000000000000000000000..6fcb2f68a6de5943f84e5fc944291feff8965e9e --- /dev/null +++ b/kvm-iotests-Test-external-snapshot-with-VM-state.patch @@ -0,0 +1,189 @@ +From 38b0cff9703fc740c30f5874973ac1be88f94d9f Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:03 +0000 +Subject: [PATCH 06/18] iotests: Test external snapshot with VM state + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-6-kwolf@redhat.com> +Patchwork-id: 93752 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 5/6] iotests: Test external snapshot with VM state +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +This tests creating an external snapshot with VM state (which results in +an active overlay over an inactive backing file, which is also the root +node of an inactive BlockBackend), re-activating the images and +performing some operations to test that the re-activation worked as +intended. + +Signed-off-by: Kevin Wolf +(cherry picked from commit f62f08ab7a9d902da70078992248ec5c98f652ad) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/280 | 83 ++++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/280.out | 50 ++++++++++++++++++++++++++++ + tests/qemu-iotests/group | 1 + + 3 files changed, 134 insertions(+) + create mode 100755 tests/qemu-iotests/280 + create mode 100644 tests/qemu-iotests/280.out + +diff --git a/tests/qemu-iotests/280 b/tests/qemu-iotests/280 +new file mode 100755 +index 0000000..0b1fa8e +--- /dev/null ++++ b/tests/qemu-iotests/280 +@@ -0,0 +1,83 @@ ++#!/usr/bin/env python ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++# Creator/Owner: Kevin Wolf ++# ++# Test migration to file for taking an external snapshot with VM state. ++ ++import iotests ++import os ++ ++iotests.verify_image_format(supported_fmts=['qcow2']) ++iotests.verify_protocol(supported=['file']) ++iotests.verify_platform(['linux']) ++ ++with iotests.FilePath('base') as base_path , \ ++ iotests.FilePath('top') as top_path, \ ++ iotests.VM() as vm: ++ ++ iotests.qemu_img_log('create', '-f', iotests.imgfmt, base_path, '64M') ++ ++ iotests.log('=== Launch VM ===') ++ vm.add_object('iothread,id=iothread0') ++ vm.add_blockdev('file,filename=%s,node-name=base-file' % (base_path)) ++ vm.add_blockdev('%s,file=base-file,node-name=base-fmt' % (iotests.imgfmt)) ++ vm.add_device('virtio-blk,drive=base-fmt,iothread=iothread0,id=vda') ++ vm.launch() ++ ++ vm.enable_migration_events('VM') ++ ++ iotests.log('\n=== Migrate to file ===') ++ vm.qmp_log('migrate', uri='exec:cat > /dev/null') ++ ++ with iotests.Timeout(3, 'Migration does not complete'): ++ vm.wait_migration() ++ ++ iotests.log('\nVM is now stopped:') ++ iotests.log(vm.qmp('query-migrate')['return']['status']) ++ vm.qmp_log('query-status') ++ ++ iotests.log('\n=== Create a snapshot of the disk image ===') ++ vm.blockdev_create({ ++ 'driver': 'file', ++ 'filename': top_path, ++ 'size': 0, ++ }) ++ vm.qmp_log('blockdev-add', node_name='top-file', ++ driver='file', filename=top_path, ++ filters=[iotests.filter_qmp_testfiles]) ++ ++ vm.blockdev_create({ ++ 'driver': iotests.imgfmt, ++ 'file': 'top-file', ++ 'size': 1024 * 1024, ++ }) ++ vm.qmp_log('blockdev-add', node_name='top-fmt', ++ driver=iotests.imgfmt, file='top-file') ++ ++ vm.qmp_log('blockdev-snapshot', node='base-fmt', overlay='top-fmt') ++ ++ iotests.log('\n=== Resume the VM and simulate a write request ===') ++ vm.qmp_log('cont') ++ iotests.log(vm.hmp_qemu_io('-d vda/virtio-backend', 'write 4k 4k')) ++ ++ iotests.log('\n=== Commit it to the backing file ===') ++ result = vm.qmp_log('block-commit', job_id='job0', auto_dismiss=False, ++ device='top-fmt', top_node='top-fmt', ++ filters=[iotests.filter_qmp_testfiles]) ++ if 'return' in result: ++ vm.run_job('job0') +diff --git a/tests/qemu-iotests/280.out b/tests/qemu-iotests/280.out +new file mode 100644 +index 0000000..5d382fa +--- /dev/null ++++ b/tests/qemu-iotests/280.out +@@ -0,0 +1,50 @@ ++Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=67108864 cluster_size=65536 lazy_refcounts=off refcount_bits=16 ++ ++=== Launch VM === ++Enabling migration QMP events on VM... ++{"return": {}} ++ ++=== Migrate to file === ++{"execute": "migrate", "arguments": {"uri": "exec:cat > /dev/null"}} ++{"return": {}} ++{"data": {"status": "setup"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"status": "active"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"status": "completed"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++ ++VM is now stopped: ++completed ++{"execute": "query-status", "arguments": {}} ++{"return": {"running": false, "singlestep": false, "status": "postmigrate"}} ++ ++=== Create a snapshot of the disk image === ++{"execute": "blockdev-create", "arguments": {"job-id": "job0", "options": {"driver": "file", "filename": "TEST_DIR/PID-top", "size": 0}}} ++{"return": {}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} ++ ++{"execute": "blockdev-add", "arguments": {"driver": "file", "filename": "TEST_DIR/PID-top", "node-name": "top-file"}} ++{"return": {}} ++{"execute": "blockdev-create", "arguments": {"job-id": "job0", "options": {"driver": "qcow2", "file": "top-file", "size": 1048576}}} ++{"return": {}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} ++ ++{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": "top-file", "node-name": "top-fmt"}} ++{"return": {}} ++{"execute": "blockdev-snapshot", "arguments": {"node": "base-fmt", "overlay": "top-fmt"}} ++{"return": {}} ++ ++=== Resume the VM and simulate a write request === ++{"execute": "cont", "arguments": {}} ++{"return": {}} ++{"return": ""} ++ ++=== Commit it to the backing file === ++{"execute": "block-commit", "arguments": {"auto-dismiss": false, "device": "top-fmt", "job-id": "job0", "top-node": "top-fmt"}} ++{"return": {}} ++{"execute": "job-complete", "arguments": {"id": "job0"}} ++{"return": {}} ++{"data": {"device": "job0", "len": 65536, "offset": 65536, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"device": "job0", "len": 65536, "offset": 65536, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"execute": "job-dismiss", "arguments": {"id": "job0"}} ++{"return": {}} +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 06cc734..01301cd 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -286,3 +286,4 @@ + 272 rw + 273 backing quick + 277 rw quick ++280 rw migration quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch b/kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch new file mode 100755 index 0000000000000000000000000000000000000000..b09439bf67f3b95ef2cf91d889bb72c401fe88a9 --- /dev/null +++ b/kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch @@ -0,0 +1,322 @@ +From 6b9a6ba9ed753ad7aa714b35de938ebeeb4fa6cb Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Fri, 7 Feb 2020 10:27:49 +0000 +Subject: [PATCH 16/18] iotests: Test handling of AioContexts with some + blockdev actions + +RH-Author: Sergio Lopez Pascual +Message-id: <20200207112749.25073-10-slp@redhat.com> +Patchwork-id: 93762 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 9/9] iotests: Test handling of AioContexts with some blockdev actions +Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Includes the following tests: + + - Adding a dirty bitmap. + * RHBZ: 1782175 + + - Starting a drive-mirror to an NBD-backed target. + * RHBZ: 1746217, 1773517 + + - Aborting an external snapshot transaction. + * RHBZ: 1779036 + + - Aborting a blockdev backup transaction. + * RHBZ: 1782111 + +For each one of them, a VM with a number of disks running in an +IOThread AioContext is used. + +Signed-off-by: Sergio Lopez +Signed-off-by: Kevin Wolf +(cherry picked from commit 9b8c59e7610b9c5315ef093d801843dbe8debfac) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/281 | 247 +++++++++++++++++++++++++++++++++++++++++++++ + tests/qemu-iotests/281.out | 5 + + tests/qemu-iotests/group | 1 + + 3 files changed, 253 insertions(+) + create mode 100755 tests/qemu-iotests/281 + create mode 100644 tests/qemu-iotests/281.out + +diff --git a/tests/qemu-iotests/281 b/tests/qemu-iotests/281 +new file mode 100755 +index 0000000..269d583 +--- /dev/null ++++ b/tests/qemu-iotests/281 +@@ -0,0 +1,247 @@ ++#!/usr/bin/env python ++# ++# Test cases for blockdev + IOThread interactions ++# ++# Copyright (C) 2019 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++import os ++import iotests ++from iotests import qemu_img ++ ++image_len = 64 * 1024 * 1024 ++ ++# Test for RHBZ#1782175 ++class TestDirtyBitmapIOThread(iotests.QMPTestCase): ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ images = { 'drive0': drive0_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm = iotests.VM() ++ self.vm.add_object('iothread,id=iothread0') ++ ++ for name in self.images: ++ self.vm.add_blockdev('driver=file,filename=%s,node-name=file_%s' ++ % (self.images[name], name)) ++ self.vm.add_blockdev('driver=qcow2,file=file_%s,node-name=%s' ++ % (name, name)) ++ ++ self.vm.launch() ++ self.vm.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ def tearDown(self): ++ self.vm.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_add_dirty_bitmap(self): ++ result = self.vm.qmp( ++ 'block-dirty-bitmap-add', ++ node='drive0', ++ name='bitmap1', ++ persistent=True, ++ ) ++ ++ self.assert_qmp(result, 'return', {}) ++ ++ ++# Test for RHBZ#1746217 & RHBZ#1773517 ++class TestNBDMirrorIOThread(iotests.QMPTestCase): ++ nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock') ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ mirror_img = os.path.join(iotests.test_dir, 'mirror.img') ++ images = { 'drive0': drive0_img, 'mirror': mirror_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm_src = iotests.VM(path_suffix='src') ++ self.vm_src.add_object('iothread,id=iothread0') ++ self.vm_src.add_blockdev('driver=file,filename=%s,node-name=file0' ++ % (self.drive0_img)) ++ self.vm_src.add_blockdev('driver=qcow2,file=file0,node-name=drive0') ++ self.vm_src.launch() ++ self.vm_src.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ self.vm_tgt = iotests.VM(path_suffix='tgt') ++ self.vm_tgt.add_object('iothread,id=iothread0') ++ self.vm_tgt.add_blockdev('driver=file,filename=%s,node-name=file0' ++ % (self.mirror_img)) ++ self.vm_tgt.add_blockdev('driver=qcow2,file=file0,node-name=drive0') ++ self.vm_tgt.launch() ++ self.vm_tgt.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ def tearDown(self): ++ self.vm_src.shutdown() ++ self.vm_tgt.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_nbd_mirror(self): ++ result = self.vm_tgt.qmp( ++ 'nbd-server-start', ++ addr={ ++ 'type': 'unix', ++ 'data': { 'path': self.nbd_sock } ++ } ++ ) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm_tgt.qmp( ++ 'nbd-server-add', ++ device='drive0', ++ writable=True ++ ) ++ self.assert_qmp(result, 'return', {}) ++ ++ result = self.vm_src.qmp( ++ 'drive-mirror', ++ device='drive0', ++ target='nbd+unix:///drive0?socket=' + self.nbd_sock, ++ sync='full', ++ mode='existing', ++ speed=64*1024*1024, ++ job_id='j1' ++ ) ++ self.assert_qmp(result, 'return', {}) ++ ++ self.vm_src.event_wait(name="BLOCK_JOB_READY") ++ ++ ++# Test for RHBZ#1779036 ++class TestExternalSnapshotAbort(iotests.QMPTestCase): ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ snapshot_img = os.path.join(iotests.test_dir, 'snapshot.img') ++ images = { 'drive0': drive0_img, 'snapshot': snapshot_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm = iotests.VM() ++ self.vm.add_object('iothread,id=iothread0') ++ self.vm.add_blockdev('driver=file,filename=%s,node-name=file0' ++ % (self.drive0_img)) ++ self.vm.add_blockdev('driver=qcow2,file=file0,node-name=drive0') ++ self.vm.launch() ++ self.vm.qmp('x-blockdev-set-iothread', ++ node_name='drive0', iothread='iothread0', ++ force=True) ++ ++ def tearDown(self): ++ self.vm.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_external_snapshot_abort(self): ++ # Use a two actions transaction with a bogus values on the second ++ # one to trigger an abort of the transaction. ++ result = self.vm.qmp('transaction', actions=[ ++ { ++ 'type': 'blockdev-snapshot-sync', ++ 'data': { 'node-name': 'drive0', ++ 'snapshot-file': self.snapshot_img, ++ 'snapshot-node-name': 'snap1', ++ 'mode': 'absolute-paths', ++ 'format': 'qcow2' } ++ }, ++ { ++ 'type': 'blockdev-snapshot-sync', ++ 'data': { 'node-name': 'drive0', ++ 'snapshot-file': '/fakesnapshot', ++ 'snapshot-node-name': 'snap2', ++ 'mode': 'absolute-paths', ++ 'format': 'qcow2' } ++ }, ++ ]) ++ ++ # Crashes on failure, we expect this error. ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++ ++# Test for RHBZ#1782111 ++class TestBlockdevBackupAbort(iotests.QMPTestCase): ++ drive0_img = os.path.join(iotests.test_dir, 'drive0.img') ++ drive1_img = os.path.join(iotests.test_dir, 'drive1.img') ++ snap0_img = os.path.join(iotests.test_dir, 'snap0.img') ++ snap1_img = os.path.join(iotests.test_dir, 'snap1.img') ++ images = { 'drive0': drive0_img, ++ 'drive1': drive1_img, ++ 'snap0': snap0_img, ++ 'snap1': snap1_img } ++ ++ def setUp(self): ++ for name in self.images: ++ qemu_img('create', '-f', iotests.imgfmt, ++ self.images[name], str(image_len)) ++ ++ self.vm = iotests.VM() ++ self.vm.add_object('iothread,id=iothread0') ++ self.vm.add_device('virtio-scsi,iothread=iothread0') ++ ++ for name in self.images: ++ self.vm.add_blockdev('driver=file,filename=%s,node-name=file_%s' ++ % (self.images[name], name)) ++ self.vm.add_blockdev('driver=qcow2,file=file_%s,node-name=%s' ++ % (name, name)) ++ ++ self.vm.add_device('scsi-hd,drive=drive0') ++ self.vm.add_device('scsi-hd,drive=drive1') ++ self.vm.launch() ++ ++ def tearDown(self): ++ self.vm.shutdown() ++ for name in self.images: ++ os.remove(self.images[name]) ++ ++ def test_blockdev_backup_abort(self): ++ # Use a two actions transaction with a bogus values on the second ++ # one to trigger an abort of the transaction. ++ result = self.vm.qmp('transaction', actions=[ ++ { ++ 'type': 'blockdev-backup', ++ 'data': { 'device': 'drive0', ++ 'target': 'snap0', ++ 'sync': 'full', ++ 'job-id': 'j1' } ++ }, ++ { ++ 'type': 'blockdev-backup', ++ 'data': { 'device': 'drive1', ++ 'target': 'snap1', ++ 'sync': 'full' } ++ }, ++ ]) ++ ++ # Hangs on failure, we expect this error. ++ self.assert_qmp(result, 'error/class', 'GenericError') ++ ++if __name__ == '__main__': ++ iotests.main(supported_fmts=['qcow2'], ++ supported_protocols=['file']) +diff --git a/tests/qemu-iotests/281.out b/tests/qemu-iotests/281.out +new file mode 100644 +index 0000000..89968f3 +--- /dev/null ++++ b/tests/qemu-iotests/281.out +@@ -0,0 +1,5 @@ ++.... ++---------------------------------------------------------------------- ++Ran 4 tests ++ ++OK +diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group +index 01301cd..c0e8197 100644 +--- a/tests/qemu-iotests/group ++++ b/tests/qemu-iotests/group +@@ -287,3 +287,4 @@ + 273 backing quick + 277 rw quick + 280 rw migration quick ++281 rw quick +-- +1.8.3.1 + diff --git a/kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch b/kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch new file mode 100755 index 0000000000000000000000000000000000000000..58ef198f84e1a1b85c28550837eb5bfdf2a5da68 --- /dev/null +++ b/kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch @@ -0,0 +1,162 @@ +From 239f7bdeef48a3c0b07098617371b9955dc55348 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:36 +0000 +Subject: [PATCH 16/20] iotests: Test mirror with temporarily disabled target + backing file + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-11-kwolf@redhat.com> +Patchwork-id: 94288 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 10/13] iotests: Test mirror with temporarily disabled target backing file +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +The newly tested scenario is a common live storage migration scenario: +The target node is opened without a backing file so that the active +layer is mirrored while its backing chain can be copied in the +background. + +The backing chain should be attached to the mirror target node when +finalising the job, just before switching the users of the source node +to the new copy (at which point the mirror job still has a reference to +the node). drive-mirror did this automatically, but with blockdev-mirror +this is the job of the QMP client. + +This patch adds test cases for two ways to achieve the desired result, +using either x-blockdev-reopen or blockdev-snapshot. + +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-5-kwolf@redhat.com> +Reviewed-by: Peter Krempa +Signed-off-by: Kevin Wolf +(cherry picked from commit 8bdee9f10eac2aefdcc5095feef756354c87bdec) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/155 | 56 +++++++++++++++++++++++++++++++++++++++++----- + tests/qemu-iotests/155.out | 4 ++-- + 2 files changed, 53 insertions(+), 7 deletions(-) + +diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 +index d7ef257..3053e50 100755 +--- a/tests/qemu-iotests/155 ++++ b/tests/qemu-iotests/155 +@@ -45,10 +45,15 @@ target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt) + # image during runtime, only makes sense if + # target_blockdev_backing is not None + # (None: same as target_backing) ++# target_open_with_backing: If True, the target image is added with its backing ++# chain opened right away. If False, blockdev-add ++# opens it without a backing file and job completion ++# is supposed to open the backing chain. + + class BaseClass(iotests.QMPTestCase): + target_blockdev_backing = None + target_real_backing = None ++ target_open_with_backing = True + + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, back0_img, '1440K') +@@ -80,9 +85,13 @@ class BaseClass(iotests.QMPTestCase): + options = { 'node-name': 'target', + 'driver': iotests.imgfmt, + 'file': { 'driver': 'file', ++ 'node-name': 'target-file', + 'filename': target_img } } +- if self.target_blockdev_backing: +- options['backing'] = self.target_blockdev_backing ++ ++ if not self.target_open_with_backing: ++ options['backing'] = None ++ elif self.target_blockdev_backing: ++ options['backing'] = self.target_blockdev_backing + + result = self.vm.qmp('blockdev-add', **options) + self.assert_qmp(result, 'return', {}) +@@ -147,10 +156,14 @@ class BaseClass(iotests.QMPTestCase): + # cmd: Mirroring command to execute, either drive-mirror or blockdev-mirror + + class MirrorBaseClass(BaseClass): ++ def openBacking(self): ++ pass ++ + def runMirror(self, sync): + if self.cmd == 'blockdev-mirror': + result = self.vm.qmp(self.cmd, job_id='mirror-job', device='source', +- sync=sync, target='target') ++ sync=sync, target='target', ++ auto_finalize=False) + else: + if self.existing: + mode = 'existing' +@@ -159,11 +172,12 @@ class MirrorBaseClass(BaseClass): + result = self.vm.qmp(self.cmd, job_id='mirror-job', device='source', + sync=sync, target=target_img, + format=iotests.imgfmt, mode=mode, +- node_name='target') ++ node_name='target', auto_finalize=False) + + self.assert_qmp(result, 'return', {}) + +- self.complete_and_wait('mirror-job') ++ self.vm.run_job('mirror-job', use_log=False, auto_finalize=False, ++ pre_finalize=self.openBacking, auto_dismiss=True) + + def testFull(self): + self.runMirror('full') +@@ -221,6 +235,38 @@ class TestBlockdevMirrorForcedBacking(MirrorBaseClass): + target_blockdev_backing = { 'driver': 'null-co' } + target_real_backing = 'null-co://' + ++# Attach the backing chain only during completion, with blockdev-reopen ++class TestBlockdevMirrorReopen(MirrorBaseClass): ++ cmd = 'blockdev-mirror' ++ existing = True ++ target_backing = 'null-co://' ++ target_open_with_backing = False ++ ++ def openBacking(self): ++ if not self.target_open_with_backing: ++ result = self.vm.qmp('blockdev-add', node_name="backing", ++ driver="null-co") ++ self.assert_qmp(result, 'return', {}) ++ result = self.vm.qmp('x-blockdev-reopen', node_name="target", ++ driver=iotests.imgfmt, file="target-file", ++ backing="backing") ++ self.assert_qmp(result, 'return', {}) ++ ++# Attach the backing chain only during completion, with blockdev-snapshot ++class TestBlockdevMirrorSnapshot(MirrorBaseClass): ++ cmd = 'blockdev-mirror' ++ existing = True ++ target_backing = 'null-co://' ++ target_open_with_backing = False ++ ++ def openBacking(self): ++ if not self.target_open_with_backing: ++ result = self.vm.qmp('blockdev-add', node_name="backing", ++ driver="null-co") ++ self.assert_qmp(result, 'return', {}) ++ result = self.vm.qmp('blockdev-snapshot', node="backing", ++ overlay="target") ++ self.assert_qmp(result, 'return', {}) + + class TestCommit(BaseClass): + existing = False +diff --git a/tests/qemu-iotests/155.out b/tests/qemu-iotests/155.out +index 4176bb9..4fd1c2d 100644 +--- a/tests/qemu-iotests/155.out ++++ b/tests/qemu-iotests/155.out +@@ -1,5 +1,5 @@ +-................... ++......................... + ---------------------------------------------------------------------- +-Ran 19 tests ++Ran 25 tests + + OK +-- +1.8.3.1 + diff --git a/kvm-iotests-Use-complete_and_wait-in-155.patch b/kvm-iotests-Use-complete_and_wait-in-155.patch new file mode 100755 index 0000000000000000000000000000000000000000..38b41be98ebbd7bcd5f67572c919ddcce8d2f796 --- /dev/null +++ b/kvm-iotests-Use-complete_and_wait-in-155.patch @@ -0,0 +1,50 @@ +From 872fbd32d06bda4aba3a7e67a95f76f62e475dbe Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:27 +0000 +Subject: [PATCH 07/20] iotests: Use complete_and_wait() in 155 + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-2-kwolf@redhat.com> +Patchwork-id: 94279 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 01/13] iotests: Use complete_and_wait() in 155 +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Max Reitz + +This way, we get to see errors during the completion phase. + +Signed-off-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200218103454.296704-14-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 6644d0e6192b36cdf2902c9774e1afb8ab2e7223) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/155 | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 +index e194859..d7ef257 100755 +--- a/tests/qemu-iotests/155 ++++ b/tests/qemu-iotests/155 +@@ -163,12 +163,7 @@ class MirrorBaseClass(BaseClass): + + self.assert_qmp(result, 'return', {}) + +- self.vm.event_wait('BLOCK_JOB_READY') +- +- result = self.vm.qmp('block-job-complete', device='mirror-job') +- self.assert_qmp(result, 'return', {}) +- +- self.vm.event_wait('BLOCK_JOB_COMPLETED') ++ self.complete_and_wait('mirror-job') + + def testFull(self): + self.runMirror('full') +-- +1.8.3.1 + diff --git a/kvm-iotests-don-t-use-format-for-drive_add.patch b/kvm-iotests-don-t-use-format-for-drive_add.patch new file mode 100755 index 0000000000000000000000000000000000000000..f95e17a9a651a6bdb62386ecfe8118b478e2ace2 --- /dev/null +++ b/kvm-iotests-don-t-use-format-for-drive_add.patch @@ -0,0 +1,81 @@ +From 127360c2fa0fefa18ff828bfec3985e04791d665 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:16 +0100 +Subject: [PATCH 17/26] iotests: don't use 'format' for drive_add +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-3-kwolf@redhat.com> +Patchwork-id: 97102 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 02/11] iotests: don't use 'format' for drive_add +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +From: John Snow + +It shadows (with a different type) the built-in format. +Use something else. + +Signed-off-by: John Snow +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Max Reitz +Message-Id: <20200331000014.11581-3-jsnow@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Max Reitz +(cherry picked from commit 1d3d4b630c6ea8b19420c097f0c448b6ded95072) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/055 | 3 ++- + tests/qemu-iotests/iotests.py | 6 +++--- + 2 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055 +index c732a11..eb50c9f 100755 +--- a/tests/qemu-iotests/055 ++++ b/tests/qemu-iotests/055 +@@ -469,7 +469,8 @@ class TestDriveCompression(iotests.QMPTestCase): + qemu_img('create', '-f', fmt, blockdev_target_img, + str(TestDriveCompression.image_len), *args) + if attach_target: +- self.vm.add_drive(blockdev_target_img, format=fmt, interface="none") ++ self.vm.add_drive(blockdev_target_img, ++ img_format=fmt, interface="none") + + self.vm.launch() + +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 46f880c..be20d56 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -481,20 +481,20 @@ class VM(qtest.QEMUQtestMachine): + self._args.append(opts) + return self + +- def add_drive(self, path, opts='', interface='virtio', format=imgfmt): ++ def add_drive(self, path, opts='', interface='virtio', img_format=imgfmt): + '''Add a virtio-blk drive to the VM''' + options = ['if=%s' % interface, + 'id=drive%d' % self._num_drives] + + if path is not None: + options.append('file=%s' % path) +- options.append('format=%s' % format) ++ options.append('format=%s' % img_format) + options.append('cache=%s' % cachemode) + + if opts: + options.append(opts) + +- if format == 'luks' and 'key-secret' not in opts: ++ if img_format == 'luks' and 'key-secret' not in opts: + # default luks support + if luks_default_secret_object not in self._args: + self.add_object(luks_default_secret_object) +-- +1.8.3.1 + diff --git a/kvm-iotests.py-Let-wait_migration-wait-even-more.patch b/kvm-iotests.py-Let-wait_migration-wait-even-more.patch new file mode 100755 index 0000000000000000000000000000000000000000..cda803759946039cc52d8f99d72bf748bd7d6ba6 --- /dev/null +++ b/kvm-iotests.py-Let-wait_migration-wait-even-more.patch @@ -0,0 +1,123 @@ +From d6df1426ae65b3a0d50bdbb1f8a7246386dd6ebf Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 7 Feb 2020 11:24:04 +0000 +Subject: [PATCH 07/18] iotests.py: Let wait_migration wait even more + +RH-Author: Kevin Wolf +Message-id: <20200207112404.25198-7-kwolf@redhat.com> +Patchwork-id: 93751 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 6/6] iotests.py: Let wait_migration wait even more +Bugzilla: 1781637 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +From: Max Reitz + +The "migration completed" event may be sent (on the source, to be +specific) before the migration is actually completed, so the VM runstate +will still be "finish-migrate" instead of "postmigrate". So ask the +users of VM.wait_migration() to specify the final runstate they desire +and then poll the VM until it has reached that state. (This should be +over very quickly, so busy polling is fine.) + +Without this patch, I see intermittent failures in the new iotest 280 +under high system load. I have not yet seen such failures with other +iotests that use VM.wait_migration() and query-status afterwards, but +maybe they just occur even more rarely, or it is because they also wait +on the destination VM to be running. + +Signed-off-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 8da7969bd7014f6de037d8ae132b40721944b186) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + tests/qemu-iotests/234 | 8 ++++---- + tests/qemu-iotests/262 | 4 ++-- + tests/qemu-iotests/280 | 2 +- + tests/qemu-iotests/iotests.py | 6 +++++- + 4 files changed, 12 insertions(+), 8 deletions(-) + +diff --git a/tests/qemu-iotests/234 b/tests/qemu-iotests/234 +index 34c818c..59a7f94 100755 +--- a/tests/qemu-iotests/234 ++++ b/tests/qemu-iotests/234 +@@ -69,9 +69,9 @@ with iotests.FilePath('img') as img_path, \ + iotests.log(vm_a.qmp('migrate', uri='exec:cat >%s' % (fifo_a))) + with iotests.Timeout(3, 'Migration does not complete'): + # Wait for the source first (which includes setup=setup) +- vm_a.wait_migration() ++ vm_a.wait_migration('postmigrate') + # Wait for the destination second (which does not) +- vm_b.wait_migration() ++ vm_b.wait_migration('running') + + iotests.log(vm_a.qmp('query-migrate')['return']['status']) + iotests.log(vm_b.qmp('query-migrate')['return']['status']) +@@ -98,9 +98,9 @@ with iotests.FilePath('img') as img_path, \ + iotests.log(vm_b.qmp('migrate', uri='exec:cat >%s' % (fifo_b))) + with iotests.Timeout(3, 'Migration does not complete'): + # Wait for the source first (which includes setup=setup) +- vm_b.wait_migration() ++ vm_b.wait_migration('postmigrate') + # Wait for the destination second (which does not) +- vm_a.wait_migration() ++ vm_a.wait_migration('running') + + iotests.log(vm_a.qmp('query-migrate')['return']['status']) + iotests.log(vm_b.qmp('query-migrate')['return']['status']) +diff --git a/tests/qemu-iotests/262 b/tests/qemu-iotests/262 +index 0963daa..bbcb526 100755 +--- a/tests/qemu-iotests/262 ++++ b/tests/qemu-iotests/262 +@@ -71,9 +71,9 @@ with iotests.FilePath('img') as img_path, \ + iotests.log(vm_a.qmp('migrate', uri='exec:cat >%s' % (fifo))) + with iotests.Timeout(3, 'Migration does not complete'): + # Wait for the source first (which includes setup=setup) +- vm_a.wait_migration() ++ vm_a.wait_migration('postmigrate') + # Wait for the destination second (which does not) +- vm_b.wait_migration() ++ vm_b.wait_migration('running') + + iotests.log(vm_a.qmp('query-migrate')['return']['status']) + iotests.log(vm_b.qmp('query-migrate')['return']['status']) +diff --git a/tests/qemu-iotests/280 b/tests/qemu-iotests/280 +index 0b1fa8e..85e9114 100755 +--- a/tests/qemu-iotests/280 ++++ b/tests/qemu-iotests/280 +@@ -45,7 +45,7 @@ with iotests.FilePath('base') as base_path , \ + vm.qmp_log('migrate', uri='exec:cat > /dev/null') + + with iotests.Timeout(3, 'Migration does not complete'): +- vm.wait_migration() ++ vm.wait_migration('postmigrate') + + iotests.log('\nVM is now stopped:') + iotests.log(vm.qmp('query-migrate')['return']['status']) +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index 5741efb..0c55f7b 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -663,12 +663,16 @@ class VM(qtest.QEMUQtestMachine): + } + ])) + +- def wait_migration(self): ++ def wait_migration(self, expect_runstate): + while True: + event = self.event_wait('MIGRATION') + log(event, filters=[filter_qmp_event]) + if event['data']['status'] == 'completed': + break ++ # The event may occur in finish-migrate, so wait for the expected ++ # post-migration runstate ++ while self.qmp('query-status')['return']['status'] != expect_runstate: ++ pass + + def node_info(self, node_name): + nodes = self.qmp('query-named-block-nodes') +-- +1.8.3.1 + diff --git a/kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch b/kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch new file mode 100755 index 0000000000000000000000000000000000000000..2ee9dcdd17154194d4669fc3c4048d1bcbc18e53 --- /dev/null +++ b/kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch @@ -0,0 +1,79 @@ +From 1c508d56d154caf5fbf53e7dabafd707236cb16b Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Wed, 29 Jan 2020 13:45:18 +0000 +Subject: [PATCH 06/15] iscsi: Cap block count from GET LBA STATUS + (CVE-2020-1711) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200129134518.1293-2-jmaloy@redhat.com> +Patchwork-id: 93571 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] iscsi: Cap block count from GET LBA STATUS (CVE-2020-1711) +Bugzilla: 1794503 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf +RH-Acked-by: Philippe Mathieu-Daudé + +From: Felipe Franciosi + +When querying an iSCSI server for the provisioning status of blocks (via +GET LBA STATUS), Qemu only validates that the response descriptor zero's +LBA matches the one requested. Given the SCSI spec allows servers to +respond with the status of blocks beyond the end of the LUN, Qemu may +have its heap corrupted by clearing/setting too many bits at the end of +its allocmap for the LUN. + +A malicious guest in control of the iSCSI server could carefully program +Qemu's heap (by selectively setting the bitmap) and then smash it. + +This limits the number of bits that iscsi_co_block_status() will try to +update in the allocmap so it can't overflow the bitmap. + +Fixes: CVE-2020-1711 +Cc: qemu-stable@nongnu.org +Signed-off-by: Felipe Franciosi +Signed-off-by: Peter Turschmid +Signed-off-by: Raphael Norwitz +Signed-off-by: Kevin Wolf +(cherry picked from commit 693fd2acdf14dd86c0bf852610f1c2cca80a74dc) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + block/iscsi.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/block/iscsi.c b/block/iscsi.c +index 2aea7e3..cbd5729 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -701,7 +701,7 @@ static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs, + struct scsi_get_lba_status *lbas = NULL; + struct scsi_lba_status_descriptor *lbasd = NULL; + struct IscsiTask iTask; +- uint64_t lba; ++ uint64_t lba, max_bytes; + int ret; + + iscsi_co_init_iscsitask(iscsilun, &iTask); +@@ -721,6 +721,7 @@ static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs, + } + + lba = offset / iscsilun->block_size; ++ max_bytes = (iscsilun->num_blocks - lba) * iscsilun->block_size; + + qemu_mutex_lock(&iscsilun->mutex); + retry: +@@ -764,7 +765,7 @@ retry: + goto out_unlock; + } + +- *pnum = (int64_t) lbasd->num_blocks * iscsilun->block_size; ++ *pnum = MIN((int64_t) lbasd->num_blocks * iscsilun->block_size, max_bytes); + + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { +-- +1.8.3.1 + diff --git a/kvm-iscsi-Drop-iscsi_co_create_opts.patch b/kvm-iscsi-Drop-iscsi_co_create_opts.patch new file mode 100755 index 0000000000000000000000000000000000000000..a6d0baf2523894c36eaab2c7428a80913ae201da --- /dev/null +++ b/kvm-iscsi-Drop-iscsi_co_create_opts.patch @@ -0,0 +1,113 @@ +From 58b7d33e1bc17b89103ceaa39f5722a69b35d810 Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Wed, 11 Mar 2020 10:51:45 +0000 +Subject: [PATCH 04/20] iscsi: Drop iscsi_co_create_opts() + +RH-Author: Maxim Levitsky +Message-id: <20200311105147.13208-5-mlevitsk@redhat.com> +Patchwork-id: 94226 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 4/6] iscsi: Drop iscsi_co_create_opts() +Bugzilla: 1640894 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: John Snow +RH-Acked-by: Max Reitz + +From: Max Reitz + +The generic fallback implementation effectively does the same. + +Reviewed-by: Maxim Levitsky +Signed-off-by: Max Reitz +Message-Id: <20200122164532.178040-5-mreitz@redhat.com> +Signed-off-by: Max Reitz +(cherry picked from commit 80f0900905b555f00d644894c786b6d66ac2e00e) +Signed-off-by: Maxim Levitsky +Signed-off-by: Danilo C. L. de Paula +--- + block/iscsi.c | 56 -------------------------------------------------------- + 1 file changed, 56 deletions(-) + +diff --git a/block/iscsi.c b/block/iscsi.c +index cbd5729..b45da65 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -2164,58 +2164,6 @@ static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset, + return 0; + } + +-static int coroutine_fn iscsi_co_create_opts(const char *filename, QemuOpts *opts, +- Error **errp) +-{ +- int ret = 0; +- int64_t total_size = 0; +- BlockDriverState *bs; +- IscsiLun *iscsilun = NULL; +- QDict *bs_options; +- Error *local_err = NULL; +- +- bs = bdrv_new(); +- +- /* Read out options */ +- total_size = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +- BDRV_SECTOR_SIZE); +- bs->opaque = g_new0(struct IscsiLun, 1); +- iscsilun = bs->opaque; +- +- bs_options = qdict_new(); +- iscsi_parse_filename(filename, bs_options, &local_err); +- if (local_err) { +- error_propagate(errp, local_err); +- ret = -EINVAL; +- } else { +- ret = iscsi_open(bs, bs_options, 0, NULL); +- } +- qobject_unref(bs_options); +- +- if (ret != 0) { +- goto out; +- } +- iscsi_detach_aio_context(bs); +- if (iscsilun->type != TYPE_DISK) { +- ret = -ENODEV; +- goto out; +- } +- if (bs->total_sectors < total_size) { +- ret = -ENOSPC; +- goto out; +- } +- +- ret = 0; +-out: +- if (iscsilun->iscsi != NULL) { +- iscsi_destroy_context(iscsilun->iscsi); +- } +- g_free(bs->opaque); +- bs->opaque = NULL; +- bdrv_unref(bs); +- return ret; +-} +- + static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) + { + IscsiLun *iscsilun = bs->opaque; +@@ -2486,8 +2434,6 @@ static BlockDriver bdrv_iscsi = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, +- .bdrv_co_create_opts = iscsi_co_create_opts, +- .create_opts = &iscsi_create_opts, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +@@ -2525,8 +2471,6 @@ static BlockDriver bdrv_iser = { + .bdrv_parse_filename = iscsi_parse_filename, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, +- .bdrv_co_create_opts = iscsi_co_create_opts, +- .create_opts = &iscsi_create_opts, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + .bdrv_reopen_commit = iscsi_reopen_commit, + .bdrv_co_invalidate_cache = iscsi_co_invalidate_cache, +-- +1.8.3.1 + diff --git a/kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch b/kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch new file mode 100755 index 0000000000000000000000000000000000000000..e38428b1801d7e06b597ca157eb7c5a2d0c1fe4c --- /dev/null +++ b/kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch @@ -0,0 +1,213 @@ +From 3f16b8a33bd7503cbe857fbeb45fff7301b6bb5f Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:12 +0100 +Subject: [PATCH 1/6] job: take each job's lock individually in job_txn_apply + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-2-kwolf@redhat.com> +Patchwork-id: 94597 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/6] job: take each job's lock individually in job_txn_apply +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +From: Stefan Reiter + +All callers of job_txn_apply hold a single job's lock, but different +jobs within a transaction can have different contexts, thus we need to +lock each one individually before applying the callback function. + +Similar to job_completed_txn_abort this also requires releasing the +caller's context before and reacquiring it after to avoid recursive +locks which might break AIO_WAIT_WHILE in the callback. This is safe, since +existing code would already have to take this into account, lest +job_completed_txn_abort might have broken. + +This also brings to light a different issue: When a callback function in +job_txn_apply moves it's job to a different AIO context, callers will +try to release the wrong lock (now that we re-acquire the lock +correctly, previously it would just continue with the old lock, leaving +the job unlocked for the rest of the return path). Fix this by not caching +the job's context. + +This is only necessary for qmp_block_job_finalize, qmp_job_finalize and +job_exit, since everyone else calls through job_exit. + +One test needed adapting, since it calls job_finalize directly, so it +manually needs to acquire the correct context. + +Signed-off-by: Stefan Reiter +Message-Id: <20200407115651.69472-2-s.reiter@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit b660a84bbb0eb1a76b505648d31d5e82594fb75e) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + blockdev.c | 9 +++++++++ + job-qmp.c | 9 +++++++++ + job.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- + tests/test-blockjob.c | 2 ++ + 4 files changed, 60 insertions(+), 10 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index c8d4b51..86eb115 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -4215,7 +4215,16 @@ void qmp_block_job_finalize(const char *id, Error **errp) + } + + trace_qmp_block_job_finalize(job); ++ job_ref(&job->job); + job_finalize(&job->job, errp); ++ ++ /* ++ * Job's context might have changed via job_finalize (and job_txn_apply ++ * automatically acquires the new one), so make sure we release the correct ++ * one. ++ */ ++ aio_context = blk_get_aio_context(job->blk); ++ job_unref(&job->job); + aio_context_release(aio_context); + } + +diff --git a/job-qmp.c b/job-qmp.c +index fbfed25..a201220 100644 +--- a/job-qmp.c ++++ b/job-qmp.c +@@ -114,7 +114,16 @@ void qmp_job_finalize(const char *id, Error **errp) + } + + trace_qmp_job_finalize(job); ++ job_ref(job); + job_finalize(job, errp); ++ ++ /* ++ * Job's context might have changed via job_finalize (and job_txn_apply ++ * automatically acquires the new one), so make sure we release the correct ++ * one. ++ */ ++ aio_context = job->aio_context; ++ job_unref(job); + aio_context_release(aio_context); + } + +diff --git a/job.c b/job.c +index 04409b4..48fc4ad 100644 +--- a/job.c ++++ b/job.c +@@ -136,17 +136,38 @@ static void job_txn_del_job(Job *job) + } + } + +-static int job_txn_apply(JobTxn *txn, int fn(Job *)) ++static int job_txn_apply(Job *job, int fn(Job *)) + { +- Job *job, *next; ++ AioContext *inner_ctx; ++ Job *other_job, *next; ++ JobTxn *txn = job->txn; + int rc = 0; + +- QLIST_FOREACH_SAFE(job, &txn->jobs, txn_list, next) { +- rc = fn(job); ++ /* ++ * Similar to job_completed_txn_abort, we take each job's lock before ++ * applying fn, but since we assume that outer_ctx is held by the caller, ++ * we need to release it here to avoid holding the lock twice - which would ++ * break AIO_WAIT_WHILE from within fn. ++ */ ++ job_ref(job); ++ aio_context_release(job->aio_context); ++ ++ QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) { ++ inner_ctx = other_job->aio_context; ++ aio_context_acquire(inner_ctx); ++ rc = fn(other_job); ++ aio_context_release(inner_ctx); + if (rc) { + break; + } + } ++ ++ /* ++ * Note that job->aio_context might have been changed by calling fn, so we ++ * can't use a local variable to cache it. ++ */ ++ aio_context_acquire(job->aio_context); ++ job_unref(job); + return rc; + } + +@@ -774,11 +795,11 @@ static void job_do_finalize(Job *job) + assert(job && job->txn); + + /* prepare the transaction to complete */ +- rc = job_txn_apply(job->txn, job_prepare); ++ rc = job_txn_apply(job, job_prepare); + if (rc) { + job_completed_txn_abort(job); + } else { +- job_txn_apply(job->txn, job_finalize_single); ++ job_txn_apply(job, job_finalize_single); + } + } + +@@ -824,10 +845,10 @@ static void job_completed_txn_success(Job *job) + assert(other_job->ret == 0); + } + +- job_txn_apply(txn, job_transition_to_pending); ++ job_txn_apply(job, job_transition_to_pending); + + /* If no jobs need manual finalization, automatically do so */ +- if (job_txn_apply(txn, job_needs_finalize) == 0) { ++ if (job_txn_apply(job, job_needs_finalize) == 0) { + job_do_finalize(job); + } + } +@@ -849,9 +870,10 @@ static void job_completed(Job *job) + static void job_exit(void *opaque) + { + Job *job = (Job *)opaque; +- AioContext *ctx = job->aio_context; ++ AioContext *ctx; + +- aio_context_acquire(ctx); ++ job_ref(job); ++ aio_context_acquire(job->aio_context); + + /* This is a lie, we're not quiescent, but still doing the completion + * callbacks. However, completion callbacks tend to involve operations that +@@ -862,6 +884,14 @@ static void job_exit(void *opaque) + + job_completed(job); + ++ /* ++ * Note that calling job_completed can move the job to a different ++ * aio_context, so we cannot cache from above. job_txn_apply takes care of ++ * acquiring the new lock, and we ref/unref to avoid job_completed freeing ++ * the job underneath us. ++ */ ++ ctx = job->aio_context; ++ job_unref(job); + aio_context_release(ctx); + } + +diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c +index 7844c9f..6d857fd 100644 +--- a/tests/test-blockjob.c ++++ b/tests/test-blockjob.c +@@ -368,7 +368,9 @@ static void test_cancel_concluded(void) + aio_poll(qemu_get_aio_context(), true); + assert(job->status == JOB_STATUS_PENDING); + ++ aio_context_acquire(job->aio_context); + job_finalize(job, &error_abort); ++ aio_context_release(job->aio_context); + assert(job->status == JOB_STATUS_CONCLUDED); + + cancel_common(s); +-- +1.8.3.1 + diff --git a/kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch b/kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch new file mode 100755 index 0000000000000000000000000000000000000000..902af6c917b480533457acbf89f64b98f93cf0bb --- /dev/null +++ b/kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch @@ -0,0 +1,53 @@ +From e2cafb929acb74377754cb688419575b139b922a Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:47 -0400 +Subject: [PATCH 9/9] lan9118: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-10-jmaloy@redhat.com> +Patchwork-id: 101790 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 9/9] lan9118: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 37cee01784ff0df13e5209517e1b3594a5e792d1) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/lan9118.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c +index ed551f2178..7bb4633f0f 100644 +--- a/hw/net/lan9118.c ++++ b/hw/net/lan9118.c +@@ -667,7 +667,7 @@ static void do_tx_packet(lan9118_state *s) + /* FIXME: Honor TX disable, and allow queueing of packets. */ + if (s->phy_control & 0x4000) { + /* This assumes the receive routine doesn't touch the VLANClient. */ +- lan9118_receive(qemu_get_queue(s->nic), s->txp->data, s->txp->len); ++ qemu_receive_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len); + } else { + qemu_send_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len); + } +-- +2.27.0 + diff --git a/kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch b/kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch new file mode 100755 index 0000000000000000000000000000000000000000..71a2eacb60a3e05128609d07778f535e52d792e6 --- /dev/null +++ b/kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch @@ -0,0 +1,47 @@ +From 2687e0348e3e4d377b4f5356e46948dc2b371b6d Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:02 -0400 +Subject: [PATCH 3/7] libqos: pci-pc: use 32-bit write for EJ register +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-3-jmaloy@redhat.com> +Patchwork-id: 101484 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 2/6] libqos: pci-pc: use 32-bit write for EJ register +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Paolo Bonzini + +The memory region ops have min_access_size == 4 so obey it. + +Tested-by: Thomas Huth +Signed-off-by: Paolo Bonzini + +(cherry picked from commit 4b7c06837ae0b1ff56473202a42e7e386f53d6db) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + tests/libqos/pci-pc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tests/libqos/pci-pc.c b/tests/libqos/pci-pc.c +index 0bc591d1da..3bb2eb3ba8 100644 +--- a/tests/libqos/pci-pc.c ++++ b/tests/libqos/pci-pc.c +@@ -186,7 +186,7 @@ void qpci_unplug_acpi_device_test(QTestState *qts, const char *id, uint8_t slot) + g_assert(!qdict_haskey(response, "error")); + qobject_unref(response); + +- qtest_outb(qts, ACPI_PCIHP_ADDR + PCI_EJ_BASE, 1 << slot); ++ qtest_outl(qts, ACPI_PCIHP_ADDR + PCI_EJ_BASE, 1 << slot); + + qtest_qmp_eventwait(qts, "DEVICE_DELETED"); + } +-- +2.27.0 + diff --git a/kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch b/kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch new file mode 100755 index 0000000000000000000000000000000000000000..424a60c016f1f70f08284906beda10402e444daa --- /dev/null +++ b/kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch @@ -0,0 +1,48 @@ +From 6320b4e76965b1cf64da4307f4d313fe6b2aa971 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:01 -0400 +Subject: [PATCH 2/7] libqos: usb-hcd-ehci: use 32-bit write for config + register +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-2-jmaloy@redhat.com> +Patchwork-id: 101478 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 1/6] libqos: usb-hcd-ehci: use 32-bit write for config register +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Paolo Bonzini + +The memory region ops have min_access_size == 4 so obey it. + +Tested-by: Thomas Huth +Signed-off-by: Paolo Bonzini + +(cherry picked from commit 89ed83d8b23c11d250c290593cad3ca839d5b053) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + tests/usb-hcd-ehci-test.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tests/usb-hcd-ehci-test.c b/tests/usb-hcd-ehci-test.c +index 5251d539e9..c51e8bb223 100644 +--- a/tests/usb-hcd-ehci-test.c ++++ b/tests/usb-hcd-ehci-test.c +@@ -96,7 +96,7 @@ static void pci_ehci_port_1(void) + static void pci_ehci_config(void) + { + /* hands over all ports from companion uhci to ehci */ +- qpci_io_writew(ehci1.dev, ehci1.bar, 0x60, 1); ++ qpci_io_writel(ehci1.dev, ehci1.bar, 0x60, 1); + } + + static void pci_uhci_port_2(void) +-- +2.27.0 + diff --git a/kvm-libvhost-user-Fix-some-memtable-remap-cases.patch b/kvm-libvhost-user-Fix-some-memtable-remap-cases.patch new file mode 100755 index 0000000000000000000000000000000000000000..e362efef4c49f95962f3d2138bcc121be09dd299 --- /dev/null +++ b/kvm-libvhost-user-Fix-some-memtable-remap-cases.patch @@ -0,0 +1,117 @@ +From ee360b70f179cf540faebe7e55b34e323e2bb179 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:09 +0100 +Subject: [PATCH 098/116] libvhost-user: Fix some memtable remap cases +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-95-dgilbert@redhat.com> +Patchwork-id: 93548 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 094/112] libvhost-user: Fix some memtable remap cases +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +If a new setmemtable command comes in once the vhost threads are +running, it will remap the guests address space and the threads +will now be looking in the wrong place. + +Fortunately we're running this command under lock, so we can +update the queue mappings so that threads will look in the new-right +place. + +Note: This doesn't fix things that the threads might be doing +without a lock (e.g. a readv/writev!) That's for another time. + +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 49e9ec749d4db62ae51f76354143cee183912a1d) +Signed-off-by: Miroslav Rezanina +--- + contrib/libvhost-user/libvhost-user.c | 33 +++++++++++++++++++++++++-------- + contrib/libvhost-user/libvhost-user.h | 3 +++ + 2 files changed, 28 insertions(+), 8 deletions(-) + +diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c +index 63e4106..b89bf18 100644 +--- a/contrib/libvhost-user/libvhost-user.c ++++ b/contrib/libvhost-user/libvhost-user.c +@@ -565,6 +565,21 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) + } + + static bool ++map_ring(VuDev *dev, VuVirtq *vq) ++{ ++ vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); ++ vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); ++ vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); ++ ++ DPRINT("Setting virtq addresses:\n"); ++ DPRINT(" vring_desc at %p\n", vq->vring.desc); ++ DPRINT(" vring_used at %p\n", vq->vring.used); ++ DPRINT(" vring_avail at %p\n", vq->vring.avail); ++ ++ return !(vq->vring.desc && vq->vring.used && vq->vring.avail); ++} ++ ++static bool + vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) + { + int i; +@@ -767,6 +782,14 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) + close(vmsg->fds[i]); + } + ++ for (i = 0; i < dev->max_queues; i++) { ++ if (dev->vq[i].vring.desc) { ++ if (map_ring(dev, &dev->vq[i])) { ++ vu_panic(dev, "remaping queue %d during setmemtable", i); ++ } ++ } ++ } ++ + return false; + } + +@@ -853,18 +876,12 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) + DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", vra->avail_user_addr); + DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", vra->log_guest_addr); + ++ vq->vra = *vra; + vq->vring.flags = vra->flags; +- vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); +- vq->vring.used = qva_to_va(dev, vra->used_user_addr); +- vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); + vq->vring.log_guest_addr = vra->log_guest_addr; + +- DPRINT("Setting virtq addresses:\n"); +- DPRINT(" vring_desc at %p\n", vq->vring.desc); +- DPRINT(" vring_used at %p\n", vq->vring.used); +- DPRINT(" vring_avail at %p\n", vq->vring.avail); + +- if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { ++ if (map_ring(dev, vq)) { + vu_panic(dev, "Invalid vring_addr message"); + return false; + } +diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h +index 1844b6f..5cb7708 100644 +--- a/contrib/libvhost-user/libvhost-user.h ++++ b/contrib/libvhost-user/libvhost-user.h +@@ -327,6 +327,9 @@ typedef struct VuVirtq { + int err_fd; + unsigned int enable; + bool started; ++ ++ /* Guest addresses of our ring */ ++ struct vhost_vring_addr vra; + } VuVirtq; + + enum VuWatchCondtion { +-- +1.8.3.1 + diff --git a/kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch b/kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch new file mode 100755 index 0000000000000000000000000000000000000000..0e55df4afa4de5c542cd8d1a1f74ac547129e2c7 --- /dev/null +++ b/kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch @@ -0,0 +1,290 @@ +From cadb72854b44f53c07ea60d7a6149ccac5928a82 Mon Sep 17 00:00:00 2001 +From: Claudio Imbrenda +Date: Tue, 27 Oct 2020 12:02:15 -0400 +Subject: [PATCH 02/18] libvhost-user: handle endianness as mandated by the + spec + +RH-Author: Claudio Imbrenda +Message-id: <20201027120217.2997314-2-cimbrend@redhat.com> +Patchwork-id: 98723 +O-Subject: [RHEL8.4 qemu-kvm PATCH 1/3] libvhost-user: handle endianness as mandated by the spec +Bugzilla: 1857733 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Cornelia Huck + +From: Marc Hartmayer + +Since virtio existed even before it got standardized, the virtio +standard defines the following types of virtio devices: + + + legacy device (pre-virtio 1.0) + + non-legacy or VIRTIO 1.0 device + + transitional device (which can act both as legacy and non-legacy) + +Virtio 1.0 defines the fields of the virtqueues as little endian, +while legacy uses guest's native endian [1]. Currently libvhost-user +does not handle virtio endianness at all, i.e. it works only if the +native endianness matches with whatever is actually needed. That means +things break spectacularly on big-endian targets. Let us handle virtio +endianness for non-legacy as required by the virtio specification [1] +and fence legacy virtio, as there is no safe way to figure out the +needed endianness conversions for all cases. The fencing of legacy +virtio devices is done in `vu_set_features_exec`. + +[1] https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-210003 + +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Marc Hartmayer +Message-id: 20200901150019.29229-3-mhartmay@linux.ibm.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 2ffc54708087c6e524297957be2fc5d543abb767) +Signed-off-by: Danilo C. L. de Paula +--- + contrib/libvhost-user/libvhost-user.c | 77 +++++++++++++++------------ + 1 file changed, 43 insertions(+), 34 deletions(-) + +diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c +index b89bf185013..b8350b067e3 100644 +--- a/contrib/libvhost-user/libvhost-user.c ++++ b/contrib/libvhost-user/libvhost-user.c +@@ -42,6 +42,7 @@ + + #include "qemu/atomic.h" + #include "qemu/osdep.h" ++#include "qemu/bswap.h" + #include "qemu/memfd.h" + + #include "libvhost-user.h" +@@ -522,6 +523,14 @@ vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + dev->features = vmsg->payload.u64; ++ if (!vu_has_feature(dev, VIRTIO_F_VERSION_1)) { ++ /* ++ * We only support devices conforming to VIRTIO 1.0 or ++ * later ++ */ ++ vu_panic(dev, "virtio legacy devices aren't supported by libvhost-user"); ++ return false; ++ } + + if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { + vu_set_enable_all_rings(dev, true); +@@ -886,7 +895,7 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) + return false; + } + +- vq->used_idx = vq->vring.used->idx; ++ vq->used_idx = lduw_le_p(&vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + bool resume = dev->iface->queue_is_processed_in_order && +@@ -998,7 +1007,7 @@ vu_check_queue_inflights(VuDev *dev, VuVirtq *vq) + return 0; + } + +- vq->used_idx = vq->vring.used->idx; ++ vq->used_idx = lduw_le_p(&vq->vring.used->idx); + vq->resubmit_num = 0; + vq->resubmit_list = NULL; + vq->counter = 0; +@@ -1737,13 +1746,13 @@ vu_queue_started(const VuDev *dev, const VuVirtq *vq) + static inline uint16_t + vring_avail_flags(VuVirtq *vq) + { +- return vq->vring.avail->flags; ++ return lduw_le_p(&vq->vring.avail->flags); + } + + static inline uint16_t + vring_avail_idx(VuVirtq *vq) + { +- vq->shadow_avail_idx = vq->vring.avail->idx; ++ vq->shadow_avail_idx = lduw_le_p(&vq->vring.avail->idx); + + return vq->shadow_avail_idx; + } +@@ -1751,7 +1760,7 @@ vring_avail_idx(VuVirtq *vq) + static inline uint16_t + vring_avail_ring(VuVirtq *vq, int i) + { +- return vq->vring.avail->ring[i]; ++ return lduw_le_p(&vq->vring.avail->ring[i]); + } + + static inline uint16_t +@@ -1839,12 +1848,12 @@ virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) + { + /* If this descriptor says it doesn't chain, we're done. */ +- if (!(desc[i].flags & VRING_DESC_F_NEXT)) { ++ if (!(lduw_le_p(&desc[i].flags) & VRING_DESC_F_NEXT)) { + return VIRTQUEUE_READ_DESC_DONE; + } + + /* Check they're not leading us off end of descriptors. */ +- *next = desc[i].next; ++ *next = lduw_le_p(&desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + +@@ -1887,8 +1896,8 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + } + desc = vq->vring.desc; + +- if (desc[i].flags & VRING_DESC_F_INDIRECT) { +- if (desc[i].len % sizeof(struct vring_desc)) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_INDIRECT) { ++ if (ldl_le_p(&desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + goto err; + } +@@ -1901,8 +1910,8 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + + /* loop over the indirect descriptor table */ + indirect = 1; +- desc_addr = desc[i].addr; +- desc_len = desc[i].len; ++ desc_addr = ldq_le_p(&desc[i].addr); ++ desc_len = ldl_le_p(&desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); +@@ -1929,10 +1938,10 @@ vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + goto err; + } + +- if (desc[i].flags & VRING_DESC_F_WRITE) { +- in_total += desc[i].len; ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_WRITE) { ++ in_total += ldl_le_p(&desc[i].len); + } else { +- out_total += desc[i].len; ++ out_total += ldl_le_p(&desc[i].len); + } + if (in_total >= max_in_bytes && out_total >= max_out_bytes) { + goto done; +@@ -2047,7 +2056,7 @@ vring_used_flags_set_bit(VuVirtq *vq, int mask) + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); +- *flags |= mask; ++ stw_le_p(flags, lduw_le_p(flags) | mask); + } + + static inline void +@@ -2057,7 +2066,7 @@ vring_used_flags_unset_bit(VuVirtq *vq, int mask) + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); +- *flags &= ~mask; ++ stw_le_p(flags, lduw_le_p(flags) & ~mask); + } + + static inline void +@@ -2067,7 +2076,7 @@ vring_set_avail_event(VuVirtq *vq, uint16_t val) + return; + } + +- *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val; ++ stw_le_p(&vq->vring.used->ring[vq->vring.num], val); + } + + void +@@ -2156,14 +2165,14 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + int rc; + +- if (desc[i].flags & VRING_DESC_F_INDIRECT) { +- if (desc[i].len % sizeof(struct vring_desc)) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_INDIRECT) { ++ if (ldl_le_p(&desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ +- desc_addr = desc[i].addr; +- desc_len = desc[i].len; ++ desc_addr = ldq_le_p(&desc[i].addr); ++ desc_len = ldl_le_p(&desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); +@@ -2185,10 +2194,10 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) + + /* Collect all the descriptors */ + do { +- if (desc[i].flags & VRING_DESC_F_WRITE) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_WRITE) { + virtqueue_map_desc(dev, &in_num, iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, +- desc[i].addr, desc[i].len); ++ ldq_le_p(&desc[i].addr), ldl_le_p(&desc[i].len)); + } else { + if (in_num) { + vu_panic(dev, "Incorrect order for descriptors"); +@@ -2196,7 +2205,7 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz) + } + virtqueue_map_desc(dev, &out_num, iov, + VIRTQUEUE_MAX_SIZE, false, +- desc[i].addr, desc[i].len); ++ ldq_le_p(&desc[i].addr), ldl_le_p(&desc[i].len)); + } + + /* If we've got too many, that implies a descriptor loop. */ +@@ -2392,14 +2401,14 @@ vu_log_queue_fill(VuDev *dev, VuVirtq *vq, + max = vq->vring.num; + i = elem->index; + +- if (desc[i].flags & VRING_DESC_F_INDIRECT) { +- if (desc[i].len % sizeof(struct vring_desc)) { ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_INDIRECT) { ++ if (ldl_le_p(&desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ +- desc_addr = desc[i].addr; +- desc_len = desc[i].len; ++ desc_addr = ldq_le_p(&desc[i].addr); ++ desc_len = ldl_le_p(&desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); +@@ -2425,9 +2434,9 @@ vu_log_queue_fill(VuDev *dev, VuVirtq *vq, + return; + } + +- if (desc[i].flags & VRING_DESC_F_WRITE) { +- min = MIN(desc[i].len, len); +- vu_log_write(dev, desc[i].addr, min); ++ if (lduw_le_p(&desc[i].flags) & VRING_DESC_F_WRITE) { ++ min = MIN(ldl_le_p(&desc[i].len), len); ++ vu_log_write(dev, ldq_le_p(&desc[i].addr), min); + len -= min; + } + +@@ -2452,15 +2461,15 @@ vu_queue_fill(VuDev *dev, VuVirtq *vq, + + idx = (idx + vq->used_idx) % vq->vring.num; + +- uelem.id = elem->index; +- uelem.len = len; ++ stl_le_p(&uelem.id, elem->index); ++ stl_le_p(&uelem.len, len); + vring_used_write(dev, vq, &uelem, idx); + } + + static inline + void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) + { +- vq->vring.used->idx = val; ++ stw_le_p(&vq->vring.used->idx, val); + vu_log_write(dev, + vq->vring.log_guest_addr + offsetof(struct vring_used, idx), + sizeof(vq->vring.used->idx)); +-- +2.27.0 + diff --git a/kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch b/kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch new file mode 100755 index 0000000000000000000000000000000000000000..d9c81cf820653fe2d6e87762611b96c46d0d2075 --- /dev/null +++ b/kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch @@ -0,0 +1,43 @@ +From f844ca939adb619cce8426e104b0039a7eba70a6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 11 May 2021 11:24:04 -0400 +Subject: [PATCH 1/5] linux-headers: Add VFIO_CCW_REQ_IRQ_INDEX + +RH-Author: Thomas Huth +Message-id: <20210511112405.297037-2-thuth@redhat.com> +Patchwork-id: 101537 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] linux-headers: Add VFIO_CCW_REQ_IRQ_INDEX +Bugzilla: 1940450 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1940450 +Upstream-status: N/A + +This is based on upstream commit b3c818a47f ("Update linux headers to +5.11-rc2"), but has been reduced to the single hunk that is required +for the next patch (there were too many unrelated conflicts in the other +files for doing full backport of the original upstream commit). + +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/vfio.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index f660bd7bac..9c8810bef4 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -580,6 +580,7 @@ enum { + enum { + VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_CRW_IRQ_INDEX, ++ VFIO_CCW_REQ_IRQ_INDEX, + VFIO_CCW_NUM_IRQS + }; + +-- +2.27.0 + diff --git a/kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch b/kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch new file mode 100755 index 0000000000000000000000000000000000000000..1217a6cc768f65aa8efd998ef7015840f032539b --- /dev/null +++ b/kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch @@ -0,0 +1,83 @@ +From d9a63d12b5804eb172a040a16d7e725853c41a8c Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:12 -0500 +Subject: [PATCH 12/18] linux-headers: Partial update against Linux 5.9-rc4 + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-9-thuth@redhat.com> +Patchwork-id: 99505 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 08/12] linux-headers: Partial update against Linux 5.9-rc4 +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Upstream-status: N/A + +This is based on upstream commit e6546342a830e520d14ef03aa95677611de0d90c +but only the two files have been included (there were too many conflicts +in the other unrelated files, so they have been dropped from this patch). + +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/asm-s390/kvm.h | 7 +++++-- + linux-headers/linux/kvm.h | 6 ++++++ + 2 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/linux-headers/asm-s390/kvm.h b/linux-headers/asm-s390/kvm.h +index 0138ccb0d89..f053b8304a8 100644 +--- a/linux-headers/asm-s390/kvm.h ++++ b/linux-headers/asm-s390/kvm.h +@@ -231,11 +231,13 @@ struct kvm_guest_debug_arch { + #define KVM_SYNC_GSCB (1UL << 9) + #define KVM_SYNC_BPBC (1UL << 10) + #define KVM_SYNC_ETOKEN (1UL << 11) ++#define KVM_SYNC_DIAG318 (1UL << 12) + + #define KVM_SYNC_S390_VALID_FIELDS \ + (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ + KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ +- KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) ++ KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ ++ KVM_SYNC_DIAG318) + + /* length and alignment of the sdnx as a power of two */ + #define SDNXC 8 +@@ -264,7 +266,8 @@ struct kvm_sync_regs { + __u8 reserved2 : 7; + __u8 padding1[51]; /* riccb needs to be 64byte aligned */ + __u8 riccb[64]; /* runtime instrumentation controls block */ +- __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ ++ __u64 diag318; /* diagnose 0x318 info */ ++ __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ + union { + __u8 sdnx[SDNXL]; /* state description annex */ + struct { +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 578cd97c0d9..6bba4ec136b 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -276,6 +276,7 @@ struct kvm_run { + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; ++ __u32 cpu; + } fail_entry; + /* KVM_EXIT_EXCEPTION */ + struct { +@@ -1011,6 +1012,11 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_S390_VCPU_RESETS 179 + #define KVM_CAP_S390_PROTECTED 180 + #define KVM_CAP_PPC_SECURE_GUEST 181 ++#define KVM_CAP_HALT_POLL 182 ++#define KVM_CAP_ASYNC_PF_INT 183 ++#define KVM_CAP_LAST_CPU 184 ++#define KVM_CAP_SMALLER_MAXPHYADDR 185 ++#define KVM_CAP_S390_DIAG318 186 + + #ifdef KVM_CAP_IRQ_ROUTING + +-- +2.27.0 + diff --git a/kvm-linux-headers-add-vfio-DMA-available-capability.patch b/kvm-linux-headers-add-vfio-DMA-available-capability.patch new file mode 100755 index 0000000000000000000000000000000000000000..f62026dc475cbf8642c695cca88b84e9829d849d --- /dev/null +++ b/kvm-linux-headers-add-vfio-DMA-available-capability.patch @@ -0,0 +1,54 @@ +From b50c47e1a9fbe8876e231afbb5ed85945c8038da Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:40 -0500 +Subject: [PATCH 1/7] linux-headers: add vfio DMA available capability + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-2-cohuck@redhat.com> +Patchwork-id: 100674 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/7] linux-headers: add vfio DMA available capability +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +UPSTREAM: RHEL only + +This is the part of 53ba2eee52bf ("linux-headers: update against +5.10-rc1") required for DMA limiting. + +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/vfio.h | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index 9e227348b30..f660bd7bace 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -751,6 +751,21 @@ struct vfio_iommu_type1_info_cap_iova_range { + struct vfio_iova_range iova_ranges[]; + }; + ++/* ++ * The DMA available capability allows to report the current number of ++ * simultaneously outstanding DMA mappings that are allowed. ++ * ++ * The structure below defines version 1 of this capability. ++ * ++ * avail: specifies the current number of outstanding DMA mappings allowed. ++ */ ++#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 ++ ++struct vfio_iommu_type1_info_dma_avail { ++ struct vfio_info_cap_header header; ++ __u32 avail; ++}; ++ + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + + /** +-- +2.27.0 + diff --git a/kvm-linux-headers-support-vfio-ccw-features.patch b/kvm-linux-headers-support-vfio-ccw-features.patch new file mode 100755 index 0000000000000000000000000000000000000000..4eb95bf35228a6b29f438fd004a5af97b711211f --- /dev/null +++ b/kvm-linux-headers-support-vfio-ccw-features.patch @@ -0,0 +1,77 @@ +From 1da0eecb9f2086c880fdaf1260ae775bbfbf5f02 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:37 -0400 +Subject: [PATCH 03/12] linux-headers: support vfio-ccw features + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-4-cohuck@redhat.com> +Patchwork-id: 97696 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 3/9] linux-headers: support vfio-ccw features +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +Partial update to support CRW and SCHIB regions. + +Upstream: n/a + +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/vfio.h | 3 +++ + linux-headers/linux/vfio_ccw.h | 19 +++++++++++++++++++ + 2 files changed, 22 insertions(+) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index fb10370d29..9e227348b3 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -378,6 +378,8 @@ struct vfio_region_gfx_edid { + + /* sub-types for VFIO_REGION_TYPE_CCW */ + #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) ++#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) ++#define VFIO_REGION_SUBTYPE_CCW_CRW (3) + + /* + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped +@@ -577,6 +579,7 @@ enum { + + enum { + VFIO_CCW_IO_IRQ_INDEX, ++ VFIO_CCW_CRW_IRQ_INDEX, + VFIO_CCW_NUM_IRQS + }; + +diff --git a/linux-headers/linux/vfio_ccw.h b/linux-headers/linux/vfio_ccw.h +index fcc3e69ef5..6375d6ff25 100644 +--- a/linux-headers/linux/vfio_ccw.h ++++ b/linux-headers/linux/vfio_ccw.h +@@ -34,4 +34,23 @@ struct ccw_cmd_region { + __u32 ret_code; + } __attribute__((packed)); + ++/* ++ * Used for processing commands that read the subchannel-information block ++ * Reading this region triggers a stsch() to hardware ++ * Note: this is controlled by a capability ++ */ ++struct ccw_schib_region { ++#define SCHIB_AREA_SIZE 52 ++ __u8 schib_area[SCHIB_AREA_SIZE]; ++} __attribute__((packed)); ++ ++/* ++ * Used for returning a Channel Report Word to userspace. ++ * Note: this is controlled by a capability ++ */ ++struct ccw_crw_region { ++ __u32 crw; ++ __u32 pad; ++} __attribute__((packed)); ++ + #endif +-- +2.27.0 + diff --git a/kvm-linux-headers-update-kvm.h.patch b/kvm-linux-headers-update-kvm.h.patch new file mode 100755 index 0000000000000000000000000000000000000000..1834e3341cf6248fc73063269e0d9e1a42174b51 --- /dev/null +++ b/kvm-linux-headers-update-kvm.h.patch @@ -0,0 +1,119 @@ +From 9d1b94d3739567245578f30866facc13edb3be92 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:44 -0400 +Subject: [PATCH 02/42] linux-headers: update kvm.h + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-3-thuth@redhat.com> +Patchwork-id: 97020 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 02/38] linux-headers: update kvm.h +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Upstream-status: n/a + +Update kvm.h for the upcoming new s390x reset and protected virtualization +ioctls. This patch is based on commit ddda37483dd17c9936fdde9ebf8f6ca2692b3842 +and commit dc6f8d458a4ccc360723993f31d310d06469f55f, but I dropped all +(unrequired) changes to the other linux-header files. + +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + linux-headers/linux/kvm.h | 55 +++++++++++++++++++++++++++++++++++++-- + 1 file changed, 53 insertions(+), 2 deletions(-) + +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 3d9b18f7f8..578cd97c0d 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -468,12 +468,17 @@ struct kvm_s390_mem_op { + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ +- __u8 ar; /* the access register number */ +- __u8 reserved[31]; /* should be set to 0 */ ++ union { ++ __u8 ar; /* the access register number */ ++ __u32 sida_offset; /* offset into the sida */ ++ __u8 reserved[32]; /* should be set to 0 */ ++ }; + }; + /* types for kvm_s390_mem_op->op */ + #define KVM_S390_MEMOP_LOGICAL_READ 0 + #define KVM_S390_MEMOP_LOGICAL_WRITE 1 ++#define KVM_S390_MEMOP_SIDA_READ 2 ++#define KVM_S390_MEMOP_SIDA_WRITE 3 + /* flags for kvm_s390_mem_op->flags */ + #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) + #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +@@ -1000,6 +1005,12 @@ struct kvm_ppc_resize_hpt { + #define KVM_CAP_PMU_EVENT_FILTER 173 + #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 + #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 ++#define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176 ++#define KVM_CAP_ARM_NISV_TO_USER 177 ++#define KVM_CAP_ARM_INJECT_EXT_DABT 178 ++#define KVM_CAP_S390_VCPU_RESETS 179 ++#define KVM_CAP_S390_PROTECTED 180 ++#define KVM_CAP_PPC_SECURE_GUEST 181 + + #ifdef KVM_CAP_IRQ_ROUTING + +@@ -1461,6 +1472,43 @@ struct kvm_enc_region { + /* Available with KVM_CAP_ARM_SVE */ + #define KVM_ARM_VCPU_FINALIZE _IOW(KVMIO, 0xc2, int) + ++/* Available with KVM_CAP_S390_VCPU_RESETS */ ++#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) ++#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) ++ ++struct kvm_s390_pv_sec_parm { ++ __u64 origin; ++ __u64 length; ++}; ++ ++struct kvm_s390_pv_unp { ++ __u64 addr; ++ __u64 size; ++ __u64 tweak; ++}; ++ ++enum pv_cmd_id { ++ KVM_PV_ENABLE, ++ KVM_PV_DISABLE, ++ KVM_PV_SET_SEC_PARMS, ++ KVM_PV_UNPACK, ++ KVM_PV_VERIFY, ++ KVM_PV_PREP_RESET, ++ KVM_PV_UNSHARE_ALL, ++}; ++ ++struct kvm_pv_cmd { ++ __u32 cmd; /* Command to be executed */ ++ __u16 rc; /* Ultravisor return code */ ++ __u16 rrc; /* Ultravisor return reason code */ ++ __u64 data; /* Data or address */ ++ __u32 flags; /* flags for future extensions. Must be 0 for now */ ++ __u32 reserved[3]; ++}; ++ ++/* Available with KVM_CAP_S390_PROTECTED */ ++#define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) ++ + /* Secure Encrypted Virtualization command */ + enum sev_cmd_id { + /* Guest initialization commands */ +@@ -1611,4 +1659,7 @@ struct kvm_hyperv_eventfd { + #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff + #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + ++#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) ++#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) ++ + #endif /* __LINUX_KVM_H */ +-- +2.27.0 + diff --git a/kvm-memory-Add-IOMMUTLBEvent.patch b/kvm-memory-Add-IOMMUTLBEvent.patch new file mode 100755 index 0000000000000000000000000000000000000000..5d73c97b28241bf66f90d6b5b907a6b1e2324b6a --- /dev/null +++ b/kvm-memory-Add-IOMMUTLBEvent.patch @@ -0,0 +1,590 @@ +From 43a460bde62359c3fa2b1fc6c90d9e13ee7b9a6c Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:35 -0500 +Subject: [PATCH 11/17] memory: Add IOMMUTLBEvent +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-11-eperezma@redhat.com> +Patchwork-id: 100603 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 10/13] memory: Add IOMMUTLBEvent +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +This way we can tell between regular IOMMUTLBEntry (entry of IOMMU +hardware) and notifications. + +In the notifications, we set explicitly if it is a MAPs or an UNMAP, +instead of trusting in entry permissions to differentiate them. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-3-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Matthew Rosato +Acked-by: David Gibson +(cherry picked from commit 5039caf3c449c49e625d34e134463260cf8e00e0) + +Conflicts: + hw/s390x/s390-pci-inst.c: Context because of the lack of commit + ("37fa32de707 s390x/pci: Honor DMA limits set by vfio"). + hw/virtio/virtio-iommu.c: It does not exist in rhel. + +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 13 +++--- + hw/arm/smmuv3.c | 13 +++--- + hw/i386/intel_iommu.c | 88 ++++++++++++++++++++++------------------ + hw/misc/tz-mpc.c | 32 ++++++++------- + hw/ppc/spapr_iommu.c | 15 +++---- + hw/s390x/s390-pci-inst.c | 27 +++++++----- + include/exec/memory.h | 27 ++++++------ + memory.c | 20 ++++----- + 8 files changed, 127 insertions(+), 108 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index dfabe381182..a519c97614a 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -464,14 +464,15 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) + /* Unmap the whole notifier's range */ + static void smmu_unmap_notifier_range(IOMMUNotifier *n) + { +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + +- entry.target_as = &address_space_memory; +- entry.iova = n->start; +- entry.perm = IOMMU_NONE; +- entry.addr_mask = n->end - n->start; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &address_space_memory; ++ event.entry.iova = n->start; ++ event.entry.perm = IOMMU_NONE; ++ event.entry.addr_mask = n->end - n->start; + +- memory_region_notify_iommu_one(n, &entry); ++ memory_region_notify_iommu_one(n, &event); + } + + /* Unmap all notifiers attached to @mr */ +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ef8a877c5d8..10b8393beeb 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -783,7 +783,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + uint8_t granule = tg; + + if (!tg) { +@@ -806,12 +806,13 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + granule = tt->granule_sz; + } + +- entry.target_as = &address_space_memory; +- entry.iova = iova; +- entry.addr_mask = num_pages * (1 << granule) - 1; +- entry.perm = IOMMU_NONE; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &address_space_memory; ++ event.entry.iova = iova; ++ event.entry.addr_mask = num_pages * (1 << granule) - 1; ++ event.entry.perm = IOMMU_NONE; + +- memory_region_notify_iommu_one(n, &entry); ++ memory_region_notify_iommu_one(n, &event); + } + + /* invalidate an asid/iova range tuple in all mr's */ +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 463f107ad12..9fedbac82de 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -1016,7 +1016,7 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, + } + } + +-typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); ++typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private); + + /** + * Constant information used during page walking +@@ -1037,11 +1037,12 @@ typedef struct { + uint16_t domain_id; + } vtd_page_walk_info; + +-static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) ++static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) + { + VTDAddressSpace *as = info->as; + vtd_page_walk_hook hook_fn = info->hook_fn; + void *private = info->private; ++ IOMMUTLBEntry *entry = &event->entry; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, +@@ -1050,7 +1051,7 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + +- if (entry->perm == IOMMU_NONE && !info->notify_unmap) { ++ if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } +@@ -1058,7 +1059,7 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + assert(hook_fn); + + /* Update local IOVA mapped ranges */ +- if (entry->perm) { ++ if (event->type == IOMMU_NOTIFIER_MAP) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { +@@ -1084,19 +1085,21 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + int ret; + + /* Emulate an UNMAP */ ++ event->type = IOMMU_NOTIFIER_UNMAP; + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); +- ret = hook_fn(entry, private); ++ ret = hook_fn(event, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); +- /* Recover the correct permission */ ++ /* Recover the correct type */ ++ event->type = IOMMU_NOTIFIER_MAP; + entry->perm = cache_perm; + } + } +@@ -1113,7 +1116,7 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) + trace_vtd_page_walk_one(info->domain_id, entry->iova, + entry->translated_addr, entry->addr_mask, + entry->perm); +- return hook_fn(entry, private); ++ return hook_fn(event, private); + } + + /** +@@ -1134,7 +1137,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + uint32_t offset; + uint64_t slpte; + uint64_t subpage_size, subpage_mask; +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + uint64_t iova = start; + uint64_t iova_next; + int ret = 0; +@@ -1188,13 +1191,15 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + * + * In either case, we send an IOTLB notification down. + */ +- entry.target_as = &address_space_memory; +- entry.iova = iova & subpage_mask; +- entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); +- entry.addr_mask = ~subpage_mask; ++ event.entry.target_as = &address_space_memory; ++ event.entry.iova = iova & subpage_mask; ++ event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); ++ event.entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ +- entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); +- ret = vtd_page_walk_one(&entry, info); ++ event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); ++ event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : ++ IOMMU_NOTIFIER_UNMAP; ++ ret = vtd_page_walk_one(&event, info); + } + + if (ret < 0) { +@@ -1373,10 +1378,10 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, + return 0; + } + +-static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, ++static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event, + void *private) + { +- memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry); ++ memory_region_notify_iommu(private, 0, *event); + return 0; + } + +@@ -1936,14 +1941,17 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, + * page tables. We just deliver the PSI down to + * invalidate caches. + */ +- IOMMUTLBEntry entry = { +- .target_as = &address_space_memory, +- .iova = addr, +- .translated_addr = 0, +- .addr_mask = size - 1, +- .perm = IOMMU_NONE, ++ IOMMUTLBEvent event = { ++ .type = IOMMU_NOTIFIER_UNMAP, ++ .entry = { ++ .target_as = &address_space_memory, ++ .iova = addr, ++ .translated_addr = 0, ++ .addr_mask = size - 1, ++ .perm = IOMMU_NONE, ++ }, + }; +- memory_region_notify_iommu(&vtd_as->iommu, 0, entry); ++ memory_region_notify_iommu(&vtd_as->iommu, 0, event); + } + } + } +@@ -2355,7 +2363,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) + { + VTDAddressSpace *vtd_dev_as; +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + struct VTDBus *vtd_bus; + hwaddr addr; + uint64_t sz; +@@ -2403,12 +2411,13 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + sz = VTD_PAGE_SIZE; + } + +- entry.target_as = &vtd_dev_as->as; +- entry.addr_mask = sz - 1; +- entry.iova = addr; +- entry.perm = IOMMU_NONE; +- entry.translated_addr = 0; +- memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry); ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &vtd_dev_as->as; ++ event.entry.addr_mask = sz - 1; ++ event.entry.iova = addr; ++ event.entry.perm = IOMMU_NONE; ++ event.entry.translated_addr = 0; ++ memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); + + done: + return true; +@@ -3419,19 +3428,20 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) + size = remain = end - start + 1; + + while (remain >= VTD_PAGE_SIZE) { +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + uint64_t mask = get_naturally_aligned_size(start, remain, s->aw_bits); + + assert(mask); + +- entry.iova = start; +- entry.addr_mask = mask - 1; +- entry.target_as = &address_space_memory; +- entry.perm = IOMMU_NONE; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.iova = start; ++ event.entry.addr_mask = mask - 1; ++ event.entry.target_as = &address_space_memory; ++ event.entry.perm = IOMMU_NONE; + /* This field is meaningless for unmap */ +- entry.translated_addr = 0; ++ event.entry.translated_addr = 0; + +- memory_region_notify_iommu_one(n, &entry); ++ memory_region_notify_iommu_one(n, &event); + + start += mask; + remain -= mask; +@@ -3467,9 +3477,9 @@ static void vtd_address_space_refresh_all(IntelIOMMUState *s) + vtd_switch_address_space_all(s); + } + +-static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) ++static int vtd_replay_hook(IOMMUTLBEvent *event, void *private) + { +- memory_region_notify_iommu_one((IOMMUNotifier *)private, entry); ++ memory_region_notify_iommu_one(private, event); + return 0; + } + +diff --git a/hw/misc/tz-mpc.c b/hw/misc/tz-mpc.c +index 49dd6050bd3..e2fbd1065d8 100644 +--- a/hw/misc/tz-mpc.c ++++ b/hw/misc/tz-mpc.c +@@ -82,8 +82,10 @@ static void tz_mpc_iommu_notify(TZMPC *s, uint32_t lutidx, + /* Called when the LUT word at lutidx has changed from oldlut to newlut; + * must call the IOMMU notifiers for the changed blocks. + */ +- IOMMUTLBEntry entry = { +- .addr_mask = s->blocksize - 1, ++ IOMMUTLBEvent event = { ++ .entry = { ++ .addr_mask = s->blocksize - 1, ++ } + }; + hwaddr addr = lutidx * s->blocksize * 32; + int i; +@@ -100,26 +102,28 @@ static void tz_mpc_iommu_notify(TZMPC *s, uint32_t lutidx, + block_is_ns = newlut & (1 << i); + + trace_tz_mpc_iommu_notify(addr); +- entry.iova = addr; +- entry.translated_addr = addr; ++ event.entry.iova = addr; ++ event.entry.translated_addr = addr; + +- entry.perm = IOMMU_NONE; +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, entry); +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, entry); ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.perm = IOMMU_NONE; ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, event); ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, event); + +- entry.perm = IOMMU_RW; ++ event.type = IOMMU_NOTIFIER_MAP; ++ event.entry.perm = IOMMU_RW; + if (block_is_ns) { +- entry.target_as = &s->blocked_io_as; ++ event.entry.target_as = &s->blocked_io_as; + } else { +- entry.target_as = &s->downstream_as; ++ event.entry.target_as = &s->downstream_as; + } +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, entry); ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_S, event); + if (block_is_ns) { +- entry.target_as = &s->downstream_as; ++ event.entry.target_as = &s->downstream_as; + } else { +- entry.target_as = &s->blocked_io_as; ++ event.entry.target_as = &s->blocked_io_as; + } +- memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, entry); ++ memory_region_notify_iommu(&s->upstream, IOMMU_IDX_NS, event); + } + } + +diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c +index 3d3bcc86496..9d3ec7e2c07 100644 +--- a/hw/ppc/spapr_iommu.c ++++ b/hw/ppc/spapr_iommu.c +@@ -445,7 +445,7 @@ static void spapr_tce_reset(DeviceState *dev) + static target_ulong put_tce_emu(SpaprTceTable *tcet, target_ulong ioba, + target_ulong tce) + { +- IOMMUTLBEntry entry; ++ IOMMUTLBEvent event; + hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift); + unsigned long index = (ioba - tcet->bus_offset) >> tcet->page_shift; + +@@ -457,12 +457,13 @@ static target_ulong put_tce_emu(SpaprTceTable *tcet, target_ulong ioba, + + tcet->table[index] = tce; + +- entry.target_as = &address_space_memory, +- entry.iova = (ioba - tcet->bus_offset) & page_mask; +- entry.translated_addr = tce & page_mask; +- entry.addr_mask = ~page_mask; +- entry.perm = spapr_tce_iommu_access_flags(tce); +- memory_region_notify_iommu(&tcet->iommu, 0, entry); ++ event.entry.target_as = &address_space_memory, ++ event.entry.iova = (ioba - tcet->bus_offset) & page_mask; ++ event.entry.translated_addr = tce & page_mask; ++ event.entry.addr_mask = ~page_mask; ++ event.entry.perm = spapr_tce_iommu_access_flags(tce); ++ event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : IOMMU_NOTIFIER_UNMAP; ++ memory_region_notify_iommu(&tcet->iommu, 0, event); + + return H_SUCCESS; + } +diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c +index 92c7e45df5f..27b189e6d75 100644 +--- a/hw/s390x/s390-pci-inst.c ++++ b/hw/s390x/s390-pci-inst.c +@@ -575,15 +575,18 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + { + S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); +- IOMMUTLBEntry notify = { +- .target_as = &address_space_memory, +- .iova = entry->iova, +- .translated_addr = entry->translated_addr, +- .perm = entry->perm, +- .addr_mask = ~PAGE_MASK, ++ IOMMUTLBEvent event = { ++ .type = entry->perm ? IOMMU_NOTIFIER_MAP : IOMMU_NOTIFIER_UNMAP, ++ .entry = { ++ .target_as = &address_space_memory, ++ .iova = entry->iova, ++ .translated_addr = entry->translated_addr, ++ .perm = entry->perm, ++ .addr_mask = ~PAGE_MASK, ++ }, + }; + +- if (entry->perm == IOMMU_NONE) { ++ if (event.type == IOMMU_NOTIFIER_UNMAP) { + if (!cache) { + return; + } +@@ -595,9 +598,11 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + return; + } + +- notify.perm = IOMMU_NONE; +- memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); +- notify.perm = entry->perm; ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.perm = IOMMU_NONE; ++ memory_region_notify_iommu(&iommu->iommu_mr, 0, event); ++ event.type = IOMMU_NOTIFIER_MAP; ++ event.entry.perm = entry->perm; + } + + cache = g_new(S390IOTLBEntry, 1); +@@ -608,7 +613,7 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + g_hash_table_replace(iommu->iotlb, &cache->iova, cache); + } + +- memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); ++ memory_region_notify_iommu(&iommu->iommu_mr, 0, event); + } + + int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) +diff --git a/include/exec/memory.h b/include/exec/memory.h +index b6466ab6d57..80e36077cdb 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -106,6 +106,11 @@ struct IOMMUNotifier { + }; + typedef struct IOMMUNotifier IOMMUNotifier; + ++typedef struct IOMMUTLBEvent { ++ IOMMUNotifierFlag type; ++ IOMMUTLBEntry entry; ++} IOMMUTLBEvent; ++ + /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ + #define RAM_PREALLOC (1 << 0) + +@@ -1047,24 +1052,18 @@ uint64_t memory_region_iommu_get_min_page_size(IOMMUMemoryRegion *iommu_mr); + /** + * memory_region_notify_iommu: notify a change in an IOMMU translation entry. + * +- * The notification type will be decided by entry.perm bits: +- * +- * - For UNMAP (cache invalidation) notifies: set entry.perm to IOMMU_NONE. +- * - For MAP (newly added entry) notifies: set entry.perm to the +- * permission of the page (which is definitely !IOMMU_NONE). +- * + * Note: for any IOMMU implementation, an in-place mapping change + * should be notified with an UNMAP followed by a MAP. + * + * @iommu_mr: the memory region that was changed + * @iommu_idx: the IOMMU index for the translation table which has changed +- * @entry: the new entry in the IOMMU translation table. The entry +- * replaces all old entries for the same virtual I/O address range. +- * Deleted entries have .@perm == 0. ++ * @event: TLB event with the new entry in the IOMMU translation table. ++ * The entry replaces all old entries for the same virtual I/O address ++ * range. + */ + void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, +- IOMMUTLBEntry entry); ++ IOMMUTLBEvent event); + + /** + * memory_region_notify_iommu_one: notify a change in an IOMMU translation +@@ -1074,12 +1073,12 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + * notifies a specific notifier, not all of them. + * + * @notifier: the notifier to be notified +- * @entry: the new entry in the IOMMU translation table. The entry +- * replaces all old entries for the same virtual I/O address range. +- * Deleted entries have .@perm == 0. ++ * @event: TLB event with the new entry in the IOMMU translation table. ++ * The entry replaces all old entries for the same virtual I/O address ++ * range. + */ + void memory_region_notify_iommu_one(IOMMUNotifier *notifier, +- IOMMUTLBEntry *entry); ++ IOMMUTLBEvent *event); + + /** + * memory_region_register_iommu_notifier: register a notifier for changes to +diff --git a/memory.c b/memory.c +index 43bd3359bf8..3bd99b8ac4a 100644 +--- a/memory.c ++++ b/memory.c +@@ -1912,11 +1912,15 @@ void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + } + + void memory_region_notify_iommu_one(IOMMUNotifier *notifier, +- IOMMUTLBEntry *entry) ++ IOMMUTLBEvent *event) + { +- IOMMUNotifierFlag request_flags; ++ IOMMUTLBEntry *entry = &event->entry; + hwaddr entry_end = entry->iova + entry->addr_mask; + ++ if (event->type == IOMMU_NOTIFIER_UNMAP) { ++ assert(entry->perm == IOMMU_NONE); ++ } ++ + /* + * Skip the notification if the notification does not overlap + * with registered range. +@@ -1927,20 +1931,14 @@ void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + + assert(entry->iova >= notifier->start && entry_end <= notifier->end); + +- if (entry->perm & IOMMU_RW) { +- request_flags = IOMMU_NOTIFIER_MAP; +- } else { +- request_flags = IOMMU_NOTIFIER_UNMAP; +- } +- +- if (notifier->notifier_flags & request_flags) { ++ if (event->type & notifier->notifier_flags) { + notifier->notify(notifier, entry); + } + } + + void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + int iommu_idx, +- IOMMUTLBEntry entry) ++ IOMMUTLBEvent event) + { + IOMMUNotifier *iommu_notifier; + +@@ -1948,7 +1946,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + + IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) { + if (iommu_notifier->iommu_idx == iommu_idx) { +- memory_region_notify_iommu_one(iommu_notifier, &entry); ++ memory_region_notify_iommu_one(iommu_notifier, &event); + } + } + } +-- +2.27.0 + diff --git a/kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch b/kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch new file mode 100755 index 0000000000000000000000000000000000000000..89eb9c9a1511033f954b3cc7f795de3b2267d7a9 --- /dev/null +++ b/kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch @@ -0,0 +1,89 @@ +From f0fa537af2e1e5f827eeb74dc5b3e12776917a67 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:36 -0500 +Subject: [PATCH 12/17] memory: Add IOMMU_NOTIFIER_DEVIOTLB_UNMAP + IOMMUTLBNotificationType +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-12-eperezma@redhat.com> +Patchwork-id: 100604 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 11/13] memory: Add IOMMU_NOTIFIER_DEVIOTLB_UNMAP IOMMUTLBNotificationType +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +This allows us to differentiate between regular IOMMU map/unmap events +and DEVIOTLB unmap. Doing so, notifiers that only need device IOTLB +invalidations will not receive regular IOMMU unmappings. + +Adapt intel and vhost to use it. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-4-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit b68ba1ca57677acf870d5ab10579e6105c1f5338) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/i386/intel_iommu.c | 2 +- + hw/virtio/vhost.c | 2 +- + include/exec/memory.h | 7 ++++++- + 3 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 9fedbac82de..3640bc2ed15 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -2411,7 +2411,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + sz = VTD_PAGE_SIZE; + } + +- event.type = IOMMU_NOTIFIER_UNMAP; ++ event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP; + event.entry.target_as = &vtd_dev_as->as; + event.entry.addr_mask = sz - 1; + event.entry.iova = addr; +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 9182a00495e..78a5df3b379 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -704,7 +704,7 @@ static void vhost_iommu_region_add(MemoryListener *listener, + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); + iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, +- IOMMU_NOTIFIER_UNMAP, ++ IOMMU_NOTIFIER_DEVIOTLB_UNMAP, + section->offset_within_region, + int128_get64(end), + iommu_idx); +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 80e36077cdb..403dc0c0572 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -87,9 +87,14 @@ typedef enum { + IOMMU_NOTIFIER_UNMAP = 0x1, + /* Notify entry changes (newly created entries) */ + IOMMU_NOTIFIER_MAP = 0x2, ++ /* Notify changes on device IOTLB entries */ ++ IOMMU_NOTIFIER_DEVIOTLB_UNMAP = 0x04, + } IOMMUNotifierFlag; + +-#define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP) ++#define IOMMU_NOTIFIER_IOTLB_EVENTS (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP) ++#define IOMMU_NOTIFIER_DEVIOTLB_EVENTS IOMMU_NOTIFIER_DEVIOTLB_UNMAP ++#define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_IOTLB_EVENTS | \ ++ IOMMU_NOTIFIER_DEVIOTLB_EVENTS) + + struct IOMMUNotifier; + typedef void (*IOMMUNotify)(struct IOMMUNotifier *notifier, +-- +2.27.0 + diff --git a/kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch b/kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch new file mode 100755 index 0000000000000000000000000000000000000000..8921c1457f28144b8933719c694810f56d87f9e3 --- /dev/null +++ b/kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch @@ -0,0 +1,146 @@ +From e876535fd5ed10abf0dbeb55ec7098664412068e Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:34 -0500 +Subject: [PATCH 10/17] memory: Rename memory_region_notify_one to + memory_region_notify_iommu_one +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-10-eperezma@redhat.com> +Patchwork-id: 100602 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 09/13] memory: Rename memory_region_notify_one to memory_region_notify_iommu_one +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +Previous name didn't reflect the iommu operation. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: David Gibson +Reviewed-by: Juan Quintela +Reviewed-by: Eric Auger +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-2-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 3b5ebf8532afdc1518bd8b0961ed802bc3f5f07c) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/smmu-common.c | 2 +- + hw/arm/smmuv3.c | 2 +- + hw/i386/intel_iommu.c | 4 ++-- + include/exec/memory.h | 6 +++--- + memory.c | 6 +++--- + 5 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 9780404f002..dfabe381182 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -471,7 +471,7 @@ static void smmu_unmap_notifier_range(IOMMUNotifier *n) + entry.perm = IOMMU_NONE; + entry.addr_mask = n->end - n->start; + +- memory_region_notify_one(n, &entry); ++ memory_region_notify_iommu_one(n, &entry); + } + + /* Unmap all notifiers attached to @mr */ +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index a418fab2aa6..ef8a877c5d8 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -811,7 +811,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + entry.addr_mask = num_pages * (1 << granule) - 1; + entry.perm = IOMMU_NONE; + +- memory_region_notify_one(n, &entry); ++ memory_region_notify_iommu_one(n, &entry); + } + + /* invalidate an asid/iova range tuple in all mr's */ +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 43c94b993b4..463f107ad12 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -3431,7 +3431,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) + /* This field is meaningless for unmap */ + entry.translated_addr = 0; + +- memory_region_notify_one(n, &entry); ++ memory_region_notify_iommu_one(n, &entry); + + start += mask; + remain -= mask; +@@ -3469,7 +3469,7 @@ static void vtd_address_space_refresh_all(IntelIOMMUState *s) + + static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) + { +- memory_region_notify_one((IOMMUNotifier *)private, entry); ++ memory_region_notify_iommu_one((IOMMUNotifier *)private, entry); + return 0; + } + +diff --git a/include/exec/memory.h b/include/exec/memory.h +index e499dc215b3..b6466ab6d57 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -226,7 +226,7 @@ enum IOMMUMemoryRegionAttr { + * The IOMMU implementation must use the IOMMU notifier infrastructure + * to report whenever mappings are changed, by calling + * memory_region_notify_iommu() (or, if necessary, by calling +- * memory_region_notify_one() for each registered notifier). ++ * memory_region_notify_iommu_one() for each registered notifier). + * + * Conceptually an IOMMU provides a mapping from input address + * to an output TLB entry. If the IOMMU is aware of memory transaction +@@ -1067,7 +1067,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + IOMMUTLBEntry entry); + + /** +- * memory_region_notify_one: notify a change in an IOMMU translation ++ * memory_region_notify_iommu_one: notify a change in an IOMMU translation + * entry to a single notifier + * + * This works just like memory_region_notify_iommu(), but it only +@@ -1078,7 +1078,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + * replaces all old entries for the same virtual I/O address range. + * Deleted entries have .@perm == 0. + */ +-void memory_region_notify_one(IOMMUNotifier *notifier, ++void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + IOMMUTLBEntry *entry); + + /** +diff --git a/memory.c b/memory.c +index 06484c2bff2..43bd3359bf8 100644 +--- a/memory.c ++++ b/memory.c +@@ -1911,8 +1911,8 @@ void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + memory_region_update_iommu_notify_flags(iommu_mr, NULL); + } + +-void memory_region_notify_one(IOMMUNotifier *notifier, +- IOMMUTLBEntry *entry) ++void memory_region_notify_iommu_one(IOMMUNotifier *notifier, ++ IOMMUTLBEntry *entry) + { + IOMMUNotifierFlag request_flags; + hwaddr entry_end = entry->iova + entry->addr_mask; +@@ -1948,7 +1948,7 @@ void memory_region_notify_iommu(IOMMUMemoryRegion *iommu_mr, + + IOMMU_NOTIFIER_FOREACH(iommu_notifier, iommu_mr) { + if (iommu_notifier->iommu_idx == iommu_idx) { +- memory_region_notify_one(iommu_notifier, &entry); ++ memory_region_notify_iommu_one(iommu_notifier, &entry); + } + } + } +-- +2.27.0 + diff --git a/kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch b/kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch new file mode 100755 index 0000000000000000000000000000000000000000..f81c86fc04d23c619c5bd5771bb70cab0878d79c --- /dev/null +++ b/kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch @@ -0,0 +1,104 @@ +From 13f4ebe4708f4f4dc20d710e475a42d520459860 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:03 -0400 +Subject: [PATCH 4/7] memory: Revert "memory: accept mismatching sizes in + memory_region_access_valid" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-4-jmaloy@redhat.com> +Patchwork-id: 101480 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 3/6] memory: Revert "memory: accept mismatching sizes in memory_region_access_valid" +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: "Michael S. Tsirkin" + +Memory API documentation documents valid .min_access_size and .max_access_size +fields and explains that any access outside these boundaries is blocked. + +This is what devices seem to assume. + +However this is not what the implementation does: it simply +ignores the boundaries unless there's an "accepts" callback. + +Naturally, this breaks a bunch of devices. + +Revert to the documented behaviour. + +Devices that want to allow any access can just drop the valid field, +or add the impl field to have accesses converted to appropriate +length. + +Cc: qemu-stable@nongnu.org +Reviewed-by: Richard Henderson +Fixes: CVE-2020-13754 +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1842363 +Fixes: a014ed07bd5a ("memory: accept mismatching sizes in memory_region_access_valid") +Signed-off-by: Michael S. Tsirkin +Message-Id: <20200610134731.1514409-1-mst@redhat.com> +Signed-off-by: Paolo Bonzini + +(cherry picked from commit 5d971f9e672507210e77d020d89e0e89165c8fc9) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + memory.c | 29 +++++++++-------------------- + 1 file changed, 9 insertions(+), 20 deletions(-) + +diff --git a/memory.c b/memory.c +index 5a4a80842d..0cfcb72a5a 100644 +--- a/memory.c ++++ b/memory.c +@@ -1351,35 +1351,24 @@ bool memory_region_access_valid(MemoryRegion *mr, + bool is_write, + MemTxAttrs attrs) + { +- int access_size_min, access_size_max; +- int access_size, i; +- +- if (!mr->ops->valid.unaligned && (addr & (size - 1))) { ++ if (mr->ops->valid.accepts ++ && !mr->ops->valid.accepts(mr->opaque, addr, size, is_write, attrs)) { + return false; + } + +- if (!mr->ops->valid.accepts) { +- return true; +- } +- +- access_size_min = mr->ops->valid.min_access_size; +- if (!mr->ops->valid.min_access_size) { +- access_size_min = 1; ++ if (!mr->ops->valid.unaligned && (addr & (size - 1))) { ++ return false; + } + +- access_size_max = mr->ops->valid.max_access_size; ++ /* Treat zero as compatibility all valid */ + if (!mr->ops->valid.max_access_size) { +- access_size_max = 4; ++ return true; + } + +- access_size = MAX(MIN(size, access_size_max), access_size_min); +- for (i = 0; i < size; i += access_size) { +- if (!mr->ops->valid.accepts(mr->opaque, addr + i, access_size, +- is_write, attrs)) { +- return false; +- } ++ if (size > mr->ops->valid.max_access_size ++ || size < mr->ops->valid.min_access_size) { ++ return false; + } +- + return true; + } + +-- +2.27.0 + diff --git a/kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch b/kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch new file mode 100755 index 0000000000000000000000000000000000000000..de569019b24659d628aaea1111543eaf675469ca --- /dev/null +++ b/kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch @@ -0,0 +1,70 @@ +From 8c5154729effda3f762bfb8224f9c61dab8b2986 Mon Sep 17 00:00:00 2001 +From: eperezma +Date: Tue, 12 Jan 2021 14:36:38 -0500 +Subject: [PATCH 14/17] memory: Skip bad range assertion if notifier is + DEVIOTLB_UNMAP type +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: eperezma +Message-id: <20210112143638.374060-14-eperezma@redhat.com> +Patchwork-id: 100606 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 13/13] memory: Skip bad range assertion if notifier is DEVIOTLB_UNMAP type +Bugzilla: 1843852 +RH-Acked-by: Xiao Wang +RH-Acked-by: Peter Xu +RH-Acked-by: Auger Eric + +Device IOTLB invalidations can unmap arbitrary ranges, eiter outside of +the memory region or even [0, ~0ULL] for all the space. The assertion +could be hit by a guest, and rhel7 guest effectively hit it. + +Signed-off-by: Eugenio Pérez +Reviewed-by: Peter Xu +Reviewed-by: Juan Quintela +Acked-by: Jason Wang +Message-Id: <20201116165506.31315-6-eperezma@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 1804857f19f612f6907832e35599cdb51d4ec764) +Signed-off-by: Eugenio Pérez +Signed-off-by: Danilo C. L. de Paula +--- + memory.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/memory.c b/memory.c +index 3bd99b8ac4a..5a4a80842d7 100644 +--- a/memory.c ++++ b/memory.c +@@ -1916,6 +1916,7 @@ void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + { + IOMMUTLBEntry *entry = &event->entry; + hwaddr entry_end = entry->iova + entry->addr_mask; ++ IOMMUTLBEntry tmp = *entry; + + if (event->type == IOMMU_NOTIFIER_UNMAP) { + assert(entry->perm == IOMMU_NONE); +@@ -1929,10 +1930,16 @@ void memory_region_notify_iommu_one(IOMMUNotifier *notifier, + return; + } + +- assert(entry->iova >= notifier->start && entry_end <= notifier->end); ++ if (notifier->notifier_flags & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) { ++ /* Crop (iova, addr_mask) to range */ ++ tmp.iova = MAX(tmp.iova, notifier->start); ++ tmp.addr_mask = MIN(entry_end, notifier->end) - tmp.iova; ++ } else { ++ assert(entry->iova >= notifier->start && entry_end <= notifier->end); ++ } + + if (event->type & notifier->notifier_flags) { +- notifier->notify(notifier, entry); ++ notifier->notify(notifier, &tmp); + } + } + +-- +2.27.0 + diff --git a/kvm-memory-clamp-cached-translation-in-case-it-points-to.patch b/kvm-memory-clamp-cached-translation-in-case-it-points-to.patch new file mode 100755 index 0000000000000000000000000000000000000000..8b8f67adf2f1fbfaefcbb3c431080446af396de8 --- /dev/null +++ b/kvm-memory-clamp-cached-translation-in-case-it-points-to.patch @@ -0,0 +1,87 @@ +From 354946f1e5fee0a69282bdf284c969b03a78a53e Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 13 Jan 2021 00:42:23 -0500 +Subject: [PATCH 15/17] memory: clamp cached translation in case it points to + an MMIO region + +RH-Author: Jon Maloy +Message-id: <20210113004223.871394-2-jmaloy@redhat.com> +Patchwork-id: 100618 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] memory: clamp cached translation in case it points to an MMIO region +Bugzilla: 1904393 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Thomas Huth + +From: Paolo Bonzini + +In using the address_space_translate_internal API, address_space_cache_init +forgot one piece of advice that can be found in the code for +address_space_translate_internal: + + /* MMIO registers can be expected to perform full-width accesses based only + * on their address, without considering adjacent registers that could + * decode to completely different MemoryRegions. When such registers + * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO + * regions overlap wildly. For this reason we cannot clamp the accesses + * here. + * + * If the length is small (as is the case for address_space_ldl/stl), + * everything works fine. If the incoming length is large, however, + * the caller really has to do the clamping through memory_access_size. + */ + +address_space_cache_init is exactly one such case where "the incoming length +is large", therefore we need to clamp the resulting length---not to +memory_access_size though, since we are not doing an access yet, but to +the size of the resulting section. This ensures that subsequent accesses +to the cached MemoryRegionSection will be in range. + +With this patch, the enclosed testcase notices that the used ring does +not fit into the MSI-X table and prints a "qemu-system-x86_64: Cannot map used" +error. + +Signed-off-by: Paolo Bonzini + +(cherry picked from 4bfb024bc76973d40a359476dc0291f46e435442) +- Manually applied to file exec.c, where the code to correct + is located in this version. +- Skipped the fuzzing test part, which is hard to apply on this code. +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + exec.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/exec.c b/exec.c +index ffdb5185353..09ed0cfc756 100644 +--- a/exec.c ++++ b/exec.c +@@ -3620,6 +3620,7 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, + AddressSpaceDispatch *d; + hwaddr l; + MemoryRegion *mr; ++ Int128 diff; + + assert(len > 0); + +@@ -3628,6 +3629,15 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, + d = flatview_to_dispatch(cache->fv); + cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true); + ++ /* ++ * cache->xlat is now relative to cache->mrs.mr, not to the section itself. ++ * Take that into account to compute how many bytes are there between ++ * cache->xlat and the end of the section. ++ */ ++ diff = int128_sub(cache->mrs.size, ++ int128_make64(cache->xlat - cache->mrs.offset_within_region)); ++ l = int128_get64(int128_min(diff, int128_make64(l))); ++ + mr = cache->mrs.mr; + memory_region_ref(mr); + if (memory_access_is_direct(mr, is_write)) { +-- +2.27.0 + diff --git a/kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch b/kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch new file mode 100755 index 0000000000000000000000000000000000000000..3477af5b7ac0ba1572547454bf6760011c40038a --- /dev/null +++ b/kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch @@ -0,0 +1,179 @@ +From 38a032829b6b8d523b4cee05f732031e66fc2e41 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 31 Jan 2020 17:12:56 +0000 +Subject: [PATCH 14/15] migration: Change SaveStateEntry.instance_id into + uint32_t + +RH-Author: Peter Xu +Message-id: <20200131171257.1066593-3-peterx@redhat.com> +Patchwork-id: 93629 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/3] migration: Change SaveStateEntry.instance_id into uint32_t +Bugzilla: 1529231 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +It was always used as 32bit, so define it as used to be clear. +Instead of using -1 as the auto-gen magic value, we switch to +UINT32_MAX. We also make sure that we don't auto-gen this value to +avoid overflowed instance IDs without being noticed. + +Suggested-by: Juan Quintela +Signed-off-by: Peter Xu +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 93062e23619e057743757ee53bf7f8e07f7a3710) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + include/migration/vmstate.h + migration/savevm.c + stubs/vmstate.c + Due to missing 3cad405bab ("vmstate: replace DeviceState with + VMStateIf", 2020-01-06) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/apic_common.c | 2 +- + include/migration/register.h | 2 +- + include/migration/vmstate.h | 2 +- + migration/savevm.c | 18 ++++++++++-------- + stubs/vmstate.c | 2 +- + 5 files changed, 14 insertions(+), 12 deletions(-) + +diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c +index f2c3a7f..54b8731 100644 +--- a/hw/intc/apic_common.c ++++ b/hw/intc/apic_common.c +@@ -268,7 +268,7 @@ static void apic_common_realize(DeviceState *dev, Error **errp) + APICCommonState *s = APIC_COMMON(dev); + APICCommonClass *info; + static DeviceState *vapic; +- int instance_id = s->id; ++ uint32_t instance_id = s->id; + + info = APIC_COMMON_GET_CLASS(s); + info->realize(dev, errp); +diff --git a/include/migration/register.h b/include/migration/register.h +index a13359a..f3ba10b 100644 +--- a/include/migration/register.h ++++ b/include/migration/register.h +@@ -69,7 +69,7 @@ typedef struct SaveVMHandlers { + } SaveVMHandlers; + + int register_savevm_live(const char *idstr, +- int instance_id, ++ uint32_t instance_id, + int version_id, + const SaveVMHandlers *ops, + void *opaque); +diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h +index 883f1cf..296609c 100644 +--- a/include/migration/vmstate.h ++++ b/include/migration/vmstate.h +@@ -1158,7 +1158,7 @@ bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque); + #define VMSTATE_INSTANCE_ID_ANY -1 + + /* Returns: 0 on success, -1 on failure */ +-int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, ++int vmstate_register_with_alias_id(DeviceState *dev, uint32_t instance_id, + const VMStateDescription *vmsd, + void *base, int alias_id, + int required_for_version, +diff --git a/migration/savevm.c b/migration/savevm.c +index e2e8e0a..a80bb52 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -233,7 +233,7 @@ typedef struct CompatEntry { + typedef struct SaveStateEntry { + QTAILQ_ENTRY(SaveStateEntry) entry; + char idstr[256]; +- int instance_id; ++ uint32_t instance_id; + int alias_id; + int version_id; + /* version id read from the stream */ +@@ -665,10 +665,10 @@ void dump_vmstate_json_to_file(FILE *out_file) + fclose(out_file); + } + +-static int calculate_new_instance_id(const char *idstr) ++static uint32_t calculate_new_instance_id(const char *idstr) + { + SaveStateEntry *se; +- int instance_id = 0; ++ uint32_t instance_id = 0; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (strcmp(idstr, se->idstr) == 0 +@@ -676,6 +676,8 @@ static int calculate_new_instance_id(const char *idstr) + instance_id = se->instance_id + 1; + } + } ++ /* Make sure we never loop over without being noticed */ ++ assert(instance_id != VMSTATE_INSTANCE_ID_ANY); + return instance_id; + } + +@@ -730,7 +732,7 @@ static void savevm_state_handler_insert(SaveStateEntry *nse) + Meanwhile pass -1 as instance_id if you do not already have a clearly + distinguishing id for all instances of your device class. */ + int register_savevm_live(const char *idstr, +- int instance_id, ++ uint32_t instance_id, + int version_id, + const SaveVMHandlers *ops, + void *opaque) +@@ -784,7 +786,7 @@ void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque) + } + } + +-int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, ++int vmstate_register_with_alias_id(DeviceState *dev, uint32_t instance_id, + const VMStateDescription *vmsd, + void *opaque, int alias_id, + int required_for_version, +@@ -1600,7 +1602,7 @@ int qemu_save_device_state(QEMUFile *f) + return qemu_file_get_error(f); + } + +-static SaveStateEntry *find_se(const char *idstr, int instance_id) ++static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id) + { + SaveStateEntry *se; + +@@ -2267,7 +2269,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) + /* Find savevm section */ + se = find_se(idstr, instance_id); + if (se == NULL) { +- error_report("Unknown savevm section or instance '%s' %d. " ++ error_report("Unknown savevm section or instance '%s' %"PRIu32". " + "Make sure that your current VM setup matches your " + "saved VM setup, including any hotplugged devices", + idstr, instance_id); +@@ -2291,7 +2293,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) + + ret = vmstate_load(f, se); + if (ret < 0) { +- error_report("error while loading state for instance 0x%x of" ++ error_report("error while loading state for instance 0x%"PRIx32" of" + " device '%s'", instance_id, idstr); + return ret; + } +diff --git a/stubs/vmstate.c b/stubs/vmstate.c +index e1e89b8..4ed5cc6 100644 +--- a/stubs/vmstate.c ++++ b/stubs/vmstate.c +@@ -4,7 +4,7 @@ + const VMStateDescription vmstate_dummy = {}; + + int vmstate_register_with_alias_id(DeviceState *dev, +- int instance_id, ++ uint32_t instance_id, + const VMStateDescription *vmsd, + void *base, int alias_id, + int required_for_version, +-- +1.8.3.1 + diff --git a/kvm-migration-Create-migration_is_running.patch b/kvm-migration-Create-migration_is_running.patch new file mode 100755 index 0000000000000000000000000000000000000000..c9593de63c2bd6601c73935de3615961153e13d7 --- /dev/null +++ b/kvm-migration-Create-migration_is_running.patch @@ -0,0 +1,119 @@ +From c9e3d13d70a24bf606ce351886b27bdca25ef4dc Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:41 +0000 +Subject: [PATCH 09/18] migration: Create migration_is_running() + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-9-quintela@redhat.com> +Patchwork-id: 94115 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 08/10] migration: Create migration_is_running() +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +This function returns true if we are in the middle of a migration. +It is like migration_is_setup_or_active() with CANCELLING and COLO. +Adapt all callers that are needed. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 392d87e21325fdb01210176faa07472b4985ccf0) +Signed-off-by: Danilo C. L. de Paula +--- + migration/migration.c | 29 ++++++++++++++++++++++++----- + migration/migration.h | 1 + + migration/savevm.c | 4 +--- + 3 files changed, 26 insertions(+), 8 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 30c53c6..eb50d77 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -831,6 +831,27 @@ bool migration_is_setup_or_active(int state) + } + } + ++bool migration_is_running(int state) ++{ ++ switch (state) { ++ case MIGRATION_STATUS_ACTIVE: ++ case MIGRATION_STATUS_POSTCOPY_ACTIVE: ++ case MIGRATION_STATUS_POSTCOPY_PAUSED: ++ case MIGRATION_STATUS_POSTCOPY_RECOVER: ++ case MIGRATION_STATUS_SETUP: ++ case MIGRATION_STATUS_PRE_SWITCHOVER: ++ case MIGRATION_STATUS_DEVICE: ++ case MIGRATION_STATUS_WAIT_UNPLUG: ++ case MIGRATION_STATUS_CANCELLING: ++ case MIGRATION_STATUS_COLO: ++ return true; ++ ++ default: ++ return false; ++ ++ } ++} ++ + static void populate_time_info(MigrationInfo *info, MigrationState *s) + { + info->has_status = true; +@@ -1090,7 +1111,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, + MigrationCapabilityStatusList *cap; + bool cap_list[MIGRATION_CAPABILITY__MAX]; + +- if (migration_is_setup_or_active(s->state)) { ++ if (migration_is_running(s->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return; + } +@@ -1603,7 +1624,7 @@ static void migrate_fd_cancel(MigrationState *s) + + do { + old_state = s->state; +- if (!migration_is_setup_or_active(old_state)) { ++ if (!migration_is_running(old_state)) { + break; + } + /* If the migration is paused, kick it out of the pause */ +@@ -1900,9 +1921,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, + return true; + } + +- if (migration_is_setup_or_active(s->state) || +- s->state == MIGRATION_STATUS_CANCELLING || +- s->state == MIGRATION_STATUS_COLO) { ++ if (migration_is_running(s->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return false; + } +diff --git a/migration/migration.h b/migration/migration.h +index 0b1b0d4..a2b2336 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -279,6 +279,7 @@ void migrate_fd_error(MigrationState *s, const Error *error); + void migrate_fd_connect(MigrationState *s, Error *error_in); + + bool migration_is_setup_or_active(int state); ++bool migration_is_running(int state); + + void migrate_init(MigrationState *s); + bool migration_is_blocked(Error **errp); +diff --git a/migration/savevm.c b/migration/savevm.c +index a80bb52..144ecf0 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -1506,9 +1506,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) + MigrationState *ms = migrate_get_current(); + MigrationStatus status; + +- if (migration_is_setup_or_active(ms->state) || +- ms->state == MIGRATION_STATUS_CANCELLING || +- ms->state == MIGRATION_STATUS_COLO) { ++ if (migration_is_running(ms->state)) { + error_setg(errp, QERR_MIGRATION_ACTIVE); + return -EINVAL; + } +-- +1.8.3.1 + diff --git a/kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch b/kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch new file mode 100755 index 0000000000000000000000000000000000000000..c2ead53afa88a8a9d896fbd01038d0be3ae99370 --- /dev/null +++ b/kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch @@ -0,0 +1,257 @@ +From 2659af9267586fb626f543773bf3f844727e473b Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 31 Jan 2020 17:12:55 +0000 +Subject: [PATCH 13/15] migration: Define VMSTATE_INSTANCE_ID_ANY + +RH-Author: Peter Xu +Message-id: <20200131171257.1066593-2-peterx@redhat.com> +Patchwork-id: 93630 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/3] migration: Define VMSTATE_INSTANCE_ID_ANY +Bugzilla: 1529231 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +Define the new macro VMSTATE_INSTANCE_ID_ANY for callers who wants to +auto-generate the vmstate instance ID. Previously it was hard coded +as -1 instead of this macro. It helps to change this default value in +the follow up patches. No functional change. + +Signed-off-by: Peter Xu +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 1df2c9a26fcb2fa32d099f8e9adcdae4207872e3) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + backends/dbus-vmstate.c + File deleted + hw/core/qdev.c + hw/misc/max111x.c + hw/net/eepro100.c + Due to missing commit 3cad405bab ("vmstate: replace + DeviceState with VMStateIf", 2020-01-06) + +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/stellaris.c | 2 +- + hw/core/qdev.c | 3 ++- + hw/display/ads7846.c | 2 +- + hw/i2c/core.c | 2 +- + hw/input/stellaris_input.c | 3 ++- + hw/intc/apic_common.c | 2 +- + hw/misc/max111x.c | 2 +- + hw/net/eepro100.c | 2 +- + hw/pci/pci.c | 2 +- + hw/ppc/spapr.c | 2 +- + hw/timer/arm_timer.c | 2 +- + hw/tpm/tpm_emulator.c | 3 ++- + include/migration/vmstate.h | 2 ++ + migration/savevm.c | 8 ++++---- + 14 files changed, 21 insertions(+), 16 deletions(-) + +diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c +index b198066..bb025e0 100644 +--- a/hw/arm/stellaris.c ++++ b/hw/arm/stellaris.c +@@ -708,7 +708,7 @@ static int stellaris_sys_init(uint32_t base, qemu_irq irq, + memory_region_init_io(&s->iomem, NULL, &ssys_ops, s, "ssys", 0x00001000); + memory_region_add_subregion(get_system_memory(), base, &s->iomem); + ssys_reset(s); +- vmstate_register(NULL, -1, &vmstate_stellaris_sys, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_stellaris_sys, s); + return 0; + } + +diff --git a/hw/core/qdev.c b/hw/core/qdev.c +index cf1ba28..40f6b2b 100644 +--- a/hw/core/qdev.c ++++ b/hw/core/qdev.c +@@ -890,7 +890,8 @@ static void device_set_realized(Object *obj, bool value, Error **errp) + dev->canonical_path = object_get_canonical_path(OBJECT(dev)); + + if (qdev_get_vmsd(dev)) { +- if (vmstate_register_with_alias_id(dev, -1, qdev_get_vmsd(dev), dev, ++ if (vmstate_register_with_alias_id(dev, VMSTATE_INSTANCE_ID_ANY, ++ qdev_get_vmsd(dev), dev, + dev->instance_id_alias, + dev->alias_required_for_version, + &local_err) < 0) { +diff --git a/hw/display/ads7846.c b/hw/display/ads7846.c +index c12272a..9228b40 100644 +--- a/hw/display/ads7846.c ++++ b/hw/display/ads7846.c +@@ -154,7 +154,7 @@ static void ads7846_realize(SSISlave *d, Error **errp) + + ads7846_int_update(s); + +- vmstate_register(NULL, -1, &vmstate_ads7846, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_ads7846, s); + } + + static void ads7846_class_init(ObjectClass *klass, void *data) +diff --git a/hw/i2c/core.c b/hw/i2c/core.c +index 92cd489..d770035 100644 +--- a/hw/i2c/core.c ++++ b/hw/i2c/core.c +@@ -61,7 +61,7 @@ I2CBus *i2c_init_bus(DeviceState *parent, const char *name) + + bus = I2C_BUS(qbus_create(TYPE_I2C_BUS, parent, name)); + QLIST_INIT(&bus->current_devs); +- vmstate_register(NULL, -1, &vmstate_i2c_bus, bus); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_i2c_bus, bus); + return bus; + } + +diff --git a/hw/input/stellaris_input.c b/hw/input/stellaris_input.c +index 59892b0..e6ee5e1 100644 +--- a/hw/input/stellaris_input.c ++++ b/hw/input/stellaris_input.c +@@ -88,5 +88,6 @@ void stellaris_gamepad_init(int n, qemu_irq *irq, const int *keycode) + } + s->num_buttons = n; + qemu_add_kbd_event_handler(stellaris_gamepad_put_key, s); +- vmstate_register(NULL, -1, &vmstate_stellaris_gamepad, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, ++ &vmstate_stellaris_gamepad, s); + } +diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c +index 375cb6a..f2c3a7f 100644 +--- a/hw/intc/apic_common.c ++++ b/hw/intc/apic_common.c +@@ -284,7 +284,7 @@ static void apic_common_realize(DeviceState *dev, Error **errp) + } + + if (s->legacy_instance_id) { +- instance_id = -1; ++ instance_id = VMSTATE_INSTANCE_ID_ANY; + } + vmstate_register_with_alias_id(NULL, instance_id, &vmstate_apic_common, + s, -1, 0, NULL); +diff --git a/hw/misc/max111x.c b/hw/misc/max111x.c +index a713149..81ee73e 100644 +--- a/hw/misc/max111x.c ++++ b/hw/misc/max111x.c +@@ -146,7 +146,7 @@ static int max111x_init(SSISlave *d, int inputs) + s->input[7] = 0x80; + s->com = 0; + +- vmstate_register(dev, -1, &vmstate_max111x, s); ++ vmstate_register(dev, VMSTATE_INSTANCE_ID_ANY, &vmstate_max111x, s); + return 0; + } + +diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c +index cc2dd8b..39920c6 100644 +--- a/hw/net/eepro100.c ++++ b/hw/net/eepro100.c +@@ -1874,7 +1874,7 @@ static void e100_nic_realize(PCIDevice *pci_dev, Error **errp) + + s->vmstate = g_memdup(&vmstate_eepro100, sizeof(vmstate_eepro100)); + s->vmstate->name = qemu_get_queue(s->nic)->model; +- vmstate_register(&pci_dev->qdev, -1, s->vmstate, s); ++ vmstate_register(&pci_dev->qdev, VMSTATE_INSTANCE_ID_ANY, s->vmstate, s); + } + + static void eepro100_instance_init(Object *obj) +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index cbc7a32..fed019d 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -124,7 +124,7 @@ static void pci_bus_realize(BusState *qbus, Error **errp) + bus->machine_done.notify = pcibus_machine_done; + qemu_add_machine_init_done_notifier(&bus->machine_done); + +- vmstate_register(NULL, -1, &vmstate_pcibus, bus); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_pcibus, bus); + } + + static void pcie_bus_realize(BusState *qbus, Error **errp) +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 8749c72..c12862d 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -3028,7 +3028,7 @@ static void spapr_machine_init(MachineState *machine) + * interface, this is a legacy from the sPAPREnvironment structure + * which predated MachineState but had a similar function */ + vmstate_register(NULL, 0, &vmstate_spapr, spapr); +- register_savevm_live("spapr/htab", -1, 1, ++ register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1, + &savevm_htab_handlers, spapr); + + qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine), +diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c +index af524fa..beaa285 100644 +--- a/hw/timer/arm_timer.c ++++ b/hw/timer/arm_timer.c +@@ -180,7 +180,7 @@ static arm_timer_state *arm_timer_init(uint32_t freq) + s->control = TIMER_CTRL_IE; + + s->timer = ptimer_init(arm_timer_tick, s, PTIMER_POLICY_DEFAULT); +- vmstate_register(NULL, -1, &vmstate_arm_timer, s); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_arm_timer, s); + return s; + } + +diff --git a/hw/tpm/tpm_emulator.c b/hw/tpm/tpm_emulator.c +index 22f9113..da7b490 100644 +--- a/hw/tpm/tpm_emulator.c ++++ b/hw/tpm/tpm_emulator.c +@@ -914,7 +914,8 @@ static void tpm_emulator_inst_init(Object *obj) + tpm_emu->cur_locty_number = ~0; + qemu_mutex_init(&tpm_emu->mutex); + +- vmstate_register(NULL, -1, &vmstate_tpm_emulator, obj); ++ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, ++ &vmstate_tpm_emulator, obj); + } + + /* +diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h +index ac4f46a..883f1cf 100644 +--- a/include/migration/vmstate.h ++++ b/include/migration/vmstate.h +@@ -1155,6 +1155,8 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, + + bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque); + ++#define VMSTATE_INSTANCE_ID_ANY -1 ++ + /* Returns: 0 on success, -1 on failure */ + int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, + const VMStateDescription *vmsd, +diff --git a/migration/savevm.c b/migration/savevm.c +index a71b930..e2e8e0a 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -750,7 +750,7 @@ int register_savevm_live(const char *idstr, + + pstrcat(se->idstr, sizeof(se->idstr), idstr); + +- if (instance_id == -1) { ++ if (instance_id == VMSTATE_INSTANCE_ID_ANY) { + se->instance_id = calculate_new_instance_id(se->idstr); + } else { + se->instance_id = instance_id; +@@ -817,14 +817,14 @@ int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, + + se->compat = g_new0(CompatEntry, 1); + pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name); +- se->compat->instance_id = instance_id == -1 ? ++ se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ? + calculate_compat_instance_id(vmsd->name) : instance_id; +- instance_id = -1; ++ instance_id = VMSTATE_INSTANCE_ID_ANY; + } + } + pstrcat(se->idstr, sizeof(se->idstr), vmsd->name); + +- if (instance_id == -1) { ++ if (instance_id == VMSTATE_INSTANCE_ID_ANY) { + se->instance_id = calculate_new_instance_id(se->idstr); + } else { + se->instance_id = instance_id; +-- +1.8.3.1 + diff --git a/kvm-migration-Don-t-send-data-if-we-have-stopped.patch b/kvm-migration-Don-t-send-data-if-we-have-stopped.patch new file mode 100755 index 0000000000000000000000000000000000000000..9a367141632df7bdc0d8bcb934b8b7debd087028 --- /dev/null +++ b/kvm-migration-Don-t-send-data-if-we-have-stopped.patch @@ -0,0 +1,42 @@ +From ab07e0b41c50a85940d798a9a65a58698fd2edfb Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:40 +0000 +Subject: [PATCH 08/18] migration: Don't send data if we have stopped + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-8-quintela@redhat.com> +Patchwork-id: 94114 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 07/10] migration: Don't send data if we have stopped +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +If we do a cancel, we got out without one error, but we can't do the +rest of the output as in a normal situation. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit b69a0227a803256ad270283872d40ff768f4d56d) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/migration/ram.c b/migration/ram.c +index a0257ee..902c56c 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -3511,7 +3511,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) + ram_control_after_iterate(f, RAM_CONTROL_ROUND); + + out: +- if (ret >= 0) { ++ if (ret >= 0 ++ && migration_is_setup_or_active(migrate_get_current()->state)) { + multifd_send_sync_main(rs); + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + qemu_fflush(f); +-- +1.8.3.1 + diff --git a/kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch b/kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch new file mode 100755 index 0000000000000000000000000000000000000000..01cb0f1a4cdd9ac55d58e6b3cb6e0b6403c7e50e --- /dev/null +++ b/kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch @@ -0,0 +1,94 @@ +From 71b05ab5782aa1e38c016be6264a14f5650d2a87 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:35 +0000 +Subject: [PATCH 03/18] migration: Make sure that we don't call write() in case + of error + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-3-quintela@redhat.com> +Patchwork-id: 94113 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 02/10] migration: Make sure that we don't call write() in case of error +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +If we are exiting due to an error/finish/.... Just don't try to even +touch the channel with one IO operation. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Juan Quintela +(cherry picked from commit 4d65a6216bfc44891ac298b74a6921d479805131) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/migration/ram.c b/migration/ram.c +index 65580e3..8c783b3 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -899,6 +899,12 @@ struct { + uint64_t packet_num; + /* send channels ready */ + QemuSemaphore channels_ready; ++ /* ++ * Have we already run terminate threads. There is a race when it ++ * happens that we got one error while we are exiting. ++ * We will use atomic operations. Only valid values are 0 and 1. ++ */ ++ int exiting; + } *multifd_send_state; + + /* +@@ -927,6 +933,10 @@ static int multifd_send_pages(RAMState *rs) + MultiFDPages_t *pages = multifd_send_state->pages; + uint64_t transferred; + ++ if (atomic_read(&multifd_send_state->exiting)) { ++ return -1; ++ } ++ + qemu_sem_wait(&multifd_send_state->channels_ready); + for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { + p = &multifd_send_state->params[i]; +@@ -1008,6 +1018,16 @@ static void multifd_send_terminate_threads(Error *err) + } + } + ++ /* ++ * We don't want to exit each threads twice. Depending on where ++ * we get the error, or if there are two independent errors in two ++ * threads at the same time, we can end calling this function ++ * twice. ++ */ ++ if (atomic_xchg(&multifd_send_state->exiting, 1)) { ++ return; ++ } ++ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +@@ -1117,6 +1137,10 @@ static void *multifd_send_thread(void *opaque) + + while (true) { + qemu_sem_wait(&p->sem); ++ ++ if (atomic_read(&multifd_send_state->exiting)) { ++ break; ++ } + qemu_mutex_lock(&p->mutex); + + if (p->pending_job) { +@@ -1225,6 +1249,7 @@ int multifd_save_setup(void) + multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); + multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_ready, 0); ++ atomic_set(&multifd_send_state->exiting, 0); + + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; +-- +1.8.3.1 + diff --git a/kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch b/kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch new file mode 100755 index 0000000000000000000000000000000000000000..4a7fb28047727ceb64aa1b08962862874eee627c --- /dev/null +++ b/kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch @@ -0,0 +1,70 @@ +From 3c4f6f0c2bf5562f2aa26f964848ae53e6ac4790 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:43 +0000 +Subject: [PATCH 11/18] migration: Maybe VM is paused when migration is + cancelled + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-11-quintela@redhat.com> +Patchwork-id: 94120 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 10/10] migration: Maybe VM is paused when migration is cancelled +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Zhimin Feng + +If the migration is cancelled when it is in the completion phase, +the migration state is set to MIGRATION_STATUS_CANCELLING. +The VM maybe wait for the 'pause_sem' semaphore in migration_maybe_pause +function, so that VM always is paused. + +Reported-by: Euler Robot +Signed-off-by: Zhimin Feng +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 8958338b10abcb346b54a8038a491fda2db1c853) +Signed-off-by: Danilo C. L. de Paula +--- + migration/migration.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index eb50d77..ed18c59 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -2786,14 +2786,22 @@ static int migration_maybe_pause(MigrationState *s, + /* This block intentionally left blank */ + } + +- qemu_mutex_unlock_iothread(); +- migrate_set_state(&s->state, *current_active_state, +- MIGRATION_STATUS_PRE_SWITCHOVER); +- qemu_sem_wait(&s->pause_sem); +- migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, +- new_state); +- *current_active_state = new_state; +- qemu_mutex_lock_iothread(); ++ /* ++ * If the migration is cancelled when it is in the completion phase, ++ * the migration state is set to MIGRATION_STATUS_CANCELLING. ++ * So we don't need to wait a semaphore, otherwise we would always ++ * wait for the 'pause_sem' semaphore. ++ */ ++ if (s->state != MIGRATION_STATUS_CANCELLING) { ++ qemu_mutex_unlock_iothread(); ++ migrate_set_state(&s->state, *current_active_state, ++ MIGRATION_STATUS_PRE_SWITCHOVER); ++ qemu_sem_wait(&s->pause_sem); ++ migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, ++ new_state); ++ *current_active_state = new_state; ++ qemu_mutex_lock_iothread(); ++ } + + return s->state == new_state ? 0 : -EINVAL; + } +-- +1.8.3.1 + diff --git a/kvm-migration-Rate-limit-inside-host-pages.patch b/kvm-migration-Rate-limit-inside-host-pages.patch new file mode 100755 index 0000000000000000000000000000000000000000..2d3d519e71c309eed5c5cbbc9117afb240ee45f5 --- /dev/null +++ b/kvm-migration-Rate-limit-inside-host-pages.patch @@ -0,0 +1,172 @@ +From 8e8f421cce99543081f225acf46541312cfbc371 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Tue, 17 Mar 2020 17:05:18 +0000 +Subject: [PATCH 1/2] migration: Rate limit inside host pages + +RH-Author: Laurent Vivier +Message-id: <20200317170518.9303-1-lvivier@redhat.com> +Patchwork-id: 94374 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] migration: Rate limit inside host pages +Bugzilla: 1814336 +RH-Acked-by: Peter Xu +RH-Acked-by: Juan Quintela +RH-Acked-by: Dr. David Alan Gilbert + +From: "Dr. David Alan Gilbert" + +When using hugepages, rate limiting is necessary within each huge +page, since a 1G huge page can take a significant time to send, so +you end up with bursty behaviour. + +Fixes: 4c011c37ecb3 ("postcopy: Send whole huge pages") +Reported-by: Lin Ma +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Juan Quintela +Reviewed-by: Peter Xu +Signed-off-by: Juan Quintela +(cherry picked from commit 97e1e06780e70f6e98a0d2df881e0c0927d3aeb6) +Signed-off-by: Laurent Vivier + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1814336 +BRANCH: rhel-av-8.2.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27283241 +TESTED: Tested that the migration abort doesn't trigger an error message in + the kernel logs on P9 + +Signed-off-by: Danilo C. L. de Paula +--- + migration/migration.c | 57 ++++++++++++++++++++++++++++---------------------- + migration/migration.h | 1 + + migration/ram.c | 2 ++ + migration/trace-events | 4 ++-- + 4 files changed, 37 insertions(+), 27 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index ed18c59..e31d0f5 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -3253,6 +3253,37 @@ void migration_consume_urgent_request(void) + qemu_sem_wait(&migrate_get_current()->rate_limit_sem); + } + ++/* Returns true if the rate limiting was broken by an urgent request */ ++bool migration_rate_limit(void) ++{ ++ int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); ++ MigrationState *s = migrate_get_current(); ++ ++ bool urgent = false; ++ migration_update_counters(s, now); ++ if (qemu_file_rate_limit(s->to_dst_file)) { ++ /* ++ * Wait for a delay to do rate limiting OR ++ * something urgent to post the semaphore. ++ */ ++ int ms = s->iteration_start_time + BUFFER_DELAY - now; ++ trace_migration_rate_limit_pre(ms); ++ if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { ++ /* ++ * We were woken by one or more urgent things but ++ * the timedwait will have consumed one of them. ++ * The service routine for the urgent wake will dec ++ * the semaphore itself for each item it consumes, ++ * so add this one we just eat back. ++ */ ++ qemu_sem_post(&s->rate_limit_sem); ++ urgent = true; ++ } ++ trace_migration_rate_limit_post(urgent); ++ } ++ return urgent; ++} ++ + /* + * Master migration thread on the source VM. + * It drives the migration and pumps the data down the outgoing channel. +@@ -3319,8 +3350,6 @@ static void *migration_thread(void *opaque) + trace_migration_thread_setup_complete(); + + while (migration_is_active(s)) { +- int64_t current_time; +- + if (urgent || !qemu_file_rate_limit(s->to_dst_file)) { + MigIterateState iter_state = migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { +@@ -3347,29 +3376,7 @@ static void *migration_thread(void *opaque) + update_iteration_initial_status(s); + } + +- current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +- +- migration_update_counters(s, current_time); +- +- urgent = false; +- if (qemu_file_rate_limit(s->to_dst_file)) { +- /* Wait for a delay to do rate limiting OR +- * something urgent to post the semaphore. +- */ +- int ms = s->iteration_start_time + BUFFER_DELAY - current_time; +- trace_migration_thread_ratelimit_pre(ms); +- if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { +- /* We were worken by one or more urgent things but +- * the timedwait will have consumed one of them. +- * The service routine for the urgent wake will dec +- * the semaphore itself for each item it consumes, +- * so add this one we just eat back. +- */ +- qemu_sem_post(&s->rate_limit_sem); +- urgent = true; +- } +- trace_migration_thread_ratelimit_post(urgent); +- } ++ urgent = migration_rate_limit(); + } + + trace_migration_thread_after_loop(); +diff --git a/migration/migration.h b/migration/migration.h +index a2b2336..a15e8d8 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -347,5 +347,6 @@ extern bool migrate_pre_2_2; + + void migration_make_urgent_request(void); + void migration_consume_urgent_request(void); ++bool migration_rate_limit(void); + + #endif +diff --git a/migration/ram.c b/migration/ram.c +index 3891eff..5344c7d 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -2661,6 +2661,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, + + pages += tmppages; + pss->page++; ++ /* Allow rate limiting to happen in the middle of huge pages */ ++ migration_rate_limit(); + } while ((pss->page & (pagesize_bits - 1)) && + offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); + +diff --git a/migration/trace-events b/migration/trace-events +index 6dee7b5..2f9129e 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -138,12 +138,12 @@ migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi6 + migration_completion_file_err(void) "" + migration_completion_postcopy_end(void) "" + migration_completion_postcopy_end_after_complete(void) "" ++migration_rate_limit_pre(int ms) "%d ms" ++migration_rate_limit_post(int urgent) "urgent: %d" + migration_return_path_end_before(void) "" + migration_return_path_end_after(int rp_error) "%d" + migration_thread_after_loop(void) "" + migration_thread_file_err(void) "" +-migration_thread_ratelimit_pre(int ms) "%d ms" +-migration_thread_ratelimit_post(int urgent) "urgent: %d" + migration_thread_setup_complete(void) "" + open_return_path_on_source(void) "" + open_return_path_on_source_continue(void) "" +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-clean-pages-after-filling-packet.patch b/kvm-migration-multifd-clean-pages-after-filling-packet.patch new file mode 100755 index 0000000000000000000000000000000000000000..5fa7fdeb3195372d665b756097f379faf9d99fa8 --- /dev/null +++ b/kvm-migration-multifd-clean-pages-after-filling-packet.patch @@ -0,0 +1,65 @@ +From 32ee75b7f4a31d6080e5659e2a0285a046ef1036 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:34 +0000 +Subject: [PATCH 02/18] migration/multifd: clean pages after filling packet + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-2-quintela@redhat.com> +Patchwork-id: 94112 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 01/10] migration/multifd: clean pages after filling packet +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Wei Yang + +This is a preparation for the next patch: + + not use multifd during postcopy. + +Without enabling postcopy, everything looks good. While after enabling +postcopy, migration may fail even not use multifd during postcopy. The +reason is the pages is not properly cleared and *old* target page will +continue to be transferred. + +After clean pages, migration succeeds. + +Signed-off-by: Wei Yang +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit eab54aa78ffd9fb7895b20fc2761ee998479489b) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 5078f94..65580e3 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -944,10 +944,10 @@ static int multifd_send_pages(RAMState *rs) + } + qemu_mutex_unlock(&p->mutex); + } +- p->pages->used = 0; ++ assert(!p->pages->used); ++ assert(!p->pages->block); + + p->packet_num = multifd_send_state->packet_num++; +- p->pages->block = NULL; + multifd_send_state->pages = p->pages; + p->pages = pages; + transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len; +@@ -1129,6 +1129,8 @@ static void *multifd_send_thread(void *opaque) + p->flags = 0; + p->num_packets++; + p->num_pages += used; ++ p->pages->used = 0; ++ p->pages->block = NULL; + qemu_mutex_unlock(&p->mutex); + + trace_multifd_send(p->id, packet_num, used, flags, +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch b/kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch new file mode 100755 index 0000000000000000000000000000000000000000..0c5fe802e4882a7101181c6fb4b4a9223802ec4c --- /dev/null +++ b/kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch @@ -0,0 +1,77 @@ +From 2c14a6831954a59256cc8d1980da0ad705a3a3fa Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:37 +0000 +Subject: [PATCH 05/18] migration/multifd: fix destroyed mutex access in + terminating multifd threads + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-5-quintela@redhat.com> +Patchwork-id: 94119 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 04/10] migration/multifd: fix destroyed mutex access in terminating multifd threads +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Jiahui Cen + +One multifd will lock all the other multifds' IOChannel mutex to inform them +to quit by setting p->quit or shutting down p->c. In this senario, if some +multifds had already been terminated and multifd_load_cleanup/multifd_save_cleanup +had destroyed their mutex, it could cause destroyed mutex access when trying +lock their mutex. + +Here is the coredump stack: + #0 0x00007f81a2794437 in raise () from /usr/lib64/libc.so.6 + #1 0x00007f81a2795b28 in abort () from /usr/lib64/libc.so.6 + #2 0x00007f81a278d1b6 in __assert_fail_base () from /usr/lib64/libc.so.6 + #3 0x00007f81a278d262 in __assert_fail () from /usr/lib64/libc.so.6 + #4 0x000055eb1bfadbd3 in qemu_mutex_lock_impl (mutex=0x55eb1e2d1988, file=, line=) at util/qemu-thread-posix.c:64 + #5 0x000055eb1bb4564a in multifd_send_terminate_threads (err=) at migration/ram.c:1015 + #6 0x000055eb1bb4bb7f in multifd_send_thread (opaque=0x55eb1e2d19f8) at migration/ram.c:1171 + #7 0x000055eb1bfad628 in qemu_thread_start (args=0x55eb1e170450) at util/qemu-thread-posix.c:502 + #8 0x00007f81a2b36df5 in start_thread () from /usr/lib64/libpthread.so.0 + #9 0x00007f81a286048d in clone () from /usr/lib64/libc.so.6 + +To fix it up, let's destroy the mutex after all the other multifd threads had +been terminated. + +Signed-off-by: Jiahui Cen +Signed-off-by: Ying Fang +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 9560a48ecc0c20d87bc458a6db77fba651605819) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/migration/ram.c b/migration/ram.c +index 860f781..6c55c5d 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1052,6 +1052,10 @@ void multifd_save_cleanup(void) + if (p->running) { + qemu_thread_join(&p->thread); + } ++ } ++ for (i = 0; i < migrate_multifd_channels(); i++) { ++ MultiFDSendParams *p = &multifd_send_state->params[i]; ++ + socket_send_channel_destroy(p->c); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); +@@ -1335,6 +1339,10 @@ int multifd_load_cleanup(Error **errp) + qemu_sem_post(&p->sem_sync); + qemu_thread_join(&p->thread); + } ++ } ++ for (i = 0; i < migrate_multifd_channels(); i++) { ++ MultiFDRecvParams *p = &multifd_recv_state->params[i]; ++ + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch b/kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch new file mode 100755 index 0000000000000000000000000000000000000000..9e9683c017ccceae43812de21b19f17800a736a5 --- /dev/null +++ b/kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch @@ -0,0 +1,75 @@ +From 517a99c5fba163bf684978fe3d9476b619481391 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:42 +0000 +Subject: [PATCH 10/18] migration/multifd: fix nullptr access in + multifd_send_terminate_threads + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-10-quintela@redhat.com> +Patchwork-id: 94117 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 09/10] migration/multifd: fix nullptr access in multifd_send_terminate_threads +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Zhimin Feng + +If the multifd_send_threads is not created when migration is failed, +multifd_save_cleanup would be called twice. In this senario, the +multifd_send_state is accessed after it has been released, the result +is that the source VM is crashing down. + +Here is the coredump stack: + Program received signal SIGSEGV, Segmentation fault. + 0x00005629333a78ef in multifd_send_terminate_threads (err=err@entry=0x0) at migration/ram.c:1012 + 1012 MultiFDSendParams *p = &multifd_send_state->params[i]; + #0 0x00005629333a78ef in multifd_send_terminate_threads (err=err@entry=0x0) at migration/ram.c:1012 + #1 0x00005629333ab8a9 in multifd_save_cleanup () at migration/ram.c:1028 + #2 0x00005629333abaea in multifd_new_send_channel_async (task=0x562935450e70, opaque=) at migration/ram.c:1202 + #3 0x000056293373a562 in qio_task_complete (task=task@entry=0x562935450e70) at io/task.c:196 + #4 0x000056293373a6e0 in qio_task_thread_result (opaque=0x562935450e70) at io/task.c:111 + #5 0x00007f475d4d75a7 in g_idle_dispatch () from /usr/lib64/libglib-2.0.so.0 + #6 0x00007f475d4da9a9 in g_main_context_dispatch () from /usr/lib64/libglib-2.0.so.0 + #7 0x0000562933785b33 in glib_pollfds_poll () at util/main-loop.c:219 + #8 os_host_main_loop_wait (timeout=) at util/main-loop.c:242 + #9 main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:518 + #10 0x00005629334c5acf in main_loop () at vl.c:1810 + #11 0x000056293334d7bb in main (argc=, argv=, envp=) at vl.c:4471 + +If the multifd_send_threads is not created when migration is failed. +In this senario, we don't call multifd_save_cleanup in multifd_new_send_channel_async. + +Signed-off-by: Zhimin Feng +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 9c4d333c092e9c26d38f740ff3616deb42f21681) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 902c56c..3891eff 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1229,7 +1229,15 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + trace_multifd_new_send_channel_async(p->id); + if (qio_task_propagate_error(task, &local_err)) { + migrate_set_error(migrate_get_current(), local_err); +- multifd_save_cleanup(); ++ /* Error happen, we need to tell who pay attention to me */ ++ qemu_sem_post(&multifd_send_state->channels_ready); ++ qemu_sem_post(&p->sem_sync); ++ /* ++ * Although multifd_send_thread is not created, but main migration ++ * thread neet to judge whether it is running, so we need to mark ++ * its status. ++ */ ++ p->quit = true; + } else { + p->c = QIO_CHANNEL(sioc); + qio_channel_set_delay(p->c, false); +-- +1.8.3.1 + diff --git a/kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch b/kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch new file mode 100755 index 0000000000000000000000000000000000000000..e78069824db29a4492c230b01495b09bbb54dc67 --- /dev/null +++ b/kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch @@ -0,0 +1,68 @@ +From 7f664fe26ff67f8131faa7a81a388b8a5b51403f Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:36 +0000 +Subject: [PATCH 04/18] migration/multifd: fix nullptr access in terminating + multifd threads + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-4-quintela@redhat.com> +Patchwork-id: 94110 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 03/10] migration/multifd: fix nullptr access in terminating multifd threads +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +From: Jiahui Cen + +One multifd channel will shutdown all the other multifd's IOChannel when it +fails to receive an IOChannel. In this senario, if some multifds had not +received its IOChannel yet, it would try to shutdown its IOChannel which could +cause nullptr access at qio_channel_shutdown. + +Here is the coredump stack: + #0 object_get_class (obj=obj@entry=0x0) at qom/object.c:908 + #1 0x00005563fdbb8f4a in qio_channel_shutdown (ioc=0x0, how=QIO_CHANNEL_SHUTDOWN_BOTH, errp=0x0) at io/channel.c:355 + #2 0x00005563fd7b4c5f in multifd_recv_terminate_threads (err=) at migration/ram.c:1280 + #3 0x00005563fd7bc019 in multifd_recv_new_channel (ioc=ioc@entry=0x556400255610, errp=errp@entry=0x7ffec07dce00) at migration/ram.c:1478 + #4 0x00005563fda82177 in migration_ioc_process_incoming (ioc=ioc@entry=0x556400255610, errp=errp@entry=0x7ffec07dce30) at migration/migration.c:605 + #5 0x00005563fda8567d in migration_channel_process_incoming (ioc=0x556400255610) at migration/channel.c:44 + #6 0x00005563fda83ee0 in socket_accept_incoming_migration (listener=0x5563fff6b920, cioc=0x556400255610, opaque=) at migration/socket.c:166 + #7 0x00005563fdbc25cd in qio_net_listener_channel_func (ioc=, condition=, opaque=) at io/net-listener.c:54 + #8 0x00007f895b6fe9a9 in g_main_context_dispatch () from /usr/lib64/libglib-2.0.so.0 + #9 0x00005563fdc18136 in glib_pollfds_poll () at util/main-loop.c:218 + #10 0x00005563fdc181b5 in os_host_main_loop_wait (timeout=1000000000) at util/main-loop.c:241 + #11 0x00005563fdc183a2 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:517 + #12 0x00005563fd8edb37 in main_loop () at vl.c:1791 + #13 0x00005563fd74fd45 in main (argc=, argv=, envp=) at vl.c:4473 + +To fix it up, let's check p->c before calling qio_channel_shutdown. + +Signed-off-by: Jiahui Cen +Signed-off-by: Ying Fang +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit f76e32eb05041ab001184ab16afb56524adccd0c) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 8c783b3..860f781 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1307,7 +1307,9 @@ static void multifd_recv_terminate_threads(Error *err) + - normal quit, i.e. everything went fine, just finished + - error quit: We close the channels so the channel threads + finish the qio_channel_read_all_eof() */ +- qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); ++ if (p->c) { ++ qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); ++ } + qemu_mutex_unlock(&p->mutex); + } + } +-- +1.8.3.1 + diff --git a/kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch b/kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch new file mode 100755 index 0000000000000000000000000000000000000000..c20cb6cf3c2b06313b5c429013d24c300ab2667f --- /dev/null +++ b/kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch @@ -0,0 +1,123 @@ +From 261ee33e0e6711fadd3049e4640bb731ee3d44ff Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 24 Feb 2020 16:57:10 +0000 +Subject: [PATCH 9/9] mirror: Don't let an operation wait for itself + +RH-Author: Kevin Wolf +Message-id: <20200224165710.4830-3-kwolf@redhat.com> +Patchwork-id: 94045 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] mirror: Don't let an operation wait for itself +Bugzilla: 1794692 +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +mirror_wait_for_free_in_flight_slot() just picks a random operation to +wait for. However, when mirror_co_read() waits for free slots, its +MirrorOp is already in s->ops_in_flight, so if not enough slots are +immediately available, an operation can end up waiting for itself to +complete, which results in a hang. + +Fix this by passing the current MirrorOp and skipping this operation +when picking an operation to wait for. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692 +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +(cherry picked from commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 8959e42..cacbc70 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -283,11 +283,14 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, + } + + static inline void coroutine_fn +-mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) ++mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) + { + MirrorOp *op; + + QTAILQ_FOREACH(op, &s->ops_in_flight, next) { ++ if (self == op) { ++ continue; ++ } + /* Do not wait on pseudo ops, because it may in turn wait on + * some other operation to start, which may in fact be the + * caller of this function. Since there is only one pseudo op +@@ -302,10 +305,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) + } + + static inline void coroutine_fn +-mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) ++mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self) + { + /* Only non-active operations use up in-flight slots */ +- mirror_wait_for_any_operation(s, false); ++ mirror_wait_for_any_operation(s, self, false); + } + + /* Perform a mirror copy operation. +@@ -348,7 +351,7 @@ static void coroutine_fn mirror_co_read(void *opaque) + + while (s->buf_free_count < nb_chunks) { + trace_mirror_yield_in_flight(s, op->offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, op); + } + + /* Now make a QEMUIOVector taking enough granularity-sized chunks +@@ -555,7 +558,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) + + while (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield_in_flight(s, offset, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, pseudo_op); + } + + if (s->ret < 0) { +@@ -609,7 +612,7 @@ static void mirror_free_init(MirrorBlockJob *s) + static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) + { + while (s->in_flight > 0) { +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, NULL); + } + } + +@@ -794,7 +797,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) + if (s->in_flight >= MAX_IN_FLIGHT) { + trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, + s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, NULL); + continue; + } + +@@ -947,7 +950,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + /* Do not start passive operations while there are active + * writes in progress */ + while (s->in_active_write_counter) { +- mirror_wait_for_any_operation(s, true); ++ mirror_wait_for_any_operation(s, NULL, true); + } + + if (s->ret < 0) { +@@ -973,7 +976,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || + (cnt == 0 && s->in_flight > 0)) { + trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); +- mirror_wait_for_free_in_flight_slot(s); ++ mirror_wait_for_free_in_flight_slot(s, NULL); + continue; + } else if (cnt != 0) { + delay_ns = mirror_iteration(s); +-- +1.8.3.1 + diff --git a/kvm-mirror-Make-sure-that-source-and-target-size-match.patch b/kvm-mirror-Make-sure-that-source-and-target-size-match.patch new file mode 100755 index 0000000000000000000000000000000000000000..09d1152c9e1b8eb9aed69ea5b62ccef5f97dc78d --- /dev/null +++ b/kvm-mirror-Make-sure-that-source-and-target-size-match.patch @@ -0,0 +1,89 @@ +From 98bf67db979927a5c7bbdc4a17c35d60b5f38e71 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 3 Jun 2020 16:03:24 +0100 +Subject: [PATCH 25/26] mirror: Make sure that source and target size match + +RH-Author: Kevin Wolf +Message-id: <20200603160325.67506-11-kwolf@redhat.com> +Patchwork-id: 97110 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH v2 10/11] mirror: Make sure that source and target size match +Bugzilla: 1778593 +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz +RH-Acked-by: Stefano Garzarella + +If the target is shorter than the source, mirror would copy data until +it reaches the end of the target and then fail with an I/O error when +trying to write past the end. + +If the target is longer than the source, the mirror job would complete +successfully, but the target wouldn't actually be an accurate copy of +the source image (it would contain some additional garbage at the end). + +Fix this by checking that both images have the same size when the job +starts. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +Message-Id: <20200511135825.219437-4-kwolf@redhat.com> +Reviewed-by: Max Reitz +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit e83dd6808c6e0975970f37b49b27cc37bb54eea8) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 5e5a521..0d32fca 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -859,6 +859,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + BlockDriverState *target_bs = blk_bs(s->target); + bool need_drain = true; + int64_t length; ++ int64_t target_length; + BlockDriverInfo bdi; + char backing_filename[2]; /* we only need 2 characters because we are only + checking for a NULL string */ +@@ -874,24 +875,26 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + goto immediate_exit; + } + ++ target_length = blk_getlength(s->target); ++ if (target_length < 0) { ++ ret = target_length; ++ goto immediate_exit; ++ } ++ + /* Active commit must resize the base image if its size differs from the + * active layer. */ + if (s->base == blk_bs(s->target)) { +- int64_t base_length; +- +- base_length = blk_getlength(s->target); +- if (base_length < 0) { +- ret = base_length; +- goto immediate_exit; +- } +- +- if (s->bdev_length > base_length) { ++ if (s->bdev_length > target_length) { + ret = blk_truncate(s->target, s->bdev_length, false, + PREALLOC_MODE_OFF, NULL); + if (ret < 0) { + goto immediate_exit; + } + } ++ } else if (s->bdev_length != target_length) { ++ error_setg(errp, "Source and target image have different sizes"); ++ ret = -EINVAL; ++ goto immediate_exit; + } + + if (s->bdev_length == 0) { +-- +1.8.3.1 + diff --git a/kvm-mirror-Store-MirrorOp.co-for-debuggability.patch b/kvm-mirror-Store-MirrorOp.co-for-debuggability.patch new file mode 100755 index 0000000000000000000000000000000000000000..67f3e54d52975f1b1834e6619c9eb74dfdb681f9 --- /dev/null +++ b/kvm-mirror-Store-MirrorOp.co-for-debuggability.patch @@ -0,0 +1,51 @@ +From 27fe3b8d42a2c99de01ce20e4b0727079c12da65 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 24 Feb 2020 16:57:09 +0000 +Subject: [PATCH 8/9] mirror: Store MirrorOp.co for debuggability + +RH-Author: Kevin Wolf +Message-id: <20200224165710.4830-2-kwolf@redhat.com> +Patchwork-id: 94044 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] mirror: Store MirrorOp.co for debuggability +Bugzilla: 1794692 +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +If a coroutine is launched, but the coroutine pointer isn't stored +anywhere, debugging any problems inside the coroutine is quite hard. +Let's store the coroutine pointer of a mirror operation in MirrorOp to +have it available in the debugger. + +Signed-off-by: Kevin Wolf +Reviewed-by: Eric Blake +(cherry picked from commit eed325b92c3e68417121ea23f96e33af6a4654ed) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/block/mirror.c b/block/mirror.c +index f0f2d9d..8959e42 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -103,6 +103,7 @@ struct MirrorOp { + bool is_pseudo_op; + bool is_active_write; + CoQueue waiting_requests; ++ Coroutine *co; + + QTAILQ_ENTRY(MirrorOp) next; + }; +@@ -429,6 +430,7 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset, + default: + abort(); + } ++ op->co = co; + + QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); + qemu_coroutine_enter(co); +-- +1.8.3.1 + diff --git a/kvm-mirror-Wait-only-for-in-flight-operations.patch b/kvm-mirror-Wait-only-for-in-flight-operations.patch new file mode 100755 index 0000000000000000000000000000000000000000..a06d30e19d733a6cc8946ad5f51cc20d883a2b8a --- /dev/null +++ b/kvm-mirror-Wait-only-for-in-flight-operations.patch @@ -0,0 +1,95 @@ +From bddf389330e11fb0ce17413c1bfa2264a281ded2 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 30 Mar 2020 11:19:24 +0100 +Subject: [PATCH 4/4] mirror: Wait only for in-flight operations + +RH-Author: Kevin Wolf +Message-id: <20200330111924.22938-3-kwolf@redhat.com> +Patchwork-id: 94463 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] mirror: Wait only for in-flight operations +Bugzilla: 1794692 +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +mirror_wait_for_free_in_flight_slot() just picks a random operation to +wait for. However, a MirrorOp is already in s->ops_in_flight when +mirror_co_read() waits for free slots, so if not enough slots are +immediately available, an operation can end up waiting for itself, or +two or more operations can wait for each other to complete, which +results in a hang. + +Fix this by adding a flag to MirrorOp that tells us if the request is +already in flight (and therefore occupies slots that it will later +free), and picking only such operations for waiting. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692 +Signed-off-by: Kevin Wolf +Message-Id: <20200326153628.4869-3-kwolf@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit ce8cabbd17cf738ddfc68384440c38e5dd2fdf97) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/mirror.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 8959e42..5e5a521 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -102,6 +102,7 @@ struct MirrorOp { + + bool is_pseudo_op; + bool is_active_write; ++ bool is_in_flight; + CoQueue waiting_requests; + Coroutine *co; + +@@ -293,7 +294,9 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) + * caller of this function. Since there is only one pseudo op + * at any given time, we will always find some real operation + * to wait on. */ +- if (!op->is_pseudo_op && op->is_active_write == active) { ++ if (!op->is_pseudo_op && op->is_in_flight && ++ op->is_active_write == active) ++ { + qemu_co_queue_wait(&op->waiting_requests, NULL); + return; + } +@@ -367,6 +370,7 @@ static void coroutine_fn mirror_co_read(void *opaque) + /* Copy the dirty cluster. */ + s->in_flight++; + s->bytes_in_flight += op->bytes; ++ op->is_in_flight = true; + trace_mirror_one_iteration(s, op->offset, op->bytes); + + ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes, +@@ -382,6 +386,7 @@ static void coroutine_fn mirror_co_zero(void *opaque) + op->s->in_flight++; + op->s->bytes_in_flight += op->bytes; + *op->bytes_handled = op->bytes; ++ op->is_in_flight = true; + + ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes, + op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0); +@@ -396,6 +401,7 @@ static void coroutine_fn mirror_co_discard(void *opaque) + op->s->in_flight++; + op->s->bytes_in_flight += op->bytes; + *op->bytes_handled = op->bytes; ++ op->is_in_flight = true; + + ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes); + mirror_write_complete(op, ret); +@@ -1306,6 +1312,7 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s, + .offset = offset, + .bytes = bytes, + .is_active_write = true, ++ .is_in_flight = true, + }; + qemu_co_queue_init(&op->waiting_requests); + QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); +-- +1.8.3.1 + diff --git a/kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch b/kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch new file mode 100755 index 0000000000000000000000000000000000000000..eb4e9af5da6413501244489e364090bcd41b77d7 --- /dev/null +++ b/kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch @@ -0,0 +1,255 @@ +From 67878e1306f9ea6ccd30437327147c46de196a36 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:13 -0500 +Subject: [PATCH 13/18] misc: Replace zero-length arrays with flexible array + member (manual) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-10-thuth@redhat.com> +Patchwork-id: 99506 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 09/12] misc: Replace zero-length arrays with flexible array member (manual) +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Philippe Mathieu-Daudé + +Description copied from Linux kernel commit from Gustavo A. R. Silva +(see [3]): + +--v-- description start --v-- + + The current codebase makes use of the zero-length array language + extension to the C90 standard, but the preferred mechanism to + declare variable-length types such as these ones is a flexible + array member [1], introduced in C99: + + struct foo { + int stuff; + struct boo array[]; + }; + + By making use of the mechanism above, we will get a compiler + warning in case the flexible array does not occur last in the + structure, which will help us prevent some kind of undefined + behavior bugs from being unadvertenly introduced [2] to the + Linux codebase from now on. + +--^-- description end --^-- + +Do the similar housekeeping in the QEMU codebase (which uses +C99 since commit 7be41675f7cb). + +All these instances of code were found with the help of the +following command (then manual analysis, without modifying +structures only having a single flexible array member, such +QEDTable in block/qed.h): + + git grep -F '[0];' + +[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html +[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76497732932f +[3] https://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git/commit/?id=17642a2fbd2c1 + +Inspired-by: Gustavo A. R. Silva +Reviewed-by: David Hildenbrand +Signed-off-by: Philippe Mathieu-Daudé +Signed-off-by: Paolo Bonzini +(cherry picked from commit 880a7817c1a82a93d3f83dfb25dce1f0db629c66) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + block/vmdk.c | 2 +- + docs/interop/vhost-user.rst | 4 ++-- + hw/char/sclpconsole-lm.c | 2 +- + hw/char/sclpconsole.c | 2 +- + hw/s390x/virtio-ccw.c | 2 +- + include/hw/acpi/acpi-defs.h | 4 ++-- + include/hw/boards.h | 2 +- + include/hw/s390x/event-facility.h | 2 +- + include/hw/s390x/sclp.h | 8 ++++---- + target/s390x/ioinst.c | 2 +- + 10 files changed, 15 insertions(+), 15 deletions(-) + +diff --git a/block/vmdk.c b/block/vmdk.c +index 1bd39917290..8ec18f35a53 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -187,7 +187,7 @@ typedef struct VmdkMetaData { + typedef struct VmdkGrainMarker { + uint64_t lba; + uint32_t size; +- uint8_t data[0]; ++ uint8_t data[]; + } QEMU_PACKED VmdkGrainMarker; + + enum { +diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst +index 7827b710aa0..71b20ce83dd 100644 +--- a/docs/interop/vhost-user.rst ++++ b/docs/interop/vhost-user.rst +@@ -563,7 +563,7 @@ For split virtqueue, queue region can be implemented as: + uint16_t used_idx; + + /* Used to track the state of each descriptor in descriptor table */ +- DescStateSplit desc[0]; ++ DescStateSplit desc[]; + } QueueRegionSplit; + + To track inflight I/O, the queue region should be processed as follows: +@@ -685,7 +685,7 @@ For packed virtqueue, queue region can be implemented as: + uint8_t padding[7]; + + /* Used to track the state of each descriptor fetched from descriptor ring */ +- DescStatePacked desc[0]; ++ DescStatePacked desc[]; + } QueueRegionPacked; + + To track inflight I/O, the queue region should be processed as follows: +diff --git a/hw/char/sclpconsole-lm.c b/hw/char/sclpconsole-lm.c +index 392606259d5..a9a6f2b204c 100644 +--- a/hw/char/sclpconsole-lm.c ++++ b/hw/char/sclpconsole-lm.c +@@ -31,7 +31,7 @@ + typedef struct OprtnsCommand { + EventBufferHeader header; + MDMSU message_unit; +- char data[0]; ++ char data[]; + } QEMU_PACKED OprtnsCommand; + + /* max size for line-mode data in 4K SCCB page */ +diff --git a/hw/char/sclpconsole.c b/hw/char/sclpconsole.c +index da126f0133f..55697130a0a 100644 +--- a/hw/char/sclpconsole.c ++++ b/hw/char/sclpconsole.c +@@ -25,7 +25,7 @@ + + typedef struct ASCIIConsoleData { + EventBufferHeader ebh; +- char data[0]; ++ char data[]; + } QEMU_PACKED ASCIIConsoleData; + + /* max size for ASCII data in 4K SCCB page */ +diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c +index 6580ce5907d..aa2c75a49c6 100644 +--- a/hw/s390x/virtio-ccw.c ++++ b/hw/s390x/virtio-ccw.c +@@ -193,7 +193,7 @@ typedef struct VirtioThinintInfo { + typedef struct VirtioRevInfo { + uint16_t revision; + uint16_t length; +- uint8_t data[0]; ++ uint8_t data[]; + } QEMU_PACKED VirtioRevInfo; + + /* Specify where the virtqueues for the subchannel are in guest memory. */ +diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h +index 57a3f58b0c9..b80188b430f 100644 +--- a/include/hw/acpi/acpi-defs.h ++++ b/include/hw/acpi/acpi-defs.h +@@ -152,7 +152,7 @@ typedef struct AcpiSerialPortConsoleRedirection + */ + struct AcpiRsdtDescriptorRev1 { + ACPI_TABLE_HEADER_DEF /* ACPI common table header */ +- uint32_t table_offset_entry[0]; /* Array of pointers to other */ ++ uint32_t table_offset_entry[]; /* Array of pointers to other */ + /* ACPI tables */ + } QEMU_PACKED; + typedef struct AcpiRsdtDescriptorRev1 AcpiRsdtDescriptorRev1; +@@ -162,7 +162,7 @@ typedef struct AcpiRsdtDescriptorRev1 AcpiRsdtDescriptorRev1; + */ + struct AcpiXsdtDescriptorRev2 { + ACPI_TABLE_HEADER_DEF /* ACPI common table header */ +- uint64_t table_offset_entry[0]; /* Array of pointers to other */ ++ uint64_t table_offset_entry[]; /* Array of pointers to other */ + /* ACPI tables */ + } QEMU_PACKED; + typedef struct AcpiXsdtDescriptorRev2 AcpiXsdtDescriptorRev2; +diff --git a/include/hw/boards.h b/include/hw/boards.h +index 2920bdef5b4..a5e92f6c373 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -101,7 +101,7 @@ typedef struct CPUArchId { + */ + typedef struct { + int len; +- CPUArchId cpus[0]; ++ CPUArchId cpus[]; + } CPUArchIdList; + + /** +diff --git a/include/hw/s390x/event-facility.h b/include/hw/s390x/event-facility.h +index bdc32a3c091..700a610f33c 100644 +--- a/include/hw/s390x/event-facility.h ++++ b/include/hw/s390x/event-facility.h +@@ -122,7 +122,7 @@ typedef struct MDBO { + + typedef struct MDB { + MdbHeader header; +- MDBO mdbo[0]; ++ MDBO mdbo[]; + } QEMU_PACKED MDB; + + typedef struct SclpMsg { +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index df2fa4169b0..62e2aa1d9f1 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -133,7 +133,7 @@ typedef struct ReadInfo { + uint16_t highest_cpu; + uint8_t _reserved5[124 - 122]; /* 122-123 */ + uint32_t hmfai; +- struct CPUEntry entries[0]; ++ struct CPUEntry entries[]; + } QEMU_PACKED ReadInfo; + + typedef struct ReadCpuInfo { +@@ -143,7 +143,7 @@ typedef struct ReadCpuInfo { + uint16_t nr_standby; /* 12-13 */ + uint16_t offset_standby; /* 14-15 */ + uint8_t reserved0[24-16]; /* 16-23 */ +- struct CPUEntry entries[0]; ++ struct CPUEntry entries[]; + } QEMU_PACKED ReadCpuInfo; + + typedef struct ReadStorageElementInfo { +@@ -152,7 +152,7 @@ typedef struct ReadStorageElementInfo { + uint16_t assigned; + uint16_t standby; + uint8_t _reserved0[16 - 14]; /* 14-15 */ +- uint32_t entries[0]; ++ uint32_t entries[]; + } QEMU_PACKED ReadStorageElementInfo; + + typedef struct AttachStorageElement { +@@ -160,7 +160,7 @@ typedef struct AttachStorageElement { + uint8_t _reserved0[10 - 8]; /* 8-9 */ + uint16_t assigned; + uint8_t _reserved1[16 - 12]; /* 12-15 */ +- uint32_t entries[0]; ++ uint32_t entries[]; + } QEMU_PACKED AttachStorageElement; + + typedef struct AssignStorage { +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index b6be300cc48..a412926d278 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -387,7 +387,7 @@ typedef struct ChscResp { + uint16_t len; + uint16_t code; + uint32_t param; +- char data[0]; ++ char data[]; + } QEMU_PACKED ChscResp; + + #define CHSC_MIN_RESP_LEN 0x0008 +-- +2.27.0 + diff --git a/kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch b/kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch new file mode 100755 index 0000000000000000000000000000000000000000..bca0b4c05389926bb37ce44e30e32c8e2f0d17d2 --- /dev/null +++ b/kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch @@ -0,0 +1,74 @@ +From 78c7fb5afcb298631df47f6b71cf764f921c15f4 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:38 +0000 +Subject: [PATCH 06/18] multifd: Make sure that we don't do any IO after an + error + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-6-quintela@redhat.com> +Patchwork-id: 94118 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 05/10] multifd: Make sure that we don't do any IO after an error +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit 3d4095b222d97393b1c2c6e514951ec7798f1c43) +Signed-off-by: Danilo C. L. de Paula +--- + migration/ram.c | 22 +++++++++++++--------- + 1 file changed, 13 insertions(+), 9 deletions(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 6c55c5d..a0257ee 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -3440,7 +3440,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) + { + RAMState **temp = opaque; + RAMState *rs = *temp; +- int ret; ++ int ret = 0; + int i; + int64_t t0; + int done = 0; +@@ -3511,12 +3511,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) + ram_control_after_iterate(f, RAM_CONTROL_ROUND); + + out: +- multifd_send_sync_main(rs); +- qemu_put_be64(f, RAM_SAVE_FLAG_EOS); +- qemu_fflush(f); +- ram_counters.transferred += 8; ++ if (ret >= 0) { ++ multifd_send_sync_main(rs); ++ qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ++ qemu_fflush(f); ++ ram_counters.transferred += 8; + +- ret = qemu_file_get_error(f); ++ ret = qemu_file_get_error(f); ++ } + if (ret < 0) { + return ret; + } +@@ -3568,9 +3570,11 @@ static int ram_save_complete(QEMUFile *f, void *opaque) + ram_control_after_iterate(f, RAM_CONTROL_FINISH); + } + +- multifd_send_sync_main(rs); +- qemu_put_be64(f, RAM_SAVE_FLAG_EOS); +- qemu_fflush(f); ++ if (ret >= 0) { ++ multifd_send_sync_main(rs); ++ qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ++ qemu_fflush(f); ++ } + + return ret; + } +-- +1.8.3.1 + diff --git a/kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch b/kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch new file mode 100755 index 0000000000000000000000000000000000000000..94d2c9865b9972a845e13e4541588db565a73dda --- /dev/null +++ b/kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch @@ -0,0 +1,161 @@ +From f49ff2ed5675f1d0cddc404842e9d6e4e572d5a7 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Wed, 10 Jun 2020 18:32:01 -0400 +Subject: [PATCH 1/2] nbd/server: Avoid long error message assertions + CVE-2020-10761 + +RH-Author: Eric Blake +Message-id: <20200610183202.3780750-2-eblake@redhat.com> +Patchwork-id: 97494 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] nbd/server: Avoid long error message assertions CVE-2020-10761 +Bugzilla: 1845384 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Max Reitz +RH-Acked-by: Stefan Hajnoczi + +Ever since commit 36683283 (v2.8), the server code asserts that error +strings sent to the client are well-formed per the protocol by not +exceeding the maximum string length of 4096. At the time the server +first started sending error messages, the assertion could not be +triggered, because messages were completely under our control. +However, over the years, we have added latent scenarios where a client +could trigger the server to attempt an error message that would +include the client's information if it passed other checks first: + +- requesting NBD_OPT_INFO/GO on an export name that is not present + (commit 0cfae925 in v2.12 echoes the name) + +- requesting NBD_OPT_LIST/SET_META_CONTEXT on an export name that is + not present (commit e7b1948d in v2.12 echoes the name) + +At the time, those were still safe because we flagged names larger +than 256 bytes with a different message; but that changed in commit +93676c88 (v4.2) when we raised the name limit to 4096 to match the NBD +string limit. (That commit also failed to change the magic number +4096 in nbd_negotiate_send_rep_err to the just-introduced named +constant.) So with that commit, long client names appended to server +text can now trigger the assertion, and thus be used as a denial of +service attack against a server. As a mitigating factor, if the +server requires TLS, the client cannot trigger the problematic paths +unless it first supplies TLS credentials, and such trusted clients are +less likely to try to intentionally crash the server. + +We may later want to further sanitize the user-supplied strings we +place into our error messages, such as scrubbing out control +characters, but that is less important to the CVE fix, so it can be a +later patch to the new nbd_sanitize_name. + +Consideration was given to changing the assertion in +nbd_negotiate_send_rep_verr to instead merely log a server error and +truncate the message, to avoid leaving a latent path that could +trigger a future CVE DoS on any new error message. However, this +merely complicates the code for something that is already (correctly) +flagging coding errors, and now that we are aware of the long message +pitfall, we are less likely to introduce such errors in the future, +which would make such error handling dead code. + +Reported-by: Xueqiang Wei +CC: qemu-stable@nongnu.org +Fixes: https://bugzilla.redhat.com/1843684 CVE-2020-10761 +Fixes: 93676c88d7 +Signed-off-by: Eric Blake +Message-Id: <20200610163741.3745251-2-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 5c4fe018c025740fef4a0a4421e8162db0c3eefd) +Signed-off-by: Eric Blake +Signed-off-by: Eduardo Lima (Etrunko) +--- + nbd/server.c | 23 ++++++++++++++++++++--- + tests/qemu-iotests/143 | 4 ++++ + tests/qemu-iotests/143.out | 2 ++ + 3 files changed, 26 insertions(+), 3 deletions(-) + +diff --git a/nbd/server.c b/nbd/server.c +index 24ebc1a805..d5b9df092c 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -217,7 +217,7 @@ nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type, + + msg = g_strdup_vprintf(fmt, va); + len = strlen(msg); +- assert(len < 4096); ++ assert(len < NBD_MAX_STRING_SIZE); + trace_nbd_negotiate_send_rep_err(msg); + ret = nbd_negotiate_send_rep_len(client, type, len, errp); + if (ret < 0) { +@@ -231,6 +231,19 @@ nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type, + return 0; + } + ++/* ++ * Return a malloc'd copy of @name suitable for use in an error reply. ++ */ ++static char * ++nbd_sanitize_name(const char *name) ++{ ++ if (strnlen(name, 80) < 80) { ++ return g_strdup(name); ++ } ++ /* XXX Should we also try to sanitize any control characters? */ ++ return g_strdup_printf("%.80s...", name); ++} ++ + /* Send an error reply. + * Return -errno on error, 0 on success. */ + static int GCC_FMT_ATTR(4, 5) +@@ -595,9 +608,11 @@ static int nbd_negotiate_handle_info(NBDClient *client, Error **errp) + + exp = nbd_export_find(name); + if (!exp) { ++ g_autofree char *sane_name = nbd_sanitize_name(name); ++ + return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN, + errp, "export '%s' not present", +- name); ++ sane_name); + } + + /* Don't bother sending NBD_INFO_NAME unless client requested it */ +@@ -995,8 +1010,10 @@ static int nbd_negotiate_meta_queries(NBDClient *client, + + meta->exp = nbd_export_find(export_name); + if (meta->exp == NULL) { ++ g_autofree char *sane_name = nbd_sanitize_name(export_name); ++ + return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp, +- "export '%s' not present", export_name); ++ "export '%s' not present", sane_name); + } + + ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp); +diff --git a/tests/qemu-iotests/143 b/tests/qemu-iotests/143 +index f649b36195..d2349903b1 100755 +--- a/tests/qemu-iotests/143 ++++ b/tests/qemu-iotests/143 +@@ -58,6 +58,10 @@ _send_qemu_cmd $QEMU_HANDLE \ + $QEMU_IO_PROG -f raw -c quit \ + "nbd+unix:///no_such_export?socket=$SOCK_DIR/nbd" 2>&1 \ + | _filter_qemu_io | _filter_nbd ++# Likewise, with longest possible name permitted in NBD protocol ++$QEMU_IO_PROG -f raw -c quit \ ++ "nbd+unix:///$(printf %4096d 1 | tr ' ' a)?socket=$SOCK_DIR/nbd" 2>&1 \ ++ | _filter_qemu_io | _filter_nbd | sed 's/aaaa*aa/aa--aa/' + + _send_qemu_cmd $QEMU_HANDLE \ + "{ 'execute': 'quit' }" \ +diff --git a/tests/qemu-iotests/143.out b/tests/qemu-iotests/143.out +index 1f4001c601..fc9c0a761f 100644 +--- a/tests/qemu-iotests/143.out ++++ b/tests/qemu-iotests/143.out +@@ -5,6 +5,8 @@ QA output created by 143 + {"return": {}} + qemu-io: can't open device nbd+unix:///no_such_export?socket=SOCK_DIR/nbd: Requested export not available + server reported: export 'no_such_export' not present ++qemu-io: can't open device nbd+unix:///aa--aa1?socket=SOCK_DIR/nbd: Requested export not available ++server reported: export 'aa--aa...' not present + { 'execute': 'quit' } + {"return": {}} + {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +-- +2.27.0 + diff --git a/kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch b/kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch new file mode 100755 index 0000000000000000000000000000000000000000..654a64fb7cc1f8988fcbf7a76e3f76423c565d73 --- /dev/null +++ b/kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch @@ -0,0 +1,301 @@ +From 512c7e92808dff66779f7421f1c17a081f18d7e6 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Thu, 29 Jul 2021 04:56:46 -0400 +Subject: [PATCH 13/14] net: check if the file descriptor is valid before using + it +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210726102337.6359-2-lvivier@redhat.com> +Patchwork-id: 101924 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] net: check if the file descriptor is valid before using it +Bugzilla: 1982134 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Philippe Mathieu-Daudé + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1982134 +BRANCH: rhel-8.5.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=38380653 + +qemu_set_nonblock() checks that the file descriptor can be used and, if +not, crashes QEMU. An assert() is used for that. The use of assert() is +used to detect programming error and the coredump will allow to debug +the problem. + +But in the case of the tap device, this assert() can be triggered by +a misconfiguration by the user. At startup, it's not a real problem, but it +can also happen during the hot-plug of a new device, and here it's a +problem because we can crash a perfectly healthy system. + +For instance: + # ip link add link virbr0 name macvtap0 type macvtap mode bridge + # ip link set macvtap0 up + # TAP=/dev/tap$(ip -o link show macvtap0 | cut -d: -f1) + # qemu-system-x86_64 -machine q35 -device pcie-root-port,id=pcie-root-port-0 -monitor stdio 9<> $TAP + (qemu) netdev_add type=tap,id=hostnet0,vhost=on,fd=9 + (qemu) device_add driver=virtio-net-pci,netdev=hostnet0,id=net0,bus=pcie-root-port-0 + (qemu) device_del net0 + (qemu) netdev_del hostnet0 + (qemu) netdev_add type=tap,id=hostnet1,vhost=on,fd=9 + qemu-system-x86_64: .../util/oslib-posix.c:247: qemu_set_nonblock: Assertion `f != -1' failed. + Aborted (core dumped) + +To avoid that, add a function, qemu_try_set_nonblock(), that allows to report the +problem without crashing. + +In the same way, we also update the function for vhostfd in net_init_tap_one() and +for fd in net_init_socket() (both descriptors are provided by the user and can +be wrong). + +Signed-off-by: Laurent Vivier +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang +(cherry picked from commit 894022e616016fe81745753f14adfbd680a1c7ee) +Signed-off-by: Laurent Vivier +Signed-off-by: Miroslav Rezanina +--- + include/qemu/sockets.h | 1 + + net/socket.c | 9 +++++-- + net/tap.c | 25 +++++++++++++++--- + util/oslib-posix.c | 26 +++++++++++++------ + util/oslib-win32.c | 57 ++++++++++++++++++++++++------------------ + 5 files changed, 79 insertions(+), 39 deletions(-) + +diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h +index 57cd049d6e..7d1f813576 100644 +--- a/include/qemu/sockets.h ++++ b/include/qemu/sockets.h +@@ -18,6 +18,7 @@ int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen); + int socket_set_cork(int fd, int v); + int socket_set_nodelay(int fd); + void qemu_set_block(int fd); ++int qemu_try_set_nonblock(int fd); + void qemu_set_nonblock(int fd); + int socket_set_fast_reuse(int fd); + +diff --git a/net/socket.c b/net/socket.c +index c92354049b..2d21fddd9c 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -725,13 +725,18 @@ int net_init_socket(const Netdev *netdev, const char *name, + } + + if (sock->has_fd) { +- int fd; ++ int fd, ret; + + fd = monitor_fd_param(cur_mon, sock->fd, errp); + if (fd == -1) { + return -1; + } +- qemu_set_nonblock(fd); ++ ret = qemu_try_set_nonblock(fd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ return -1; ++ } + if (!net_socket_fd_init(peer, "socket", name, fd, 1, sock->mcast, + errp)) { + return -1; +diff --git a/net/tap.c b/net/tap.c +index 6207f61f84..41a20102fd 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -689,6 +689,8 @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, + } + + if (vhostfdname) { ++ int ret; ++ + vhostfd = monitor_fd_param(cur_mon, vhostfdname, &err); + if (vhostfd == -1) { + if (tap->has_vhostforce && tap->vhostforce) { +@@ -698,7 +700,12 @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, + } + return; + } +- qemu_set_nonblock(vhostfd); ++ ret = qemu_try_set_nonblock(vhostfd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ return; ++ } + } else { + vhostfd = open("/dev/vhost-net", O_RDWR); + if (vhostfd < 0) { +@@ -766,6 +773,7 @@ int net_init_tap(const Netdev *netdev, const char *name, + Error *err = NULL; + const char *vhostfdname; + char ifname[128]; ++ int ret = 0; + + assert(netdev->type == NET_CLIENT_DRIVER_TAP); + tap = &netdev->u.tap; +@@ -795,7 +803,12 @@ int net_init_tap(const Netdev *netdev, const char *name, + return -1; + } + +- qemu_set_nonblock(fd); ++ ret = qemu_try_set_nonblock(fd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ return -1; ++ } + + vnet_hdr = tap_probe_vnet_hdr(fd); + +@@ -810,7 +823,6 @@ int net_init_tap(const Netdev *netdev, const char *name, + char **fds; + char **vhost_fds; + int nfds = 0, nvhosts = 0; +- int ret = 0; + + if (tap->has_ifname || tap->has_script || tap->has_downscript || + tap->has_vnet_hdr || tap->has_helper || tap->has_queues || +@@ -843,7 +855,12 @@ int net_init_tap(const Netdev *netdev, const char *name, + goto free_fail; + } + +- qemu_set_nonblock(fd); ++ ret = qemu_try_set_nonblock(fd); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d", ++ name, fd); ++ goto free_fail; ++ } + + if (i == 0) { + vnet_hdr = tap_probe_vnet_hdr(fd); +diff --git a/util/oslib-posix.c b/util/oslib-posix.c +index 8f88e4dbe1..db70416dbb 100644 +--- a/util/oslib-posix.c ++++ b/util/oslib-posix.c +@@ -240,25 +240,35 @@ void qemu_set_block(int fd) + assert(f != -1); + } + +-void qemu_set_nonblock(int fd) ++int qemu_try_set_nonblock(int fd) + { + int f; + f = fcntl(fd, F_GETFL); +- assert(f != -1); +- f = fcntl(fd, F_SETFL, f | O_NONBLOCK); +-#ifdef __OpenBSD__ + if (f == -1) { ++ return -errno; ++ } ++ if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) { ++#ifdef __OpenBSD__ + /* + * Previous to OpenBSD 6.3, fcntl(F_SETFL) is not permitted on + * memory devices and sets errno to ENODEV. + * It's OK if we fail to set O_NONBLOCK on devices like /dev/null, + * because they will never block anyway. + */ +- assert(errno == ENODEV); +- } +-#else +- assert(f != -1); ++ if (errno == ENODEV) { ++ return 0; ++ } + #endif ++ return -errno; ++ } ++ return 0; ++} ++ ++void qemu_set_nonblock(int fd) ++{ ++ int f; ++ f = qemu_try_set_nonblock(fd); ++ assert(f == 0); + } + + int socket_set_fast_reuse(int fd) +diff --git a/util/oslib-win32.c b/util/oslib-win32.c +index 3b49d27297..7eedbe5859 100644 +--- a/util/oslib-win32.c ++++ b/util/oslib-win32.c +@@ -132,31 +132,6 @@ struct tm *localtime_r(const time_t *timep, struct tm *result) + } + #endif /* CONFIG_LOCALTIME_R */ + +-void qemu_set_block(int fd) +-{ +- unsigned long opt = 0; +- WSAEventSelect(fd, NULL, 0); +- ioctlsocket(fd, FIONBIO, &opt); +-} +- +-void qemu_set_nonblock(int fd) +-{ +- unsigned long opt = 1; +- ioctlsocket(fd, FIONBIO, &opt); +- qemu_fd_register(fd); +-} +- +-int socket_set_fast_reuse(int fd) +-{ +- /* Enabling the reuse of an endpoint that was used by a socket still in +- * TIME_WAIT state is usually performed by setting SO_REUSEADDR. On Windows +- * fast reuse is the default and SO_REUSEADDR does strange things. So we +- * don't have to do anything here. More info can be found at: +- * http://msdn.microsoft.com/en-us/library/windows/desktop/ms740621.aspx */ +- return 0; +-} +- +- + static int socket_error(void) + { + switch (WSAGetLastError()) { +@@ -233,6 +208,38 @@ static int socket_error(void) + } + } + ++void qemu_set_block(int fd) ++{ ++ unsigned long opt = 0; ++ WSAEventSelect(fd, NULL, 0); ++ ioctlsocket(fd, FIONBIO, &opt); ++} ++ ++int qemu_try_set_nonblock(int fd) ++{ ++ unsigned long opt = 1; ++ if (ioctlsocket(fd, FIONBIO, &opt) != NO_ERROR) { ++ return -socket_error(); ++ } ++ qemu_fd_register(fd); ++ return 0; ++} ++ ++void qemu_set_nonblock(int fd) ++{ ++ (void)qemu_try_set_nonblock(fd); ++} ++ ++int socket_set_fast_reuse(int fd) ++{ ++ /* Enabling the reuse of an endpoint that was used by a socket still in ++ * TIME_WAIT state is usually performed by setting SO_REUSEADDR. On Windows ++ * fast reuse is the default and SO_REUSEADDR does strange things. So we ++ * don't have to do anything here. More info can be found at: ++ * http://msdn.microsoft.com/en-us/library/windows/desktop/ms740621.aspx */ ++ return 0; ++} ++ + int inet_aton(const char *cp, struct in_addr *ia) + { + uint32_t addr = inet_addr(cp); +-- +2.27.0 + diff --git a/kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch b/kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch new file mode 100755 index 0000000000000000000000000000000000000000..8718c711727c73a1cbfbc95583ce42e3a1eff410 --- /dev/null +++ b/kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch @@ -0,0 +1,221 @@ +From 3475ea6598896edb689ca8ba6fb81781e2517b6f Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Thu, 29 Jul 2021 04:56:49 -0400 +Subject: [PATCH 14/14] net: detect errors from probing vnet hdr flag for TAP + devices +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210726102337.6359-3-lvivier@redhat.com> +Patchwork-id: 101923 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] net: detect errors from probing vnet hdr flag for TAP devices +Bugzilla: 1982134 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Daniel P. Berrange" + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1982134 +BRANCH: rhel-8.5.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=38380653 + +When QEMU sets up a tap based network device backend, it mostly ignores errors +reported from various ioctl() calls it makes, assuming the TAP file descriptor +is valid. This assumption can easily be violated when the user is passing in a +pre-opened file descriptor. At best, the ioctls may fail with a -EBADF, but if +the user passes in a bogus FD number that happens to clash with a FD number that +QEMU has opened internally for another reason, a wide variety of errnos may +result, as the TUNGETIFF ioctl number may map to a completely different command +on a different type of file. + +By ignoring all these errors, QEMU sets up a zombie network backend that will +never pass any data. Even worse, when QEMU shuts down, or that network backend +is hot-removed, it will close this bogus file descriptor, which could belong to +another QEMU device backend. + +There's no obvious guaranteed reliable way to detect that a FD genuinely is a +TAP device, as opposed to a UNIX socket, or pipe, or something else. Checking +the errno from probing vnet hdr flag though, does catch the big common cases. +ie calling TUNGETIFF will return EBADF for an invalid FD, and ENOTTY when FD is +a UNIX socket, or pipe which catches accidental collisions with FDs used for +stdio, or monitor socket. + +Previously the example below where bogus fd 9 collides with the FD used for the +chardev saw: + +$ ./x86_64-softmmu/qemu-system-x86_64 -netdev tap,id=hostnet0,fd=9 \ + -chardev socket,id=charchannel0,path=/tmp/qga,server,nowait \ + -monitor stdio -vnc :0 +qemu-system-x86_64: -netdev tap,id=hostnet0,fd=9: TUNGETIFF ioctl() failed: Inappropriate ioctl for device +TUNSETOFFLOAD ioctl() failed: Bad address +QEMU 2.9.1 monitor - type 'help' for more information +(qemu) Warning: netdev hostnet0 has no peer + +which gives a running QEMU with a zombie network backend. + +With this change applied we get an error message and QEMU immediately exits +before carrying on and making a bigger disaster: + +$ ./x86_64-softmmu/qemu-system-x86_64 -netdev tap,id=hostnet0,fd=9 \ + -chardev socket,id=charchannel0,path=/tmp/qga,server,nowait \ + -monitor stdio -vnc :0 +qemu-system-x86_64: -netdev tap,id=hostnet0,vhost=on,fd=9: Unable to query TUNGETIFF on FD 9: Inappropriate ioctl for device + +Reported-by: Dr. David Alan Gilbert +Signed-off-by: Daniel P. Berrange +Tested-by: Dr. David Alan Gilbert +Message-id: 20171027085548.3472-1-berrange@redhat.com +[lv: to simplify, don't check on EINVAL with TUNGETIFF as it exists since v2.6.27] +Signed-off-by: Laurent Vivier +Signed-off-by: Jason Wang +(cherry picked from commit e7b347d0bf640adb1c998d317eaf44d2d7cbd973) +Signed-off-by: Laurent Vivier +Signed-off-by: Miroslav Rezanina +--- + net/tap-bsd.c | 2 +- + net/tap-linux.c | 8 +++++--- + net/tap-solaris.c | 2 +- + net/tap-stub.c | 2 +- + net/tap.c | 25 ++++++++++++++++++++----- + net/tap_int.h | 2 +- + 6 files changed, 29 insertions(+), 12 deletions(-) + +diff --git a/net/tap-bsd.c b/net/tap-bsd.c +index a5c3707f80..77aaf674b1 100644 +--- a/net/tap-bsd.c ++++ b/net/tap-bsd.c +@@ -211,7 +211,7 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + { + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + return 0; + } +diff --git a/net/tap-linux.c b/net/tap-linux.c +index e0dd442ee3..b0635e9e32 100644 +--- a/net/tap-linux.c ++++ b/net/tap-linux.c +@@ -147,13 +147,15 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + } + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + struct ifreq ifr; + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) { +- error_report("TUNGETIFF ioctl() failed: %s", strerror(errno)); +- return 0; ++ /* TUNGETIFF is available since kernel v2.6.27 */ ++ error_setg_errno(errp, errno, ++ "Unable to query TUNGETIFF on FD %d", fd); ++ return -1; + } + + return ifr.ifr_flags & IFF_VNET_HDR; +diff --git a/net/tap-solaris.c b/net/tap-solaris.c +index 4725d2314e..ae2ba68284 100644 +--- a/net/tap-solaris.c ++++ b/net/tap-solaris.c +@@ -206,7 +206,7 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + { + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + return 0; + } +diff --git a/net/tap-stub.c b/net/tap-stub.c +index a9ab8f8293..de525a2e69 100644 +--- a/net/tap-stub.c ++++ b/net/tap-stub.c +@@ -37,7 +37,7 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) + { + } + +-int tap_probe_vnet_hdr(int fd) ++int tap_probe_vnet_hdr(int fd, Error **errp) + { + return 0; + } +diff --git a/net/tap.c b/net/tap.c +index 41a20102fd..b37ccae00c 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -597,7 +597,11 @@ int net_init_bridge(const Netdev *netdev, const char *name, + } + + qemu_set_nonblock(fd); +- vnet_hdr = tap_probe_vnet_hdr(fd); ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ close(fd); ++ return -1; ++ } + s = net_tap_fd_init(peer, "bridge", name, fd, vnet_hdr); + + snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper, +@@ -810,7 +814,11 @@ int net_init_tap(const Netdev *netdev, const char *name, + return -1; + } + +- vnet_hdr = tap_probe_vnet_hdr(fd); ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ close(fd); ++ return -1; ++ } + + net_init_tap_one(tap, peer, "tap", name, NULL, + script, downscript, +@@ -863,8 +871,11 @@ int net_init_tap(const Netdev *netdev, const char *name, + } + + if (i == 0) { +- vnet_hdr = tap_probe_vnet_hdr(fd); +- } else if (vnet_hdr != tap_probe_vnet_hdr(fd)) { ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ goto free_fail; ++ } ++ } else if (vnet_hdr != tap_probe_vnet_hdr(fd, NULL)) { + error_setg(errp, + "vnet_hdr not consistent across given tap fds"); + ret = -1; +@@ -909,7 +920,11 @@ free_fail: + } + + qemu_set_nonblock(fd); +- vnet_hdr = tap_probe_vnet_hdr(fd); ++ vnet_hdr = tap_probe_vnet_hdr(fd, errp); ++ if (vnet_hdr < 0) { ++ close(fd); ++ return -1; ++ } + + net_init_tap_one(tap, peer, "bridge", name, ifname, + script, downscript, vhostfdname, +diff --git a/net/tap_int.h b/net/tap_int.h +index e3194b23f4..225a49ea48 100644 +--- a/net/tap_int.h ++++ b/net/tap_int.h +@@ -34,7 +34,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen); + + void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp); +-int tap_probe_vnet_hdr(int fd); ++int tap_probe_vnet_hdr(int fd, Error **errp); + int tap_probe_vnet_hdr_len(int fd, int len); + int tap_probe_has_ufo(int fd); + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); +-- +2.27.0 + diff --git a/kvm-net-forbid-the-reentrant-RX.patch b/kvm-net-forbid-the-reentrant-RX.patch new file mode 100755 index 0000000000000000000000000000000000000000..aaf57ed7a9d1e93ebfe0c1615d25b66263830f3d --- /dev/null +++ b/kvm-net-forbid-the-reentrant-RX.patch @@ -0,0 +1,50 @@ +From 1e01e2f96fd5e903394eab59365d5363394c8b18 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 18:59:12 -0400 +Subject: [PATCH 3/5] net: forbid the reentrant RX + +RH-Author: Jon Maloy +Message-id: <20210413185912.3811035-2-jmaloy@redhat.com> +Patchwork-id: 101467 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] net: forbid the reentrant RX +Bugzilla: 1859175 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Xiao Wang + +From: Jason Wang + +The memory API allows DMA into NIC's MMIO area. This means the NIC's +RX routine must be reentrant. Instead of auditing all the NIC, we can +simply detect the reentrancy and return early. The queue->delivering +is set and cleared by qemu_net_queue_deliver() for other queue helpers +to know whether the delivering in on going (NIC's receive is being +called). We can check it and return early in qemu_net_queue_flush() to +forbid reentrant RX. + +Signed-off-by: Jason Wang + +(cherry picked from commit 22dc8663d9fc7baa22100544c600b6285a63c7a3) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + net/queue.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/net/queue.c b/net/queue.c +index 61276ca4be..c679d79f4b 100644 +--- a/net/queue.c ++++ b/net/queue.c +@@ -250,6 +250,9 @@ void qemu_net_queue_purge(NetQueue *queue, NetClientState *from) + + bool qemu_net_queue_flush(NetQueue *queue) + { ++ if (queue->delivering) ++ return false; ++ + while (!QTAILQ_EMPTY(&queue->packets)) { + NetPacket *packet; + int ret; +-- +2.27.0 + diff --git a/kvm-net-introduce-qemu_receive_packet.patch b/kvm-net-introduce-qemu_receive_packet.patch new file mode 100755 index 0000000000000000000000000000000000000000..8de8cae983f767edbd8f48b764b59746fd24b57d --- /dev/null +++ b/kvm-net-introduce-qemu_receive_packet.patch @@ -0,0 +1,187 @@ +From 89732bf03b26daaebbd3e6e031e79459ae3f77e1 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:39 -0400 +Subject: [PATCH 1/9] net: introduce qemu_receive_packet() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-2-jmaloy@redhat.com> +Patchwork-id: 101785 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 1/9] net: introduce qemu_receive_packet() +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +Some NIC supports loopback mode and this is done by calling +nc->info->receive() directly which in fact suppresses the effort of +reentrancy check that is done in qemu_net_queue_send(). + +Unfortunately we can't use qemu_net_queue_send() here since for +loopback there's no sender as peer, so this patch introduce a +qemu_receive_packet() which is used for implementing loopback mode +for a NIC with this check. + +NIC that supports loopback mode will be converted to this helper. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Reviewed-by: Philippe Mathieu-Daudé +Cc: qemu-stable@nongnu.org +Signed-off-by: Jason Wang + +(cherry picked from commit 705df5466c98f3efdd2b68d3b31dad86858acad7) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + include/net/net.h | 5 +++++ + include/net/queue.h | 8 ++++++++ + net/net.c | 38 +++++++++++++++++++++++++++++++------- + net/queue.c | 22 ++++++++++++++++++++++ + 4 files changed, 66 insertions(+), 7 deletions(-) + +diff --git a/include/net/net.h b/include/net/net.h +index e175ba9677..1b32a8aaec 100644 +--- a/include/net/net.h ++++ b/include/net/net.h +@@ -142,12 +142,17 @@ void *qemu_get_nic_opaque(NetClientState *nc); + void qemu_del_net_client(NetClientState *nc); + typedef void (*qemu_nic_foreach)(NICState *nic, void *opaque); + void qemu_foreach_nic(qemu_nic_foreach func, void *opaque); ++int qemu_can_receive_packet(NetClientState *nc); + int qemu_can_send_packet(NetClientState *nc); + ssize_t qemu_sendv_packet(NetClientState *nc, const struct iovec *iov, + int iovcnt); + ssize_t qemu_sendv_packet_async(NetClientState *nc, const struct iovec *iov, + int iovcnt, NetPacketSent *sent_cb); + ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size); ++ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size); ++ssize_t qemu_receive_packet_iov(NetClientState *nc, ++ const struct iovec *iov, ++ int iovcnt); + ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size); + ssize_t qemu_send_packet_async(NetClientState *nc, const uint8_t *buf, + int size, NetPacketSent *sent_cb); +diff --git a/include/net/queue.h b/include/net/queue.h +index c0269bb1dc..9f2f289d77 100644 +--- a/include/net/queue.h ++++ b/include/net/queue.h +@@ -55,6 +55,14 @@ void qemu_net_queue_append_iov(NetQueue *queue, + + void qemu_del_net_queue(NetQueue *queue); + ++ssize_t qemu_net_queue_receive(NetQueue *queue, ++ const uint8_t *data, ++ size_t size); ++ ++ssize_t qemu_net_queue_receive_iov(NetQueue *queue, ++ const struct iovec *iov, ++ int iovcnt); ++ + ssize_t qemu_net_queue_send(NetQueue *queue, + NetClientState *sender, + unsigned flags, +diff --git a/net/net.c b/net/net.c +index 84aa6d8d00..d0b651ca95 100644 +--- a/net/net.c ++++ b/net/net.c +@@ -516,6 +516,17 @@ int qemu_set_vnet_be(NetClientState *nc, bool is_be) + #endif + } + ++int qemu_can_receive_packet(NetClientState *nc) ++{ ++ if (nc->receive_disabled) { ++ return 0; ++ } else if (nc->info->can_receive && ++ !nc->info->can_receive(nc)) { ++ return 0; ++ } ++ return 1; ++} ++ + int qemu_can_send_packet(NetClientState *sender) + { + int vm_running = runstate_is_running(); +@@ -528,13 +539,7 @@ int qemu_can_send_packet(NetClientState *sender) + return 1; + } + +- if (sender->peer->receive_disabled) { +- return 0; +- } else if (sender->peer->info->can_receive && +- !sender->peer->info->can_receive(sender->peer)) { +- return 0; +- } +- return 1; ++ return qemu_can_receive_packet(sender->peer); + } + + static ssize_t filter_receive_iov(NetClientState *nc, +@@ -667,6 +672,25 @@ ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size) + return qemu_send_packet_async(nc, buf, size, NULL); + } + ++ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size) ++{ ++ if (!qemu_can_receive_packet(nc)) { ++ return 0; ++ } ++ ++ return qemu_net_queue_receive(nc->incoming_queue, buf, size); ++} ++ ++ssize_t qemu_receive_packet_iov(NetClientState *nc, const struct iovec *iov, ++ int iovcnt) ++{ ++ if (!qemu_can_receive_packet(nc)) { ++ return 0; ++ } ++ ++ return qemu_net_queue_receive_iov(nc->incoming_queue, iov, iovcnt); ++} ++ + ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size) + { + return qemu_send_packet_async_with_flags(nc, QEMU_NET_PACKET_FLAG_RAW, +diff --git a/net/queue.c b/net/queue.c +index c679d79f4b..5f0f9ffcaf 100644 +--- a/net/queue.c ++++ b/net/queue.c +@@ -182,6 +182,28 @@ static ssize_t qemu_net_queue_deliver_iov(NetQueue *queue, + return ret; + } + ++ssize_t qemu_net_queue_receive(NetQueue *queue, ++ const uint8_t *data, ++ size_t size) ++{ ++ if (queue->delivering) { ++ return 0; ++ } ++ ++ return qemu_net_queue_deliver(queue, NULL, 0, data, size); ++} ++ ++ssize_t qemu_net_queue_receive_iov(NetQueue *queue, ++ const struct iovec *iov, ++ int iovcnt) ++{ ++ if (queue->delivering) { ++ return 0; ++ } ++ ++ return qemu_net_queue_deliver_iov(queue, NULL, 0, iov, iovcnt); ++} ++ + ssize_t qemu_net_queue_send(NetQueue *queue, + NetClientState *sender, + unsigned flags, +-- +2.27.0 + diff --git a/kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch b/kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch new file mode 100755 index 0000000000000000000000000000000000000000..b619e7864a144e1425ae7f7c45e6ee16ee410718 --- /dev/null +++ b/kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch @@ -0,0 +1,59 @@ +From b7de63e72c479df42c324c058a487517210fa069 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 13 Apr 2021 19:21:50 -0400 +Subject: [PATCH 1/5] net: remove an assert call in eth_get_gso_type + +RH-Author: Jon Maloy +Message-id: <20210413192150.3817133-2-jmaloy@redhat.com> +Patchwork-id: 101469 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] net: remove an assert call in eth_get_gso_type +Bugzilla: 1892350 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Xiao Wang + +From: Prasad J Pandit + +eth_get_gso_type() routine returns segmentation offload type based on +L3 protocol type. It calls g_assert_not_reached if L3 protocol is +unknown, making the following return statement unreachable. Remove the +g_assert call, it maybe triggered by a guest user. + +Reported-by: Gaoning Pan +Signed-off-by: Prasad J Pandit +Signed-off-by: Jason Wang + +(cherry picked from commit 7564bf7701f00214cdc8a678a9f7df765244def1) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + net/eth.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/net/eth.c b/net/eth.c +index 0c1d413ee2..1e0821c5f8 100644 +--- a/net/eth.c ++++ b/net/eth.c +@@ -16,6 +16,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/log.h" + #include "net/eth.h" + #include "net/checksum.h" + #include "net/tap.h" +@@ -71,9 +72,8 @@ eth_get_gso_type(uint16_t l3_proto, uint8_t *l3_hdr, uint8_t l4proto) + return VIRTIO_NET_HDR_GSO_TCPV6 | ecn_state; + } + } +- +- /* Unsupported offload */ +- g_assert_not_reached(); ++ qemu_log_mask(LOG_UNIMP, "%s: probably not GSO frame, " ++ "unknown L3 protocol: 0x%04"PRIx16"\n", __func__, l3_proto); + + return VIRTIO_NET_HDR_GSO_NONE | ecn_state; + } +-- +2.27.0 + diff --git a/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch b/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch new file mode 100755 index 0000000000000000000000000000000000000000..6d9382c0ee14a08a8964914ff3fac91eb0327424 --- /dev/null +++ b/kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch @@ -0,0 +1,318 @@ +From 70f8bbb27f9f357ea83ff6639fc00aa60fc902b9 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:47 +0100 +Subject: [PATCH 04/12] numa: Extend CLI to provide initiator information for + numa nodes + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-4-plai@redhat.com> +Patchwork-id: 96736 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 03/11] numa: Extend CLI to provide initiator information for numa nodes +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Tao Xu + +In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), +The initiator represents processor which access to memory. And in 5.2.27.3 +Memory Proximity Domain Attributes Structure, the attached initiator is +defined as where the memory controller responsible for a memory proximity +domain. With attached initiator information, the topology of heterogeneous +memory can be described. Add new machine property 'hmat' to enable all +HMAT specific options. + +Extend CLI of "-numa node" option to indicate the initiator numa node-id. +In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report +the platform's HMAT tables. Before using initiator option, enable HMAT with +-machine hmat=on. + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Jingqi Liu +Suggested-by: Dan Williams +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-2-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 244b3f4485a07c7ce4b7123d6ce9d8c6012756e8) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/machine.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/core/numa.c | 23 ++++++++++++++++++ + include/sysemu/numa.h | 5 ++++ + qapi/machine.json | 10 +++++++- + qemu-options.hx | 35 ++++++++++++++++++++++++---- + 5 files changed, 131 insertions(+), 6 deletions(-) + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index 19c78c6..cb21ae1 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -688,6 +688,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) + ms->nvdimms_state->is_enabled = value; + } + ++static bool machine_get_hmat(Object *obj, Error **errp) ++{ ++ MachineState *ms = MACHINE(obj); ++ ++ return ms->numa_state->hmat_enabled; ++} ++ ++static void machine_set_hmat(Object *obj, bool value, Error **errp) ++{ ++ MachineState *ms = MACHINE(obj); ++ ++ ms->numa_state->hmat_enabled = value; ++} ++ + static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) + { + MachineState *ms = MACHINE(obj); +@@ -815,6 +829,7 @@ void machine_set_cpu_numa_node(MachineState *machine, + const CpuInstanceProperties *props, Error **errp) + { + MachineClass *mc = MACHINE_GET_CLASS(machine); ++ NodeInfo *numa_info = machine->numa_state->nodes; + bool match = false; + int i; + +@@ -884,6 +899,17 @@ void machine_set_cpu_numa_node(MachineState *machine, + match = true; + slot->props.node_id = props->node_id; + slot->props.has_node_id = props->has_node_id; ++ ++ if (machine->numa_state->hmat_enabled) { ++ if ((numa_info[props->node_id].initiator < MAX_NODES) && ++ (props->node_id != numa_info[props->node_id].initiator)) { ++ error_setg(errp, "The initiator of CPU NUMA node %" PRId64 ++ " should be itself", props->node_id); ++ return; ++ } ++ numa_info[props->node_id].has_cpu = true; ++ numa_info[props->node_id].initiator = props->node_id; ++ } + } + + if (!match) { +@@ -1130,6 +1156,13 @@ static void machine_initfn(Object *obj) + + if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { + ms->numa_state = g_new0(NumaState, 1); ++ object_property_add_bool(obj, "hmat", ++ machine_get_hmat, machine_set_hmat, ++ &error_abort); ++ object_property_set_description(obj, "hmat", ++ "Set on/off to enable/disable " ++ "ACPI Heterogeneous Memory Attribute " ++ "Table (HMAT)", NULL); + } + + /* Register notifier when init is done for sysbus sanity checks */ +@@ -1218,6 +1251,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) + return g_string_free(s, false); + } + ++static void numa_validate_initiator(NumaState *numa_state) ++{ ++ int i; ++ NodeInfo *numa_info = numa_state->nodes; ++ ++ for (i = 0; i < numa_state->num_nodes; i++) { ++ if (numa_info[i].initiator == MAX_NODES) { ++ error_report("The initiator of NUMA node %d is missing, use " ++ "'-numa node,initiator' option to declare it", i); ++ exit(1); ++ } ++ ++ if (!numa_info[numa_info[i].initiator].present) { ++ error_report("NUMA node %" PRIu16 " is missing, use " ++ "'-numa node' option to declare it first", ++ numa_info[i].initiator); ++ exit(1); ++ } ++ ++ if (!numa_info[numa_info[i].initiator].has_cpu) { ++ error_report("The initiator of NUMA node %d is invalid", i); ++ exit(1); ++ } ++ } ++} ++ + static void machine_numa_finish_cpu_init(MachineState *machine) + { + int i; +@@ -1258,6 +1317,11 @@ static void machine_numa_finish_cpu_init(MachineState *machine) + machine_set_cpu_numa_node(machine, &props, &error_fatal); + } + } ++ ++ if (machine->numa_state->hmat_enabled) { ++ numa_validate_initiator(machine->numa_state); ++ } ++ + if (s->len && !qtest_enabled()) { + warn_report("CPU(s) not present in any NUMA nodes: %s", + s->str); +diff --git a/hw/core/numa.c b/hw/core/numa.c +index 19f082d..a07eef9 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -129,6 +129,29 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, + numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL); + numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); + } ++ ++ /* ++ * If not set the initiator, set it to MAX_NODES. And if ++ * HMAT is enabled and this node has no cpus, QEMU will raise error. ++ */ ++ numa_info[nodenr].initiator = MAX_NODES; ++ if (node->has_initiator) { ++ if (!ms->numa_state->hmat_enabled) { ++ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " ++ "(HMAT) is disabled, enable it with -machine hmat=on " ++ "before using any of hmat specific options"); ++ return; ++ } ++ ++ if (node->initiator >= MAX_NODES) { ++ error_report("The initiator id %" PRIu16 " expects an integer " ++ "between 0 and %d", node->initiator, ++ MAX_NODES - 1); ++ return; ++ } ++ ++ numa_info[nodenr].initiator = node->initiator; ++ } + numa_info[nodenr].present = true; + max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); + ms->numa_state->num_nodes++; +diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h +index ae9c41d..788cbec 100644 +--- a/include/sysemu/numa.h ++++ b/include/sysemu/numa.h +@@ -18,6 +18,8 @@ struct NodeInfo { + uint64_t node_mem; + struct HostMemoryBackend *node_memdev; + bool present; ++ bool has_cpu; ++ uint16_t initiator; + uint8_t distance[MAX_NODES]; + }; + +@@ -33,6 +35,9 @@ struct NumaState { + /* Allow setting NUMA distance for different NUMA nodes */ + bool have_numa_distance; + ++ /* Detect if HMAT support is enabled. */ ++ bool hmat_enabled; ++ + /* NUMA nodes information */ + NodeInfo nodes[MAX_NODES]; + }; +diff --git a/qapi/machine.json b/qapi/machine.json +index ca26779..27d0e37 100644 +--- a/qapi/machine.json ++++ b/qapi/machine.json +@@ -463,6 +463,13 @@ + # @memdev: memory backend object. If specified for one node, + # it must be specified for all nodes. + # ++# @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, ++# points to the nodeid which has the memory controller ++# responsible for this NUMA node. This field provides ++# additional information as to the initiator node that ++# is closest (as in directly attached) to this node, and ++# therefore has the best performance (since 5.0) ++# + # Since: 2.1 + ## + { 'struct': 'NumaNodeOptions', +@@ -470,7 +477,8 @@ + '*nodeid': 'uint16', + '*cpus': ['uint16'], + '*mem': 'size', +- '*memdev': 'str' }} ++ '*memdev': 'str', ++ '*initiator': 'uint16' }} + + ## + # @NumaDistOptions: +diff --git a/qemu-options.hx b/qemu-options.hx +index df1d27b..e2ce754 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -43,7 +43,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ + " suppress-vmdesc=on|off disables self-describing migration (default=off)\n" + " nvdimm=on|off controls NVDIMM support (default=off)\n" + " enforce-config-section=on|off enforce configuration section migration (default=off)\n" +- " memory-encryption=@var{} memory encryption object to use (default=none)\n", ++ " memory-encryption=@var{} memory encryption object to use (default=none)\n" ++ " hmat=on|off controls ACPI HMAT support (default=off)\n", + QEMU_ARCH_ALL) + STEXI + @item -machine [type=]@var{name}[,prop=@var{value}[,...]] +@@ -103,6 +104,9 @@ NOTE: this parameter is deprecated. Please use @option{-global} + @option{migration.send-configuration}=@var{on|off} instead. + @item memory-encryption=@var{} + Memory encryption object to use. The default is none. ++@item hmat=on|off ++Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support. ++The default is off. + @end table + ETEXI + +@@ -161,14 +165,14 @@ If any on the three values is given, the total number of CPUs @var{n} can be omi + ETEXI + + DEF("numa", HAS_ARG, QEMU_OPTION_numa, +- "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" +- "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" ++ "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" ++ "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa dist,src=source,dst=destination,val=distance\n" + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", + QEMU_ARCH_ALL) + STEXI +-@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] +-@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] ++@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] ++@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] + @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} + @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] + @findex -numa +@@ -215,6 +219,27 @@ split equally between them. + @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore, + if one node uses @samp{memdev}, all of them have to use it. + ++@samp{initiator} is an additional option that points to an @var{initiator} ++NUMA node that has best performance (the lowest latency or largest bandwidth) ++to this NUMA @var{node}. Note that this option can be set only when ++the machine property 'hmat' is set to 'on'. ++ ++Following example creates a machine with 2 NUMA nodes, node 0 has CPU. ++node 1 has only memory, and its initiator is node 0. Note that because ++node 0 has CPU, by default the initiator of node 0 is itself and must be ++itself. ++@example ++-machine hmat=on \ ++-m 2G,slots=2,maxmem=4G \ ++-object memory-backend-ram,size=1G,id=m0 \ ++-object memory-backend-ram,size=1G,id=m1 \ ++-numa node,nodeid=0,memdev=m0 \ ++-numa node,nodeid=1,memdev=m1,initiator=0 \ ++-smp 2,sockets=2,maxcpus=2 \ ++-numa cpu,node-id=0,socket-id=0 \ ++-numa cpu,node-id=0,socket-id=1 ++@end example ++ + @var{source} and @var{destination} are NUMA node IDs. + @var{distance} is the NUMA distance from @var{source} to @var{destination}. + The distance from a node to itself is always 10. If any pair of nodes is +-- +1.8.3.1 + diff --git a/kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch b/kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch new file mode 100755 index 0000000000000000000000000000000000000000..306abebec73ed0165d39fea5b066ef515b4b146f --- /dev/null +++ b/kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch @@ -0,0 +1,545 @@ +From 32341d8cf680625def040b44d70b197f2399bbdb Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:48 +0100 +Subject: [PATCH 05/12] numa: Extend CLI to provide memory latency and + bandwidth information + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-5-plai@redhat.com> +Patchwork-id: 96731 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 04/11] numa: Extend CLI to provide memory latency and bandwidth information +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +Add -numa hmat-lb option to provide System Locality Latency and +Bandwidth Information. These memory attributes help to build +System Locality Latency and Bandwidth Information Structure(s) +in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using +hmat-lb option, enable HMAT with -machine hmat=on. + +Acked-by: Markus Armbruster +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-3-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Igor Mammedov +(cherry picked from commit 9b12dfa03a94d7f7a4b54eb67229a31e58193384) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/numa.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++ + include/sysemu/numa.h | 53 ++++++++++++++ + qapi/machine.json | 93 +++++++++++++++++++++++- + qemu-options.hx | 47 +++++++++++- + 4 files changed, 384 insertions(+), 3 deletions(-) + +diff --git a/hw/core/numa.c b/hw/core/numa.c +index a07eef9..58fe713 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -23,6 +23,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/units.h" + #include "sysemu/hostmem.h" + #include "sysemu/numa.h" + #include "sysemu/sysemu.h" +@@ -194,6 +195,186 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) + ms->numa_state->have_numa_distance = true; + } + ++void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, ++ Error **errp) ++{ ++ int i, first_bit, last_bit; ++ uint64_t max_entry, temp_base, bitmap_copy; ++ NodeInfo *numa_info = numa_state->nodes; ++ HMAT_LB_Info *hmat_lb = ++ numa_state->hmat_lb[node->hierarchy][node->data_type]; ++ HMAT_LB_Data lb_data = {}; ++ HMAT_LB_Data *lb_temp; ++ ++ /* Error checking */ ++ if (node->initiator > numa_state->num_nodes) { ++ error_setg(errp, "Invalid initiator=%d, it should be less than %d", ++ node->initiator, numa_state->num_nodes); ++ return; ++ } ++ if (node->target > numa_state->num_nodes) { ++ error_setg(errp, "Invalid target=%d, it should be less than %d", ++ node->target, numa_state->num_nodes); ++ return; ++ } ++ if (!numa_info[node->initiator].has_cpu) { ++ error_setg(errp, "Invalid initiator=%d, it isn't an " ++ "initiator proximity domain", node->initiator); ++ return; ++ } ++ if (!numa_info[node->target].present) { ++ error_setg(errp, "The target=%d should point to an existing node", ++ node->target); ++ return; ++ } ++ ++ if (!hmat_lb) { ++ hmat_lb = g_malloc0(sizeof(*hmat_lb)); ++ numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; ++ hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); ++ } ++ hmat_lb->hierarchy = node->hierarchy; ++ hmat_lb->data_type = node->data_type; ++ lb_data.initiator = node->initiator; ++ lb_data.target = node->target; ++ ++ if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { ++ /* Input latency data */ ++ ++ if (!node->has_latency) { ++ error_setg(errp, "Missing 'latency' option"); ++ return; ++ } ++ if (node->has_bandwidth) { ++ error_setg(errp, "Invalid option 'bandwidth' since " ++ "the data type is latency"); ++ return; ++ } ++ ++ /* Detect duplicate configuration */ ++ for (i = 0; i < hmat_lb->list->len; i++) { ++ lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); ++ ++ if (node->initiator == lb_temp->initiator && ++ node->target == lb_temp->target) { ++ error_setg(errp, "Duplicate configuration of the latency for " ++ "initiator=%d and target=%d", node->initiator, ++ node->target); ++ return; ++ } ++ } ++ ++ hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; ++ ++ if (node->latency) { ++ /* Calculate the temporary base and compressed latency */ ++ max_entry = node->latency; ++ temp_base = 1; ++ while (QEMU_IS_ALIGNED(max_entry, 10)) { ++ max_entry /= 10; ++ temp_base *= 10; ++ } ++ ++ /* Calculate the max compressed latency */ ++ temp_base = MIN(hmat_lb->base, temp_base); ++ max_entry = node->latency / hmat_lb->base; ++ max_entry = MAX(hmat_lb->range_bitmap, max_entry); ++ ++ /* ++ * For latency hmat_lb->range_bitmap record the max compressed ++ * latency which should be less than 0xFFFF (UINT16_MAX) ++ */ ++ if (max_entry >= UINT16_MAX) { ++ error_setg(errp, "Latency %" PRIu64 " between initiator=%d and " ++ "target=%d should not differ from previously entered " ++ "min or max values on more than %d", node->latency, ++ node->initiator, node->target, UINT16_MAX - 1); ++ return; ++ } else { ++ hmat_lb->base = temp_base; ++ hmat_lb->range_bitmap = max_entry; ++ } ++ ++ /* ++ * Set lb_info_provided bit 0 as 1, ++ * latency information is provided ++ */ ++ numa_info[node->target].lb_info_provided |= BIT(0); ++ } ++ lb_data.data = node->latency; ++ } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) { ++ /* Input bandwidth data */ ++ if (!node->has_bandwidth) { ++ error_setg(errp, "Missing 'bandwidth' option"); ++ return; ++ } ++ if (node->has_latency) { ++ error_setg(errp, "Invalid option 'latency' since " ++ "the data type is bandwidth"); ++ return; ++ } ++ if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) { ++ error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and " ++ "target=%d should be 1MB aligned", node->bandwidth, ++ node->initiator, node->target); ++ return; ++ } ++ ++ /* Detect duplicate configuration */ ++ for (i = 0; i < hmat_lb->list->len; i++) { ++ lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); ++ ++ if (node->initiator == lb_temp->initiator && ++ node->target == lb_temp->target) { ++ error_setg(errp, "Duplicate configuration of the bandwidth for " ++ "initiator=%d and target=%d", node->initiator, ++ node->target); ++ return; ++ } ++ } ++ ++ hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1; ++ ++ if (node->bandwidth) { ++ /* Keep bitmap unchanged when bandwidth out of range */ ++ bitmap_copy = hmat_lb->range_bitmap; ++ bitmap_copy |= node->bandwidth; ++ first_bit = ctz64(bitmap_copy); ++ temp_base = UINT64_C(1) << first_bit; ++ max_entry = node->bandwidth / temp_base; ++ last_bit = 64 - clz64(bitmap_copy); ++ ++ /* ++ * For bandwidth, first_bit record the base unit of bandwidth bits, ++ * last_bit record the last bit of the max bandwidth. The max ++ * compressed bandwidth should be less than 0xFFFF (UINT16_MAX) ++ */ ++ if ((last_bit - first_bit) > UINT16_BITS || ++ max_entry >= UINT16_MAX) { ++ error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d " ++ "and target=%d should not differ from previously " ++ "entered values on more than %d", node->bandwidth, ++ node->initiator, node->target, UINT16_MAX - 1); ++ return; ++ } else { ++ hmat_lb->base = temp_base; ++ hmat_lb->range_bitmap = bitmap_copy; ++ } ++ ++ /* ++ * Set lb_info_provided bit 1 as 1, ++ * bandwidth information is provided ++ */ ++ numa_info[node->target].lb_info_provided |= BIT(1); ++ } ++ lb_data.data = node->bandwidth; ++ } else { ++ assert(0); ++ } ++ ++ g_array_append_val(hmat_lb->list, lb_data); ++} ++ + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + { + Error *err = NULL; +@@ -231,6 +412,19 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), + &err); + break; ++ case NUMA_OPTIONS_TYPE_HMAT_LB: ++ if (!ms->numa_state->hmat_enabled) { ++ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " ++ "(HMAT) is disabled, enable it with -machine hmat=on " ++ "before using any of hmat specific options"); ++ return; ++ } ++ ++ parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err); ++ if (err) { ++ goto end; ++ } ++ break; + default: + abort(); + } +diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h +index 788cbec..70f93c8 100644 +--- a/include/sysemu/numa.h ++++ b/include/sysemu/numa.h +@@ -14,11 +14,34 @@ struct CPUArchId; + #define NUMA_DISTANCE_MAX 254 + #define NUMA_DISTANCE_UNREACHABLE 255 + ++/* the value of AcpiHmatLBInfo flags */ ++enum { ++ HMAT_LB_MEM_MEMORY = 0, ++ HMAT_LB_MEM_CACHE_1ST_LEVEL = 1, ++ HMAT_LB_MEM_CACHE_2ND_LEVEL = 2, ++ HMAT_LB_MEM_CACHE_3RD_LEVEL = 3, ++ HMAT_LB_LEVELS /* must be the last entry */ ++}; ++ ++/* the value of AcpiHmatLBInfo data type */ ++enum { ++ HMAT_LB_DATA_ACCESS_LATENCY = 0, ++ HMAT_LB_DATA_READ_LATENCY = 1, ++ HMAT_LB_DATA_WRITE_LATENCY = 2, ++ HMAT_LB_DATA_ACCESS_BANDWIDTH = 3, ++ HMAT_LB_DATA_READ_BANDWIDTH = 4, ++ HMAT_LB_DATA_WRITE_BANDWIDTH = 5, ++ HMAT_LB_TYPES /* must be the last entry */ ++}; ++ ++#define UINT16_BITS 16 ++ + struct NodeInfo { + uint64_t node_mem; + struct HostMemoryBackend *node_memdev; + bool present; + bool has_cpu; ++ uint8_t lb_info_provided; + uint16_t initiator; + uint8_t distance[MAX_NODES]; + }; +@@ -28,6 +51,31 @@ struct NumaNodeMem { + uint64_t node_plugged_mem; + }; + ++struct HMAT_LB_Data { ++ uint8_t initiator; ++ uint8_t target; ++ uint64_t data; ++}; ++typedef struct HMAT_LB_Data HMAT_LB_Data; ++ ++struct HMAT_LB_Info { ++ /* Indicates it's memory or the specified level memory side cache. */ ++ uint8_t hierarchy; ++ ++ /* Present the type of data, access/read/write latency or bandwidth. */ ++ uint8_t data_type; ++ ++ /* The range bitmap of bandwidth for calculating common base */ ++ uint64_t range_bitmap; ++ ++ /* The common base unit for latencies or bandwidths */ ++ uint64_t base; ++ ++ /* Array to store the latencies or bandwidths */ ++ GArray *list; ++}; ++typedef struct HMAT_LB_Info HMAT_LB_Info; ++ + struct NumaState { + /* Number of NUMA nodes */ + int num_nodes; +@@ -40,11 +88,16 @@ struct NumaState { + + /* NUMA nodes information */ + NodeInfo nodes[MAX_NODES]; ++ ++ /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ ++ HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; + }; + typedef struct NumaState NumaState; + + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); + void parse_numa_opts(MachineState *ms); ++void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, ++ Error **errp); + void numa_complete_configuration(MachineState *ms); + void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); + extern QemuOptsList qemu_numa_opts; +diff --git a/qapi/machine.json b/qapi/machine.json +index 27d0e37..cf8faf5 100644 +--- a/qapi/machine.json ++++ b/qapi/machine.json +@@ -426,10 +426,12 @@ + # + # @cpu: property based CPU(s) to node mapping (Since: 2.10) + # ++# @hmat-lb: memory latency and bandwidth information (Since: 5.0) ++# + # Since: 2.1 + ## + { 'enum': 'NumaOptionsType', +- 'data': [ 'node', 'dist', 'cpu' ] } ++ 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } + + ## + # @NumaOptions: +@@ -444,7 +446,8 @@ + 'data': { + 'node': 'NumaNodeOptions', + 'dist': 'NumaDistOptions', +- 'cpu': 'NumaCpuOptions' }} ++ 'cpu': 'NumaCpuOptions', ++ 'hmat-lb': 'NumaHmatLBOptions' }} + + ## + # @NumaNodeOptions: +@@ -558,6 +561,92 @@ + 'data' : {} } + + ## ++# @HmatLBMemoryHierarchy: ++# ++# The memory hierarchy in the System Locality Latency and Bandwidth ++# Information Structure of HMAT (Heterogeneous Memory Attribute Table) ++# ++# For more information about @HmatLBMemoryHierarchy, see chapter ++# 5.2.27.4: Table 5-146: Field "Flags" of ACPI 6.3 spec. ++# ++# @memory: the structure represents the memory performance ++# ++# @first-level: first level of memory side cache ++# ++# @second-level: second level of memory side cache ++# ++# @third-level: third level of memory side cache ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatLBMemoryHierarchy', ++ 'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] } ++ ++## ++# @HmatLBDataType: ++# ++# Data type in the System Locality Latency and Bandwidth ++# Information Structure of HMAT (Heterogeneous Memory Attribute Table) ++# ++# For more information about @HmatLBDataType, see chapter ++# 5.2.27.4: Table 5-146: Field "Data Type" of ACPI 6.3 spec. ++# ++# @access-latency: access latency (nanoseconds) ++# ++# @read-latency: read latency (nanoseconds) ++# ++# @write-latency: write latency (nanoseconds) ++# ++# @access-bandwidth: access bandwidth (Bytes per second) ++# ++# @read-bandwidth: read bandwidth (Bytes per second) ++# ++# @write-bandwidth: write bandwidth (Bytes per second) ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatLBDataType', ++ 'data': [ 'access-latency', 'read-latency', 'write-latency', ++ 'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] } ++ ++## ++# @NumaHmatLBOptions: ++# ++# Set the system locality latency and bandwidth information ++# between Initiator and Target proximity Domains. ++# ++# For more information about @NumaHmatLBOptions, see chapter ++# 5.2.27.4: Table 5-146 of ACPI 6.3 spec. ++# ++# @initiator: the Initiator Proximity Domain. ++# ++# @target: the Target Proximity Domain. ++# ++# @hierarchy: the Memory Hierarchy. Indicates the performance ++# of memory or side cache. ++# ++# @data-type: presents the type of data, access/read/write ++# latency or hit latency. ++# ++# @latency: the value of latency from @initiator to @target ++# proximity domain, the latency unit is "ns(nanosecond)". ++# ++# @bandwidth: the value of bandwidth between @initiator and @target ++# proximity domain, the bandwidth unit is ++# "Bytes per second". ++# ++# Since: 5.0 ++## ++{ 'struct': 'NumaHmatLBOptions', ++ 'data': { ++ 'initiator': 'uint16', ++ 'target': 'uint16', ++ 'hierarchy': 'HmatLBMemoryHierarchy', ++ 'data-type': 'HmatLBDataType', ++ '*latency': 'uint64', ++ '*bandwidth': 'size' }} ++ ++## + # @HostMemPolicy: + # + # Host memory policy types +diff --git a/qemu-options.hx b/qemu-options.hx +index e2ce754..86d9d8a 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -168,16 +168,19 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, + "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa dist,src=source,dst=destination,val=distance\n" +- "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", ++ "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" ++ "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n", + QEMU_ARCH_ALL) + STEXI + @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] + @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] + @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} + @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] ++@itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}] + @findex -numa + Define a NUMA node and assign RAM and VCPUs to it. + Set the NUMA distance from a source node to a destination node. ++Set the ACPI Heterogeneous Memory Attributes for the given nodes. + + Legacy VCPU assignment uses @samp{cpus} option where + @var{firstcpu} and @var{lastcpu} are CPU indexes. Each +@@ -256,6 +259,48 @@ specified resources, it just assigns existing resources to NUMA + nodes. This means that one still has to use the @option{-m}, + @option{-smp} options to allocate RAM and VCPUs respectively. + ++Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information ++between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory Table (HMAT). ++Initiator NUMA node can create memory requests, usually it has one or more processors. ++Target NUMA node contains addressable memory. ++ ++In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{hierarchy} is the memory ++hierarchy of the target NUMA node: if @var{hierarchy} is 'memory', the structure ++represents the memory performance; if @var{hierarchy} is 'first-level|second-level|third-level', ++this structure represents aggregated performance of memory side caches for each domain. ++@var{type} of 'data-type' is type of data represented by this structure instance: ++if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency or 'access|read|write' ++bandwidth of the target memory; if 'hierarchy' is 'first-level|second-level|third-level', ++'data-type' is 'access|read|write' hit latency or 'access|read|write' hit bandwidth of the ++target memory side cache. ++ ++@var{lat} is latency value in nanoseconds. @var{bw} is bandwidth value, ++the possible value and units are NUM[M|G|T], mean that the bandwidth value are ++NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix). ++Note that if latency or bandwidth value is 0, means the corresponding latency or ++bandwidth information is not provided. ++ ++For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and ++a ram, node 1 has only a ram. The processors in node 0 access memory in node ++0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s; ++The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10 ++nanoseconds, access-bandwidth is 100 MB/s. ++@example ++-machine hmat=on \ ++-m 2G \ ++-object memory-backend-ram,size=1G,id=m0 \ ++-object memory-backend-ram,size=1G,id=m1 \ ++-smp 2 \ ++-numa node,nodeid=0,memdev=m0 \ ++-numa node,nodeid=1,memdev=m1,initiator=0 \ ++-numa cpu,node-id=0,socket-id=0 \ ++-numa cpu,node-id=0,socket-id=1 \ ++-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \ ++-numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \ ++-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \ ++-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M ++@end example ++ + ETEXI + + DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd, +-- +1.8.3.1 + diff --git a/kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch b/kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch new file mode 100755 index 0000000000000000000000000000000000000000..a17db22473f32e4430cef3b9076b97f21bd032fe --- /dev/null +++ b/kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch @@ -0,0 +1,326 @@ +From 8cd3544b1347b248b9d04eb3d6c9b9bde3a13655 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:49 +0100 +Subject: [PATCH 06/12] numa: Extend CLI to provide memory side cache + information + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-6-plai@redhat.com> +Patchwork-id: 96740 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 05/11] numa: Extend CLI to provide memory side cache information +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Liu Jingqi + +Add -numa hmat-cache option to provide Memory Side Cache Information. +These memory attributes help to build Memory Side Cache Information +Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). +Before using hmat-cache option, enable HMAT with -machine hmat=on. + +Acked-by: Markus Armbruster +Signed-off-by: Liu Jingqi +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-4-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Igor Mammedov +(cherry picked from commit c412a48d4d91e8f8b89aae02de0f44f1f0b729e5) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/numa.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ + include/sysemu/numa.h | 5 ++++ + qapi/machine.json | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-- + qemu-options.hx | 17 +++++++++-- + 4 files changed, 179 insertions(+), 4 deletions(-) + +diff --git a/hw/core/numa.c b/hw/core/numa.c +index 58fe713..0d1b4be 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -375,6 +375,73 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + g_array_append_val(hmat_lb->list, lb_data); + } + ++void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, ++ Error **errp) ++{ ++ int nb_numa_nodes = ms->numa_state->num_nodes; ++ NodeInfo *numa_info = ms->numa_state->nodes; ++ NumaHmatCacheOptions *hmat_cache = NULL; ++ ++ if (node->node_id >= nb_numa_nodes) { ++ error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " ++ "than %d", node->node_id, nb_numa_nodes); ++ return; ++ } ++ ++ if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { ++ error_setg(errp, "The latency and bandwidth information of " ++ "node-id=%" PRIu32 " should be provided before memory side " ++ "cache attributes", node->node_id); ++ return; ++ } ++ ++ if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { ++ error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " ++ "and less than or equal to %d", node->level, ++ HMAT_LB_LEVELS - 1); ++ return; ++ } ++ ++ assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); ++ assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); ++ if (ms->numa_state->hmat_cache[node->node_id][node->level]) { ++ error_setg(errp, "Duplicate configuration of the side cache for " ++ "node-id=%" PRIu32 " and level=%" PRIu8, ++ node->node_id, node->level); ++ return; ++ } ++ ++ if ((node->level > 1) && ++ ms->numa_state->hmat_cache[node->node_id][node->level - 1] && ++ (node->size >= ++ ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { ++ error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 ++ " should be less than the size(%" PRIu64 ") of " ++ "level=%u", node->size, node->level, ++ ms->numa_state->hmat_cache[node->node_id] ++ [node->level - 1]->size, ++ node->level - 1); ++ return; ++ } ++ ++ if ((node->level < HMAT_LB_LEVELS - 1) && ++ ms->numa_state->hmat_cache[node->node_id][node->level + 1] && ++ (node->size <= ++ ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { ++ error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 ++ " should be larger than the size(%" PRIu64 ") of " ++ "level=%u", node->size, node->level, ++ ms->numa_state->hmat_cache[node->node_id] ++ [node->level + 1]->size, ++ node->level + 1); ++ return; ++ } ++ ++ hmat_cache = g_malloc0(sizeof(*hmat_cache)); ++ memcpy(hmat_cache, node, sizeof(*hmat_cache)); ++ ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; ++} ++ + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + { + Error *err = NULL; +@@ -425,6 +492,19 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + goto end; + } + break; ++ case NUMA_OPTIONS_TYPE_HMAT_CACHE: ++ if (!ms->numa_state->hmat_enabled) { ++ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " ++ "(HMAT) is disabled, enable it with -machine hmat=on " ++ "before using any of hmat specific options"); ++ return; ++ } ++ ++ parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err); ++ if (err) { ++ goto end; ++ } ++ break; + default: + abort(); + } +diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h +index 70f93c8..ba693cc 100644 +--- a/include/sysemu/numa.h ++++ b/include/sysemu/numa.h +@@ -91,6 +91,9 @@ struct NumaState { + + /* NUMA nodes HMAT Locality Latency and Bandwidth Information */ + HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES]; ++ ++ /* Memory Side Cache Information Structure */ ++ NumaHmatCacheOptions *hmat_cache[MAX_NODES][HMAT_LB_LEVELS]; + }; + typedef struct NumaState NumaState; + +@@ -98,6 +101,8 @@ void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); + void parse_numa_opts(MachineState *ms); + void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, + Error **errp); ++void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, ++ Error **errp); + void numa_complete_configuration(MachineState *ms); + void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms); + extern QemuOptsList qemu_numa_opts; +diff --git a/qapi/machine.json b/qapi/machine.json +index cf8faf5..b3d30bc 100644 +--- a/qapi/machine.json ++++ b/qapi/machine.json +@@ -428,10 +428,12 @@ + # + # @hmat-lb: memory latency and bandwidth information (Since: 5.0) + # ++# @hmat-cache: memory side cache information (Since: 5.0) ++# + # Since: 2.1 + ## + { 'enum': 'NumaOptionsType', +- 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } ++ 'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] } + + ## + # @NumaOptions: +@@ -447,7 +449,8 @@ + 'node': 'NumaNodeOptions', + 'dist': 'NumaDistOptions', + 'cpu': 'NumaCpuOptions', +- 'hmat-lb': 'NumaHmatLBOptions' }} ++ 'hmat-lb': 'NumaHmatLBOptions', ++ 'hmat-cache': 'NumaHmatCacheOptions' }} + + ## + # @NumaNodeOptions: +@@ -647,6 +650,80 @@ + '*bandwidth': 'size' }} + + ## ++# @HmatCacheAssociativity: ++# ++# Cache associativity in the Memory Side Cache Information Structure ++# of HMAT ++# ++# For more information of @HmatCacheAssociativity, see chapter ++# 5.2.27.5: Table 5-147 of ACPI 6.3 spec. ++# ++# @none: None (no memory side cache in this proximity domain, ++# or cache associativity unknown) ++# ++# @direct: Direct Mapped ++# ++# @complex: Complex Cache Indexing (implementation specific) ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatCacheAssociativity', ++ 'data': [ 'none', 'direct', 'complex' ] } ++ ++## ++# @HmatCacheWritePolicy: ++# ++# Cache write policy in the Memory Side Cache Information Structure ++# of HMAT ++# ++# For more information of @HmatCacheWritePolicy, see chapter ++# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. ++# ++# @none: None (no memory side cache in this proximity domain, ++# or cache write policy unknown) ++# ++# @write-back: Write Back (WB) ++# ++# @write-through: Write Through (WT) ++# ++# Since: 5.0 ++## ++{ 'enum': 'HmatCacheWritePolicy', ++ 'data': [ 'none', 'write-back', 'write-through' ] } ++ ++## ++# @NumaHmatCacheOptions: ++# ++# Set the memory side cache information for a given memory domain. ++# ++# For more information of @NumaHmatCacheOptions, see chapter ++# 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. ++# ++# @node-id: the memory proximity domain to which the memory belongs. ++# ++# @size: the size of memory side cache in bytes. ++# ++# @level: the cache level described in this structure. ++# ++# @associativity: the cache associativity, ++# none/direct-mapped/complex(complex cache indexing). ++# ++# @policy: the write policy, none/write-back/write-through. ++# ++# @line: the cache Line size in bytes. ++# ++# Since: 5.0 ++## ++{ 'struct': 'NumaHmatCacheOptions', ++ 'data': { ++ 'node-id': 'uint32', ++ 'size': 'size', ++ 'level': 'uint8', ++ 'associativity': 'HmatCacheAssociativity', ++ 'policy': 'HmatCacheWritePolicy', ++ 'line': 'uint16' }} ++ ++## + # @HostMemPolicy: + # + # Host memory policy types +diff --git a/qemu-options.hx b/qemu-options.hx +index 86d9d8a..8fe05b6 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -169,7 +169,8 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa dist,src=source,dst=destination,val=distance\n" + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" +- "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n", ++ "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n" ++ "-numa hmat-cache,node-id=node,size=size,level=level[,associativity=none|direct|complex][,policy=none|write-back|write-through][,line=size]\n", + QEMU_ARCH_ALL) + STEXI + @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}] +@@ -177,6 +178,7 @@ STEXI + @itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} + @itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] + @itemx -numa hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}] ++@itemx -numa hmat-cache,node-id=@var{node},size=@var{size},level=@var{level}[,associativity=@var{str}][,policy=@var{str}][,line=@var{size}] + @findex -numa + Define a NUMA node and assign RAM and VCPUs to it. + Set the NUMA distance from a source node to a destination node. +@@ -280,11 +282,20 @@ NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix). + Note that if latency or bandwidth value is 0, means the corresponding latency or + bandwidth information is not provided. + ++In @samp{hmat-cache} option, @var{node-id} is the NUMA-id of the memory belongs. ++@var{size} is the size of memory side cache in bytes. @var{level} is the cache ++level described in this structure, note that the cache level 0 should not be used ++with @samp{hmat-cache} option. @var{associativity} is the cache associativity, ++the possible value is 'none/direct(direct-mapped)/complex(complex cache indexing)'. ++@var{policy} is the write policy. @var{line} is the cache Line size in bytes. ++ + For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and + a ram, node 1 has only a ram. The processors in node 0 access memory in node + 0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s; + The processors in NUMA node 0 access memory in NUMA node 1 with access-latency 10 + nanoseconds, access-bandwidth is 100 MB/s. ++And for memory side cache information, NUMA node 0 and 1 both have 1 level memory ++cache, size is 10KB, policy is write-back, the cache Line size is 8 bytes: + @example + -machine hmat=on \ + -m 2G \ +@@ -298,7 +309,9 @@ nanoseconds, access-bandwidth is 100 MB/s. + -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5 \ + -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M \ + -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10 \ +--numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M ++-numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M \ ++-numa hmat-cache,node-id=0,size=10K,level=1,associativity=direct,policy=write-back,line=8 \ ++-numa hmat-cache,node-id=1,size=10K,level=1,associativity=direct,policy=write-back,line=8 + @end example + + ETEXI +-- +1.8.3.1 + diff --git a/kvm-numa-properly-check-if-numa-is-supported.patch b/kvm-numa-properly-check-if-numa-is-supported.patch new file mode 100755 index 0000000000000000000000000000000000000000..c6022563c21f51196f3623ccd02193deb51d142e --- /dev/null +++ b/kvm-numa-properly-check-if-numa-is-supported.patch @@ -0,0 +1,81 @@ +From e3a1c2ff0d7b930b1782d59d093fd15471d3aee1 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:46 +0100 +Subject: [PATCH 03/12] numa: properly check if numa is supported + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-3-plai@redhat.com> +Patchwork-id: 96732 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 02/11] numa: properly check if numa is supported +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Igor Mammedov + +Commit aa57020774b, by mistake used MachineClass::numa_mem_supported +to check if NUMA is supported by machine and also as unrelated change +set it to true for sbsa-ref board. + +Luckily change didn't break machines that support NUMA, as the field +is set to true for them. + +But the field is not intended for checking if NUMA is supported and +will be flipped to false within this release for new machine types. + +Fix it: + - by using previously used condition + !mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id + the first time and then use MachineState::numa_state down the road + to check if NUMA is supported + - dropping stray sbsa-ref chunk + +Fixes: aa57020774b690a22be72453b8e91c9b5a68c516 +Signed-off-by: Igor Mammedov +Message-Id: <1576154936-178362-3-git-send-email-imammedo@redhat.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit fcd3f2cc124600385dba46c69a80626985c15b50) +Signed-off-by: Danilo C. L. de Paula +--- + hw/arm/sbsa-ref.c | 1 - + hw/core/machine.c | 4 ++-- + 2 files changed, 2 insertions(+), 3 deletions(-) + +diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c +index 27046cc..c6261d4 100644 +--- a/hw/arm/sbsa-ref.c ++++ b/hw/arm/sbsa-ref.c +@@ -791,7 +791,6 @@ static void sbsa_ref_class_init(ObjectClass *oc, void *data) + mc->possible_cpu_arch_ids = sbsa_ref_possible_cpu_arch_ids; + mc->cpu_index_to_instance_props = sbsa_ref_cpu_index_to_props; + mc->get_default_cpu_node_id = sbsa_ref_get_default_cpu_node_id; +- mc->numa_mem_supported = true; + } + + static const TypeInfo sbsa_ref_info = { +diff --git a/hw/core/machine.c b/hw/core/machine.c +index 5a025d1..19c78c6 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -1128,7 +1128,7 @@ static void machine_initfn(Object *obj) + NULL); + } + +- if (mc->numa_mem_supported) { ++ if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { + ms->numa_state = g_new0(NumaState, 1); + } + +@@ -1272,7 +1272,7 @@ void machine_run_board_init(MachineState *machine) + { + MachineClass *machine_class = MACHINE_GET_CLASS(machine); + +- if (machine_class->numa_mem_supported) { ++ if (machine->numa_state) { + numa_complete_configuration(machine); + if (machine->numa_state->num_nodes) { + machine_numa_finish_cpu_init(machine); +-- +1.8.3.1 + diff --git a/kvm-numa-remove-not-needed-check.patch b/kvm-numa-remove-not-needed-check.patch new file mode 100755 index 0000000000000000000000000000000000000000..cbe677fce8d9ef37def47e9754446168b935d491 --- /dev/null +++ b/kvm-numa-remove-not-needed-check.patch @@ -0,0 +1,59 @@ +From 348115bbd0d60fada6f7d9fa27848044690a4bc3 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:45 +0100 +Subject: [PATCH 02/12] numa: remove not needed check + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-2-plai@redhat.com> +Patchwork-id: 96738 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 01/11] numa: remove not needed check +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Igor Mammedov + +Currently parse_numa_node() is always called from already numa +enabled context. +Drop unnecessary check if numa is supported. + +Signed-off-by: Igor Mammedov +Message-Id: <1576154936-178362-2-git-send-email-imammedo@redhat.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 5275db59aa7ff8a26bd6aa5d07cb4d53de5cfab5) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + hw/core/numa.c | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/hw/core/numa.c b/hw/core/numa.c +index e3332a9..19f082d 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -83,10 +83,6 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, + return; + } + +- if (!mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id) { +- error_setg(errp, "NUMA is not supported by this machine-type"); +- return; +- } + for (cpus = node->cpus; cpus; cpus = cpus->next) { + CpuInstanceProperties props; + if (cpus->value >= max_cpus) { +@@ -178,9 +174,8 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) + void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) + { + Error *err = NULL; +- MachineClass *mc = MACHINE_GET_CLASS(ms); + +- if (!mc->numa_mem_supported) { ++ if (!ms->numa_state) { + error_setg(errp, "NUMA is not supported by this machine-type"); + goto end; + } +-- +1.8.3.1 + diff --git a/kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch b/kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch new file mode 100755 index 0000000000000000000000000000000000000000..008874f51801a3faf4920edc4fa924711abe351a --- /dev/null +++ b/kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch @@ -0,0 +1,250 @@ +From aac48d07764ce73c2ba23e3f05ccd29db190024a Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Thu, 8 Oct 2020 11:06:43 -0400 +Subject: [PATCH 04/14] nvram: Exit QEMU if NVRAM cannot contain all -prom-env + data + +RH-Author: Greg Kurz +Message-id: <20201008110643.155902-2-gkurz@redhat.com> +Patchwork-id: 98577 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] nvram: Exit QEMU if NVRAM cannot contain all -prom-env data +Bugzilla: 1874780 +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +Since commit 61f20b9dc5b7 ("spapr_nvram: Pre-initialize the NVRAM to +support the -prom-env parameter"), pseries machines can pre-initialize +the "system" partition in the NVRAM with the data passed to all -prom-env +parameters on the QEMU command line. + +In this case it is assumed that all the data fits in 64 KiB, but the user +can easily pass more and crash QEMU: + +$ qemu-system-ppc64 -M pseries $(for ((x=0;x<128;x++)); do \ + echo -n " -prom-env " ; printf "%0.sx" {1..1024}; \ + done) # this requires ~128 Kib +malloc(): corrupted top size +Aborted (core dumped) + +This happens because we don't check if all the prom-env data fits in +the NVRAM and chrp_nvram_set_var() happily memcpy() it passed the +buffer. + +This crash affects basically all ppc/ppc64 machine types that use -prom-env: +- pseries (all versions) +- g3beige +- mac99 + +and also sparc/sparc64 machine types: +- LX +- SPARCClassic +- SPARCbook +- SS-10 +- SS-20 +- SS-4 +- SS-5 +- SS-600MP +- Voyager +- sun4u +- sun4v + +Add a max_len argument to chrp_nvram_create_system_partition() so that +it can check the available size before writing to memory. + +Since NVRAM is populated at machine init, it seems reasonable to consider +this error as fatal. So, instead of reporting an error when we detect that +the NVRAM is too small and adapt all machine types to handle it, we simply +exit QEMU in all cases. This is still better than crashing. If someone +wants another behavior, I guess this can be reworked later. + +Tested with: + +$ yes q | \ + (for arch in ppc ppc64 sparc sparc64; do \ + echo == $arch ==; \ + qemu=${arch}-softmmu/qemu-system-$arch; \ + for mach in $($qemu -M help | awk '! /^Supported/ { print $1 }'); do \ + echo $mach; \ + $qemu -M $mach -monitor stdio -nodefaults -nographic \ + $(for ((x=0;x<128;x++)); do \ + echo -n " -prom-env " ; printf "%0.sx" {1..1024}; \ + done) >/dev/null; \ + done; echo; \ + done) + +Without the patch, affected machine types cause QEMU to report some +memory corruption and crash: + +malloc(): corrupted top size + +free(): invalid size + +*** stack smashing detected ***: terminated + +With the patch, QEMU prints the following message and exits: + +NVRAM is too small. Try to pass less data to -prom-env + +It seems that the conditions for the crash have always existed, but it +affects pseries, the machine type I care for, since commit 61f20b9dc5b7 +only. + +Fixes: 61f20b9dc5b7 ("spapr_nvram: Pre-initialize the NVRAM to support the -prom-env parameter") +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1867739 +Reported-by: John Snow +Reviewed-by: Laurent Vivier +Signed-off-by: Greg Kurz +Message-Id: <159736033937.350502.12402444542194031035.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 37035df51eaabb8d26b71da75b88a1c6727de8fa) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/nvram/chrp_nvram.c | 24 +++++++++++++++++++++--- + hw/nvram/mac_nvram.c | 2 +- + hw/nvram/spapr_nvram.c | 3 ++- + hw/sparc/sun4m.c | 2 +- + hw/sparc64/sun4u.c | 2 +- + include/hw/nvram/chrp_nvram.h | 3 ++- + 6 files changed, 28 insertions(+), 8 deletions(-) + +diff --git a/hw/nvram/chrp_nvram.c b/hw/nvram/chrp_nvram.c +index d969f26704..d4d10a7c03 100644 +--- a/hw/nvram/chrp_nvram.c ++++ b/hw/nvram/chrp_nvram.c +@@ -21,14 +21,21 @@ + + #include "qemu/osdep.h" + #include "qemu/cutils.h" ++#include "qemu/error-report.h" + #include "hw/nvram/chrp_nvram.h" + #include "sysemu/sysemu.h" + +-static int chrp_nvram_set_var(uint8_t *nvram, int addr, const char *str) ++static int chrp_nvram_set_var(uint8_t *nvram, int addr, const char *str, ++ int max_len) + { + int len; + + len = strlen(str) + 1; ++ ++ if (max_len < len) { ++ return -1; ++ } ++ + memcpy(&nvram[addr], str, len); + + return addr + len; +@@ -38,19 +45,26 @@ static int chrp_nvram_set_var(uint8_t *nvram, int addr, const char *str) + * Create a "system partition", used for the Open Firmware + * environment variables. + */ +-int chrp_nvram_create_system_partition(uint8_t *data, int min_len) ++int chrp_nvram_create_system_partition(uint8_t *data, int min_len, int max_len) + { + ChrpNvramPartHdr *part_header; + unsigned int i; + int end; + ++ if (max_len < sizeof(*part_header)) { ++ goto fail; ++ } ++ + part_header = (ChrpNvramPartHdr *)data; + part_header->signature = CHRP_NVPART_SYSTEM; + pstrcpy(part_header->name, sizeof(part_header->name), "system"); + + end = sizeof(ChrpNvramPartHdr); + for (i = 0; i < nb_prom_envs; i++) { +- end = chrp_nvram_set_var(data, end, prom_envs[i]); ++ end = chrp_nvram_set_var(data, end, prom_envs[i], max_len - end); ++ if (end == -1) { ++ goto fail; ++ } + } + + /* End marker */ +@@ -65,6 +79,10 @@ int chrp_nvram_create_system_partition(uint8_t *data, int min_len) + chrp_nvram_finish_partition(part_header, end); + + return end; ++ ++fail: ++ error_report("NVRAM is too small. Try to pass less data to -prom-env"); ++ exit(EXIT_FAILURE); + } + + /** +diff --git a/hw/nvram/mac_nvram.c b/hw/nvram/mac_nvram.c +index 9a47e35b8e..ecfb36182f 100644 +--- a/hw/nvram/mac_nvram.c ++++ b/hw/nvram/mac_nvram.c +@@ -152,7 +152,7 @@ static void pmac_format_nvram_partition_of(MacIONVRAMState *nvr, int off, + + /* OpenBIOS nvram variables partition */ + sysp_end = chrp_nvram_create_system_partition(&nvr->data[off], +- DEF_SYSTEM_SIZE) + off; ++ DEF_SYSTEM_SIZE, len) + off; + + /* Free space partition */ + chrp_nvram_create_free_partition(&nvr->data[sysp_end], len - sysp_end); +diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c +index 838082b451..225cd69b49 100644 +--- a/hw/nvram/spapr_nvram.c ++++ b/hw/nvram/spapr_nvram.c +@@ -188,7 +188,8 @@ static void spapr_nvram_realize(SpaprVioDevice *dev, Error **errp) + } + } else if (nb_prom_envs > 0) { + /* Create a system partition to pass the -prom-env variables */ +- chrp_nvram_create_system_partition(nvram->buf, MIN_NVRAM_SIZE / 4); ++ chrp_nvram_create_system_partition(nvram->buf, MIN_NVRAM_SIZE / 4, ++ nvram->size); + chrp_nvram_create_free_partition(&nvram->buf[MIN_NVRAM_SIZE / 4], + nvram->size - MIN_NVRAM_SIZE / 4); + } +diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c +index 2aaa5bf1ae..cf2d0762d9 100644 +--- a/hw/sparc/sun4m.c ++++ b/hw/sparc/sun4m.c +@@ -142,7 +142,7 @@ static void nvram_init(Nvram *nvram, uint8_t *macaddr, + memset(image, '\0', sizeof(image)); + + /* OpenBIOS nvram variables partition */ +- sysp_end = chrp_nvram_create_system_partition(image, 0); ++ sysp_end = chrp_nvram_create_system_partition(image, 0, 0x1fd0); + + /* Free space partition */ + chrp_nvram_create_free_partition(&image[sysp_end], 0x1fd0 - sysp_end); +diff --git a/hw/sparc64/sun4u.c b/hw/sparc64/sun4u.c +index 955082773b..f5295a687e 100644 +--- a/hw/sparc64/sun4u.c ++++ b/hw/sparc64/sun4u.c +@@ -137,7 +137,7 @@ static int sun4u_NVRAM_set_params(Nvram *nvram, uint16_t NVRAM_size, + memset(image, '\0', sizeof(image)); + + /* OpenBIOS nvram variables partition */ +- sysp_end = chrp_nvram_create_system_partition(image, 0); ++ sysp_end = chrp_nvram_create_system_partition(image, 0, 0x1fd0); + + /* Free space partition */ + chrp_nvram_create_free_partition(&image[sysp_end], 0x1fd0 - sysp_end); +diff --git a/include/hw/nvram/chrp_nvram.h b/include/hw/nvram/chrp_nvram.h +index 09941a9be4..4a0f5c21b8 100644 +--- a/include/hw/nvram/chrp_nvram.h ++++ b/include/hw/nvram/chrp_nvram.h +@@ -50,7 +50,8 @@ chrp_nvram_finish_partition(ChrpNvramPartHdr *header, uint32_t size) + header->checksum = sum & 0xff; + } + +-int chrp_nvram_create_system_partition(uint8_t *data, int min_len); ++/* chrp_nvram_create_system_partition() failure is fatal */ ++int chrp_nvram_create_system_partition(uint8_t *data, int min_len, int max_len); + int chrp_nvram_create_free_partition(uint8_t *data, int len); + + #endif +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch b/kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch new file mode 100755 index 0000000000000000000000000000000000000000..270b92688316f47698b68d74ef3e2662df0e470f --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch @@ -0,0 +1,112 @@ +From e46aaac6f1ad67753face896e827ad1da920b9e5 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:47 -0400 +Subject: [PATCH 11/14] pc-bios/s390-ccw: Allow booting in case the first + virtio-blk disk is bad + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-8-thuth@redhat.com> +Patchwork-id: 98601 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 7/9] pc-bios/s390-ccw: Allow booting in case the first virtio-blk disk is bad +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +If you try to boot with two virtio-blk disks (without bootindex), and +only the second one is bootable, the s390-ccw bios currently stops at +the first disk and does not continue booting from the second one. This +is annoying - and all other major QEMU firmwares succeed to boot from +the second disk in this case, so we should do the same in the s390-ccw +bios, too. + +Reviewed-by: Cornelia Huck +Message-Id: <20200806105349.632-8-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 5dc739f343cd06ecb9b058294564ce7504856f3f) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 34 +++++++++++++++++++++++----------- + pc-bios/s390-ccw/main.c | 2 +- + 2 files changed, 24 insertions(+), 12 deletions(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index d13b7cbd15..e91ea719ff 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -289,11 +289,18 @@ static void ipl_eckd_cdl(void) + read_block(1, ipl2, "Cannot read IPL2 record at block 1"); + + mbr = &ipl2->mbr; +- IPL_assert(magic_match(mbr, ZIPL_MAGIC), "No zIPL section in IPL2 record."); +- IPL_assert(block_size_ok(mbr->blockptr.xeckd.bptr.size), +- "Bad block size in zIPL section of IPL2 record."); +- IPL_assert(mbr->dev_type == DEV_TYPE_ECKD, +- "Non-ECKD device type in zIPL section of IPL2 record."); ++ if (!magic_match(mbr, ZIPL_MAGIC)) { ++ sclp_print("No zIPL section in IPL2 record.\n"); ++ return; ++ } ++ if (!block_size_ok(mbr->blockptr.xeckd.bptr.size)) { ++ sclp_print("Bad block size in zIPL section of IPL2 record.\n"); ++ return; ++ } ++ if (!mbr->dev_type == DEV_TYPE_ECKD) { ++ sclp_print("Non-ECKD device type in zIPL section of IPL2 record.\n"); ++ return; ++ } + + /* save pointer to Boot Map Table */ + bmt_block_nr = eckd_block_num(&mbr->blockptr.xeckd.bptr.chs); +@@ -303,10 +310,14 @@ static void ipl_eckd_cdl(void) + + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(2, vlbl, "Cannot read Volume Label at block 2"); +- IPL_assert(magic_match(vlbl->key, VOL1_MAGIC), +- "Invalid magic of volume label block"); +- IPL_assert(magic_match(vlbl->f.key, VOL1_MAGIC), +- "Invalid magic of volser block"); ++ if (!magic_match(vlbl->key, VOL1_MAGIC)) { ++ sclp_print("Invalid magic of volume label block.\n"); ++ return; ++ } ++ if (!magic_match(vlbl->f.key, VOL1_MAGIC)) { ++ sclp_print("Invalid magic of volser block.\n"); ++ return; ++ } + print_volser(vlbl->f.volser); + + run_eckd_boot_script(bmt_block_nr, s1b_block_nr); +@@ -400,7 +411,8 @@ static void ipl_eckd(void) + read_block(0, mbr, "Cannot read block 0 on DASD"); + + if (magic_match(mbr->magic, IPL1_MAGIC)) { +- ipl_eckd_cdl(); /* no return */ ++ ipl_eckd_cdl(); /* only returns in case of error */ ++ return; + } + + /* LDL/CMS? */ +@@ -827,5 +839,5 @@ void zipl_load(void) + panic("\n! Unknown IPL device type !\n"); + } + +- panic("\n* this can never happen *\n"); ++ sclp_print("zIPL load failed.\n"); + } +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 5c1c98341d..b5c721c395 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -249,7 +249,7 @@ static void ipl_boot_device(void) + break; + case CU_TYPE_VIRTIO: + if (virtio_setup() == 0) { +- zipl_load(); /* no return */ ++ zipl_load(); /* Only returns in case of errors */ + } + break; + default: +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch b/kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch new file mode 100755 index 0000000000000000000000000000000000000000..4a295ca7b01be24f8ad6b55ac22a296ec648a535 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch @@ -0,0 +1,214 @@ +From 6f44767aeda52048e7c9ee4b5fcc30353c71cbc1 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:45 -0400 +Subject: [PATCH 09/14] pc-bios/s390-ccw: Do not bail out early if not finding + a SCSI disk + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-6-thuth@redhat.com> +Patchwork-id: 98599 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 5/9] pc-bios/s390-ccw: Do not bail out early if not finding a SCSI disk +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +In case the user did not specify a boot device, we want to continue +looking for other devices if there are no valid SCSI disks on a virtio- +scsi controller. As a first step, do not panic in this case and let +the control flow carry the error to the upper functions instead. + +Message-Id: <20200806105349.632-6-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Signed-off-by: Thomas Huth +(cherry picked from commit 605751b5a5334e187761b0b8a8266a216897bf70) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 14 ++++++++++---- + pc-bios/s390-ccw/s390-ccw.h | 2 +- + pc-bios/s390-ccw/virtio-blkdev.c | 7 +++++-- + pc-bios/s390-ccw/virtio-scsi.c | 28 ++++++++++++++++++++-------- + pc-bios/s390-ccw/virtio-scsi.h | 2 +- + 5 files changed, 37 insertions(+), 16 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index d6fd218074..456733fbee 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -227,7 +227,7 @@ static void find_boot_device(void) + IPL_assert(found, "Boot device not found\n"); + } + +-static void virtio_setup(void) ++static int virtio_setup(void) + { + VDev *vdev = virtio_get_device(); + QemuIplParameters *early_qipl = (QemuIplParameters *)QIPL_ADDRESS; +@@ -242,9 +242,14 @@ static void virtio_setup(void) + sclp_print("Network boot device detected\n"); + vdev->netboot_start_addr = qipl.netboot_start_addr; + } else { +- virtio_blk_setup_device(blk_schid); ++ int ret = virtio_blk_setup_device(blk_schid); ++ if (ret) { ++ return ret; ++ } + IPL_assert(virtio_ipl_disk_is_valid(), "No valid IPL device detected"); + } ++ ++ return 0; + } + + static void ipl_boot_device(void) +@@ -255,8 +260,9 @@ static void ipl_boot_device(void) + dasd_ipl(blk_schid, cutype); /* no return */ + break; + case CU_TYPE_VIRTIO: +- virtio_setup(); +- zipl_load(); /* no return */ ++ if (virtio_setup() == 0) { ++ zipl_load(); /* no return */ ++ } + break; + default: + print_int("Attempting to boot from unexpected device type", cutype); +diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h +index ae432c40b8..e7cf36eb91 100644 +--- a/pc-bios/s390-ccw/s390-ccw.h ++++ b/pc-bios/s390-ccw/s390-ccw.h +@@ -70,7 +70,7 @@ int sclp_read(char *str, size_t count); + unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2, + ulong subchan_id, void *load_addr); + bool virtio_is_supported(SubChannelId schid); +-void virtio_blk_setup_device(SubChannelId schid); ++int virtio_blk_setup_device(SubChannelId schid); + int virtio_read(ulong sector, void *load_addr); + u64 get_clock(void); + ulong get_second(void); +diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c +index 11c56261ca..7d35050292 100644 +--- a/pc-bios/s390-ccw/virtio-blkdev.c ++++ b/pc-bios/s390-ccw/virtio-blkdev.c +@@ -263,9 +263,10 @@ uint64_t virtio_get_blocks(void) + return 0; + } + +-void virtio_blk_setup_device(SubChannelId schid) ++int virtio_blk_setup_device(SubChannelId schid) + { + VDev *vdev = virtio_get_device(); ++ int ret = 0; + + vdev->schid = schid; + virtio_setup_ccw(vdev); +@@ -288,9 +289,11 @@ void virtio_blk_setup_device(SubChannelId schid) + "Config: CDB size mismatch"); + + sclp_print("Using virtio-scsi.\n"); +- virtio_scsi_setup(vdev); ++ ret = virtio_scsi_setup(vdev); + break; + default: + panic("\n! No IPL device available !\n"); + } ++ ++ return ret; + } +diff --git a/pc-bios/s390-ccw/virtio-scsi.c b/pc-bios/s390-ccw/virtio-scsi.c +index 4fe4b9d261..88691edb89 100644 +--- a/pc-bios/s390-ccw/virtio-scsi.c ++++ b/pc-bios/s390-ccw/virtio-scsi.c +@@ -192,7 +192,12 @@ static bool scsi_read_capacity(VDev *vdev, + + /* virtio-scsi routines */ + +-static void virtio_scsi_locate_device(VDev *vdev) ++/* ++ * Tries to locate a SCSI device and and adds the information for the found ++ * device to the vdev->scsi_device structure. ++ * Returns 0 if SCSI device could be located, or a error code < 0 otherwise ++ */ ++static int virtio_scsi_locate_device(VDev *vdev) + { + const uint16_t channel = 0; /* again, it's what QEMU does */ + uint16_t target; +@@ -218,7 +223,7 @@ static void virtio_scsi_locate_device(VDev *vdev) + IPL_check(sdev->channel == 0, "non-zero channel requested"); + IPL_check(sdev->target <= vdev->config.scsi.max_target, "target# high"); + IPL_check(sdev->lun <= vdev->config.scsi.max_lun, "LUN# high"); +- return; ++ return 0; + } + + for (target = 0; target <= vdev->config.scsi.max_target; target++) { +@@ -245,18 +250,20 @@ static void virtio_scsi_locate_device(VDev *vdev) + */ + sdev->lun = r->lun[0].v16[0]; /* it's returned this way */ + debug_print_int("Have to use LUN", sdev->lun); +- return; /* we have to use this device */ ++ return 0; /* we have to use this device */ + } + for (i = 0; i < luns; i++) { + if (r->lun[i].v64) { + /* Look for non-zero LUN - we have where to choose from */ + sdev->lun = r->lun[i].v16[0]; + debug_print_int("Will use LUN", sdev->lun); +- return; /* we have found a device */ ++ return 0; /* we have found a device */ + } + } + } +- panic("\n! Cannot locate virtio-scsi device !\n"); ++ ++ sclp_print("Warning: Could not locate a usable virtio-scsi device\n"); ++ return -ENODEV; + } + + int virtio_scsi_read_many(VDev *vdev, +@@ -320,17 +327,20 @@ static void scsi_parse_capacity_report(void *data, + } + } + +-void virtio_scsi_setup(VDev *vdev) ++int virtio_scsi_setup(VDev *vdev) + { + int retry_test_unit_ready = 3; + uint8_t data[256]; + uint32_t data_size = sizeof(data); + ScsiInquiryEvpdPages *evpd = &scsi_inquiry_evpd_pages_response; + ScsiInquiryEvpdBl *evpd_bl = &scsi_inquiry_evpd_bl_response; +- int i; ++ int i, ret; + + vdev->scsi_device = &default_scsi_device; +- virtio_scsi_locate_device(vdev); ++ ret = virtio_scsi_locate_device(vdev); ++ if (ret < 0) { ++ return ret; ++ } + + /* We have to "ping" the device before it becomes readable */ + while (!scsi_test_unit_ready(vdev)) { +@@ -415,4 +425,6 @@ void virtio_scsi_setup(VDev *vdev) + } + scsi_parse_capacity_report(data, &vdev->scsi_last_block, + (uint32_t *) &vdev->scsi_block_size); ++ ++ return 0; + } +diff --git a/pc-bios/s390-ccw/virtio-scsi.h b/pc-bios/s390-ccw/virtio-scsi.h +index 4c4f4bbc31..4b14c2c2f9 100644 +--- a/pc-bios/s390-ccw/virtio-scsi.h ++++ b/pc-bios/s390-ccw/virtio-scsi.h +@@ -67,7 +67,7 @@ static inline bool virtio_scsi_response_ok(const VirtioScsiCmdResp *r) + return r->response == VIRTIO_SCSI_S_OK && r->status == CDB_STATUS_GOOD; + } + +-void virtio_scsi_setup(VDev *vdev); ++int virtio_scsi_setup(VDev *vdev); + int virtio_scsi_read_many(VDev *vdev, + ulong sector, void *load_addr, int sec_num); + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch b/kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch new file mode 100755 index 0000000000000000000000000000000000000000..4385267e23a431448acf58659e8b5414d8af962e --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch @@ -0,0 +1,54 @@ +From 7b3a7cbfc5872e088f13e11f5c38dc5ac80c3330 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:43 -0400 +Subject: [PATCH 07/14] pc-bios/s390-ccw: Introduce ENODEV define and remove + guards of others + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-4-thuth@redhat.com> +Patchwork-id: 98597 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/9] pc-bios/s390-ccw: Introduce ENODEV define and remove guards of others +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Remove the "#ifndef E..." guards from the defines here - the header +guard S390_CCW_H at the top of the file should avoid double definition, +and if the error code is defined in a different file already, we're in +trouble anyway, then it's better to see the error at compile time instead +of hunting weird behavior during runtime later. +Also define ENODEV - we will use this in a later patch. + +Message-Id: <20200806105349.632-4-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Reviewed-by: Janosch Frank +Signed-off-by: Thomas Huth +(cherry picked from commit f3180b0266386b31deb7bb83fcaea68af7d1bcee) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/s390-ccw.h | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h +index 21f27e7990..ae432c40b8 100644 +--- a/pc-bios/s390-ccw/s390-ccw.h ++++ b/pc-bios/s390-ccw/s390-ccw.h +@@ -27,12 +27,10 @@ typedef unsigned long long __u64; + #define false 0 + #define PAGE_SIZE 4096 + +-#ifndef EIO + #define EIO 1 +-#endif +-#ifndef EBUSY + #define EBUSY 2 +-#endif ++#define ENODEV 3 ++ + #ifndef NULL + #define NULL 0 + #endif +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch b/kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch new file mode 100755 index 0000000000000000000000000000000000000000..8f44646dc239736027f58d823da47f562b2e2f55 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch @@ -0,0 +1,60 @@ +From eda3b6620e779ff89df46a0fb9022016bffd7f44 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:41 -0400 +Subject: [PATCH 05/14] pc-bios/s390-ccw/Makefile: Compile with -std=gnu99, + -fwrapv and -fno-common + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-2-thuth@redhat.com> +Patchwork-id: 98595 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/9] pc-bios/s390-ccw/Makefile: Compile with -std=gnu99, -fwrapv and -fno-common +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +The main QEMU code is compiled with -std=gnu99, -fwrapv and -fno-common. +We should use the same flags for the s390-ccw bios, too, to avoid that +we get different behavior with different compiler versions that changed +their default settings in the course of time (it happened at least with +-std=... and -fno-common in the past already). + +While we're at it, also group the other flags here in a little bit nicer +fashion: Move the two "-m" flags out of the "-f" area and specify them on +a separate line. + +Reviewed-by: Claudio Imbrenda +Acked-by: Cornelia Huck +Acked-by: Janosch Frank +Message-Id: <20200806105349.632-2-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 4f6a1eb886961f1f9da2d553c4b0e5ef69cd3801) +Conflicts: Simple contextual conflict due to meson reworks in upstream +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/Makefile | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/pc-bios/s390-ccw/Makefile b/pc-bios/s390-ccw/Makefile +index a048b6b077..e776a2a5ec 100644 +--- a/pc-bios/s390-ccw/Makefile ++++ b/pc-bios/s390-ccw/Makefile +@@ -13,10 +13,11 @@ OBJECTS = start.o main.o bootmap.o jump2ipl.o sclp.o menu.o \ + virtio.o virtio-scsi.o virtio-blkdev.o libc.o cio.o dasd-ipl.o + + QEMU_CFLAGS := $(filter -W%, $(QEMU_CFLAGS)) +-QEMU_CFLAGS += -ffreestanding -fno-delete-null-pointer-checks -msoft-float +-QEMU_CFLAGS += -march=z900 -fPIE -fno-strict-aliasing +-QEMU_CFLAGS += -fno-asynchronous-unwind-tables ++QEMU_CFLAGS += -ffreestanding -fno-delete-null-pointer-checks -fno-common -fPIE ++QEMU_CFLAGS += -fwrapv -fno-strict-aliasing -fno-asynchronous-unwind-tables + QEMU_CFLAGS += $(call cc-option, $(QEMU_CFLAGS), -fno-stack-protector) ++QEMU_CFLAGS += -msoft-float -march=z900 ++QEMU_CFLAGS += -std=gnu99 + LDFLAGS += -Wl,-pie -nostdlib + + build-all: s390-ccw.img s390-netboot.img +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch b/kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch new file mode 100755 index 0000000000000000000000000000000000000000..bbeac9e33c8e66df8e824c9894a9f0da809cbf98 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch @@ -0,0 +1,72 @@ +From 740590240bec03dc6ca208963112d3c2999f353e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:42 -0400 +Subject: [PATCH 06/14] pc-bios/s390-ccw: Move ipl-related code from main() + into a separate function + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-3-thuth@redhat.com> +Patchwork-id: 98596 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/9] pc-bios/s390-ccw: Move ipl-related code from main() into a separate function +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Let's move this part of the code into a separate function to be able +to use it from multiple spots later. + +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Reviewed-by: Janosch Frank +Message-Id: <20200806105349.632-3-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit d1f060a8b515a0b1d14c38f2c8f86ab54e79c3dc) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 4e65b411e1..5e565be5b1 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -232,14 +232,8 @@ static void virtio_setup(void) + } + } + +-int main(void) ++static void ipl_boot_device(void) + { +- sclp_setup(); +- css_setup(); +- boot_setup(); +- find_boot_device(); +- enable_subchannel(blk_schid); +- + switch (cutype) { + case CU_TYPE_DASD_3990: + case CU_TYPE_DASD_2107: +@@ -251,8 +245,18 @@ int main(void) + break; + default: + print_int("Attempting to boot from unexpected device type", cutype); +- panic(""); ++ panic("\nBoot failed.\n"); + } ++} ++ ++int main(void) ++{ ++ sclp_setup(); ++ css_setup(); ++ boot_setup(); ++ find_boot_device(); ++ enable_subchannel(blk_schid); ++ ipl_boot_device(); + + panic("Failed to load OS from hard disk\n"); + return 0; /* make compiler happy */ +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch b/kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch new file mode 100755 index 0000000000000000000000000000000000000000..3aa5dfd57070b52413344460798822979830352a --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch @@ -0,0 +1,154 @@ +From d90cbb55fe3ec232091a24137cab45419aac8bc5 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:44 -0400 +Subject: [PATCH 08/14] pc-bios/s390-ccw: Move the inner logic of find_subch() + to a separate function + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-5-thuth@redhat.com> +Patchwork-id: 98598 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/9] pc-bios/s390-ccw: Move the inner logic of find_subch() to a separate function +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Move the code to a separate function to be able to re-use it from a +different spot later. + +Reviewed-by: Claudio Imbrenda +Message-Id: <20200806105349.632-5-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Reviewed-by: Janosch Frank +Signed-off-by: Thomas Huth +(cherry picked from commit d2cf4af1f4af02f6f2d5827d9a06c31690084d3b) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 99 ++++++++++++++++++++++++----------------- + 1 file changed, 57 insertions(+), 42 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 5e565be5b1..d6fd218074 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -60,6 +60,60 @@ unsigned int get_loadparm_index(void) + return atoui(loadparm_str); + } + ++static int is_dev_possibly_bootable(int dev_no, int sch_no) ++{ ++ bool is_virtio; ++ Schib schib; ++ int r; ++ ++ blk_schid.sch_no = sch_no; ++ r = stsch_err(blk_schid, &schib); ++ if (r == 3 || r == -EIO) { ++ return -ENODEV; ++ } ++ if (!schib.pmcw.dnv) { ++ return false; ++ } ++ ++ enable_subchannel(blk_schid); ++ cutype = cu_type(blk_schid); ++ ++ /* ++ * Note: we always have to run virtio_is_supported() here to make ++ * sure that the vdev.senseid data gets pre-initialized correctly ++ */ ++ is_virtio = virtio_is_supported(blk_schid); ++ ++ /* No specific devno given, just return whether the device is possibly bootable */ ++ if (dev_no < 0) { ++ switch (cutype) { ++ case CU_TYPE_VIRTIO: ++ if (is_virtio) { ++ /* ++ * Skip net devices since no IPLB is created and therefore ++ * no network bootloader has been loaded ++ */ ++ if (virtio_get_device_type() != VIRTIO_ID_NET) { ++ return true; ++ } ++ } ++ return false; ++ case CU_TYPE_DASD_3990: ++ case CU_TYPE_DASD_2107: ++ return true; ++ default: ++ return false; ++ } ++ } ++ ++ /* Caller asked for a specific devno */ ++ if (schib.pmcw.dev == dev_no) { ++ return true; ++ } ++ ++ return false; ++} ++ + /* + * Find the subchannel connected to the given device (dev_no) and fill in the + * subchannel information block (schib) with the connected subchannel's info. +@@ -71,53 +125,14 @@ unsigned int get_loadparm_index(void) + */ + static bool find_subch(int dev_no) + { +- Schib schib; + int i, r; +- bool is_virtio; + + for (i = 0; i < 0x10000; i++) { +- blk_schid.sch_no = i; +- r = stsch_err(blk_schid, &schib); +- if ((r == 3) || (r == -EIO)) { ++ r = is_dev_possibly_bootable(dev_no, i); ++ if (r < 0) { + break; + } +- if (!schib.pmcw.dnv) { +- continue; +- } +- +- enable_subchannel(blk_schid); +- cutype = cu_type(blk_schid); +- +- /* +- * Note: we always have to run virtio_is_supported() here to make +- * sure that the vdev.senseid data gets pre-initialized correctly +- */ +- is_virtio = virtio_is_supported(blk_schid); +- +- /* No specific devno given, just return 1st possibly bootable device */ +- if (dev_no < 0) { +- switch (cutype) { +- case CU_TYPE_VIRTIO: +- if (is_virtio) { +- /* +- * Skip net devices since no IPLB is created and therefore +- * no network bootloader has been loaded +- */ +- if (virtio_get_device_type() != VIRTIO_ID_NET) { +- return true; +- } +- } +- continue; +- case CU_TYPE_DASD_3990: +- case CU_TYPE_DASD_2107: +- return true; +- default: +- continue; +- } +- } +- +- /* Caller asked for a specific devno */ +- if (schib.pmcw.dev == dev_no) { ++ if (r == true) { + return true; + } + } +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch b/kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch new file mode 100755 index 0000000000000000000000000000000000000000..c8e3017e1033eeedd8414e999ce6a0d608307b92 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch @@ -0,0 +1,116 @@ +From 911dc631f9ab68c6acfd4b401fbcfaa3b58a4fb6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:46 -0400 +Subject: [PATCH 10/14] pc-bios/s390-ccw: Scan through all devices if no boot + device specified + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-7-thuth@redhat.com> +Patchwork-id: 98600 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 6/9] pc-bios/s390-ccw: Scan through all devices if no boot device specified +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +If no boot device has been specified (via "bootindex=..."), the s390-ccw +bios scans through all devices to find a bootable device. But so far, it +stops at the very first block device (including virtio-scsi controllers +without attached devices) that it finds, no matter whether it is bootable +or not. That leads to some weird situatation where it is e.g. possible +to boot via: + + qemu-system-s390x -hda /path/to/disk.qcow2 + +but not if there is e.g. a virtio-scsi controller specified before: + + qemu-system-s390x -device virtio-scsi -hda /path/to/disk.qcow2 + +While using "bootindex=..." is clearly the preferred way of booting +on s390x, we still can make the life for the users at least a little +bit easier if we look at all available devices to find a bootable one. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1846975 +Reviewed-by: Cornelia Huck +Message-Id: <20200806105349.632-7-thuth@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 869d0e2f593dd37297c366203f006b9acd1b7b45) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 46 +++++++++++++++++++++++++++-------------- + 1 file changed, 31 insertions(+), 15 deletions(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index 456733fbee..5c1c98341d 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -191,20 +191,8 @@ static void boot_setup(void) + static void find_boot_device(void) + { + VDev *vdev = virtio_get_device(); +- int ssid; + bool found; + +- if (!have_iplb) { +- for (ssid = 0; ssid < 0x3; ssid++) { +- blk_schid.ssid = ssid; +- found = find_subch(-1); +- if (found) { +- return; +- } +- } +- panic("Could not find a suitable boot device (none specified)\n"); +- } +- + switch (iplb.pbt) { + case S390_IPL_TYPE_CCW: + debug_print_int("device no. ", iplb.ccw.devno); +@@ -270,14 +258,42 @@ static void ipl_boot_device(void) + } + } + ++/* ++ * No boot device has been specified, so we have to scan through the ++ * channels to find one. ++ */ ++static void probe_boot_device(void) ++{ ++ int ssid, sch_no, ret; ++ ++ for (ssid = 0; ssid < 0x3; ssid++) { ++ blk_schid.ssid = ssid; ++ for (sch_no = 0; sch_no < 0x10000; sch_no++) { ++ ret = is_dev_possibly_bootable(-1, sch_no); ++ if (ret < 0) { ++ break; ++ } ++ if (ret == true) { ++ ipl_boot_device(); /* Only returns if unsuccessful */ ++ } ++ } ++ } ++ ++ sclp_print("Could not find a suitable boot device (none specified)\n"); ++} ++ + int main(void) + { + sclp_setup(); + css_setup(); + boot_setup(); +- find_boot_device(); +- enable_subchannel(blk_schid); +- ipl_boot_device(); ++ if (have_iplb) { ++ find_boot_device(); ++ enable_subchannel(blk_schid); ++ ipl_boot_device(); ++ } else { ++ probe_boot_device(); ++ } + + panic("Failed to load OS from hard disk\n"); + return 0; /* make compiler happy */ +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch b/kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch new file mode 100755 index 0000000000000000000000000000000000000000..414cc13c32bfeeb2491daa7adf89a722bd50f0d9 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch @@ -0,0 +1,50 @@ +From 56ae2d8a1ee3a35e2eed4f4baa61f97184189b47 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 18 May 2021 13:51:24 -0400 +Subject: [PATCH 4/5] pc-bios/s390-ccw: break loop if a null block number is + reached +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20210518135125.191329-3-thuth@redhat.com> +Patchwork-id: 101549 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/3] pc-bios/s390-ccw: break loop if a null block number is reached +Bugzilla: 1942880 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Break the loop if `cur_block_nr` is a null block number because this +means that the end of chunk is reached. In this case we will try to +boot the default entry. + +Fixes: ba831b25262a ("s390-ccw: read stage2 boot loader data to find menu") +Reviewed-by: Collin Walling +Signed-off-by: Marc Hartmayer +Message-Id: <20200924085926.21709-3-mhartmay@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 468184ec9024f4f7b55247f70ec57554e8a500d7) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index bb6e003270..624f524331 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -192,7 +192,7 @@ static int eckd_get_boot_menu_index(block_number_t s1b_block_nr) + for (i = 0; i < STAGE2_BLK_CNT_MAX; i++) { + cur_block_nr = eckd_block_num(&s1b->seek[i].chs); + +- if (!cur_block_nr) { ++ if (!cur_block_nr || is_null_block_number(cur_block_nr)) { + break; + } + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch b/kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch new file mode 100755 index 0000000000000000000000000000000000000000..25971180a4e95012a333b58cde4026777d6b2f7d --- /dev/null +++ b/kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch @@ -0,0 +1,48 @@ +From 52ba1903b2c8ce69e8cd1de2a78c2c63cc60383b Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 18 May 2021 13:51:25 -0400 +Subject: [PATCH 5/5] pc-bios/s390-ccw: don't try to read the next block if end + of chunk is reached +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20210518135125.191329-4-thuth@redhat.com> +Patchwork-id: 101550 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 3/3] pc-bios/s390-ccw: don't try to read the next block if end of chunk is reached +Bugzilla: 1942880 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +Don't read the block if a null block number is reached, because this means that +the end of chunk is reached. + +Reviewed-by: Collin Walling +Signed-off-by: Marc Hartmayer +Message-Id: <20210416074736.17409-1-mhartmay@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit a6625d38cce3901a7c1cba069f0abcf743a293f1) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index 624f524331..8458b15cb6 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -212,7 +212,7 @@ static int eckd_get_boot_menu_index(block_number_t s1b_block_nr) + next_block_nr = eckd_block_num(&s1b->seek[i + 1].chs); + } + +- if (next_block_nr) { ++ if (next_block_nr && !is_null_block_number(next_block_nr)) { + read_block(next_block_nr, s2_next_blk, + "Cannot read stage2 boot loader"); + } +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch b/kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch new file mode 100755 index 0000000000000000000000000000000000000000..691bed44fe03e7c3075fafa9768104145f3b45de --- /dev/null +++ b/kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch @@ -0,0 +1,51 @@ +From 0e9bdb960045f98d70f765bbb585f1647e5fea08 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 18 May 2021 13:51:23 -0400 +Subject: [PATCH 3/5] pc-bios/s390-ccw: fix off-by-one error +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20210518135125.191329-2-thuth@redhat.com> +Patchwork-id: 101548 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/3] pc-bios/s390-ccw: fix off-by-one error +Bugzilla: 1942880 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +This error takes effect when the magic value "zIPL" is located at the +end of a block. For example if s2_cur_blk = 0x7fe18000 and the magic +value "zIPL" is located at 0x7fe18ffc - 0x7fe18fff. + +Fixes: ba831b25262a ("s390-ccw: read stage2 boot loader data to find menu") +Reviewed-by: Collin Walling +Signed-off-by: Marc Hartmayer +Message-Id: <20200924085926.21709-2-mhartmay@linux.ibm.com> +Reviewed-by: Thomas Huth +[thuth: Use "<= ... - 4" instead of "< ... - 3"] +Signed-off-by: Thomas Huth +(cherry picked from commit 5f97ba0c74ccace0a4014460de9751ff3c6f454a) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/bootmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c +index e91ea719ff..bb6e003270 100644 +--- a/pc-bios/s390-ccw/bootmap.c ++++ b/pc-bios/s390-ccw/bootmap.c +@@ -163,7 +163,7 @@ static bool find_zipl_boot_menu_banner(int *offset) + int i; + + /* Menu banner starts with "zIPL" */ +- for (i = 0; i < virtio_get_block_size() - 4; i++) { ++ for (i = 0; i <= virtio_get_block_size() - 4; i++) { + if (magic_match(s2_cur_blk + i, ZIPL_MAGIC_EBCDIC)) { + *offset = i; + return true; +-- +2.27.0 + diff --git a/kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch b/kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch new file mode 100755 index 0000000000000000000000000000000000000000..cf1466a3e9fd5a452d25b9195caf587de397ade0 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch @@ -0,0 +1,43 @@ +From 541d06b7dc1cd3ad4722850f3a7f5df12b8d6fba Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 9 Oct 2020 10:08:48 -0400 +Subject: [PATCH 12/14] pc-bios/s390-ccw/main: Remove superfluous call to + enable_subchannel() + +RH-Author: Thomas Huth +Message-id: <20201009100849.264994-9-thuth@redhat.com> +Patchwork-id: 98602 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 8/9] pc-bios/s390-ccw/main: Remove superfluous call to enable_subchannel() +Bugzilla: 1846975 +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +enable_subchannel() is already done during is_dev_possibly_bootable() +(which is called from find_boot_device() -> find_subch()), so there +is no need to do this again in the main() function. + +Message-Id: <20200806105349.632-9-thuth@redhat.com> +Reviewed-by: Cornelia Huck +Signed-off-by: Thomas Huth +(cherry picked from commit 49d4388ec03fd8c7701b907a4e11c437a28f8572) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/main.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index b5c721c395..e3a1a3053d 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -289,7 +289,6 @@ int main(void) + boot_setup(); + if (have_iplb) { + find_boot_device(); +- enable_subchannel(blk_schid); + ipl_boot_device(); + } else { + probe_boot_device(); +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch b/kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch new file mode 100755 index 0000000000000000000000000000000000000000..8334b7b6e4f66becedcbbf45ae54f7c8fba42985 --- /dev/null +++ b/kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch @@ -0,0 +1,87 @@ +From c6f62870f27ece45e944d1818f6aa04b3e024959 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:41 -0500 +Subject: [PATCH 5/5] pc-bios: s390x: Clear out leftover S390EP string + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-5-thuth@redhat.com> +Patchwork-id: 100369 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/4] pc-bios: s390x: Clear out leftover S390EP string +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Eric Farman + +A Linux binary will have the string "S390EP" at address 0x10008, +which is important in getting the guest up off the ground. In the +case of a reboot (specifically chreipl going to a new device), +we should defer to the PSW at address zero for the new config, +which will re-write "S390EP" from the new image. + +Let's clear it out at this point so that a reipl to, say, a DASD +passthrough device drives the IPL path from scratch without disrupting +disrupting the order of operations for other boots. + +Rather than hardcoding the address of this magic (again), let's +define it somewhere so that the two users are visibly related. + +Signed-off-by: Eric Farman +Message-Id: <20201120160117.59366-3-farman@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 3d6519968bb10260fc724c491fb4275f7c0b78ac) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 2 +- + pc-bios/s390-ccw/main.c | 6 ++++++ + pc-bios/s390-ccw/s390-arch.h | 3 +++ + 3 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index 767012bf0c9..6c6823b5db8 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -78,7 +78,7 @@ void jump_to_low_kernel(void) + * kernel start address (when jumping to the PSW-at-zero address instead, + * the kernel startup code fails when we booted from a network device). + */ +- if (!memcmp((char *)0x10008, "S390EP", 6)) { ++ if (!memcmp((char *)S390EP, "S390EP", 6)) { + jump_to_IPL_code(KERN_IMAGE_START); + } + +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index e3a1a3053d0..c04b910082b 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -185,6 +185,12 @@ static void boot_setup(void) + memcpy(lpmsg + 10, loadparm_str, 8); + sclp_print(lpmsg); + ++ /* ++ * Clear out any potential S390EP magic (see jump_to_low_kernel()), ++ * so we don't taint our decision-making process during a reboot. ++ */ ++ memset((char *)S390EP, 0, 6); ++ + have_iplb = store_iplb(&iplb); + } + +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 6da44d4436c..a741488aaa1 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -95,6 +95,9 @@ typedef struct LowCore { + + extern LowCore *lowcore; + ++/* Location of "S390EP" in a Linux binary (see arch/s390/boot/head.S) */ ++#define S390EP 0x10008 ++ + static inline void set_prefix(uint32_t address) + { + asm volatile("spx %0" : : "m" (address) : "memory"); +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch b/kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch new file mode 100755 index 0000000000000000000000000000000000000000..9d09be3e9e7091d7aebc6c098bac724a057e1d22 --- /dev/null +++ b/kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch @@ -0,0 +1,63 @@ +From 6b19062226ecebf63d2d0b0ff05b5bcfa7a05818 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:40 -0500 +Subject: [PATCH 4/5] pc-bios: s390x: Ensure Read IPL memory is clean + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-4-thuth@redhat.com> +Patchwork-id: 100372 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/4] pc-bios: s390x: Ensure Read IPL memory is clean +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Eric Farman + +If, for example, we boot off a virtio device and chreipl to a vfio-ccw +device, the space at lowcore will be non-zero. We build a Read IPL CCW +at address zero, but it will have leftover PSW data that will conflict +with the Format-0 CCW being generated: + +0x0: 00080000 80010000 + ------ Ccw0.cda + -- Ccw0.chainData + -- Reserved bits + +The data address will be overwritten with the correct value (0x0), but +the apparent data chain bit will cause subsequent memory to be used as +the target of the data store, which may not be where we expect (0x0). + +Clear out this space when we boot from DASD, so that we know it exists +exactly as we expect. + +Signed-off-by: Eric Farman +Reviewed-by: Jason J. Herne +Reviewed-by: Janosch Frank +Acked-by: Christian Borntraeger +Acked-by: Cornelia Huck +Message-Id: <20201120160117.59366-2-farman@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit d8e5bbdd0d6fa8d9b5ac15de62c87105d92ff558) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/dasd-ipl.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/pc-bios/s390-ccw/dasd-ipl.c b/pc-bios/s390-ccw/dasd-ipl.c +index 0fc879bb8e8..71cbae2f16e 100644 +--- a/pc-bios/s390-ccw/dasd-ipl.c ++++ b/pc-bios/s390-ccw/dasd-ipl.c +@@ -100,6 +100,9 @@ static void make_readipl(void) + { + Ccw0 *ccwIplRead = (Ccw0 *)0x00; + ++ /* Clear out any existing data */ ++ memset(ccwIplRead, 0, sizeof(Ccw0)); ++ + /* Create Read IPL ccw at address 0 */ + ccwIplRead->cmd_code = CCW_CMD_READ_IPL; + ccwIplRead->cda = 0x00; /* Read into address 0x00 in main memory */ +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Fix-reset-psw-mask.patch b/kvm-pc-bios-s390x-Fix-reset-psw-mask.patch new file mode 100755 index 0000000000000000000000000000000000000000..9c45e927849ea84b98836fe1a55744cb781aa081 --- /dev/null +++ b/kvm-pc-bios-s390x-Fix-reset-psw-mask.patch @@ -0,0 +1,75 @@ +From 38ba55dd27a3b8308f0ce2e82a4c3eba3f197d20 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:53 -0400 +Subject: [PATCH 11/42] pc-bios/s390x: Fix reset psw mask + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-12-thuth@redhat.com> +Patchwork-id: 97034 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 11/38] pc-bios/s390x: Fix reset psw mask +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +We need to set the short psw indication bit in the reset psw, as it is +a short psw. + +Exposed by "s390x: Properly fetch and test the short psw on diag308 +subc 0/1". + +Fixes: 962982329029 ("pc-bios/s390-ccw: do a subsystem reset before running the guest") +Signed-off-by: Janosch Frank +Message-Id: <20191203132813.2734-5-frankja@linux.ibm.com> +Acked-by: Christian Borntraeger +Signed-off-by: Cornelia Huck +(cherry picked from commit 5c6f0d5f46a77d77460dfb518cf1e1e4145c276e) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index 266f1502b9..da13c43cc0 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -12,11 +12,11 @@ + #define KERN_IMAGE_START 0x010000UL + #define PSW_MASK_64 0x0000000100000000ULL + #define PSW_MASK_32 0x0000000080000000ULL +-#define IPL_PSW_MASK (PSW_MASK_32 | PSW_MASK_64) ++#define PSW_MASK_SHORTPSW 0x0008000000000000ULL ++#define RESET_PSW_MASK (PSW_MASK_SHORTPSW | PSW_MASK_32 | PSW_MASK_64) + + typedef struct ResetInfo { +- uint32_t ipl_mask; +- uint32_t ipl_addr; ++ uint64_t ipl_psw; + uint32_t ipl_continue; + } ResetInfo; + +@@ -50,7 +50,9 @@ void jump_to_IPL_code(uint64_t address) + ResetInfo *current = 0; + + save = *current; +- current->ipl_addr = (uint32_t) (uint64_t) &jump_to_IPL_2; ++ ++ current->ipl_psw = (uint64_t) &jump_to_IPL_2; ++ current->ipl_psw |= RESET_PSW_MASK; + current->ipl_continue = address & 0x7fffffff; + + debug_print_int("set IPL addr to", current->ipl_continue); +@@ -82,7 +84,7 @@ void jump_to_low_kernel(void) + } + + /* Trying to get PSW at zero address */ +- if (*((uint64_t *)0) & IPL_PSW_MASK) { ++ if (*((uint64_t *)0) & RESET_PSW_MASK) { + jump_to_IPL_code((*((uint64_t *)0)) & 0x7fffffff); + } + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch b/kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch new file mode 100755 index 0000000000000000000000000000000000000000..8ba45304332c4654bec73f358586741595d12486 --- /dev/null +++ b/kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch @@ -0,0 +1,45 @@ +From 494ce6ed658a806af36d4f50600e44740a446011 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:38 -0500 +Subject: [PATCH 2/5] pc-bios: s390x: Rename PSW_MASK_ZMODE to PSW_MASK_64 + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-2-thuth@redhat.com> +Patchwork-id: 100370 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/4] pc-bios: s390x: Rename PSW_MASK_ZMODE to PSW_MASK_64 +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +This constant enables 64 bit addressing, not the ESAME architecture, +so it shouldn't be named ZMODE. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Message-Id: <20200624075226.92728-7-frankja@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit b88faa1c899db2fae8b5b168aeb6c47bef090f27) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/s390-arch.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 5f36361c022..73852029d4e 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -29,7 +29,7 @@ _Static_assert(sizeof(struct PSWLegacy) == 8, "PSWLegacy size incorrect"); + #define PSW_MASK_WAIT 0x0002000000000000ULL + #define PSW_MASK_EAMODE 0x0000000100000000ULL + #define PSW_MASK_BAMODE 0x0000000080000000ULL +-#define PSW_MASK_ZMODE (PSW_MASK_EAMODE | PSW_MASK_BAMODE) ++#define PSW_MASK_64 (PSW_MASK_EAMODE | PSW_MASK_BAMODE) + + /* Low core mapping */ + typedef struct LowCore { +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch b/kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch new file mode 100755 index 0000000000000000000000000000000000000000..2db2f931845e89411c79c378ca51bb7dd566a022 --- /dev/null +++ b/kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch @@ -0,0 +1,145 @@ +From 8350ad9c0f54519a06ec396c2997330615f4b470 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:58 -0400 +Subject: [PATCH 16/42] pc-bios: s390x: Save iplb location in lowcore + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-17-thuth@redhat.com> +Patchwork-id: 97027 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 16/38] pc-bios: s390x: Save iplb location in lowcore +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The POP states that for a list directed IPL the IPLB is stored into +memory by the machine loader and its address is stored at offset 0x14 +of the lowcore. + +ZIPL currently uses the address in offset 0x14 to access the IPLB and +acquire flags about secure boot. If the IPLB address points into +memory which has an unsupported mix of flags set, ZIPL will panic +instead of booting the OS. + +As the lowcore can have quite a high entropy for a guest that did drop +out of protected mode (i.e. rebooted) we encountered the ZIPL panic +quite often. + +Signed-off-by: Janosch Frank +Tested-by: Marc Hartmayer +Message-Id: <20200304114231.23493-19-frankja@linux.ibm.com> +Reviewed-by: Christian Borntraeger +Reviewed-by: David Hildenbrand +Signed-off-by: Christian Borntraeger +(cherry picked from commit 9bfc04f9ef6802fff0fc77130ff345a541783363) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 1 + + pc-bios/s390-ccw/main.c | 8 +++++++- + pc-bios/s390-ccw/netmain.c | 1 + + pc-bios/s390-ccw/s390-arch.h | 10 ++++++++-- + pc-bios/s390-ccw/s390-ccw.h | 1 + + 5 files changed, 18 insertions(+), 3 deletions(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index da13c43cc0..4eba2510b0 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -35,6 +35,7 @@ void jump_to_IPL_code(uint64_t address) + { + /* store the subsystem information _after_ the bootmap was loaded */ + write_subsystem_identification(); ++ write_iplb_location(); + + /* prevent unknown IPL types in the guest */ + if (iplb.pbt == S390_IPL_TYPE_QEMU_SCSI) { +diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c +index a21b386280..4e65b411e1 100644 +--- a/pc-bios/s390-ccw/main.c ++++ b/pc-bios/s390-ccw/main.c +@@ -9,6 +9,7 @@ + */ + + #include "libc.h" ++#include "helper.h" + #include "s390-arch.h" + #include "s390-ccw.h" + #include "cio.h" +@@ -22,7 +23,7 @@ QemuIplParameters qipl; + IplParameterBlock iplb __attribute__((__aligned__(PAGE_SIZE))); + static bool have_iplb; + static uint16_t cutype; +-LowCore const *lowcore; /* Yes, this *is* a pointer to address 0 */ ++LowCore *lowcore; /* Yes, this *is* a pointer to address 0 */ + + #define LOADPARM_PROMPT "PROMPT " + #define LOADPARM_EMPTY " " +@@ -42,6 +43,11 @@ void write_subsystem_identification(void) + *zeroes = 0; + } + ++void write_iplb_location(void) ++{ ++ lowcore->ptr_iplb = ptr2u32(&iplb); ++} ++ + void panic(const char *string) + { + sclp_print(string); +diff --git a/pc-bios/s390-ccw/netmain.c b/pc-bios/s390-ccw/netmain.c +index f2dcc01e27..309ffa30d9 100644 +--- a/pc-bios/s390-ccw/netmain.c ++++ b/pc-bios/s390-ccw/netmain.c +@@ -40,6 +40,7 @@ + #define DEFAULT_TFTP_RETRIES 20 + + extern char _start[]; ++void write_iplb_location(void) {} + + #define KERNEL_ADDR ((void *)0L) + #define KERNEL_MAX_SIZE ((long)_start) +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 504fc7c2f0..5f36361c02 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -36,7 +36,13 @@ typedef struct LowCore { + /* prefix area: defined by architecture */ + PSWLegacy ipl_psw; /* 0x000 */ + uint32_t ccw1[2]; /* 0x008 */ +- uint32_t ccw2[2]; /* 0x010 */ ++ union { ++ uint32_t ccw2[2]; /* 0x010 */ ++ struct { ++ uint32_t reserved10; ++ uint32_t ptr_iplb; ++ }; ++ }; + uint8_t pad1[0x80 - 0x18]; /* 0x018 */ + uint32_t ext_params; /* 0x080 */ + uint16_t cpu_addr; /* 0x084 */ +@@ -85,7 +91,7 @@ typedef struct LowCore { + PSW io_new_psw; /* 0x1f0 */ + } __attribute__((packed, aligned(8192))) LowCore; + +-extern LowCore const *lowcore; ++extern LowCore *lowcore; + + static inline void set_prefix(uint32_t address) + { +diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h +index 11bce7d73c..21f27e7990 100644 +--- a/pc-bios/s390-ccw/s390-ccw.h ++++ b/pc-bios/s390-ccw/s390-ccw.h +@@ -57,6 +57,7 @@ void consume_io_int(void); + /* main.c */ + void panic(const char *string); + void write_subsystem_identification(void); ++void write_iplb_location(void); + extern char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE))); + unsigned int get_loadparm_index(void); + +-- +2.27.0 + diff --git a/kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch b/kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch new file mode 100755 index 0000000000000000000000000000000000000000..576447d0362c7197ca0c156d58333f6c6323c8b6 --- /dev/null +++ b/kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch @@ -0,0 +1,89 @@ +From 35891c9334058c02f3ee83eee1a986802387c18b Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 10 Dec 2020 08:32:39 -0500 +Subject: [PATCH 3/5] pc-bios: s390x: Use PSW masks where possible and + introduce PSW_MASK_SHORT_ADDR + +RH-Author: Thomas Huth +Message-id: <20201210083241.173509-3-thuth@redhat.com> +Patchwork-id: 100371 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/4] pc-bios: s390x: Use PSW masks where possible and introduce PSW_MASK_SHORT_ADDR +Bugzilla: 1903135 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jens Freimann +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's move some of the PSW mask defines into s390-arch.h and use them +in jump2ipl.c. Also let's introduce a new constant for the address +mask of 8 byte (short) PSWs. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Thomas Huth +Message-Id: <20200624075226.92728-8-frankja@linux.ibm.com> +Signed-off-by: Thomas Huth +(cherry picked from commit fe75c657b8ee962da79f5d3518b139e26dc69c24) +Signed-off-by: Danilo C. L. de Paula +--- + pc-bios/s390-ccw/jump2ipl.c | 10 ++++------ + pc-bios/s390-ccw/s390-arch.h | 2 ++ + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/pc-bios/s390-ccw/jump2ipl.c b/pc-bios/s390-ccw/jump2ipl.c +index 4eba2510b04..767012bf0c9 100644 +--- a/pc-bios/s390-ccw/jump2ipl.c ++++ b/pc-bios/s390-ccw/jump2ipl.c +@@ -8,12 +8,10 @@ + + #include "libc.h" + #include "s390-ccw.h" ++#include "s390-arch.h" + + #define KERN_IMAGE_START 0x010000UL +-#define PSW_MASK_64 0x0000000100000000ULL +-#define PSW_MASK_32 0x0000000080000000ULL +-#define PSW_MASK_SHORTPSW 0x0008000000000000ULL +-#define RESET_PSW_MASK (PSW_MASK_SHORTPSW | PSW_MASK_32 | PSW_MASK_64) ++#define RESET_PSW_MASK (PSW_MASK_SHORTPSW | PSW_MASK_64) + + typedef struct ResetInfo { + uint64_t ipl_psw; +@@ -54,7 +52,7 @@ void jump_to_IPL_code(uint64_t address) + + current->ipl_psw = (uint64_t) &jump_to_IPL_2; + current->ipl_psw |= RESET_PSW_MASK; +- current->ipl_continue = address & 0x7fffffff; ++ current->ipl_continue = address & PSW_MASK_SHORT_ADDR; + + debug_print_int("set IPL addr to", current->ipl_continue); + +@@ -86,7 +84,7 @@ void jump_to_low_kernel(void) + + /* Trying to get PSW at zero address */ + if (*((uint64_t *)0) & RESET_PSW_MASK) { +- jump_to_IPL_code((*((uint64_t *)0)) & 0x7fffffff); ++ jump_to_IPL_code((*((uint64_t *)0)) & PSW_MASK_SHORT_ADDR); + } + + /* No other option left, so use the Linux kernel start address */ +diff --git a/pc-bios/s390-ccw/s390-arch.h b/pc-bios/s390-ccw/s390-arch.h +index 73852029d4e..6da44d4436c 100644 +--- a/pc-bios/s390-ccw/s390-arch.h ++++ b/pc-bios/s390-ccw/s390-arch.h +@@ -26,9 +26,11 @@ _Static_assert(sizeof(struct PSWLegacy) == 8, "PSWLegacy size incorrect"); + + /* s390 psw bit masks */ + #define PSW_MASK_IOINT 0x0200000000000000ULL ++#define PSW_MASK_SHORTPSW 0x0008000000000000ULL + #define PSW_MASK_WAIT 0x0002000000000000ULL + #define PSW_MASK_EAMODE 0x0000000100000000ULL + #define PSW_MASK_BAMODE 0x0000000080000000ULL ++#define PSW_MASK_SHORT_ADDR 0x000000007fffffffULL + #define PSW_MASK_64 (PSW_MASK_EAMODE | PSW_MASK_BAMODE) + + /* Low core mapping */ +-- +2.27.0 + diff --git a/kvm-pcie_root_port-Add-hotplug-disabling-option.patch b/kvm-pcie_root_port-Add-hotplug-disabling-option.patch new file mode 100755 index 0000000000000000000000000000000000000000..57f3c3bc653e27ea0e6ff4322fddcdab845562b6 --- /dev/null +++ b/kvm-pcie_root_port-Add-hotplug-disabling-option.patch @@ -0,0 +1,153 @@ +From 8587278a20283851081d4d282d11ef6bafd17dc2 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Tue, 17 Mar 2020 13:56:39 -0400 +Subject: [PATCH 1/2] pcie_root_port: Add hotplug disabling option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Julia Suvorova +Message-id: <20200317135639.65085-1-jusual@redhat.com> +Patchwork-id: 94367 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] pcie_root_port: Add hotplug disabling option +Bugzilla: 1790899 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Peter Xu + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1790899 +BRANCH: rhel-av-8.2.1 +UPSTREAM: merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=27302449 + +Make hot-plug/hot-unplug on PCIe Root Ports optional to allow libvirt +manage it and restrict unplug for the whole machine. This is going to +prevent user-initiated unplug in guests (Windows mostly). +Hotplug is enabled by default. +Usage: + -device pcie-root-port,hotplug=off,... + +If you want to disable hot-unplug on some downstream ports of one +switch, disable hot-unplug on PCIe Root Port connected to the upstream +port as well as on the selected downstream ports. + +Discussion related: + https://lists.gnu.org/archive/html/qemu-devel/2020-02/msg00530.html + +Signed-off-by: Julia Suvorova +Message-Id: <20200226174607.205941-1-jusual@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Ján Tomko +(cherry picked from commit 530a0963184e57e71a5b538e9161f115df533e96) +Signed-off-by: Jon Maloy +--- + hw/pci-bridge/pcie_root_port.c | 2 +- + hw/pci-bridge/xio3130_downstream.c | 2 +- + hw/pci/pcie.c | 11 +++++++---- + hw/pci/pcie_port.c | 1 + + include/hw/pci/pcie.h | 2 +- + include/hw/pci/pcie_port.h | 3 +++ + 6 files changed, 14 insertions(+), 7 deletions(-) + +diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c +index 012c2cb12c..db80e2ec23 100644 +--- a/hw/pci-bridge/pcie_root_port.c ++++ b/hw/pci-bridge/pcie_root_port.c +@@ -94,7 +94,7 @@ static void rp_realize(PCIDevice *d, Error **errp) + + pcie_cap_arifwd_init(d); + pcie_cap_deverr_init(d); +- pcie_cap_slot_init(d, s->slot); ++ pcie_cap_slot_init(d, s); + pcie_cap_root_init(d); + + pcie_chassis_create(s->chassis); +diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c +index a9f084b863..4489ce4a40 100644 +--- a/hw/pci-bridge/xio3130_downstream.c ++++ b/hw/pci-bridge/xio3130_downstream.c +@@ -94,7 +94,7 @@ static void xio3130_downstream_realize(PCIDevice *d, Error **errp) + } + pcie_cap_flr_init(d); + pcie_cap_deverr_init(d); +- pcie_cap_slot_init(d, s->slot); ++ pcie_cap_slot_init(d, s); + pcie_cap_arifwd_init(d); + + pcie_chassis_create(s->chassis); +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index 08718188bb..0eb3a2a5d2 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -495,7 +495,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + + /* pci express slot for pci express root/downstream port + PCI express capability slot registers */ +-void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot) ++void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s) + { + uint32_t pos = dev->exp.exp_cap; + +@@ -505,13 +505,16 @@ void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot) + pci_long_test_and_clear_mask(dev->config + pos + PCI_EXP_SLTCAP, + ~PCI_EXP_SLTCAP_PSN); + pci_long_test_and_set_mask(dev->config + pos + PCI_EXP_SLTCAP, +- (slot << PCI_EXP_SLTCAP_PSN_SHIFT) | ++ (s->slot << PCI_EXP_SLTCAP_PSN_SHIFT) | + PCI_EXP_SLTCAP_EIP | +- PCI_EXP_SLTCAP_HPS | +- PCI_EXP_SLTCAP_HPC | + PCI_EXP_SLTCAP_PIP | + PCI_EXP_SLTCAP_AIP | + PCI_EXP_SLTCAP_ABP); ++ if (s->hotplug) { ++ pci_long_test_and_set_mask(dev->config + pos + PCI_EXP_SLTCAP, ++ PCI_EXP_SLTCAP_HPS | ++ PCI_EXP_SLTCAP_HPC); ++ } + + if (dev->cap_present & QEMU_PCIE_SLTCAP_PCP) { + pci_long_test_and_set_mask(dev->config + pos + PCI_EXP_SLTCAP, +diff --git a/hw/pci/pcie_port.c b/hw/pci/pcie_port.c +index c19a9be592..36dac33d98 100644 +--- a/hw/pci/pcie_port.c ++++ b/hw/pci/pcie_port.c +@@ -147,6 +147,7 @@ static const TypeInfo pcie_port_type_info = { + static Property pcie_slot_props[] = { + DEFINE_PROP_UINT8("chassis", PCIESlot, chassis, 0), + DEFINE_PROP_UINT16("slot", PCIESlot, slot, 0), ++ DEFINE_PROP_BOOL("hotplug", PCIESlot, hotplug, true), + DEFINE_PROP_END_OF_LIST() + }; + +diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h +index 7064875835..14c58ebdb6 100644 +--- a/include/hw/pci/pcie.h ++++ b/include/hw/pci/pcie.h +@@ -104,7 +104,7 @@ void pcie_cap_deverr_reset(PCIDevice *dev); + void pcie_cap_lnkctl_init(PCIDevice *dev); + void pcie_cap_lnkctl_reset(PCIDevice *dev); + +-void pcie_cap_slot_init(PCIDevice *dev, uint16_t slot); ++void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s); + void pcie_cap_slot_reset(PCIDevice *dev); + void pcie_cap_slot_get(PCIDevice *dev, uint16_t *slt_ctl, uint16_t *slt_sta); + void pcie_cap_slot_write_config(PCIDevice *dev, +diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h +index 7515430087..7072cc8731 100644 +--- a/include/hw/pci/pcie_port.h ++++ b/include/hw/pci/pcie_port.h +@@ -55,6 +55,9 @@ struct PCIESlot { + + /* Disable ACS (really for a pcie_root_port) */ + bool disable_acs; ++ ++ /* Indicates whether hot-plug is enabled on the slot */ ++ bool hotplug; + QLIST_ENTRY(PCIESlot) next; + }; + +-- +2.18.2 + diff --git a/kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch b/kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch new file mode 100755 index 0000000000000000000000000000000000000000..8c333343e0791704ea2920d8f1af9616884bed11 --- /dev/null +++ b/kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch @@ -0,0 +1,54 @@ +From b36a9259e085b4d32532d896e485889181b130ae Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:45 -0400 +Subject: [PATCH 7/9] pcnet: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-8-jmaloy@redhat.com> +Patchwork-id: 101791 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 7/9] pcnet: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Buglink: https://bugs.launchpad.net/qemu/+bug/1917085 +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 99ccfaa1edafd79f7a3a0ff7b58ae4da7c514928) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/pcnet.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c +index f3f18d8598..dcd3fc4948 100644 +--- a/hw/net/pcnet.c ++++ b/hw/net/pcnet.c +@@ -1250,7 +1250,7 @@ txagain: + if (BCR_SWSTYLE(s) == 1) + add_crc = !GET_FIELD(tmd.status, TMDS, NOFCS); + s->looptest = add_crc ? PCNET_LOOPTEST_CRC : PCNET_LOOPTEST_NOCRC; +- pcnet_receive(qemu_get_queue(s->nic), s->buffer, s->xmit_pos); ++ qemu_receive_packet(qemu_get_queue(s->nic), s->buffer, s->xmit_pos); + s->looptest = 0; + } else { + if (s->nic) { +-- +2.27.0 + diff --git a/kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch b/kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch new file mode 100755 index 0000000000000000000000000000000000000000..2dbdb167384004bd1f15a95633e15948479c7f62 --- /dev/null +++ b/kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch @@ -0,0 +1,107 @@ +From 22fc9bd7e7ae0b72c6f6e483eb66cf996f519766 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Tue, 21 Jan 2020 05:16:11 +0000 +Subject: [PATCH 01/15] ppc: Deassert the external interrupt pin in KVM on + reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200121051613.388295-2-dgibson@redhat.com> +Patchwork-id: 93429 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 1/3] ppc: Deassert the external interrupt pin in KVM on reset +Bugzilla: 1776638 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +When a CPU is reset, QEMU makes sure no interrupt is pending by clearing +CPUPPCstate::pending_interrupts in ppc_cpu_reset(). In the case of a +complete machine emulation, eg. a sPAPR machine, an external interrupt +request could still be pending in KVM though, eg. an IPI. It will be +eventually presented to the guest, which is supposed to acknowledge it at +the interrupt controller. If the interrupt controller is emulated in QEMU, +either XICS or XIVE, ppc_set_irq() won't deassert the external interrupt +pin in KVM since it isn't pending anymore for QEMU. When the vCPU re-enters +the guest, the interrupt request is still pending and the vCPU will try +again to acknowledge it. This causes an infinite loop and eventually hangs +the guest. + +The code has been broken since the beginning. The issue wasn't hit before +because accel=kvm,kernel-irqchip=off is an awkward setup that never got +used until recently with the LC92x IBM systems (aka, Boston). + +Add a ppc_irq_reset() function to do the necessary cleanup, ie. deassert +the IRQ pins of the CPU in QEMU and most importantly the external interrupt +pin for this vCPU in KVM. + +Reported-by: Satheesh Rajendran +Signed-off-by: Greg Kurz +Message-Id: <157548861740.3650476.16879693165328764758.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 401774387aeb37f2ada9bb18f7c7e307b21a3e93) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1776638 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/ppc.c | 8 ++++++++ + include/hw/ppc/ppc.h | 2 ++ + target/ppc/translate_init.inc.c | 1 + + 3 files changed, 11 insertions(+) + +diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c +index 52a18eb..d554b64 100644 +--- a/hw/ppc/ppc.c ++++ b/hw/ppc/ppc.c +@@ -1510,3 +1510,11 @@ PowerPCCPU *ppc_get_vcpu_by_pir(int pir) + + return NULL; + } ++ ++void ppc_irq_reset(PowerPCCPU *cpu) ++{ ++ CPUPPCState *env = &cpu->env; ++ ++ env->irq_input_state = 0; ++ kvmppc_set_interrupt(cpu, PPC_INTERRUPT_EXT, 0); ++} +diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h +index 4bdcb8b..5dd7531 100644 +--- a/include/hw/ppc/ppc.h ++++ b/include/hw/ppc/ppc.h +@@ -76,6 +76,7 @@ static inline void ppc970_irq_init(PowerPCCPU *cpu) {} + static inline void ppcPOWER7_irq_init(PowerPCCPU *cpu) {} + static inline void ppcPOWER9_irq_init(PowerPCCPU *cpu) {} + static inline void ppce500_irq_init(PowerPCCPU *cpu) {} ++static inline void ppc_irq_reset(PowerPCCPU *cpu) {} + #else + void ppc40x_irq_init(PowerPCCPU *cpu); + void ppce500_irq_init(PowerPCCPU *cpu); +@@ -83,6 +84,7 @@ void ppc6xx_irq_init(PowerPCCPU *cpu); + void ppc970_irq_init(PowerPCCPU *cpu); + void ppcPOWER7_irq_init(PowerPCCPU *cpu); + void ppcPOWER9_irq_init(PowerPCCPU *cpu); ++void ppc_irq_reset(PowerPCCPU *cpu); + #endif + + /* PPC machines for OpenBIOS */ +diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c +index ba726de..64a8380 100644 +--- a/target/ppc/translate_init.inc.c ++++ b/target/ppc/translate_init.inc.c +@@ -10461,6 +10461,7 @@ static void ppc_cpu_reset(CPUState *s) + env->pending_interrupts = 0; + s->exception_index = POWERPC_EXCP_NONE; + env->error_code = 0; ++ ppc_irq_reset(cpu); + + /* tininess for underflow is detected before rounding */ + set_float_detect_tininess(float_tininess_before_rounding, +-- +1.8.3.1 + diff --git a/kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch b/kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch new file mode 100755 index 0000000000000000000000000000000000000000..457d14973cc6acf6e35b17d970973fcbec64fb77 --- /dev/null +++ b/kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch @@ -0,0 +1,112 @@ +From f2f57c1ed926384e074d2048cdbdc30ee2f426eb Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Tue, 21 Jan 2020 05:16:13 +0000 +Subject: [PATCH 03/15] ppc: Don't use CPUPPCState::irq_input_state with modern + Book3s CPU models +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200121051613.388295-4-dgibson@redhat.com> +Patchwork-id: 93431 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 3/3] ppc: Don't use CPUPPCState::irq_input_state with modern Book3s CPU models +Bugzilla: 1776638 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +The power7_set_irq() and power9_set_irq() functions set this but it is +never used actually. Modern Book3s compatible CPUs are only supported +by the pnv and spapr machines. They have an interrupt controller, XICS +for POWER7/8 and XIVE for POWER9, whose models don't require to track +IRQ input states at the CPU level. + +Drop these lines to avoid confusion. + +Signed-off-by: Greg Kurz +Message-Id: <157548862861.3650476.16622818876928044450.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit c1ad0b892ce20cf2b5e619c79e8a0c4c66b235dc) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1776638 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/ppc.c | 16 ++-------------- + target/ppc/cpu.h | 4 +++- + 2 files changed, 5 insertions(+), 15 deletions(-) + +diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c +index d554b64..730a41f 100644 +--- a/hw/ppc/ppc.c ++++ b/hw/ppc/ppc.c +@@ -275,10 +275,9 @@ void ppc970_irq_init(PowerPCCPU *cpu) + static void power7_set_irq(void *opaque, int pin, int level) + { + PowerPCCPU *cpu = opaque; +- CPUPPCState *env = &cpu->env; + + LOG_IRQ("%s: env %p pin %d level %d\n", __func__, +- env, pin, level); ++ &cpu->env, pin, level); + + switch (pin) { + case POWER7_INPUT_INT: +@@ -292,11 +291,6 @@ static void power7_set_irq(void *opaque, int pin, int level) + LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); + return; + } +- if (level) { +- env->irq_input_state |= 1 << pin; +- } else { +- env->irq_input_state &= ~(1 << pin); +- } + } + + void ppcPOWER7_irq_init(PowerPCCPU *cpu) +@@ -311,10 +305,9 @@ void ppcPOWER7_irq_init(PowerPCCPU *cpu) + static void power9_set_irq(void *opaque, int pin, int level) + { + PowerPCCPU *cpu = opaque; +- CPUPPCState *env = &cpu->env; + + LOG_IRQ("%s: env %p pin %d level %d\n", __func__, +- env, pin, level); ++ &cpu->env, pin, level); + + switch (pin) { + case POWER9_INPUT_INT: +@@ -334,11 +327,6 @@ static void power9_set_irq(void *opaque, int pin, int level) + LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); + return; + } +- if (level) { +- env->irq_input_state |= 1 << pin; +- } else { +- env->irq_input_state &= ~(1 << pin); +- } + } + + void ppcPOWER9_irq_init(PowerPCCPU *cpu) +diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h +index 5c53801..8887f76 100644 +--- a/target/ppc/cpu.h ++++ b/target/ppc/cpu.h +@@ -1090,7 +1090,9 @@ struct CPUPPCState { + #if !defined(CONFIG_USER_ONLY) + /* + * This is the IRQ controller, which is implementation dependent +- * and only relevant when emulating a complete machine. ++ * and only relevant when emulating a complete machine. Note that ++ * this isn't used by recent Book3s compatible CPUs (POWER7 and ++ * newer). + */ + uint32_t irq_input_state; + void **irq_inputs; +-- +1.8.3.1 + diff --git a/kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch b/kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch new file mode 100755 index 0000000000000000000000000000000000000000..380007cf635e8989bc36f1b946647cabfd849bf5 --- /dev/null +++ b/kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch @@ -0,0 +1,82 @@ +From 5b826e7ed09ecf3b2837d147fec6b593f629e450 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Fri, 4 Dec 2020 15:07:59 -0500 +Subject: [PATCH 01/14] ppc/spapr: Add hotremovable flag on DIMM LMBs on + drmem_v2 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Greg Kurz +Message-id: <20201204150800.264829-2-gkurz@redhat.com> +Patchwork-id: 100217 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/2] ppc/spapr: Add hotremovable flag on DIMM LMBs on drmem_v2 +Bugzilla: 1901837 +RH-Acked-by: Danilo de Paula +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier + +From: Leonardo Bras + +On reboot, all memory that was previously added using object_add and +device_add is placed in this DIMM area. + +The new SPAPR_LMB_FLAGS_HOTREMOVABLE flag helps Linux to put this memory in +the correct memory zone, so no unmovable allocations are made there, +allowing the object to be easily hot-removed by device_del and +object_del. + +This new flag was accepted in Power Architecture documentation. + +Signed-off-by: Leonardo Bras +Reviewed-by: Bharata B Rao +Message-Id: <20200511200201.58537-1-leobras.c@gmail.com> +[dwg: Fixed syntax error spotted by Cédric Le Goater] +Signed-off-by: David Gibson +(cherry picked from commit 0911a60c76b8598f1863c6951b2b690059465153) +Signed-off-by: Greg Kurz + +Conflicts: + hw/ppc/pnv.c + +The changes in this file clearly don't belong to this +patch. Same goes for the changes in target/ppc/cpu.h and +target/ppc/excp_helper.c. Something went wrong when the +patch was applied. Anyway, downstream doesn't especially +care for pnv, so just drop the changes. + +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 3 ++- + include/hw/ppc/spapr.h | 1 + + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index a330f038b95..c74079702d0 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -690,7 +690,8 @@ static int spapr_populate_drmem_v2(SpaprMachineState *spapr, void *fdt, + g_assert(drc); + elem = spapr_get_drconf_cell(size / lmb_size, addr, + spapr_drc_index(drc), node, +- SPAPR_LMB_FLAGS_ASSIGNED); ++ (SPAPR_LMB_FLAGS_ASSIGNED | ++ SPAPR_LMB_FLAGS_HOTREMOVABLE)); + QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry); + nr_entries++; + cur_addr = addr + size; +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index aa89cc4a95c..e047dabf300 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -847,6 +847,7 @@ int spapr_rtc_import_offset(SpaprRtcState *rtc, int64_t legacy_offset); + #define SPAPR_LMB_FLAGS_ASSIGNED 0x00000008 + #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020 + #define SPAPR_LMB_FLAGS_RESERVED 0x00000080 ++#define SPAPR_LMB_FLAGS_HOTREMOVABLE 0x00000100 + + void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg); + +-- +2.27.0 + diff --git a/kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch b/kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch new file mode 100755 index 0000000000000000000000000000000000000000..ee0b19a88f3063a9b233ce2b34596c961db04f37 --- /dev/null +++ b/kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch @@ -0,0 +1,67 @@ +From e4065c7739c8ea3f6f88898295ed899a1059806e Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Fri, 4 Dec 2020 15:08:00 -0500 +Subject: [PATCH 02/14] ppc/spapr: re-assert IRQs during event-scan if there + are pending + +RH-Author: Greg Kurz +Message-id: <20201204150800.264829-3-gkurz@redhat.com> +Patchwork-id: 100216 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/2] ppc/spapr: re-assert IRQs during event-scan if there are pending +Bugzilla: 1901837 +RH-Acked-by: Danilo de Paula +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier + +From: Laurent Vivier + +If we hotplug a CPU during the first second of the kernel boot, +the IRQ can be sent to the kernel while the RTAS event handler +is not installed. The event is queued, but the kernel doesn't +collect it and ignores the new CPU. + +As the code relies on edge-triggered IRQ, we can re-assert it +during the event-scan RTAS call if there are still pending +events (as it is already done in check-exception). + +Signed-off-by: Laurent Vivier +Message-Id: <20201015210318.117386-1-lvivier@redhat.com> +Reviewed-by: Greg Kurz +Signed-off-by: David Gibson +(cherry picked from commit dff669d6a15fb92b063cb5aa691b4bb498727404) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_events.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c +index e355e000d07..15b92b63adb 100644 +--- a/hw/ppc/spapr_events.c ++++ b/hw/ppc/spapr_events.c +@@ -692,10 +692,22 @@ static void event_scan(PowerPCCPU *cpu, SpaprMachineState *spapr, + target_ulong args, + uint32_t nret, target_ulong rets) + { ++ int i; + if (nargs != 4 || nret != 1) { + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); + return; + } ++ ++ for (i = 0; i < EVENT_CLASS_MAX; i++) { ++ if (rtas_event_log_contains(EVENT_CLASS_MASK(i))) { ++ const SpaprEventSource *source = ++ spapr_event_sources_get_source(spapr->event_sources, i); ++ ++ g_assert(source->enabled); ++ qemu_irq_pulse(spapr_qirq(spapr, source->irq)); ++ } ++ } ++ + rtas_st(rets, 0, RTAS_OUT_NO_ERRORS_FOUND); + } + +-- +2.27.0 + diff --git a/kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch b/kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch new file mode 100755 index 0000000000000000000000000000000000000000..9c25b7664bc20221bc897514da7d062508fc68ae --- /dev/null +++ b/kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch @@ -0,0 +1,64 @@ +From 428eb7260718b69b1f3f421d03bce10b8785fc49 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 13 Mar 2020 12:34:39 +0000 +Subject: [PATCH 19/20] qapi: Add '@allow-write-only-overlay' feature for + 'blockdev-snapshot' + +RH-Author: Kevin Wolf +Message-id: <20200313123439.10548-14-kwolf@redhat.com> +Patchwork-id: 94290 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 13/13] qapi: Add '@allow-write-only-overlay' feature for 'blockdev-snapshot' +Bugzilla: 1790482 1805143 +RH-Acked-by: John Snow +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Peter Krempa + +From: Peter Krempa + +Anounce that 'blockdev-snapshot' command's permissions allow changing +of the backing file if the 'consistent_read' permission is not required. + +This is useful for libvirt to allow late opening of the backing chain +during a blockdev-mirror. + +Signed-off-by: Peter Krempa +Signed-off-by: Kevin Wolf +Message-Id: <20200310113831.27293-8-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit c6bdc312f30d5c7326aa2fdca3e0f98c15eb541a) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + qapi/block-core.json | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/qapi/block-core.json b/qapi/block-core.json +index a1e85b0..a64ad81 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -1541,6 +1541,12 @@ + # + # For the arguments, see the documentation of BlockdevSnapshot. + # ++# Features: ++# @allow-write-only-overlay: If present, the check whether this operation is safe ++# was relaxed so that it can be used to change ++# backing file of a destination of a blockdev-mirror. ++# (since 5.0) ++# + # Since: 2.5 + # + # Example: +@@ -1561,7 +1567,8 @@ + # + ## + { 'command': 'blockdev-snapshot', +- 'data': 'BlockdevSnapshot' } ++ 'data': 'BlockdevSnapshot', ++ 'features': [ 'allow-write-only-overlay' ] } + + ## + # @change-backing-file: +-- +1.8.3.1 + diff --git a/kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch b/kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch new file mode 100755 index 0000000000000000000000000000000000000000..bf296d885415e6e7e3b58848c3593b2b2f3a7cab --- /dev/null +++ b/kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch @@ -0,0 +1,237 @@ +From 34f664093db2a6275fcddd768684c7319cfc01b4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:06 -0500 +Subject: [PATCH 05/14] qapi: enable use of g_autoptr with QAPI types +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-2-marcandre.lureau@redhat.com> +Patchwork-id: 100472 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 01/10] qapi: enable use of g_autoptr with QAPI types +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Daniel P. Berrangé + +Currently QAPI generates a type and function for free'ing it: + + typedef struct QCryptoBlockCreateOptions QCryptoBlockCreateOptions; + void qapi_free_QCryptoBlockCreateOptions(QCryptoBlockCreateOptions *obj); + +This is used in the traditional manner: + + QCryptoBlockCreateOptions *opts = NULL; + + opts = g_new0(QCryptoBlockCreateOptions, 1); + + ....do stuff with opts... + + qapi_free_QCryptoBlockCreateOptions(opts); + +Since bumping the min glib to 2.48, QEMU has incrementally adopted the +use of g_auto/g_autoptr. This allows the compiler to run a function to +free a variable when it goes out of scope, the benefit being the +compiler can guarantee it is freed in all possible code ptahs. + +This benefit is applicable to QAPI types too, and given the seriously +long method names for some qapi_free_XXXX() functions, is much less +typing. This change thus makes the code generator emit: + + G_DEFINE_AUTOPTR_CLEANUP_FUNC(QCryptoBlockCreateOptions, + qapi_free_QCryptoBlockCreateOptions) + +The above code example now becomes + + g_autoptr(QCryptoBlockCreateOptions) opts = NULL; + + opts = g_new0(QCryptoBlockCreateOptions, 1); + + ....do stuff with opts... + +Note, if the local pointer needs to live beyond the scope holding the +variable, then g_steal_pointer can be used. This is useful to return the +pointer to the caller in the success codepath, while letting it be freed +in all error codepaths. + + return g_steal_pointer(&opts); + +The crypto/block.h header needs updating to avoid symbol clash now that +the g_autoptr support is a standard QAPI feature. + +Signed-off-by: Daniel P. Berrangé +Message-Id: <20200723153845.2934357-1-berrange@redhat.com> +Reviewed-by: Markus Armbruster +Reviewed-by: Eric Blake +Signed-off-by: Markus Armbruster + +(cherry picked from commit 221db5daf6b3666f1c8e4ca06ae45892e99a112f) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + docs/devel/qapi-code-gen.txt | 2 ++ + scripts/qapi/types.py | 1 + + tests/test-qobject-input-visitor.c | 23 +++++++---------------- + 3 files changed, 10 insertions(+), 16 deletions(-) + +diff --git a/docs/devel/qapi-code-gen.txt b/docs/devel/qapi-code-gen.txt +index 45c93a43cc3..ca59c695fac 100644 +--- a/docs/devel/qapi-code-gen.txt ++++ b/docs/devel/qapi-code-gen.txt +@@ -1278,6 +1278,7 @@ Example: + }; + + void qapi_free_UserDefOne(UserDefOne *obj); ++ G_DEFINE_AUTOPTR_CLEANUP_FUNC(UserDefOne, qapi_free_UserDefOne) + + struct UserDefOneList { + UserDefOneList *next; +@@ -1285,6 +1286,7 @@ Example: + }; + + void qapi_free_UserDefOneList(UserDefOneList *obj); ++ G_DEFINE_AUTOPTR_CLEANUP_FUNC(UserDefOneList, qapi_free_UserDefOneList) + + struct q_obj_my_command_arg { + UserDefOneList *arg1; +diff --git a/scripts/qapi/types.py b/scripts/qapi/types.py +index d8751daa049..c3be141dc90 100644 +--- a/scripts/qapi/types.py ++++ b/scripts/qapi/types.py +@@ -213,6 +213,7 @@ def gen_type_cleanup_decl(name): + ret = mcgen(''' + + void qapi_free_%(c_name)s(%(c_name)s *obj); ++G_DEFINE_AUTOPTR_CLEANUP_FUNC(%(c_name)s, qapi_free_%(c_name)s) + ''', + c_name=c_name(name)) + return ret +diff --git a/tests/test-qobject-input-visitor.c b/tests/test-qobject-input-visitor.c +index 6bacabf0632..e41b91a2a6f 100644 +--- a/tests/test-qobject-input-visitor.c ++++ b/tests/test-qobject-input-visitor.c +@@ -417,7 +417,7 @@ static void test_visitor_in_struct(TestInputVisitorData *data, + static void test_visitor_in_struct_nested(TestInputVisitorData *data, + const void *unused) + { +- UserDefTwo *udp = NULL; ++ g_autoptr(UserDefTwo) udp = NULL; + Visitor *v; + + v = visitor_input_test_init(data, "{ 'string0': 'string0', " +@@ -433,8 +433,6 @@ static void test_visitor_in_struct_nested(TestInputVisitorData *data, + g_assert_cmpstr(udp->dict1->dict2->userdef->string, ==, "string"); + g_assert_cmpstr(udp->dict1->dict2->string, ==, "string2"); + g_assert(udp->dict1->has_dict3 == false); +- +- qapi_free_UserDefTwo(udp); + } + + static void test_visitor_in_list(TestInputVisitorData *data, +@@ -546,7 +544,7 @@ static void test_visitor_in_union_flat(TestInputVisitorData *data, + const void *unused) + { + Visitor *v; +- UserDefFlatUnion *tmp; ++ g_autoptr(UserDefFlatUnion) tmp = NULL; + UserDefUnionBase *base; + + v = visitor_input_test_init(data, +@@ -563,8 +561,6 @@ static void test_visitor_in_union_flat(TestInputVisitorData *data, + + base = qapi_UserDefFlatUnion_base(tmp); + g_assert(&base->enum1 == &tmp->enum1); +- +- qapi_free_UserDefFlatUnion(tmp); + } + + static void test_visitor_in_alternate(TestInputVisitorData *data, +@@ -690,7 +686,7 @@ static void test_list_union_integer_helper(TestInputVisitorData *data, + const void *unused, + UserDefListUnionKind kind) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); + GString *gstr_union = g_string_new(""); +@@ -782,7 +778,6 @@ static void test_list_union_integer_helper(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + static void test_visitor_in_list_union_int(TestInputVisitorData *data, +@@ -851,7 +846,7 @@ static void test_visitor_in_list_union_uint64(TestInputVisitorData *data, + static void test_visitor_in_list_union_bool(TestInputVisitorData *data, + const void *unused) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + boolList *elem = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); +@@ -879,13 +874,12 @@ static void test_visitor_in_list_union_bool(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + static void test_visitor_in_list_union_string(TestInputVisitorData *data, + const void *unused) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + strList *elem = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); +@@ -914,7 +908,6 @@ static void test_visitor_in_list_union_string(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + #define DOUBLE_STR_MAX 16 +@@ -922,7 +915,7 @@ static void test_visitor_in_list_union_string(TestInputVisitorData *data, + static void test_visitor_in_list_union_number(TestInputVisitorData *data, + const void *unused) + { +- UserDefListUnion *cvalue = NULL; ++ g_autoptr(UserDefListUnion) cvalue = NULL; + numberList *elem = NULL; + Visitor *v; + GString *gstr_list = g_string_new(""); +@@ -957,7 +950,6 @@ static void test_visitor_in_list_union_number(TestInputVisitorData *data, + + g_string_free(gstr_union, true); + g_string_free(gstr_list, true); +- qapi_free_UserDefListUnion(cvalue); + } + + static void input_visitor_test_add(const char *testpath, +@@ -1253,7 +1245,7 @@ static void test_visitor_in_fail_alternate(TestInputVisitorData *data, + static void do_test_visitor_in_qmp_introspect(TestInputVisitorData *data, + const QLitObject *qlit) + { +- SchemaInfoList *schema = NULL; ++ g_autoptr(SchemaInfoList) schema = NULL; + QObject *obj = qobject_from_qlit(qlit); + Visitor *v; + +@@ -1262,7 +1254,6 @@ static void do_test_visitor_in_qmp_introspect(TestInputVisitorData *data, + visit_type_SchemaInfoList(v, NULL, &schema, &error_abort); + g_assert(schema); + +- qapi_free_SchemaInfoList(schema); + qobject_unref(obj); + visit_free(v); + } +-- +2.27.0 + diff --git a/kvm-qcow2-Expose-bitmaps-size-during-measure.patch b/kvm-qcow2-Expose-bitmaps-size-during-measure.patch new file mode 100755 index 0000000000000000000000000000000000000000..48c15c5ab86a3519ff04d7738ea0e67f0f346030 --- /dev/null +++ b/kvm-qcow2-Expose-bitmaps-size-during-measure.patch @@ -0,0 +1,495 @@ +From af4d66e07c86d7593f7d18ae4b6a2151123b529b Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:17 +0100 +Subject: [PATCH 12/26] qcow2: Expose bitmaps' size during measure + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-10-eblake@redhat.com> +Patchwork-id: 97072 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 09/12] qcow2: Expose bitmaps' size during measure +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +It's useful to know how much space can be occupied by qcow2 persistent +bitmaps, even though such metadata is unrelated to the guest-visible +data. Report this value as an additional QMP field, present when +measuring an existing image and output format that both support +bitmaps. Update iotest 178 and 190 to updated output, as well as new +coverage in 190 demonstrating non-zero values made possible with the +recently-added qemu-img bitmap command (see 3b51ab4b). + +The new 'bitmaps size:' field is displayed automatically as part of +'qemu-img measure' any time it is present in QMP (that is, any time +both the source image being measured and destination format support +bitmaps, even if the measurement is 0 because there are no bitmaps +present). If the field is absent, it means that no bitmaps can be +copied (source, destination, or both lack bitmaps, including when +measuring based on size rather than on a source image). This behavior +is compatible with an upcoming patch adding 'qemu-img convert +--bitmaps': that command will fail in the same situations where this +patch omits the field. + +The addition of a new field demonstrates why we should always +zero-initialize qapi C structs; while the qcow2 driver still fully +populates all fields, the raw and crypto drivers had to be tweaked to +avoid uninitialized data. + +Consideration was also given towards having a 'qemu-img measure +--bitmaps' which errors out when bitmaps are not possible, and +otherwise sums the bitmaps into the existing allocation totals rather +than displaying as a separate field, as a potential convenience +factor. But this was ultimately decided to be more complexity than +necessary when the QMP interface was sufficient enough with bitmaps +remaining a separate field. + +See also: https://bugzilla.redhat.com/1779904 + +Reported-by: Nir Soffer +Signed-off-by: Eric Blake +Message-Id: <20200521192137.1120211-3-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 5d72c68b49769c927e90b78af6d90f6a384b26ac) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + block/crypto.c - commit a9da6e49 not present (no measure support) + docs/tools/qemu-img.rst - changes in qemu-img.texi instead +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-bitmap.c | 36 ++++++++++++++++++++++++++++++ + block/qcow2.c | 14 +++++++++--- + block/qcow2.h | 2 ++ + block/raw-format.c | 2 +- + qapi/block-core.json | 16 +++++++++----- + qemu-img.c | 3 +++ + qemu-img.texi | 7 ++++++ + tests/qemu-iotests/178.out.qcow2 | 16 ++++++++++++++ + tests/qemu-iotests/190 | 47 ++++++++++++++++++++++++++++++++++++++-- + tests/qemu-iotests/190.out | 27 ++++++++++++++++++++++- + 10 files changed, 158 insertions(+), 12 deletions(-) + +diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c +index cbac905..10d1297 100644 +--- a/block/qcow2-bitmap.c ++++ b/block/qcow2-bitmap.c +@@ -1766,3 +1766,39 @@ bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs) + + return s->qcow_version >= 3; + } ++ ++/* ++ * Compute the space required for bitmaps in @bs. ++ * ++ * The computation is based as if copying to a new image with the ++ * given @cluster_size, which may differ from the cluster size in @bs. ++ */ ++uint64_t qcow2_get_persistent_dirty_bitmap_size(BlockDriverState *bs, ++ uint32_t cluster_size) ++{ ++ uint64_t bitmaps_size = 0; ++ BdrvDirtyBitmap *bm; ++ size_t bitmap_dir_size = 0; ++ ++ FOR_EACH_DIRTY_BITMAP(bs, bm) { ++ if (bdrv_dirty_bitmap_get_persistence(bm)) { ++ const char *name = bdrv_dirty_bitmap_name(bm); ++ uint32_t granularity = bdrv_dirty_bitmap_granularity(bm); ++ uint64_t bmbytes = ++ get_bitmap_bytes_needed(bdrv_dirty_bitmap_size(bm), ++ granularity); ++ uint64_t bmclusters = DIV_ROUND_UP(bmbytes, cluster_size); ++ ++ /* Assume the entire bitmap is allocated */ ++ bitmaps_size += bmclusters * cluster_size; ++ /* Also reserve space for the bitmap table entries */ ++ bitmaps_size += ROUND_UP(bmclusters * sizeof(uint64_t), ++ cluster_size); ++ /* And space for contribution to bitmap directory size */ ++ bitmap_dir_size += calc_dir_entry_size(strlen(name), 0); ++ } ++ } ++ bitmaps_size += ROUND_UP(bitmap_dir_size, cluster_size); ++ ++ return bitmaps_size; ++} +diff --git a/block/qcow2.c b/block/qcow2.c +index 36b0f7d..dbd870a 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -4751,16 +4751,24 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs, + required = virtual_size; + } + +- info = g_new(BlockMeasureInfo, 1); ++ info = g_new0(BlockMeasureInfo, 1); + info->fully_allocated = + qcow2_calc_prealloc_size(virtual_size, cluster_size, + ctz32(refcount_bits)) + luks_payload_size; + +- /* Remove data clusters that are not required. This overestimates the ++ /* ++ * Remove data clusters that are not required. This overestimates the + * required size because metadata needed for the fully allocated file is +- * still counted. ++ * still counted. Show bitmaps only if both source and destination ++ * would support them. + */ + info->required = info->fully_allocated - virtual_size + required; ++ info->has_bitmaps = version >= 3 && in_bs && ++ bdrv_supports_persistent_dirty_bitmap(in_bs); ++ if (info->has_bitmaps) { ++ info->bitmaps = qcow2_get_persistent_dirty_bitmap_size(in_bs, ++ cluster_size); ++ } + return info; + + err: +diff --git a/block/qcow2.h b/block/qcow2.h +index ceb1ceb..3297e6b 100644 +--- a/block/qcow2.h ++++ b/block/qcow2.h +@@ -768,6 +768,8 @@ int qcow2_co_remove_persistent_dirty_bitmap(BlockDriverState *bs, + const char *name, + Error **errp); + bool qcow2_supports_persistent_dirty_bitmap(BlockDriverState *bs); ++uint64_t qcow2_get_persistent_dirty_bitmap_size(BlockDriverState *bs, ++ uint32_t cluster_size); + + ssize_t coroutine_fn + qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, +diff --git a/block/raw-format.c b/block/raw-format.c +index 93b25e1..4bb54f4 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -346,7 +346,7 @@ static BlockMeasureInfo *raw_measure(QemuOpts *opts, BlockDriverState *in_bs, + BDRV_SECTOR_SIZE); + } + +- info = g_new(BlockMeasureInfo, 1); ++ info = g_new0(BlockMeasureInfo, 1); + info->required = required; + + /* Unallocated sectors count towards the file size in raw images */ +diff --git a/qapi/block-core.json b/qapi/block-core.json +index a64ad81..2893209 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -689,18 +689,24 @@ + # efficiently so file size may be smaller than virtual disk size. + # + # The values are upper bounds that are guaranteed to fit the new image file. +-# Subsequent modification, such as internal snapshot or bitmap creation, may +-# require additional space and is not covered here. ++# Subsequent modification, such as internal snapshot or further bitmap ++# creation, may require additional space and is not covered here. + # +-# @required: Size required for a new image file, in bytes. ++# @required: Size required for a new image file, in bytes, when copying just ++# allocated guest-visible contents. + # + # @fully-allocated: Image file size, in bytes, once data has been written +-# to all sectors. ++# to all sectors, when copying just guest-visible contents. ++# ++# @bitmaps: Additional size required if all the top-level bitmap metadata ++# in the source image were to be copied to the destination, ++# present only when source and destination both support ++# persistent bitmaps. (since 5.1) + # + # Since: 2.10 + ## + { 'struct': 'BlockMeasureInfo', +- 'data': {'required': 'int', 'fully-allocated': 'int'} } ++ 'data': {'required': 'int', 'fully-allocated': 'int', '*bitmaps': 'int'} } + + ## + # @query-block: +diff --git a/qemu-img.c b/qemu-img.c +index 11a4537..b57856e 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -5212,6 +5212,9 @@ static int img_measure(int argc, char **argv) + if (output_format == OFORMAT_HUMAN) { + printf("required size: %" PRIu64 "\n", info->required); + printf("fully allocated size: %" PRIu64 "\n", info->fully_allocated); ++ if (info->has_bitmaps) { ++ printf("bitmaps size: %" PRIu64 "\n", info->bitmaps); ++ } + } else { + dump_json_block_measure_info(info); + } +diff --git a/qemu-img.texi b/qemu-img.texi +index abf2771..3670b96 100644 +--- a/qemu-img.texi ++++ b/qemu-img.texi +@@ -576,6 +576,7 @@ The following fields are reported: + @example + required size: 524288 + fully allocated size: 1074069504 ++bitmaps size: 0 + @end example + + The @code{required size} is the file size of the new image. It may be smaller +@@ -586,6 +587,12 @@ been written to all sectors. This is the maximum size that the image file can + occupy with the exception of internal snapshots, dirty bitmaps, vmstate data, + and other advanced image format features. + ++The @code{bitmaps size} is the additional size required in order to ++copy bitmaps from a source image in addition to the guest-visible ++data; the line is omitted if either source or destination lacks ++bitmap support, or 0 if bitmaps are supported but there is nothing to ++copy. ++ + @item snapshot [--object @var{objectdef}] [--image-opts] [-U] [-q] [-l | -a @var{snapshot} | -c @var{snapshot} | -d @var{snapshot}] @var{filename} + + List, apply, create or delete snapshots in image @var{filename}. +diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2 +index 345eab3..b9ed41b 100644 +--- a/tests/qemu-iotests/178.out.qcow2 ++++ b/tests/qemu-iotests/178.out.qcow2 +@@ -37,6 +37,7 @@ qemu-img: The image size is too large (try using a larger cluster size) + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=0 + required size: 196608 + fully allocated size: 196608 ++bitmaps size: 0 + + converted image file size in bytes: 196608 + +@@ -45,6 +46,7 @@ converted image file size in bytes: 196608 + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 + required size: 393216 + fully allocated size: 1074135040 ++bitmaps size: 0 + wrote 512/512 bytes at offset 512 + 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + wrote 65536/65536 bytes at offset 65536 +@@ -53,6 +55,7 @@ wrote 64512/64512 bytes at offset 134217728 + 63 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + required size: 589824 + fully allocated size: 1074135040 ++bitmaps size: 0 + + converted image file size in bytes: 524288 + +@@ -60,6 +63,7 @@ converted image file size in bytes: 524288 + + required size: 524288 + fully allocated size: 1074135040 ++bitmaps size: 0 + + converted image file size in bytes: 458752 + +@@ -67,16 +71,19 @@ converted image file size in bytes: 458752 + + required size: 1074135040 + fully allocated size: 1074135040 ++bitmaps size: 0 + + == qcow2 input image and LUKS encryption == + + required size: 2686976 + fully allocated size: 1076232192 ++bitmaps size: 0 + + == qcow2 input image and preallocation (human) == + + required size: 1074135040 + fully allocated size: 1074135040 ++bitmaps size: 0 + + converted image file size in bytes: 1074135040 + +@@ -87,6 +94,7 @@ wrote 8388608/8388608 bytes at offset 0 + 8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + required size: 8716288 + fully allocated size: 8716288 ++bitmaps size: 0 + + converted image file size in bytes: 8716288 + +@@ -173,6 +181,7 @@ qemu-img: The image size is too large (try using a larger cluster size) + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=0 + { ++ "bitmaps": 0, + "required": 196608, + "fully-allocated": 196608 + } +@@ -183,6 +192,7 @@ converted image file size in bytes: 196608 + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 + { ++ "bitmaps": 0, + "required": 393216, + "fully-allocated": 1074135040 + } +@@ -193,6 +203,7 @@ wrote 65536/65536 bytes at offset 65536 + wrote 64512/64512 bytes at offset 134217728 + 63 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { ++ "bitmaps": 0, + "required": 589824, + "fully-allocated": 1074135040 + } +@@ -202,6 +213,7 @@ converted image file size in bytes: 524288 + == qcow2 input image with internal snapshot (json) == + + { ++ "bitmaps": 0, + "required": 524288, + "fully-allocated": 1074135040 + } +@@ -211,6 +223,7 @@ converted image file size in bytes: 458752 + == qcow2 input image and a backing file (json) == + + { ++ "bitmaps": 0, + "required": 1074135040, + "fully-allocated": 1074135040 + } +@@ -218,6 +231,7 @@ converted image file size in bytes: 458752 + == qcow2 input image and LUKS encryption == + + { ++ "bitmaps": 0, + "required": 2686976, + "fully-allocated": 1076232192 + } +@@ -225,6 +239,7 @@ converted image file size in bytes: 458752 + == qcow2 input image and preallocation (json) == + + { ++ "bitmaps": 0, + "required": 1074135040, + "fully-allocated": 1074135040 + } +@@ -237,6 +252,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608 + wrote 8388608/8388608 bytes at offset 0 + 8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + { ++ "bitmaps": 0, + "required": 8716288, + "fully-allocated": 8716288 + } +diff --git a/tests/qemu-iotests/190 b/tests/qemu-iotests/190 +index eb766ad..5084ccd 100755 +--- a/tests/qemu-iotests/190 ++++ b/tests/qemu-iotests/190 +@@ -2,7 +2,7 @@ + # + # qemu-img measure sub-command tests on huge qcow2 files + # +-# Copyright (C) 2017 Red Hat, Inc. ++# Copyright (C) 2017-2020 Red Hat, Inc. + # + # This program is free software; you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by +@@ -42,7 +42,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 + _supported_fmt qcow2 + _supported_proto file + +-echo "== Huge file ==" ++echo "== Huge file without bitmaps ==" + echo + + IMGOPTS='cluster_size=2M' _make_test_img 2T +@@ -51,6 +51,49 @@ $QEMU_IMG measure -O raw -f qcow2 "$TEST_IMG" + $QEMU_IMG measure -O qcow2 -o cluster_size=64k -f qcow2 "$TEST_IMG" + $QEMU_IMG measure -O qcow2 -o cluster_size=2M -f qcow2 "$TEST_IMG" + ++echo ++echo "== Huge file with bitmaps ==" ++echo ++ ++$QEMU_IMG bitmap --add --granularity 512 -f qcow2 "$TEST_IMG" b1 ++$QEMU_IMG bitmap --add -g 2M -f qcow2 "$TEST_IMG" b2 ++ ++# No bitmap without a source ++$QEMU_IMG measure -O qcow2 --size 10M ++# No bitmap output, since raw does not support it ++$QEMU_IMG measure -O raw -f qcow2 "$TEST_IMG" ++# No bitmap output, since no bitmaps on raw source. Munge required size, as ++# some filesystems store the qcow2 file with less sparseness than others ++$QEMU_IMG measure -O qcow2 -f raw "$TEST_IMG" | ++ sed '/^required size:/ s/[0-9][0-9]*/SIZE/' ++# No bitmap output, since v2 does not support it ++$QEMU_IMG measure -O qcow2 -o compat=0.10 -f qcow2 "$TEST_IMG" ++ ++# Compute expected output: bitmap clusters + bitmap tables + bitmaps directory ++echo ++val2T=$((2*1024*1024*1024*1024)) ++cluster=$((64*1024)) ++b1clusters=$(( (val2T/512/8 + cluster - 1) / cluster )) ++b2clusters=$(( (val2T/2/1024/1024/8 + cluster - 1) / cluster )) ++echo expected bitmap $((b1clusters * cluster + ++ (b1clusters * 8 + cluster - 1) / cluster * cluster + ++ b2clusters * cluster + ++ (b2clusters * 8 + cluster - 1) / cluster * cluster + ++ cluster)) ++$QEMU_IMG measure -O qcow2 -o cluster_size=64k -f qcow2 "$TEST_IMG" ++ ++# Compute expected output: bitmap clusters + bitmap tables + bitmaps directory ++echo ++cluster=$((2*1024*1024)) ++b1clusters=$(( (val2T/512/8 + cluster - 1) / cluster )) ++b2clusters=$(( (val2T/2/1024/1024/8 + cluster - 1) / cluster )) ++echo expected bitmap $((b1clusters * cluster + ++ (b1clusters * 8 + cluster - 1) / cluster * cluster + ++ b2clusters * cluster + ++ (b2clusters * 8 + cluster - 1) / cluster * cluster + ++ cluster)) ++$QEMU_IMG measure --output=json -O qcow2 -o cluster_size=2M -f qcow2 "$TEST_IMG" ++ + # success, all done + echo "*** done" + rm -f $seq.full +diff --git a/tests/qemu-iotests/190.out b/tests/qemu-iotests/190.out +index d001942..ed9d821 100644 +--- a/tests/qemu-iotests/190.out ++++ b/tests/qemu-iotests/190.out +@@ -1,11 +1,36 @@ + QA output created by 190 +-== Huge file == ++== Huge file without bitmaps == + + Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=2199023255552 + required size: 2199023255552 + fully allocated size: 2199023255552 + required size: 335806464 + fully allocated size: 2199359062016 ++bitmaps size: 0 + required size: 18874368 + fully allocated size: 2199042129920 ++bitmaps size: 0 ++ ++== Huge file with bitmaps == ++ ++required size: 327680 ++fully allocated size: 10813440 ++required size: 2199023255552 ++fully allocated size: 2199023255552 ++required size: SIZE ++fully allocated size: 17170432 ++required size: 335806464 ++fully allocated size: 2199359062016 ++ ++expected bitmap 537198592 ++required size: 335806464 ++fully allocated size: 2199359062016 ++bitmaps size: 537198592 ++ ++expected bitmap 545259520 ++{ ++ "bitmaps": 545259520, ++ "required": 18874368, ++ "fully-allocated": 2199042129920 ++} + *** done +-- +1.8.3.1 + diff --git a/kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch b/kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch new file mode 100755 index 0000000000000000000000000000000000000000..43ff282ac90c2832783d6fecde0a338bc7672303 --- /dev/null +++ b/kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch @@ -0,0 +1,47 @@ +From bd97bbbce54da301407d51cae35e09ba2a12b160 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Mon, 13 Jul 2020 14:24:48 -0400 +Subject: [PATCH 1/4] qcow2: Fix alloc_cluster_abort() for pre-existing + clusters + +RH-Author: Max Reitz +Message-id: <20200713142451.289703-2-mreitz@redhat.com> +Patchwork-id: 97954 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/4] qcow2: Fix alloc_cluster_abort() for pre-existing clusters +Bugzilla: 1807057 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf + +handle_alloc() reuses preallocated zero clusters. If anything goes +wrong during the data write, we do not change their L2 entry, so we +must not let qcow2_alloc_cluster_abort() free them. + +Fixes: 8b24cd141549b5b264baeddd4e72902cfb5de23b +Cc: qemu-stable@nongnu.org +Signed-off-by: Max Reitz +Message-Id: <20200225143130.111267-2-mreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 3ede935fdbbd5f7b24b4724bbfb8938acb5956d8) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-cluster.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c +index 9d04f8d77b..1970797ce5 100644 +--- a/block/qcow2-cluster.c ++++ b/block/qcow2-cluster.c +@@ -1015,7 +1015,7 @@ err: + void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) + { + BDRVQcow2State *s = bs->opaque; +- if (!has_data_file(bs)) { ++ if (!has_data_file(bs) && !m->keep_old_clusters) { + qcow2_free_clusters(bs, m->alloc_offset, + m->nb_clusters << s->cluster_bits, + QCOW2_DISCARD_NEVER); +-- +2.27.0 + diff --git a/kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch b/kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch new file mode 100755 index 0000000000000000000000000000000000000000..1a7ace53027bdc631315f06301285a92dea5e69d --- /dev/null +++ b/kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch @@ -0,0 +1,52 @@ +From ecc4fb6e1941035e1d9def1f69b779fbea216caf Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 24 Feb 2020 16:13:07 +0000 +Subject: [PATCH 7/9] qcow2: Fix qcow2_alloc_cluster_abort() for external data + file + +RH-Author: Kevin Wolf +Message-id: <20200224161307.29783-2-kwolf@redhat.com> +Patchwork-id: 94042 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] qcow2: Fix qcow2_alloc_cluster_abort() for external data file +Bugzilla: 1703907 +RH-Acked-by: John Snow +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz + +For external data file, cluster allocations return an offset in the data +file and are not refcounted. In this case, there is nothing to do for +qcow2_alloc_cluster_abort(). Freeing the same offset in the qcow2 file +is wrong and causes crashes in the better case or image corruption in +the worse case. + +Signed-off-by: Kevin Wolf +Message-Id: <20200211094900.17315-3-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit c3b6658c1a5a3fb24d6c27b2594cf86146f75b22) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-cluster.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c +index 8982b7b..dc3c270 100644 +--- a/block/qcow2-cluster.c ++++ b/block/qcow2-cluster.c +@@ -1015,8 +1015,11 @@ err: + void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) + { + BDRVQcow2State *s = bs->opaque; +- qcow2_free_clusters(bs, m->alloc_offset, m->nb_clusters << s->cluster_bits, +- QCOW2_DISCARD_NEVER); ++ if (!has_data_file(bs)) { ++ qcow2_free_clusters(bs, m->alloc_offset, ++ m->nb_clusters << s->cluster_bits, ++ QCOW2_DISCARD_NEVER); ++ } + } + + /* +-- +1.8.3.1 + diff --git a/kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch b/kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch new file mode 100755 index 0000000000000000000000000000000000000000..522ba60e9c39809639f50acc7fea071c88fc9299 --- /dev/null +++ b/kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch @@ -0,0 +1,98 @@ +From 4290173219e15065e9a7c2e95774ac979b5fd869 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:40 +0100 +Subject: [PATCH 12/17] qcow2: Forward ZERO_WRITE flag for full preallocation + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-12-kwolf@redhat.com> +Patchwork-id: 97456 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 11/11] qcow2: Forward ZERO_WRITE flag for full preallocation +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +The BDRV_REQ_ZERO_WRITE is currently implemented in a way that first the +image is possibly preallocated and then the zero flag is added to all +clusters. This means that a copy-on-write operation may be needed when +writing to these clusters, despite having used preallocation, negating +one of the major benefits of preallocation. + +Instead, try to forward the BDRV_REQ_ZERO_WRITE to the protocol driver, +and if the protocol driver can ensure that the new area reads as zeros, +we can skip setting the zero flag in the qcow2 layer. + +Unfortunately, the same approach doesn't work for metadata +preallocation, so we'll still set the zero flag there. + +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +Message-Id: <20200424142701.67053-1-kwolf@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Kevin Wolf +(cherry picked from commit eb8a0cf3ba26611f3981f8f45ac6a868975a68cc) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2.c | 22 +++++++++++++++++++--- + tests/qemu-iotests/274.out | 4 ++-- + 2 files changed, 21 insertions(+), 5 deletions(-) + +diff --git a/block/qcow2.c b/block/qcow2.c +index f3d6cb0..b783662 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -4153,9 +4153,25 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + /* Allocate the data area */ + new_file_size = allocation_start + + nb_new_data_clusters * s->cluster_size; +- /* Image file grows, so @exact does not matter */ +- ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0, +- errp); ++ /* ++ * Image file grows, so @exact does not matter. ++ * ++ * If we need to zero out the new area, try first whether the protocol ++ * driver can already take care of this. ++ */ ++ if (flags & BDRV_REQ_ZERO_WRITE) { ++ ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, ++ BDRV_REQ_ZERO_WRITE, NULL); ++ if (ret >= 0) { ++ flags &= ~BDRV_REQ_ZERO_WRITE; ++ } ++ } else { ++ ret = -1; ++ } ++ if (ret < 0) { ++ ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0, ++ errp); ++ } + if (ret < 0) { + error_prepend(errp, "Failed to resize underlying file: "); + qcow2_free_clusters(bs, allocation_start, +diff --git a/tests/qemu-iotests/274.out b/tests/qemu-iotests/274.out +index 1a796fd..9d6fdeb 100644 +--- a/tests/qemu-iotests/274.out ++++ b/tests/qemu-iotests/274.out +@@ -187,7 +187,7 @@ read 65536/65536 bytes at offset 9437184 + 10 MiB (0xa00000) bytes allocated at offset 5 MiB (0x500000) + + [{ "start": 0, "length": 5242880, "depth": 1, "zero": true, "data": false}, +-{ "start": 5242880, "length": 10485760, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++{ "start": 5242880, "length": 10485760, "depth": 0, "zero": false, "data": true, "offset": 327680}] + + === preallocation=full === + Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=16777216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 +@@ -206,7 +206,7 @@ read 65536/65536 bytes at offset 11534336 + 4 MiB (0x400000) bytes allocated at offset 8 MiB (0x800000) + + [{ "start": 0, "length": 8388608, "depth": 1, "zero": true, "data": false}, +-{ "start": 8388608, "length": 4194304, "depth": 0, "zero": true, "data": false, "offset": 327680}] ++{ "start": 8388608, "length": 4194304, "depth": 0, "zero": false, "data": true, "offset": 327680}] + + === preallocation=off === + Formatting 'TEST_DIR/PID-base', fmt=qcow2 size=393216 cluster_size=65536 lazy_refcounts=off refcount_bits=16 +-- +1.8.3.1 + diff --git a/kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch b/kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch new file mode 100755 index 0000000000000000000000000000000000000000..454759e4d4b437276595b8adbd1538af850499e8 --- /dev/null +++ b/kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch @@ -0,0 +1,101 @@ +From 3e603e344b81b3ecfea6fb9589ba91f70a22139d Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:33 +0100 +Subject: [PATCH 05/17] qcow2: Support BDRV_REQ_ZERO_WRITE for truncate + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-5-kwolf@redhat.com> +Patchwork-id: 97449 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 04/11] qcow2: Support BDRV_REQ_ZERO_WRITE for truncate +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +If BDRV_REQ_ZERO_WRITE is set and we're extending the image, calling +qcow2_cluster_zeroize() with flags=0 does the right thing: It doesn't +undo any previous preallocation, but just adds the zero flag to all +relevant L2 entries. If an external data file is in use, a write_zeroes +request to the data file is made instead. + +Signed-off-by: Kevin Wolf +Message-Id: <20200424125448.63318-5-kwolf@redhat.com> +Reviewed-by: Eric Blake +Reviewed-by: Max Reitz +Signed-off-by: Kevin Wolf +(cherry picked from commit f01643fb8b47e8a70c04bbf45e0f12a9e5bc54de) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/qcow2-cluster.c | 2 +- + block/qcow2.c | 34 ++++++++++++++++++++++++++++++++++ + 2 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c +index dc3c270..9d04f8d 100644 +--- a/block/qcow2-cluster.c ++++ b/block/qcow2-cluster.c +@@ -1784,7 +1784,7 @@ int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset, + /* Caller must pass aligned values, except at image end */ + assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); + assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || +- end_offset == bs->total_sectors << BDRV_SECTOR_BITS); ++ end_offset >= bs->total_sectors << BDRV_SECTOR_BITS); + + /* The zero flag is only supported by version 3 and newer */ + if (s->qcow_version < 3) { +diff --git a/block/qcow2.c b/block/qcow2.c +index 86aa74a..f3d6cb0 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -1726,6 +1726,7 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, + } + + bs->supported_zero_flags = header.version >= 3 ? BDRV_REQ_MAY_UNMAP : 0; ++ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; + + /* Repair image if dirty */ + if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only && +@@ -4197,6 +4198,39 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, + g_assert_not_reached(); + } + ++ if ((flags & BDRV_REQ_ZERO_WRITE) && offset > old_length) { ++ uint64_t zero_start = QEMU_ALIGN_UP(old_length, s->cluster_size); ++ ++ /* ++ * Use zero clusters as much as we can. qcow2_cluster_zeroize() ++ * requires a cluster-aligned start. The end may be unaligned if it is ++ * at the end of the image (which it is here). ++ */ ++ ret = qcow2_cluster_zeroize(bs, zero_start, offset - zero_start, 0); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "Failed to zero out new clusters"); ++ goto fail; ++ } ++ ++ /* Write explicit zeros for the unaligned head */ ++ if (zero_start > old_length) { ++ uint64_t len = zero_start - old_length; ++ uint8_t *buf = qemu_blockalign0(bs, len); ++ QEMUIOVector qiov; ++ qemu_iovec_init_buf(&qiov, buf, len); ++ ++ qemu_co_mutex_unlock(&s->lock); ++ ret = qcow2_co_pwritev_part(bs, old_length, len, &qiov, 0, 0); ++ qemu_co_mutex_lock(&s->lock); ++ ++ qemu_vfree(buf); ++ if (ret < 0) { ++ error_setg_errno(errp, -ret, "Failed to zero out the new area"); ++ goto fail; ++ } ++ } ++ } ++ + if (prealloc != PREALLOC_MODE_OFF) { + /* Flush metadata before actually changing the image size */ + ret = qcow2_write_caches(bs); +-- +1.8.3.1 + diff --git a/kvm-qemu-file-Don-t-do-IO-after-shutdown.patch b/kvm-qemu-file-Don-t-do-IO-after-shutdown.patch new file mode 100755 index 0000000000000000000000000000000000000000..88a6e31cfadabd3290d4c9b61153ed3ed20803cd --- /dev/null +++ b/kvm-qemu-file-Don-t-do-IO-after-shutdown.patch @@ -0,0 +1,92 @@ +From d84814e298e3b05fb5bc61cc8e641a5e104d32d5 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Tue, 3 Mar 2020 14:51:39 +0000 +Subject: [PATCH 07/18] qemu-file: Don't do IO after shutdown + +RH-Author: Juan Quintela +Message-id: <20200303145143.149290-7-quintela@redhat.com> +Patchwork-id: 94116 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 06/10] qemu-file: Don't do IO after shutdown +Bugzilla: 1738451 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Peter Xu +RH-Acked-by: Dr. David Alan Gilbert + +Be sure that we are not doing neither read/write after shutdown of the +QEMUFile. + +Signed-off-by: Juan Quintela +Reviewed-by: Dr. David Alan Gilbert +(cherry picked from commit a555b8092abc6f1bbe4b64c516679cbd68fcfbd8) +Signed-off-by: Danilo C. L. de Paula +--- + migration/qemu-file.c | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +diff --git a/migration/qemu-file.c b/migration/qemu-file.c +index 26fb25d..bbb2b63 100644 +--- a/migration/qemu-file.c ++++ b/migration/qemu-file.c +@@ -53,6 +53,8 @@ struct QEMUFile { + + int last_error; + Error *last_error_obj; ++ /* has the file has been shutdown */ ++ bool shutdown; + }; + + /* +@@ -61,10 +63,18 @@ struct QEMUFile { + */ + int qemu_file_shutdown(QEMUFile *f) + { ++ int ret; ++ ++ f->shutdown = true; + if (!f->ops->shut_down) { + return -ENOSYS; + } +- return f->ops->shut_down(f->opaque, true, true, NULL); ++ ret = f->ops->shut_down(f->opaque, true, true, NULL); ++ ++ if (!f->last_error) { ++ qemu_file_set_error(f, -EIO); ++ } ++ return ret; + } + + /* +@@ -214,6 +224,9 @@ void qemu_fflush(QEMUFile *f) + return; + } + ++ if (f->shutdown) { ++ return; ++ } + if (f->iovcnt > 0) { + expect = iov_size(f->iov, f->iovcnt); + ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos, +@@ -328,6 +341,10 @@ static ssize_t qemu_fill_buffer(QEMUFile *f) + f->buf_index = 0; + f->buf_size = pending; + ++ if (f->shutdown) { ++ return 0; ++ } ++ + len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos, + IO_BUF_SIZE - pending, &local_error); + if (len > 0) { +@@ -642,6 +659,9 @@ int64_t qemu_ftell(QEMUFile *f) + + int qemu_file_rate_limit(QEMUFile *f) + { ++ if (f->shutdown) { ++ return 1; ++ } + if (qemu_file_get_error(f)) { + return 1; + } +-- +1.8.3.1 + diff --git a/kvm-qemu-img-Add-bitmap-sub-command.patch b/kvm-qemu-img-Add-bitmap-sub-command.patch new file mode 100755 index 0000000000000000000000000000000000000000..eb801886aa8e3df424bf098ef05efb7ee73b371a --- /dev/null +++ b/kvm-qemu-img-Add-bitmap-sub-command.patch @@ -0,0 +1,398 @@ +From 53baacb72e8561391841363b2acbd85a783cbc66 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:15 +0100 +Subject: [PATCH 10/26] qemu-img: Add bitmap sub-command + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-8-eblake@redhat.com> +Patchwork-id: 97074 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 07/12] qemu-img: Add bitmap sub-command +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Include actions for --add, --remove, --clear, --enable, --disable, and +--merge (note that --clear is a bit of fluff, because the same can be +accomplished by removing a bitmap and then adding a new one in its +place, but it matches what QMP commands exist). Listing is omitted, +because it does not require a bitmap name and because it was already +possible with 'qemu-img info'. A single command line can play one or +more bitmap commands in sequence on the same bitmap name (although all +added bitmaps share the same granularity, and and all merged bitmaps +come from the same source file). Merge defaults to other bitmaps in +the primary image, but can also be told to merge bitmaps from a +distinct image. + +While this supports --image-opts for the file being modified, I did +not think it worth the extra complexity to support that for the source +file in a cross-file merges. Likewise, I chose to have --merge only +take a single source rather than following the QMP support for +multiple merges in one go (although you can still use more than one +--merge in the command line); in part because qemu-img is offline and +therefore atomicity is not an issue. + +Upcoming patches will add iotest coverage of these commands while +also testing other features. + +Signed-off-by: Eric Blake +Reviewed-by: Max Reitz +Message-Id: <20200513011648.166876-7-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 3b51ab4bf0f49a01cc2db7b954e0669e081719b5) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + docs/tools/qemu-img.rst - lives in qemu-img.texi instead; plus + fix a typo in the text for --merge rather than waiting for + a one-line upstream followup patch + qemu-img-cmds.hx - context, use texi instead of rst + qemu-img.c - context +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img-cmds.hx | 6 ++ + qemu-img.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + qemu-img.texi | 27 ++++++ + 3 files changed, 281 insertions(+) + +diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx +index 1c93e6d..1a6a8e9 100644 +--- a/qemu-img-cmds.hx ++++ b/qemu-img-cmds.hx +@@ -25,6 +25,12 @@ STEXI + @item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} + ETEXI + ++DEF("bitmap", img_bitmap, ++ "bitmap (--merge SOURCE | --add | --remove | --clear | --enable | --disable)... [-b source_file [-F source_fmt]] [-g granularity] [--object objectdef] [--image-opts | -f fmt] filename bitmap") ++STEXI ++.. option:: bitmap (--merge @var{source} | --add | --remove | --clear | --enable | --disable)... [-b @var{source_file} [-F @var{source_fmt}]] [-g @var{granularity}] [--object @var{objectdef}] [--image-opts | -f @var{fmt}] @var{filename} @var{bitmap} ++ETEXI ++ + DEF("check", img_check, + "check [--object objectdef] [--image-opts] [-q] [-f fmt] [--output=ofmt] [-r [leaks | all]] [-T src_cache] [-U] filename") + STEXI +diff --git a/qemu-img.c b/qemu-img.c +index e69529b..11a4537 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -28,6 +28,7 @@ + #include "qemu-common.h" + #include "qemu-version.h" + #include "qapi/error.h" ++#include "qapi/qapi-commands-block-core.h" + #include "qapi/qapi-visit-block-core.h" + #include "qapi/qobject-output-visitor.h" + #include "qapi/qmp/qjson.h" +@@ -70,6 +71,12 @@ enum { + OPTION_PREALLOCATION = 265, + OPTION_SHRINK = 266, + OPTION_SALVAGE = 267, ++ OPTION_ADD = 269, ++ OPTION_REMOVE = 270, ++ OPTION_CLEAR = 271, ++ OPTION_ENABLE = 272, ++ OPTION_DISABLE = 273, ++ OPTION_MERGE = 274, + }; + + typedef enum OutputFormat { +@@ -168,6 +175,14 @@ static void QEMU_NORETURN help(void) + " '-n' skips the target volume creation (useful if the volume is created\n" + " prior to running qemu-img)\n" + "\n" ++ "Parameters to bitmap subcommand:\n" ++ " 'bitmap' is the name of the bitmap to manipulate, through one or more\n" ++ " actions from '--add', '--remove', '--clear', '--enable', '--disable',\n" ++ " or '--merge source'\n" ++ " '-g granularity' sets the granularity for '--add' actions\n" ++ " '-b source' and '-F src_fmt' tell '--merge' actions to find the source\n" ++ " bitmaps from an alternative file\n" ++ "\n" + "Parameters to check subcommand:\n" + " '-r' tries to repair any inconsistencies that are found during the check.\n" + " '-r leaks' repairs only cluster leaks, whereas '-r all' fixes all\n" +@@ -4402,6 +4417,239 @@ out: + return 0; + } + ++enum ImgBitmapAct { ++ BITMAP_ADD, ++ BITMAP_REMOVE, ++ BITMAP_CLEAR, ++ BITMAP_ENABLE, ++ BITMAP_DISABLE, ++ BITMAP_MERGE, ++}; ++typedef struct ImgBitmapAction { ++ enum ImgBitmapAct act; ++ const char *src; /* only used for merge */ ++ QSIMPLEQ_ENTRY(ImgBitmapAction) next; ++} ImgBitmapAction; ++ ++static int img_bitmap(int argc, char **argv) ++{ ++ Error *err = NULL; ++ int c, ret = 1; ++ QemuOpts *opts = NULL; ++ const char *fmt = NULL, *src_fmt = NULL, *src_filename = NULL; ++ const char *filename, *bitmap; ++ BlockBackend *blk = NULL, *src = NULL; ++ BlockDriverState *bs = NULL, *src_bs = NULL; ++ bool image_opts = false; ++ int64_t granularity = 0; ++ bool add = false, merge = false; ++ QSIMPLEQ_HEAD(, ImgBitmapAction) actions; ++ ImgBitmapAction *act, *act_next; ++ const char *op; ++ ++ QSIMPLEQ_INIT(&actions); ++ ++ for (;;) { ++ static const struct option long_options[] = { ++ {"help", no_argument, 0, 'h'}, ++ {"object", required_argument, 0, OPTION_OBJECT}, ++ {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS}, ++ {"add", no_argument, 0, OPTION_ADD}, ++ {"remove", no_argument, 0, OPTION_REMOVE}, ++ {"clear", no_argument, 0, OPTION_CLEAR}, ++ {"enable", no_argument, 0, OPTION_ENABLE}, ++ {"disable", no_argument, 0, OPTION_DISABLE}, ++ {"merge", required_argument, 0, OPTION_MERGE}, ++ {"granularity", required_argument, 0, 'g'}, ++ {"source-file", required_argument, 0, 'b'}, ++ {"source-format", required_argument, 0, 'F'}, ++ {0, 0, 0, 0} ++ }; ++ c = getopt_long(argc, argv, ":b:f:F:g:h", long_options, NULL); ++ if (c == -1) { ++ break; ++ } ++ ++ switch (c) { ++ case ':': ++ missing_argument(argv[optind - 1]); ++ break; ++ case '?': ++ unrecognized_option(argv[optind - 1]); ++ break; ++ case 'h': ++ help(); ++ break; ++ case 'b': ++ src_filename = optarg; ++ break; ++ case 'f': ++ fmt = optarg; ++ break; ++ case 'F': ++ src_fmt = optarg; ++ break; ++ case 'g': ++ granularity = cvtnum("granularity", optarg); ++ if (granularity < 0) { ++ return 1; ++ } ++ break; ++ case OPTION_ADD: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_ADD; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ add = true; ++ break; ++ case OPTION_REMOVE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_REMOVE; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_CLEAR: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_CLEAR; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_ENABLE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_ENABLE; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_DISABLE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_DISABLE; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ break; ++ case OPTION_MERGE: ++ act = g_new0(ImgBitmapAction, 1); ++ act->act = BITMAP_MERGE; ++ act->src = optarg; ++ QSIMPLEQ_INSERT_TAIL(&actions, act, next); ++ merge = true; ++ break; ++ case OPTION_OBJECT: ++ opts = qemu_opts_parse_noisily(&qemu_object_opts, optarg, true); ++ if (!opts) { ++ goto out; ++ } ++ break; ++ case OPTION_IMAGE_OPTS: ++ image_opts = true; ++ break; ++ } ++ } ++ ++ if (qemu_opts_foreach(&qemu_object_opts, ++ user_creatable_add_opts_foreach, ++ qemu_img_object_print_help, &error_fatal)) { ++ goto out; ++ } ++ ++ if (QSIMPLEQ_EMPTY(&actions)) { ++ error_report("Need at least one of --add, --remove, --clear, " ++ "--enable, --disable, or --merge"); ++ goto out; ++ } ++ ++ if (granularity && !add) { ++ error_report("granularity only supported with --add"); ++ goto out; ++ } ++ if (src_fmt && !src_filename) { ++ error_report("-F only supported with -b"); ++ goto out; ++ } ++ if (src_filename && !merge) { ++ error_report("Merge bitmap source file only supported with " ++ "--merge"); ++ goto out; ++ } ++ ++ if (optind != argc - 2) { ++ error_report("Expecting filename and bitmap name"); ++ goto out; ++ } ++ ++ filename = argv[optind]; ++ bitmap = argv[optind + 1]; ++ ++ blk = img_open(image_opts, filename, fmt, BDRV_O_RDWR, false, false, ++ false); ++ if (!blk) { ++ goto out; ++ } ++ bs = blk_bs(blk); ++ if (src_filename) { ++ src = img_open(false, src_filename, src_fmt, 0, false, false, false); ++ if (!src) { ++ goto out; ++ } ++ src_bs = blk_bs(src); ++ } else { ++ src_bs = bs; ++ } ++ ++ QSIMPLEQ_FOREACH_SAFE(act, &actions, next, act_next) { ++ switch (act->act) { ++ case BITMAP_ADD: ++ qmp_block_dirty_bitmap_add(bs->node_name, bitmap, ++ !!granularity, granularity, true, true, ++ false, false, &err); ++ op = "add"; ++ break; ++ case BITMAP_REMOVE: ++ qmp_block_dirty_bitmap_remove(bs->node_name, bitmap, &err); ++ op = "remove"; ++ break; ++ case BITMAP_CLEAR: ++ qmp_block_dirty_bitmap_clear(bs->node_name, bitmap, &err); ++ op = "clear"; ++ break; ++ case BITMAP_ENABLE: ++ qmp_block_dirty_bitmap_enable(bs->node_name, bitmap, &err); ++ op = "enable"; ++ break; ++ case BITMAP_DISABLE: ++ qmp_block_dirty_bitmap_disable(bs->node_name, bitmap, &err); ++ op = "disable"; ++ break; ++ case BITMAP_MERGE: { ++ BlockDirtyBitmapMergeSource *merge_src; ++ BlockDirtyBitmapMergeSourceList *list; ++ ++ merge_src = g_new0(BlockDirtyBitmapMergeSource, 1); ++ merge_src->type = QTYPE_QDICT; ++ merge_src->u.external.node = g_strdup(src_bs->node_name); ++ merge_src->u.external.name = g_strdup(act->src); ++ list = g_new0(BlockDirtyBitmapMergeSourceList, 1); ++ list->value = merge_src; ++ qmp_block_dirty_bitmap_merge(bs->node_name, bitmap, list, &err); ++ qapi_free_BlockDirtyBitmapMergeSourceList(list); ++ op = "merge"; ++ break; ++ } ++ default: ++ g_assert_not_reached(); ++ } ++ ++ if (err) { ++ error_reportf_err(err, "Operation %s on bitmap %s failed: ", ++ op, bitmap); ++ goto out; ++ } ++ g_free(act); ++ } ++ ++ ret = 0; ++ ++ out: ++ blk_unref(src); ++ blk_unref(blk); ++ qemu_opts_del(opts); ++ return ret; ++} ++ + #define C_BS 01 + #define C_COUNT 02 + #define C_IF 04 +diff --git a/qemu-img.texi b/qemu-img.texi +index b5156d6..abf2771 100644 +--- a/qemu-img.texi ++++ b/qemu-img.texi +@@ -230,6 +230,33 @@ specified as well. + For write tests, by default a buffer filled with zeros is written. This can be + overridden with a pattern byte specified by @var{pattern}. + ++@item bitmap (--merge @var{source} | --add | --remove | --clear | --enable | --disable)... [-b @var{source_file} [-F @var{source_fmt}]] [-g @var{granularity}] [--object @var{objectdef}] [--image-opts | -f @var{fmt}] @var{filename} @var{bitmap} ++ ++Perform one or more modifications of the persistent bitmap @var{bitmap} ++in the disk image @var{filename}. The various modifications are: ++ ++@table @option ++@item add ++create @var{bitmap}, enabled to record future edits. ++@item remove ++remove @var{bitmap}. ++@item clear ++clear @var{bitmap}. ++@item enable ++change @var{bitmap} to start recording future edits. ++@item disable ++change @var{bitmap} to stop recording future edits. ++@item merge @var{source} ++merge the contents of the @var{source} bitmap into @var{bitmap}. ++@end table ++ ++Additional options include @option{-g} which sets a non-default ++@var{granularity} for @option{--add}, and @option{-b} and @option{-F} ++which select an alternative source file for all @var{source} bitmaps used by ++@option{--merge}. ++ ++To see what bitmaps are present in an image, use @code{qemu-img info}. ++ + @item check [--object @var{objectdef}] [--image-opts] [-q] [-f @var{fmt}] [--output=@var{ofmt}] [-r [leaks | all]] [-T @var{src_cache}] [-U] @var{filename} + + Perform a consistency check on the disk image @var{filename}. The command can +-- +1.8.3.1 + diff --git a/kvm-qemu-img-Add-convert-bitmaps-option.patch b/kvm-qemu-img-Add-convert-bitmaps-option.patch new file mode 100755 index 0000000000000000000000000000000000000000..20eca9f23b04a1d662c51ea5d36d5a0e0544ce04 --- /dev/null +++ b/kvm-qemu-img-Add-convert-bitmaps-option.patch @@ -0,0 +1,244 @@ +From f2add7d5955770318824c3eee774bec2dd850936 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:19 +0100 +Subject: [PATCH 14/26] qemu-img: Add convert --bitmaps option + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-12-eblake@redhat.com> +Patchwork-id: 97076 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 11/12] qemu-img: Add convert --bitmaps option +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +Make it easier to copy all the persistent bitmaps of (the top layer +of) a source image along with its guest-visible contents, by adding a +boolean flag for use with qemu-img convert. This is basically +shorthand, as the same effect could be accomplished with a series of +'qemu-img bitmap --add' and 'qemu-img bitmap --merge -b source' +commands, or by their corresponding QMP commands. + +Note that this command will fail in the same scenarios where 'qemu-img +measure' omits a 'bitmaps size:' line, namely, when either the source +or the destination lacks persistent bitmap support altogether. + +See also https://bugzilla.redhat.com/show_bug.cgi?id=1779893 + +While touching this, clean up a couple coding issues spotted in the +same function: an extra blank line, and merging back-to-back 'if +(!skip_create)' blocks. + +Signed-off-by: Eric Blake +Message-Id: <20200521192137.1120211-5-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 15e39ad95078d528dfb9a75417453cab60332b77) + +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + docs/tools/qemu-img.rst - qemu-img.texi instead + qemu-img.c - context: no --target-is-zero + qemu-img-cmds.hx - context: texi instead of rst +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img-cmds.hx | 4 ++-- + qemu-img.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- + qemu-img.texi | 4 +++- + 3 files changed, 72 insertions(+), 6 deletions(-) + +diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx +index 1a6a8e9..48144aa 100644 +--- a/qemu-img-cmds.hx ++++ b/qemu-img-cmds.hx +@@ -50,9 +50,9 @@ STEXI + ETEXI + + DEF("convert", img_convert, +- "convert [--object objectdef] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") ++ "convert [--object objectdef] [--image-opts] [--target-image-opts] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") + STEXI +-@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] [--salvage] @var{filename} [@var{filename2} [...]] @var{output_filename} ++@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] [--salvage] @var{filename} [@var{filename2} [...]] @var{output_filename} + ETEXI + + DEF("create", img_create, +diff --git a/qemu-img.c b/qemu-img.c +index 39e1586..6dc881b 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -77,6 +77,7 @@ enum { + OPTION_ENABLE = 272, + OPTION_DISABLE = 273, + OPTION_MERGE = 274, ++ OPTION_BITMAPS = 275, + }; + + typedef enum OutputFormat { +@@ -190,6 +191,7 @@ static void QEMU_NORETURN help(void) + " hiding corruption that has already occurred.\n" + "\n" + "Parameters to convert subcommand:\n" ++ " '--bitmaps' copies all top-level persistent bitmaps to destination\n" + " '-m' specifies how many coroutines work in parallel during the convert\n" + " process (defaults to 8)\n" + " '-W' allow to write to the target out of order rather than sequential\n" +@@ -2084,6 +2086,39 @@ static int convert_do_copy(ImgConvertState *s) + return s->ret; + } + ++static int convert_copy_bitmaps(BlockDriverState *src, BlockDriverState *dst) ++{ ++ BdrvDirtyBitmap *bm; ++ Error *err = NULL; ++ ++ FOR_EACH_DIRTY_BITMAP(src, bm) { ++ const char *name; ++ ++ if (!bdrv_dirty_bitmap_get_persistence(bm)) { ++ continue; ++ } ++ name = bdrv_dirty_bitmap_name(bm); ++ qmp_block_dirty_bitmap_add(dst->node_name, name, ++ true, bdrv_dirty_bitmap_granularity(bm), ++ true, true, ++ true, !bdrv_dirty_bitmap_enabled(bm), ++ &err); ++ if (err) { ++ error_reportf_err(err, "Failed to create bitmap %s: ", name); ++ return -1; ++ } ++ ++ do_dirty_bitmap_merge(dst->node_name, name, src->node_name, name, ++ &err); ++ if (err) { ++ error_reportf_err(err, "Failed to populate bitmap %s: ", name); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ + #define MAX_BUF_SECTORS 32768 + + static int img_convert(int argc, char **argv) +@@ -2105,6 +2140,7 @@ static int img_convert(int argc, char **argv) + int64_t ret = -EINVAL; + bool force_share = false; + bool explict_min_sparse = false; ++ bool bitmaps = false; + + ImgConvertState s = (ImgConvertState) { + /* Need at least 4k of zeros for sparse detection */ +@@ -2123,6 +2159,7 @@ static int img_convert(int argc, char **argv) + {"force-share", no_argument, 0, 'U'}, + {"target-image-opts", no_argument, 0, OPTION_TARGET_IMAGE_OPTS}, + {"salvage", no_argument, 0, OPTION_SALVAGE}, ++ {"bitmaps", no_argument, 0, OPTION_BITMAPS}, + {0, 0, 0, 0} + }; + c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WU", +@@ -2248,6 +2285,9 @@ static int img_convert(int argc, char **argv) + case OPTION_TARGET_IMAGE_OPTS: + tgt_image_opts = true; + break; ++ case OPTION_BITMAPS: ++ bitmaps = true; ++ break; + } + } + +@@ -2304,7 +2344,6 @@ static int img_convert(int argc, char **argv) + goto fail_getopt; + } + +- + /* ret is still -EINVAL until here */ + ret = bdrv_parse_cache_mode(src_cache, &src_flags, &src_writethrough); + if (ret < 0) { +@@ -2458,6 +2497,20 @@ static int img_convert(int argc, char **argv) + } + } + ++ /* Determine if bitmaps need copying */ ++ if (bitmaps) { ++ if (s.src_num > 1) { ++ error_report("Copying bitmaps only possible with single source"); ++ ret = -1; ++ goto out; ++ } ++ if (!bdrv_supports_persistent_dirty_bitmap(blk_bs(s.src[0]))) { ++ error_report("Source lacks bitmap support"); ++ ret = -1; ++ goto out; ++ } ++ } ++ + /* + * The later open call will need any decryption secrets, and + * bdrv_create() will purge "opts", so extract them now before +@@ -2466,9 +2519,7 @@ static int img_convert(int argc, char **argv) + if (!skip_create) { + open_opts = qdict_new(); + qemu_opt_foreach(opts, img_add_key_secrets, open_opts, &error_abort); +- } + +- if (!skip_create) { + /* Create the new image */ + ret = bdrv_create(drv, out_filename, opts, &local_err); + if (ret < 0) { +@@ -2506,6 +2557,13 @@ static int img_convert(int argc, char **argv) + } + out_bs = blk_bs(s.target); + ++ if (bitmaps && !bdrv_supports_persistent_dirty_bitmap(out_bs)) { ++ error_report("Format driver '%s' does not support bitmaps", ++ out_bs->drv->format_name); ++ ret = -1; ++ goto out; ++ } ++ + if (s.compressed && !block_driver_can_compress(out_bs->drv)) { + error_report("Compression not supported for this file format"); + ret = -1; +@@ -2565,6 +2623,12 @@ static int img_convert(int argc, char **argv) + } + + ret = convert_do_copy(&s); ++ ++ /* Now copy the bitmaps */ ++ if (bitmaps && ret == 0) { ++ ret = convert_copy_bitmaps(blk_bs(s.src[0]), out_bs); ++ } ++ + out: + if (!ret) { + qemu_progress_print(100, 0); +diff --git a/qemu-img.texi b/qemu-img.texi +index 3670b96..b95d019 100644 +--- a/qemu-img.texi ++++ b/qemu-img.texi +@@ -161,6 +161,8 @@ Parameters to convert subcommand: + + @table @option + ++@item --bitmaps ++Additionally copy all persistent bitmaps from the top layer of the source + @item -n + Skip the creation of the target volume + @item -m +@@ -357,7 +359,7 @@ Error on reading data + + @end table + +-@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] @var{filename} [@var{filename2} [...]] @var{output_filename} ++@item convert [--object @var{objectdef}] [--image-opts] [--target-image-opts] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-B @var{backing_file}] [-o @var{options}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] @var{filename} [@var{filename2} [...]] @var{output_filename} + + Convert the disk image @var{filename} or a snapshot @var{snapshot_param} + to disk image @var{output_filename} using format @var{output_fmt}. It can be optionally compressed (@code{-c} +-- +1.8.3.1 + diff --git a/kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch b/kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch new file mode 100755 index 0000000000000000000000000000000000000000..c4012b7d1b14adb99f751c3b0932ca464755f739 --- /dev/null +++ b/kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch @@ -0,0 +1,89 @@ +From 58816c3709e5058e8805333ca011cc4e793d67ff Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:18 +0100 +Subject: [PATCH 13/26] qemu-img: Factor out code for merging bitmaps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-11-eblake@redhat.com> +Patchwork-id: 97078 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 10/12] qemu-img: Factor out code for merging bitmaps +Bugzilla: 1779893 1779904 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +The next patch will add another client that wants to merge dirty +bitmaps; it will be easier to refactor the code to construct the QAPI +struct correctly into a helper function. + +Signed-off-by: Eric Blake +Message-Id: <20200521192137.1120211-4-eblake@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 6c729dd832207d7347ecb074912f538e2942f269) +Signed-off-by: Eric Blake +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img.c | 34 +++++++++++++++++++++------------- + 1 file changed, 21 insertions(+), 13 deletions(-) + +diff --git a/qemu-img.c b/qemu-img.c +index b57856e..39e1586 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -1582,6 +1582,24 @@ out4: + return ret; + } + ++/* Convenience wrapper around qmp_block_dirty_bitmap_merge */ ++static void do_dirty_bitmap_merge(const char *dst_node, const char *dst_name, ++ const char *src_node, const char *src_name, ++ Error **errp) ++{ ++ BlockDirtyBitmapMergeSource *merge_src; ++ BlockDirtyBitmapMergeSourceList *list; ++ ++ merge_src = g_new0(BlockDirtyBitmapMergeSource, 1); ++ merge_src->type = QTYPE_QDICT; ++ merge_src->u.external.node = g_strdup(src_node); ++ merge_src->u.external.name = g_strdup(src_name); ++ list = g_new0(BlockDirtyBitmapMergeSourceList, 1); ++ list->value = merge_src; ++ qmp_block_dirty_bitmap_merge(dst_node, dst_name, list, errp); ++ qapi_free_BlockDirtyBitmapMergeSourceList(list); ++} ++ + enum ImgConvertBlockStatus { + BLK_DATA, + BLK_ZERO, +@@ -4614,21 +4632,11 @@ static int img_bitmap(int argc, char **argv) + qmp_block_dirty_bitmap_disable(bs->node_name, bitmap, &err); + op = "disable"; + break; +- case BITMAP_MERGE: { +- BlockDirtyBitmapMergeSource *merge_src; +- BlockDirtyBitmapMergeSourceList *list; +- +- merge_src = g_new0(BlockDirtyBitmapMergeSource, 1); +- merge_src->type = QTYPE_QDICT; +- merge_src->u.external.node = g_strdup(src_bs->node_name); +- merge_src->u.external.name = g_strdup(act->src); +- list = g_new0(BlockDirtyBitmapMergeSourceList, 1); +- list->value = merge_src; +- qmp_block_dirty_bitmap_merge(bs->node_name, bitmap, list, &err); +- qapi_free_BlockDirtyBitmapMergeSourceList(list); ++ case BITMAP_MERGE: ++ do_dirty_bitmap_merge(bs->node_name, bitmap, src_bs->node_name, ++ act->src, &err); + op = "merge"; + break; +- } + default: + g_assert_not_reached(); + } +-- +1.8.3.1 + diff --git a/kvm-qemu-img-convert-Don-t-pre-zero-images.patch b/kvm-qemu-img-convert-Don-t-pre-zero-images.patch new file mode 100755 index 0000000000000000000000000000000000000000..28311f4e474accfd2de36407e6ef66c9ec0c246a --- /dev/null +++ b/kvm-qemu-img-convert-Don-t-pre-zero-images.patch @@ -0,0 +1,73 @@ +From eea45924903f03dc6d8f20576be0a4a84d5acce4 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 10 Feb 2021 10:16:11 -0500 +Subject: [PATCH 4/5] qemu-img convert: Don't pre-zero images + +RH-Author: Kevin Wolf +Message-id: <20210210101611.137928-2-kwolf@redhat.com> +Patchwork-id: 101030 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] qemu-img convert: Don't pre-zero images +Bugzilla: 1855250 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Max Reitz + +Since commit 5a37b60a61c, qemu-img create will pre-zero the target image +if it isn't already zero-initialised (most importantly, for host block +devices, but also iscsi etc.), so that writing explicit zeros wouldn't +be necessary later. + +This could speed up the operation significantly, in particular when the +source image file was only sparsely populated. However, it also means +that some block are written twice: Once when pre-zeroing them, and then +when they are overwritten with actual data. On a full image, the +pre-zeroing is wasted work because everything will be overwritten. + +In practice, write_zeroes typically turns out faster than writing +explicit zero buffers, but slow enough that first zeroing everything and +then overwriting parts can be a significant net loss. + +Meanwhile, qemu-img convert was rewritten in 690c7301600 and zero blocks +are now written to the target using bdrv_co_pwrite_zeroes() if the +target could be pre-zeroed. This way we already make use of the faster +write_zeroes operation, but avoid writing any blocks twice. + +Remove the pre-zeroing because these days this former optimisation has +actually turned into a pessimisation in the common case. + +Reported-by: Nir Soffer +Signed-off-by: Kevin Wolf +Message-Id: <20200622151203.35624-1-kwolf@redhat.com> +Tested-by: Nir Soffer +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit edafc70c0c8510862f2f213a3acf7067113bcd08) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img.c | 9 --------- + 1 file changed, 9 deletions(-) + +diff --git a/qemu-img.c b/qemu-img.c +index a27ad70851..b10dc5129b 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -2029,15 +2029,6 @@ static int convert_do_copy(ImgConvertState *s) + s->has_zero_init = false; + } + +- if (!s->has_zero_init && !s->target_has_backing && +- bdrv_can_write_zeroes_with_unmap(blk_bs(s->target))) +- { +- ret = blk_make_zero(s->target, BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK); +- if (ret == 0) { +- s->has_zero_init = true; +- } +- } +- + /* Allocate buffer for copied data. For compressed images, only one cluster + * can be copied at a time. */ + if (s->compressed) { +-- +2.27.0 + diff --git a/kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch b/kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch new file mode 100755 index 0000000000000000000000000000000000000000..b4180b91290424a7a3a9ace3006c3667d8fea3a5 --- /dev/null +++ b/kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch @@ -0,0 +1,241 @@ +From 1a8a4ece5def912e7cfa5ef8565fc8ecef6e72c3 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 Jun 2020 02:34:11 +0100 +Subject: [PATCH 06/26] qemu_img: add cvtnum_full to print error reports + +RH-Author: Eric Blake +Message-id: <20200602023420.2133649-4-eblake@redhat.com> +Patchwork-id: 97067 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 03/12] qemu_img: add cvtnum_full to print error reports +Bugzilla: 1779893 1779904 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Kevin Wolf + +From: Eyal Moscovici + +All calls to cvtnum check the return value and print the same error +message more or less. And so error reporting moved to cvtnum_full to +reduce code duplication and provide a single error +message. Additionally, cvtnum now wraps cvtnum_full with the existing +default range of 0 to MAX_INT64. + +Acked-by: Mark Kanda +Signed-off-by: Eyal Moscovici +Message-Id: <20200513133629.18508-2-eyal.moscovici@oracle.com> +Reviewed-by: Eric Blake +[eblake: fix printf formatting, avoid trailing space, change error wording, +reformat commit message] +Signed-off-by: Eric Blake +(cherry picked from commit 43d589b074370ebc9b340340b5f641b385da9df8) +Signed-off-by: Eric Blake + +Signed-off-by: Danilo C. L. de Paula +--- + qemu-img.c | 76 +++++++++++++++++++++------------------------- + tests/qemu-iotests/049.out | 8 ++--- + 2 files changed, 38 insertions(+), 46 deletions(-) + +diff --git a/qemu-img.c b/qemu-img.c +index 95a24b9..e69529b 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -422,19 +422,31 @@ static int add_old_style_options(const char *fmt, QemuOpts *opts, + return 0; + } + +-static int64_t cvtnum(const char *s) ++static int64_t cvtnum_full(const char *name, const char *value, int64_t min, ++ int64_t max) + { + int err; +- uint64_t value; +- +- err = qemu_strtosz(s, NULL, &value); +- if (err < 0) { ++ uint64_t res; ++ ++ err = qemu_strtosz(value, NULL, &res); ++ if (err < 0 && err != -ERANGE) { ++ error_report("Invalid %s specified. You may use " ++ "k, M, G, T, P or E suffixes for", name); ++ error_report("kilobytes, megabytes, gigabytes, terabytes, " ++ "petabytes and exabytes."); + return err; + } +- if (value > INT64_MAX) { ++ if (err == -ERANGE || res > max || res < min) { ++ error_report("Invalid %s specified. Must be between %" PRId64 ++ " and %" PRId64 ".", name, min, max); + return -ERANGE; + } +- return value; ++ return res; ++} ++ ++static int64_t cvtnum(const char *name, const char *value) ++{ ++ return cvtnum_full(name, value, 0, INT64_MAX); + } + + static int img_create(int argc, char **argv) +@@ -532,16 +544,8 @@ static int img_create(int argc, char **argv) + if (optind < argc) { + int64_t sval; + +- sval = cvtnum(argv[optind++]); ++ sval = cvtnum("image size", argv[optind++]); + if (sval < 0) { +- if (sval == -ERANGE) { +- error_report("Image size must be less than 8 EiB!"); +- } else { +- error_report("Invalid image size specified! You may use k, M, " +- "G, T, P or E suffixes for "); +- error_report("kilobytes, megabytes, gigabytes, terabytes, " +- "petabytes and exabytes."); +- } + goto fail; + } + img_size = (uint64_t)sval; +@@ -2148,8 +2152,10 @@ static int img_convert(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); +- if (sval < 0 || !QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) || ++ sval = cvtnum("buffer size for sparse output", optarg); ++ if (sval < 0) { ++ goto fail_getopt; ++ } else if (!QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) || + sval / BDRV_SECTOR_SIZE > MAX_BUF_SECTORS) { + error_report("Invalid buffer size for sparse output specified. " + "Valid sizes are multiples of %llu up to %llu. Select " +@@ -4229,9 +4235,8 @@ static int img_bench(int argc, char **argv) + break; + case 'o': + { +- offset = cvtnum(optarg); ++ offset = cvtnum("offset", optarg); + if (offset < 0) { +- error_report("Invalid offset specified"); + return 1; + } + break; +@@ -4244,9 +4249,8 @@ static int img_bench(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); +- if (sval < 0 || sval > INT_MAX) { +- error_report("Invalid buffer size specified"); ++ sval = cvtnum_full("buffer size", optarg, 0, INT_MAX); ++ if (sval < 0) { + return 1; + } + +@@ -4257,9 +4261,8 @@ static int img_bench(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); +- if (sval < 0 || sval > INT_MAX) { +- error_report("Invalid step size specified"); ++ sval = cvtnum_full("step_size", optarg, 0, INT_MAX); ++ if (sval < 0) { + return 1; + } + +@@ -4429,10 +4432,9 @@ static int img_dd_bs(const char *arg, + { + int64_t res; + +- res = cvtnum(arg); ++ res = cvtnum_full("bs", arg, 1, INT_MAX); + +- if (res <= 0 || res > INT_MAX) { +- error_report("invalid number: '%s'", arg); ++ if (res < 0) { + return 1; + } + in->bsz = out->bsz = res; +@@ -4444,10 +4446,9 @@ static int img_dd_count(const char *arg, + struct DdIo *in, struct DdIo *out, + struct DdInfo *dd) + { +- dd->count = cvtnum(arg); ++ dd->count = cvtnum("count", arg); + + if (dd->count < 0) { +- error_report("invalid number: '%s'", arg); + return 1; + } + +@@ -4476,10 +4477,9 @@ static int img_dd_skip(const char *arg, + struct DdIo *in, struct DdIo *out, + struct DdInfo *dd) + { +- in->offset = cvtnum(arg); ++ in->offset = cvtnum("skip", arg); + + if (in->offset < 0) { +- error_report("invalid number: '%s'", arg); + return 1; + } + +@@ -4869,16 +4869,8 @@ static int img_measure(int argc, char **argv) + { + int64_t sval; + +- sval = cvtnum(optarg); ++ sval = cvtnum("image size", optarg); + if (sval < 0) { +- if (sval == -ERANGE) { +- error_report("Image size must be less than 8 EiB!"); +- } else { +- error_report("Invalid image size specified! You may use " +- "k, M, G, T, P or E suffixes for "); +- error_report("kilobytes, megabytes, gigabytes, terabytes, " +- "petabytes and exabytes."); +- } + goto out; + } + img_size = (uint64_t)sval; +diff --git a/tests/qemu-iotests/049.out b/tests/qemu-iotests/049.out +index 6b50540..8b35f3d 100644 +--- a/tests/qemu-iotests/049.out ++++ b/tests/qemu-iotests/049.out +@@ -92,19 +92,19 @@ Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=1649267441664 cluster_size=65536 l + == 3. Invalid sizes == + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- -1024 +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + + qemu-img create -f qcow2 -o size=-1024 TEST_DIR/t.qcow2 + qemu-img: TEST_DIR/t.qcow2: Value '-1024' is out of range for parameter 'size' + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- -1k +-qemu-img: Image size must be less than 8 EiB! ++qemu-img: Invalid image size specified. Must be between 0 and 9223372036854775807. + + qemu-img create -f qcow2 -o size=-1k TEST_DIR/t.qcow2 + qemu-img: TEST_DIR/t.qcow2: Value '-1k' is out of range for parameter 'size' + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- 1kilobyte +-qemu-img: Invalid image size specified! You may use k, M, G, T, P or E suffixes for ++qemu-img: Invalid image size specified. You may use k, M, G, T, P or E suffixes for + qemu-img: kilobytes, megabytes, gigabytes, terabytes, petabytes and exabytes. + + qemu-img create -f qcow2 -o size=1kilobyte TEST_DIR/t.qcow2 +@@ -113,7 +113,7 @@ Optional suffix k, M, G, T, P or E means kilo-, mega-, giga-, tera-, peta- + and exabytes, respectively. + + qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- foobar +-qemu-img: Invalid image size specified! You may use k, M, G, T, P or E suffixes for ++qemu-img: Invalid image size specified. You may use k, M, G, T, P or E suffixes for + qemu-img: kilobytes, megabytes, gigabytes, terabytes, petabytes and exabytes. + + qemu-img create -f qcow2 -o size=foobar TEST_DIR/t.qcow2 +-- +1.8.3.1 + diff --git a/kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch b/kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch new file mode 100755 index 0000000000000000000000000000000000000000..3b533a5adc3e5a0f2f94b412b6b29c3a46376973 --- /dev/null +++ b/kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch @@ -0,0 +1,73 @@ +From c5f90436555d7ab2c1c28bf1cfdb5f5f8ca97816 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 24 Dec 2020 12:53:04 -0500 +Subject: [PATCH 4/5] qga: Use qemu_get_host_name() instead of + g_get_host_name() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201224125304.62697-4-marcandre.lureau@redhat.com> +Patchwork-id: 100500 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/3] qga: Use qemu_get_host_name() instead of g_get_host_name() +Bugzilla: 1910326 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Michal Privoznik + +Problem with g_get_host_name() is that on the first call it saves +the hostname into a global variable and from then on, every +subsequent call returns the saved hostname. Even if the hostname +changes. This doesn't play nicely with guest agent, because if +the hostname is acquired before the guest is set up (e.g. on the +first boot, or before DHCP) we will report old, invalid hostname. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1845127 + +Signed-off-by: Michal Privoznik +Reviewed-by: Daniel P. Berrangé +Cc: qemu-stable@nongnu.org +Signed-off-by: Michael Roth + +(cherry picked from commit 0d3a8f32b1e0eca279da1b0cc793efc7250c3daf) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +diff --git a/qga/commands.c b/qga/commands.c +index 43c323ceada..93bed292d08 100644 +--- a/qga/commands.c ++++ b/qga/commands.c +@@ -502,11 +502,20 @@ int ga_parse_whence(GuestFileWhence *whence, Error **errp) + GuestHostName *qmp_guest_get_host_name(Error **errp) + { + GuestHostName *result = NULL; +- gchar const *hostname = g_get_host_name(); +- if (hostname != NULL) { +- result = g_new0(GuestHostName, 1); +- result->host_name = g_strdup(hostname); ++ g_autofree char *hostname = qemu_get_host_name(errp); ++ ++ /* ++ * We want to avoid using g_get_host_name() because that ++ * caches the result and we wouldn't reflect changes in the ++ * host name. ++ */ ++ ++ if (!hostname) { ++ hostname = g_strdup("localhost"); + } ++ ++ result = g_new0(GuestHostName, 1); ++ result->host_name = g_steal_pointer(&hostname); + return result; + } + +-- +2.27.0 + diff --git a/kvm-qga-add-command-guest-get-disks.patch b/kvm-qga-add-command-guest-get-disks.patch new file mode 100755 index 0000000000000000000000000000000000000000..360301dc826d65ec0127a44aef815094915e6725 --- /dev/null +++ b/kvm-qga-add-command-guest-get-disks.patch @@ -0,0 +1,115 @@ +From 58688d868656e77f67ea915544b0bb3bb60f33d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:11 -0500 +Subject: [PATCH 10/14] qga: add command guest-get-disks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-7-marcandre.lureau@redhat.com> +Patchwork-id: 100475 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 06/10] qga: add command guest-get-disks +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Tomáš Golembiovský + +Add API and stubs for new guest-get-disks command. + +The command guest-get-fsinfo can be used to list information about disks +and partitions but it is limited only to mounted disks with filesystem. +This new command should allow listing information about disks of the VM +regardles whether they are mounted or not. This can be usefull for +management applications for mapping virtualized devices or pass-through +devices to device names in the guest OS. + +Signed-off-by: Tomáš Golembiovský +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Marc-André Lureau +Signed-off-by: Michael Roth + +(cherry-picked from commit c27ea3f9ef7c7f29e55bde91879f8514abce9c38) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 6 ++++++ + qga/commands-win32.c | 6 ++++++ + qga/qapi-schema.json | 31 +++++++++++++++++++++++++++++++ + 3 files changed, 43 insertions(+) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index c86c87ed522..5095104afc0 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -3039,3 +3039,9 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} +diff --git a/qga/commands-win32.c b/qga/commands-win32.c +index 55ba5b263af..be63fa2b208 100644 +--- a/qga/commands-win32.c ++++ b/qga/commands-win32.c +@@ -2234,3 +2234,9 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index fb4605cc19c..22df375c92f 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -852,6 +852,37 @@ + 'bus': 'int', 'target': 'int', 'unit': 'int', + '*serial': 'str', '*dev': 'str'} } + ++## ++# @GuestDiskInfo: ++# ++# @name: device node (Linux) or device UNC (Windows) ++# @partition: whether this is a partition or disk ++# @dependents: list of dependent devices; e.g. for LVs of the LVM this will ++# hold the list of PVs, for LUKS encrypted volume this will ++# contain the disk where the volume is placed. (Linux) ++# @address: disk address information (only for non-virtual devices) ++# @alias: optional alias assigned to the disk, on Linux this is a name assigned ++# by device mapper ++# ++# Since 5.2 ++## ++{ 'struct': 'GuestDiskInfo', ++ 'data': {'name': 'str', 'partition': 'bool', 'dependents': ['str'], ++ '*address': 'GuestDiskAddress', '*alias': 'str'} } ++ ++## ++# @guest-get-disks: ++# ++# Returns: The list of disks in the guest. For Windows these are only the ++# physical disks. On Linux these are all root block devices of ++# non-zero size including e.g. removable devices, loop devices, ++# NBD, etc. ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-get-disks', ++ 'returns': ['GuestDiskInfo'] } ++ + ## + # @GuestFilesystemInfo: + # +-- +2.27.0 + diff --git a/kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch b/kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch new file mode 100755 index 0000000000000000000000000000000000000000..939a2126b3356cb1c2fbd3aad9baf1c5628794bb --- /dev/null +++ b/kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch @@ -0,0 +1,427 @@ +From 086957b970a8f4165249589e2bc0cc08d1800db3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:12 -0500 +Subject: [PATCH 11/14] qga: add implementation of guest-get-disks for Linux +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-8-marcandre.lureau@redhat.com> +Patchwork-id: 100478 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 07/10] qga: add implementation of guest-get-disks for Linux +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Tomáš Golembiovský + +The command lists all disks (real and virtual) as well as disk +partitions. For each disk the list of dependent disks is also listed and +/dev path is used as a handle so it can be matched with "name" field of +other returned disk entries. For disk partitions the "dependents" list +is populated with the the parent device for easier tracking of +hierarchy. + +Example output: +{ + "return": [ + ... + { + "name": "/dev/dm-0", + "partition": false, + "dependents": [ + "/dev/sda2" + ], + "alias": "luks-7062202e-5b9b-433e-81e8-6628c40da9f7" + }, + { + "name": "/dev/sda2", + "partition": true, + "dependents": [ + "/dev/sda" + ] + }, + { + "name": "/dev/sda", + "partition": false, + "address": { + "serial": "SAMSUNG_MZ7LN512HCHP-000L1_S1ZKNXAG822493", + "bus-type": "sata", + ... + "dev": "/dev/sda", + "target": 0 + }, + "dependents": [] + }, + ... + ] +} + +Signed-off-by: Tomáš Golembiovský +Reviewed-by: Marc-André Lureau +*add missing stub for !defined(CONFIG_FSFREEZE) +*remove unused deps_dir variable +Signed-off-by: Michael Roth +(cherry picked from commit fed3956429d560a06fc2d2fcf1a01efb58659f87) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 303 +++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 292 insertions(+), 11 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 5095104afc0..96f5ddafd3a 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1152,13 +1152,27 @@ static void build_guest_fsinfo_for_virtual_device(char const *syspath, + closedir(dir); + } + ++static bool is_disk_virtual(const char *devpath, Error **errp) ++{ ++ g_autofree char *syspath = realpath(devpath, NULL); ++ ++ if (!syspath) { ++ error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); ++ return false; ++ } ++ return strstr(syspath, "/devices/virtual/block/") != NULL; ++} ++ + /* Dispatch to functions for virtual/real device */ + static void build_guest_fsinfo_for_device(char const *devpath, + GuestFilesystemInfo *fs, + Error **errp) + { +- char *syspath = realpath(devpath, NULL); ++ ERRP_GUARD(); ++ g_autofree char *syspath = NULL; ++ bool is_virtual = false; + ++ syspath = realpath(devpath, NULL); + if (!syspath) { + error_setg_errno(errp, errno, "realpath(\"%s\")", devpath); + return; +@@ -1169,16 +1183,281 @@ static void build_guest_fsinfo_for_device(char const *devpath, + } + + g_debug(" parse sysfs path '%s'", syspath); +- +- if (strstr(syspath, "/devices/virtual/block/")) { ++ is_virtual = is_disk_virtual(syspath, errp); ++ if (*errp != NULL) { ++ return; ++ } ++ if (is_virtual) { + build_guest_fsinfo_for_virtual_device(syspath, fs, errp); + } else { + build_guest_fsinfo_for_real_device(syspath, fs, errp); + } ++} ++ ++#ifdef CONFIG_LIBUDEV ++ ++/* ++ * Wrapper around build_guest_fsinfo_for_device() for getting just ++ * the disk address. ++ */ ++static GuestDiskAddress *get_disk_address(const char *syspath, Error **errp) ++{ ++ g_autoptr(GuestFilesystemInfo) fs = NULL; + +- free(syspath); ++ fs = g_new0(GuestFilesystemInfo, 1); ++ build_guest_fsinfo_for_device(syspath, fs, errp); ++ if (fs->disk != NULL) { ++ return g_steal_pointer(&fs->disk->value); ++ } ++ return NULL; + } + ++static char *get_alias_for_syspath(const char *syspath) ++{ ++ struct udev *udev = NULL; ++ struct udev_device *udevice = NULL; ++ char *ret = NULL; ++ ++ udev = udev_new(); ++ if (udev == NULL) { ++ g_debug("failed to query udev"); ++ goto out; ++ } ++ udevice = udev_device_new_from_syspath(udev, syspath); ++ if (udevice == NULL) { ++ g_debug("failed to query udev for path: %s", syspath); ++ goto out; ++ } else { ++ const char *alias = udev_device_get_property_value( ++ udevice, "DM_NAME"); ++ /* ++ * NULL means there was an error and empty string means there is no ++ * alias. In case of no alias we return NULL instead of empty string. ++ */ ++ if (alias == NULL) { ++ g_debug("failed to query udev for device alias for: %s", ++ syspath); ++ } else if (*alias != 0) { ++ ret = g_strdup(alias); ++ } ++ } ++ ++out: ++ udev_unref(udev); ++ udev_device_unref(udevice); ++ return ret; ++} ++ ++static char *get_device_for_syspath(const char *syspath) ++{ ++ struct udev *udev = NULL; ++ struct udev_device *udevice = NULL; ++ char *ret = NULL; ++ ++ udev = udev_new(); ++ if (udev == NULL) { ++ g_debug("failed to query udev"); ++ goto out; ++ } ++ udevice = udev_device_new_from_syspath(udev, syspath); ++ if (udevice == NULL) { ++ g_debug("failed to query udev for path: %s", syspath); ++ goto out; ++ } else { ++ ret = g_strdup(udev_device_get_devnode(udevice)); ++ } ++ ++out: ++ udev_unref(udev); ++ udev_device_unref(udevice); ++ return ret; ++} ++ ++static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) ++{ ++ g_autofree char *deps_dir = NULL; ++ const gchar *dep; ++ GDir *dp_deps = NULL; ++ ++ /* List dependent disks */ ++ deps_dir = g_strdup_printf("%s/slaves", disk_dir); ++ g_debug(" listing entries in: %s", deps_dir); ++ dp_deps = g_dir_open(deps_dir, 0, NULL); ++ if (dp_deps == NULL) { ++ g_debug("failed to list entries in %s", deps_dir); ++ return; ++ } ++ while ((dep = g_dir_read_name(dp_deps)) != NULL) { ++ g_autofree char *dep_dir = NULL; ++ strList *dep_item = NULL; ++ char *dev_name; ++ ++ /* Add dependent disks */ ++ dep_dir = g_strdup_printf("%s/%s", deps_dir, dep); ++ dev_name = get_device_for_syspath(dep_dir); ++ if (dev_name != NULL) { ++ g_debug(" adding dependent device: %s", dev_name); ++ dep_item = g_new0(strList, 1); ++ dep_item->value = dev_name; ++ dep_item->next = disk->dependents; ++ disk->dependents = dep_item; ++ } ++ } ++ g_dir_close(dp_deps); ++} ++ ++/* ++ * Detect partitions subdirectory, name is "" or ++ * "p" ++ * ++ * @disk_name -- last component of /sys path (e.g. sda) ++ * @disk_dir -- sys path of the disk (e.g. /sys/block/sda) ++ * @disk_dev -- device node of the disk (e.g. /dev/sda) ++ */ ++static GuestDiskInfoList *get_disk_partitions( ++ GuestDiskInfoList *list, ++ const char *disk_name, const char *disk_dir, ++ const char *disk_dev) ++{ ++ GuestDiskInfoList *item, *ret = list; ++ struct dirent *de_disk; ++ DIR *dp_disk = NULL; ++ size_t len = strlen(disk_name); ++ ++ dp_disk = opendir(disk_dir); ++ while ((de_disk = readdir(dp_disk)) != NULL) { ++ g_autofree char *partition_dir = NULL; ++ char *dev_name; ++ GuestDiskInfo *partition; ++ ++ if (!(de_disk->d_type & DT_DIR)) { ++ continue; ++ } ++ ++ if (!(strncmp(disk_name, de_disk->d_name, len) == 0 && ++ ((*(de_disk->d_name + len) == 'p' && ++ isdigit(*(de_disk->d_name + len + 1))) || ++ isdigit(*(de_disk->d_name + len))))) { ++ continue; ++ } ++ ++ partition_dir = g_strdup_printf("%s/%s", ++ disk_dir, de_disk->d_name); ++ dev_name = get_device_for_syspath(partition_dir); ++ if (dev_name == NULL) { ++ g_debug("Failed to get device name for syspath: %s", ++ disk_dir); ++ continue; ++ } ++ partition = g_new0(GuestDiskInfo, 1); ++ partition->name = dev_name; ++ partition->partition = true; ++ /* Add parent disk as dependent for easier tracking of hierarchy */ ++ partition->dependents = g_new0(strList, 1); ++ partition->dependents->value = g_strdup(disk_dev); ++ ++ item = g_new0(GuestDiskInfoList, 1); ++ item->value = partition; ++ item->next = ret; ++ ret = item; ++ ++ } ++ closedir(dp_disk); ++ ++ return ret; ++} ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ GuestDiskInfoList *item, *ret = NULL; ++ GuestDiskInfo *disk; ++ DIR *dp = NULL; ++ struct dirent *de = NULL; ++ ++ g_debug("listing /sys/block directory"); ++ dp = opendir("/sys/block"); ++ if (dp == NULL) { ++ error_setg_errno(errp, errno, "Can't open directory \"/sys/block\""); ++ return NULL; ++ } ++ while ((de = readdir(dp)) != NULL) { ++ g_autofree char *disk_dir = NULL, *line = NULL, ++ *size_path = NULL; ++ char *dev_name; ++ Error *local_err = NULL; ++ if (de->d_type != DT_LNK) { ++ g_debug(" skipping entry: %s", de->d_name); ++ continue; ++ } ++ ++ /* Check size and skip zero-sized disks */ ++ g_debug(" checking disk size"); ++ size_path = g_strdup_printf("/sys/block/%s/size", de->d_name); ++ if (!g_file_get_contents(size_path, &line, NULL, NULL)) { ++ g_debug(" failed to read disk size"); ++ continue; ++ } ++ if (g_strcmp0(line, "0\n") == 0) { ++ g_debug(" skipping zero-sized disk"); ++ continue; ++ } ++ ++ g_debug(" adding %s", de->d_name); ++ disk_dir = g_strdup_printf("/sys/block/%s", de->d_name); ++ dev_name = get_device_for_syspath(disk_dir); ++ if (dev_name == NULL) { ++ g_debug("Failed to get device name for syspath: %s", ++ disk_dir); ++ continue; ++ } ++ disk = g_new0(GuestDiskInfo, 1); ++ disk->name = dev_name; ++ disk->partition = false; ++ disk->alias = get_alias_for_syspath(disk_dir); ++ disk->has_alias = (disk->alias != NULL); ++ item = g_new0(GuestDiskInfoList, 1); ++ item->value = disk; ++ item->next = ret; ++ ret = item; ++ ++ /* Get address for non-virtual devices */ ++ bool is_virtual = is_disk_virtual(disk_dir, &local_err); ++ if (local_err != NULL) { ++ g_debug(" failed to check disk path, ignoring error: %s", ++ error_get_pretty(local_err)); ++ error_free(local_err); ++ local_err = NULL; ++ /* Don't try to get the address */ ++ is_virtual = true; ++ } ++ if (!is_virtual) { ++ disk->address = get_disk_address(disk_dir, &local_err); ++ if (local_err != NULL) { ++ g_debug(" failed to get device info, ignoring error: %s", ++ error_get_pretty(local_err)); ++ error_free(local_err); ++ local_err = NULL; ++ } else if (disk->address != NULL) { ++ disk->has_address = true; ++ } ++ } ++ ++ get_disk_deps(disk_dir, disk); ++ ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name); ++ } ++ return ret; ++} ++ ++#else ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} ++ ++#endif ++ + /* Return a list of the disk device(s)' info which @mount lies on */ + static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, + Error **errp) +@@ -2770,6 +3049,13 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) + + return 0; + } ++ ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} ++ + #endif /* CONFIG_FSFREEZE */ + + #if !defined(CONFIG_FSTRIM) +@@ -2806,7 +3092,8 @@ GList *ga_command_blacklist_init(GList *blacklist) + const char *list[] = { + "guest-get-fsinfo", "guest-fsfreeze-status", + "guest-fsfreeze-freeze", "guest-fsfreeze-freeze-list", +- "guest-fsfreeze-thaw", "guest-get-fsinfo", NULL}; ++ "guest-fsfreeze-thaw", "guest-get-fsinfo", ++ "guest-get-disks", NULL}; + char **p = (char **)list; + + while (*p) { +@@ -3039,9 +3326,3 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } +- +-GuestDiskInfoList *qmp_guest_get_disks(Error **errp) +-{ +- error_setg(errp, QERR_UNSUPPORTED); +- return NULL; +-} +-- +2.27.0 + diff --git a/kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch b/kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch new file mode 100755 index 0000000000000000000000000000000000000000..f82d95d410176c28c36967e0659928b020355501 --- /dev/null +++ b/kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch @@ -0,0 +1,181 @@ +From 925163bf8498e26c19742dbd34b6b324e49c07b6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:13 -0500 +Subject: [PATCH 12/14] qga: add implementation of guest-get-disks for Windows +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-9-marcandre.lureau@redhat.com> +Patchwork-id: 100479 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 08/10] qga: add implementation of guest-get-disks for Windows +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Tomáš Golembiovský + +The command lists all the physical disk drives. Unlike for Linux +partitions and virtual volumes are not listed. + +Example output: + +{ + "return": [ + { + "name": "\\\\.\\PhysicalDrive0", + "partition": false, + "address": { + "serial": "QM00001", + "bus-type": "sata", + ... + }, + "dependents": [] + } + ] +} + +Signed-off-by: Tomáš Golembiovský +Signed-off-by: Michael Roth + +(cherry picked from commit c67d2efd9d1771fd886e3b58771adaa62897f3d9) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-win32.c | 107 ++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 101 insertions(+), 6 deletions(-) + +diff --git a/qga/commands-win32.c b/qga/commands-win32.c +index be63fa2b208..a07725e874b 100644 +--- a/qga/commands-win32.c ++++ b/qga/commands-win32.c +@@ -960,6 +960,101 @@ out: + return list; + } + ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ ERRP_GUARD(); ++ GuestDiskInfoList *new = NULL, *ret = NULL; ++ HDEVINFO dev_info; ++ SP_DEVICE_INTERFACE_DATA dev_iface_data; ++ int i; ++ ++ dev_info = SetupDiGetClassDevs(&GUID_DEVINTERFACE_DISK, 0, 0, ++ DIGCF_PRESENT | DIGCF_DEVICEINTERFACE); ++ if (dev_info == INVALID_HANDLE_VALUE) { ++ error_setg_win32(errp, GetLastError(), "failed to get device tree"); ++ return NULL; ++ } ++ ++ g_debug("enumerating devices"); ++ dev_iface_data.cbSize = sizeof(SP_DEVICE_INTERFACE_DATA); ++ for (i = 0; ++ SetupDiEnumDeviceInterfaces(dev_info, NULL, &GUID_DEVINTERFACE_DISK, ++ i, &dev_iface_data); ++ i++) { ++ GuestDiskAddress *address = NULL; ++ GuestDiskInfo *disk = NULL; ++ Error *local_err = NULL; ++ g_autofree PSP_DEVICE_INTERFACE_DETAIL_DATA ++ pdev_iface_detail_data = NULL; ++ STORAGE_DEVICE_NUMBER sdn; ++ HANDLE dev_file; ++ DWORD size = 0; ++ BOOL result; ++ int attempt; ++ ++ g_debug(" getting device path"); ++ for (attempt = 0, result = FALSE; attempt < 2 && !result; attempt++) { ++ result = SetupDiGetDeviceInterfaceDetail(dev_info, ++ &dev_iface_data, pdev_iface_detail_data, size, &size, NULL); ++ if (result) { ++ break; ++ } ++ if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { ++ pdev_iface_detail_data = g_realloc(pdev_iface_detail_data, ++ size); ++ pdev_iface_detail_data->cbSize = ++ sizeof(*pdev_iface_detail_data); ++ } else { ++ g_debug("failed to get device interface details"); ++ break; ++ } ++ } ++ if (!result) { ++ g_debug("skipping device"); ++ continue; ++ } ++ ++ g_debug(" device: %s", pdev_iface_detail_data->DevicePath); ++ dev_file = CreateFile(pdev_iface_detail_data->DevicePath, 0, ++ FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL); ++ if (!DeviceIoControl(dev_file, IOCTL_STORAGE_GET_DEVICE_NUMBER, ++ NULL, 0, &sdn, sizeof(sdn), &size, NULL)) { ++ CloseHandle(dev_file); ++ debug_error("failed to get storage device number"); ++ continue; ++ } ++ CloseHandle(dev_file); ++ ++ disk = g_new0(GuestDiskInfo, 1); ++ disk->name = g_strdup_printf("\\\\.\\PhysicalDrive%lu", ++ sdn.DeviceNumber); ++ ++ g_debug(" number: %lu", sdn.DeviceNumber); ++ address = g_malloc0(sizeof(GuestDiskAddress)); ++ address->has_dev = true; ++ address->dev = g_strdup(disk->name); ++ get_single_disk_info(sdn.DeviceNumber, address, &local_err); ++ if (local_err) { ++ g_debug("failed to get disk info: %s", ++ error_get_pretty(local_err)); ++ error_free(local_err); ++ qapi_free_GuestDiskAddress(address); ++ address = NULL; ++ } else { ++ disk->address = address; ++ disk->has_address = true; ++ } ++ ++ new = g_malloc0(sizeof(GuestDiskInfoList)); ++ new->value = disk; ++ new->next = ret; ++ ret = new; ++ } ++ ++ SetupDiDestroyDeviceInfoList(dev_info); ++ return ret; ++} ++ + #else + + static GuestDiskAddressList *build_guest_disk_info(char *guid, Error **errp) +@@ -967,6 +1062,12 @@ static GuestDiskAddressList *build_guest_disk_info(char *guid, Error **errp) + return NULL; + } + ++GuestDiskInfoList *qmp_guest_get_disks(Error **errp) ++{ ++ error_setg(errp, QERR_UNSUPPORTED); ++ return NULL; ++} ++ + #endif /* CONFIG_QGA_NTDDSCSI */ + + static GuestFilesystemInfo *build_guest_fsinfo(char *guid, Error **errp) +@@ -2234,9 +2335,3 @@ GuestOSInfo *qmp_guest_get_osinfo(Error **errp) + + return info; + } +- +-GuestDiskInfoList *qmp_guest_get_disks(Error **errp) +-{ +- error_setg(errp, QERR_UNSUPPORTED); +- return NULL; +-} +-- +2.27.0 + diff --git a/kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch b/kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch new file mode 100755 index 0000000000000000000000000000000000000000..dec7f7ba4a4b236b709b5b31b619201e1d4f6caf --- /dev/null +++ b/kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch @@ -0,0 +1,176 @@ +From 7f8888f2c53060c4536856859d5ea94d23ea9e45 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:54 -0400 +Subject: [PATCH 03/14] qga: add *reset argument to ssh-add-authorized-keys +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-4-marcandre.lureau@redhat.com> +Patchwork-id: 101689 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 3/4] qga: add *reset argument to ssh-add-authorized-keys +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Michael Roth + +I prefer 'reset' over 'clear', since 'clear' and keys may have some +other relations or meaning. + +Signed-off-by: Marc-André Lureau +*fix disallowed g_assert* usage reported by checkpatch +Signed-off-by: Michael Roth + +(cherry picked from commit 0e3c94758e3851f0ab30d2a1e63a73284499775d) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + qga/commands-posix-ssh.c | 53 ++++++++++++++++++++++++++++++++++++---- + qga/qapi-schema.json | 3 ++- + 2 files changed, 50 insertions(+), 6 deletions(-) + +diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c +index f74d89679c..362c9e8816 100644 +--- a/qga/commands-posix-ssh.c ++++ b/qga/commands-posix-ssh.c +@@ -168,6 +168,7 @@ read_authkeys(const char *path, Error **errp) + + void + qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, ++ bool has_reset, bool reset, + Error **errp) + { + g_autofree struct passwd *p = NULL; +@@ -178,6 +179,7 @@ qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, + size_t nkeys, nauthkeys; + + ERRP_GUARD(); ++ reset = has_reset && reset; + + if (!check_openssh_pub_keys(keys, &nkeys, errp)) { + return; +@@ -191,7 +193,9 @@ qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, + ssh_path = g_build_filename(p->pw_dir, ".ssh", NULL); + authkeys_path = g_build_filename(ssh_path, "authorized_keys", NULL); + +- authkeys = read_authkeys(authkeys_path, NULL); ++ if (!reset) { ++ authkeys = read_authkeys(authkeys_path, NULL); ++ } + if (authkeys == NULL) { + if (!g_file_test(ssh_path, G_FILE_TEST_IS_DIR) && + !mkdir_for_user(ssh_path, p, 0700, errp)) { +@@ -318,7 +322,7 @@ test_invalid_user(void) + { + Error *err = NULL; + +- qmp_guest_ssh_add_authorized_keys("", NULL, &err); ++ qmp_guest_ssh_add_authorized_keys("", NULL, FALSE, FALSE, &err); + error_free_or_abort(&err); + + qmp_guest_ssh_remove_authorized_keys("", NULL, &err); +@@ -333,7 +337,8 @@ test_invalid_key(void) + }; + Error *err = NULL; + +- qmp_guest_ssh_add_authorized_keys(g_get_user_name(), &key, &err); ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), &key, ++ FALSE, FALSE, &err); + error_free_or_abort(&err); + + qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), &key, &err); +@@ -346,13 +351,17 @@ test_add_keys(void) + Error *err = NULL; + + qmp_guest_ssh_add_authorized_keys(g_get_user_name(), +- (strList *)&test_key2, &err); ++ (strList *)&test_key2, ++ FALSE, FALSE, ++ &err); + g_assert(err == NULL); + + test_authorized_keys_equal("algo key2 comments"); + + qmp_guest_ssh_add_authorized_keys(g_get_user_name(), +- (strList *)&test_key1_2, &err); ++ (strList *)&test_key1_2, ++ FALSE, FALSE, ++ &err); + g_assert(err == NULL); + + /* key2 came first, and should'nt be duplicated */ +@@ -360,6 +369,39 @@ test_add_keys(void) + "algo key1 comments"); + } + ++static void ++test_add_reset_keys(void) ++{ ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key1_2, ++ FALSE, FALSE, ++ &err); ++ g_assert(err == NULL); ++ ++ /* reset with key2 only */ ++ test_authorized_keys_equal("algo key1 comments\n" ++ "algo key2 comments"); ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key2, ++ TRUE, TRUE, ++ &err); ++ g_assert(err == NULL); ++ ++ test_authorized_keys_equal("algo key2 comments"); ++ ++ /* empty should clear file */ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)NULL, ++ TRUE, TRUE, ++ &err); ++ g_assert(err == NULL); ++ ++ test_authorized_keys_equal(""); ++} ++ + static void + test_remove_keys(void) + { +@@ -393,6 +435,7 @@ int main(int argc, char *argv[]) + g_test_add_func("/qga/ssh/invalid_user", test_invalid_user); + g_test_add_func("/qga/ssh/invalid_key", test_invalid_key); + g_test_add_func("/qga/ssh/add_keys", test_add_keys); ++ g_test_add_func("/qga/ssh/add_reset_keys", test_add_reset_keys); + g_test_add_func("/qga/ssh/remove_keys", test_remove_keys); + + return g_test_run(); +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index 3b85f5a03f..a70ea5da77 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -1279,6 +1279,7 @@ + # + # @username: the user account to add the authorized keys + # @keys: the public keys to add (in OpenSSH/sshd(8) authorized_keys format) ++# @reset: ignore the existing content, set it with the given keys only + # + # Append public keys to user .ssh/authorized_keys on Unix systems (not + # implemented for other systems). +@@ -1288,7 +1289,7 @@ + # Since: 5.2 + ## + { 'command': 'guest-ssh-add-authorized-keys', +- 'data': { 'username': 'str', 'keys': ['str'] }, ++ 'data': { 'username': 'str', 'keys': ['str'], '*reset': 'bool' }, + 'if': 'defined(CONFIG_POSIX)' } + + ## +-- +2.27.0 + diff --git a/kvm-qga-add-ssh-add-remove-authorized-keys.patch b/kvm-qga-add-ssh-add-remove-authorized-keys.patch new file mode 100755 index 0000000000000000000000000000000000000000..b767d429cbb89a2fb84eba31eea0ecab21c4f8d9 --- /dev/null +++ b/kvm-qga-add-ssh-add-remove-authorized-keys.patch @@ -0,0 +1,525 @@ +From 4be6cb23235b29d6ce450c2dacaef09c52d1aeea Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:52 -0400 +Subject: [PATCH 02/14] qga: add ssh-{add, remove}-authorized-keys +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-3-marcandre.lureau@redhat.com> +Patchwork-id: 101688 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/4] qga: add ssh-{add, remove}-authorized-keys +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Marc-André Lureau + +Add new commands to add and remove SSH public keys from +~/.ssh/authorized_keys. + +I took a different approach for testing, including the unit tests right +with the code. I wanted to overwrite the function to get the user +details, I couldn't easily do that over QMP. Furthermore, I prefer +having unit tests very close to the code, and unit files that are domain +specific (commands-posix is too crowded already). FWIW, that +coding/testing style is Rust-style (where tests can or should even be +part of the documentation!). + +Fixes: +https://bugzilla.redhat.com/show_bug.cgi?id=1885332 + +Signed-off-by: Marc-André Lureau +Reviewed-by: Michal Privoznik +Reviewed-by: Daniel P. Berrangé +*squashed in fix-ups for setting file ownership and use of QAPI + conditionals for CONFIG_POSIX instead of stub definitions +*disable qga-ssh-test for now due to G_TEST_OPTION_ISOLATE_DIRS + triggering leak detector in build-oss-fuzz +*fix disallowed g_assert* usage reported by checkpatch +Signed-off-by: Michael Roth + +(cherry picked from commit 8d769ec777dccbff199711aba43aa6297fe4a0e0) +[ Fixes trivial backport conflicts and use Makefile.objs build-sys ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + qga/Makefile.objs | 2 +- + qga/commands-posix-ssh.c | 407 +++++++++++++++++++++++++++++++++++++++ + qga/qapi-schema.json | 35 ++++ + 3 files changed, 443 insertions(+), 1 deletion(-) + create mode 100644 qga/commands-posix-ssh.c + +diff --git a/qga/Makefile.objs b/qga/Makefile.objs +index 80e6bb3c2e..c8da634db0 100644 +--- a/qga/Makefile.objs ++++ b/qga/Makefile.objs +@@ -1,6 +1,6 @@ + commands-posix.o-libs := $(LIBUDEV_LIBS) + qga-obj-y = commands.o guest-agent-command-state.o main.o +-qga-obj-$(CONFIG_POSIX) += commands-posix.o channel-posix.o ++qga-obj-$(CONFIG_POSIX) += commands-posix.o channel-posix.o commands-posix-ssh.o + qga-obj-$(CONFIG_WIN32) += commands-win32.o channel-win32.o service-win32.o + qga-obj-$(CONFIG_WIN32) += vss-win32.o + qga-obj-y += qapi-generated/qga-qapi-types.o qapi-generated/qga-qapi-visit.o +diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c +new file mode 100644 +index 0000000000..f74d89679c +--- /dev/null ++++ b/qga/commands-posix-ssh.c +@@ -0,0 +1,407 @@ ++ /* ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++#include "qemu/osdep.h" ++ ++#include ++#include ++#include ++#include ++ ++#include "qapi/error.h" ++#include "qga-qapi-commands.h" ++ ++#ifdef QGA_BUILD_UNIT_TEST ++static struct passwd * ++test_get_passwd_entry(const gchar *user_name, GError **error) ++{ ++ struct passwd *p; ++ int ret; ++ ++ if (!user_name || g_strcmp0(user_name, g_get_user_name())) { ++ g_set_error(error, G_UNIX_ERROR, 0, "Invalid user name"); ++ return NULL; ++ } ++ ++ p = g_new0(struct passwd, 1); ++ p->pw_dir = (char *)g_get_home_dir(); ++ p->pw_uid = geteuid(); ++ p->pw_gid = getegid(); ++ ++ ret = g_mkdir_with_parents(p->pw_dir, 0700); ++ g_assert(ret == 0); ++ ++ return p; ++} ++ ++#define g_unix_get_passwd_entry_qemu(username, err) \ ++ test_get_passwd_entry(username, err) ++#endif ++ ++static struct passwd * ++get_passwd_entry(const char *username, Error **errp) ++{ ++ g_autoptr(GError) err = NULL; ++ struct passwd *p; ++ ++ ERRP_GUARD(); ++ ++ p = g_unix_get_passwd_entry_qemu(username, &err); ++ if (p == NULL) { ++ error_setg(errp, "failed to lookup user '%s': %s", ++ username, err->message); ++ return NULL; ++ } ++ ++ return p; ++} ++ ++static bool ++mkdir_for_user(const char *path, const struct passwd *p, ++ mode_t mode, Error **errp) ++{ ++ ERRP_GUARD(); ++ ++ if (g_mkdir(path, mode) == -1) { ++ error_setg(errp, "failed to create directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ if (chown(path, p->pw_uid, p->pw_gid) == -1) { ++ error_setg(errp, "failed to set ownership of directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ if (chmod(path, mode) == -1) { ++ error_setg(errp, "failed to set permissions of directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++check_openssh_pub_key(const char *key, Error **errp) ++{ ++ ERRP_GUARD(); ++ ++ /* simple sanity-check, we may want more? */ ++ if (!key || key[0] == '#' || strchr(key, '\n')) { ++ error_setg(errp, "invalid OpenSSH public key: '%s'", key); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++check_openssh_pub_keys(strList *keys, size_t *nkeys, Error **errp) ++{ ++ size_t n = 0; ++ strList *k; ++ ++ ERRP_GUARD(); ++ ++ for (k = keys; k != NULL; k = k->next) { ++ if (!check_openssh_pub_key(k->value, errp)) { ++ return false; ++ } ++ n++; ++ } ++ ++ if (nkeys) { ++ *nkeys = n; ++ } ++ return true; ++} ++ ++static bool ++write_authkeys(const char *path, const GStrv keys, ++ const struct passwd *p, Error **errp) ++{ ++ g_autofree char *contents = NULL; ++ g_autoptr(GError) err = NULL; ++ ++ ERRP_GUARD(); ++ ++ contents = g_strjoinv("\n", keys); ++ if (!g_file_set_contents(path, contents, -1, &err)) { ++ error_setg(errp, "failed to write to '%s': %s", path, err->message); ++ return false; ++ } ++ ++ if (chown(path, p->pw_uid, p->pw_gid) == -1) { ++ error_setg(errp, "failed to set ownership of directory '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ if (chmod(path, 0600) == -1) { ++ error_setg(errp, "failed to set permissions of '%s': %s", ++ path, g_strerror(errno)); ++ return false; ++ } ++ ++ return true; ++} ++ ++static GStrv ++read_authkeys(const char *path, Error **errp) ++{ ++ g_autoptr(GError) err = NULL; ++ g_autofree char *contents = NULL; ++ ++ ERRP_GUARD(); ++ ++ if (!g_file_get_contents(path, &contents, NULL, &err)) { ++ error_setg(errp, "failed to read '%s': %s", path, err->message); ++ return NULL; ++ } ++ ++ return g_strsplit(contents, "\n", -1); ++ ++} ++ ++void ++qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys, ++ Error **errp) ++{ ++ g_autofree struct passwd *p = NULL; ++ g_autofree char *ssh_path = NULL; ++ g_autofree char *authkeys_path = NULL; ++ g_auto(GStrv) authkeys = NULL; ++ strList *k; ++ size_t nkeys, nauthkeys; ++ ++ ERRP_GUARD(); ++ ++ if (!check_openssh_pub_keys(keys, &nkeys, errp)) { ++ return; ++ } ++ ++ p = get_passwd_entry(username, errp); ++ if (p == NULL) { ++ return; ++ } ++ ++ ssh_path = g_build_filename(p->pw_dir, ".ssh", NULL); ++ authkeys_path = g_build_filename(ssh_path, "authorized_keys", NULL); ++ ++ authkeys = read_authkeys(authkeys_path, NULL); ++ if (authkeys == NULL) { ++ if (!g_file_test(ssh_path, G_FILE_TEST_IS_DIR) && ++ !mkdir_for_user(ssh_path, p, 0700, errp)) { ++ return; ++ } ++ } ++ ++ nauthkeys = authkeys ? g_strv_length(authkeys) : 0; ++ authkeys = g_realloc_n(authkeys, nauthkeys + nkeys + 1, sizeof(char *)); ++ memset(authkeys + nauthkeys, 0, (nkeys + 1) * sizeof(char *)); ++ ++ for (k = keys; k != NULL; k = k->next) { ++ if (g_strv_contains((const gchar * const *)authkeys, k->value)) { ++ continue; ++ } ++ authkeys[nauthkeys++] = g_strdup(k->value); ++ } ++ ++ write_authkeys(authkeys_path, authkeys, p, errp); ++} ++ ++void ++qmp_guest_ssh_remove_authorized_keys(const char *username, strList *keys, ++ Error **errp) ++{ ++ g_autofree struct passwd *p = NULL; ++ g_autofree char *authkeys_path = NULL; ++ g_autofree GStrv new_keys = NULL; /* do not own the strings */ ++ g_auto(GStrv) authkeys = NULL; ++ GStrv a; ++ size_t nkeys = 0; ++ ++ ERRP_GUARD(); ++ ++ if (!check_openssh_pub_keys(keys, NULL, errp)) { ++ return; ++ } ++ ++ p = get_passwd_entry(username, errp); ++ if (p == NULL) { ++ return; ++ } ++ ++ authkeys_path = g_build_filename(p->pw_dir, ".ssh", ++ "authorized_keys", NULL); ++ if (!g_file_test(authkeys_path, G_FILE_TEST_EXISTS)) { ++ return; ++ } ++ authkeys = read_authkeys(authkeys_path, errp); ++ if (authkeys == NULL) { ++ return; ++ } ++ ++ new_keys = g_new0(char *, g_strv_length(authkeys) + 1); ++ for (a = authkeys; *a != NULL; a++) { ++ strList *k; ++ ++ for (k = keys; k != NULL; k = k->next) { ++ if (g_str_equal(k->value, *a)) { ++ break; ++ } ++ } ++ if (k != NULL) { ++ continue; ++ } ++ ++ new_keys[nkeys++] = *a; ++ } ++ ++ write_authkeys(authkeys_path, new_keys, p, errp); ++} ++ ++ ++#ifdef QGA_BUILD_UNIT_TEST ++#if GLIB_CHECK_VERSION(2, 60, 0) ++static const strList test_key2 = { ++ .value = (char *)"algo key2 comments" ++}; ++ ++static const strList test_key1_2 = { ++ .value = (char *)"algo key1 comments", ++ .next = (strList *)&test_key2, ++}; ++ ++static char * ++test_get_authorized_keys_path(void) ++{ ++ return g_build_filename(g_get_home_dir(), ".ssh", "authorized_keys", NULL); ++} ++ ++static void ++test_authorized_keys_set(const char *contents) ++{ ++ g_autoptr(GError) err = NULL; ++ g_autofree char *path = NULL; ++ int ret; ++ ++ path = g_build_filename(g_get_home_dir(), ".ssh", NULL); ++ ret = g_mkdir_with_parents(path, 0700); ++ g_assert(ret == 0); ++ g_free(path); ++ ++ path = test_get_authorized_keys_path(); ++ g_file_set_contents(path, contents, -1, &err); ++ g_assert(err == NULL); ++} ++ ++static void ++test_authorized_keys_equal(const char *expected) ++{ ++ g_autoptr(GError) err = NULL; ++ g_autofree char *path = NULL; ++ g_autofree char *contents = NULL; ++ ++ path = test_get_authorized_keys_path(); ++ g_file_get_contents(path, &contents, NULL, &err); ++ g_assert(err == NULL); ++ ++ g_assert(g_strcmp0(contents, expected) == 0); ++} ++ ++static void ++test_invalid_user(void) ++{ ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys("", NULL, &err); ++ error_free_or_abort(&err); ++ ++ qmp_guest_ssh_remove_authorized_keys("", NULL, &err); ++ error_free_or_abort(&err); ++} ++ ++static void ++test_invalid_key(void) ++{ ++ strList key = { ++ .value = (char *)"not a valid\nkey" ++ }; ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), &key, &err); ++ error_free_or_abort(&err); ++ ++ qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), &key, &err); ++ error_free_or_abort(&err); ++} ++ ++static void ++test_add_keys(void) ++{ ++ Error *err = NULL; ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key2, &err); ++ g_assert(err == NULL); ++ ++ test_authorized_keys_equal("algo key2 comments"); ++ ++ qmp_guest_ssh_add_authorized_keys(g_get_user_name(), ++ (strList *)&test_key1_2, &err); ++ g_assert(err == NULL); ++ ++ /* key2 came first, and should'nt be duplicated */ ++ test_authorized_keys_equal("algo key2 comments\n" ++ "algo key1 comments"); ++} ++ ++static void ++test_remove_keys(void) ++{ ++ Error *err = NULL; ++ static const char *authkeys = ++ "algo key1 comments\n" ++ /* originally duplicated */ ++ "algo key1 comments\n" ++ "# a commented line\n" ++ "algo some-key another\n"; ++ ++ test_authorized_keys_set(authkeys); ++ qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), ++ (strList *)&test_key2, &err); ++ g_assert(err == NULL); ++ test_authorized_keys_equal(authkeys); ++ ++ qmp_guest_ssh_remove_authorized_keys(g_get_user_name(), ++ (strList *)&test_key1_2, &err); ++ g_assert(err == NULL); ++ test_authorized_keys_equal("# a commented line\n" ++ "algo some-key another\n"); ++} ++ ++int main(int argc, char *argv[]) ++{ ++ setlocale(LC_ALL, ""); ++ ++ g_test_init(&argc, &argv, G_TEST_OPTION_ISOLATE_DIRS, NULL); ++ ++ g_test_add_func("/qga/ssh/invalid_user", test_invalid_user); ++ g_test_add_func("/qga/ssh/invalid_key", test_invalid_key); ++ g_test_add_func("/qga/ssh/add_keys", test_add_keys); ++ g_test_add_func("/qga/ssh/remove_keys", test_remove_keys); ++ ++ return g_test_run(); ++} ++#else ++int main(int argc, char *argv[]) ++{ ++ g_test_message("test skipped, needs glib >= 2.60"); ++ return 0; ++} ++#endif /* GLIB_2_60 */ ++#endif /* BUILD_UNIT_TEST */ +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index 4222cb92d3..3b85f5a03f 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -1273,3 +1273,38 @@ + ## + { 'command': 'guest-get-osinfo', + 'returns': 'GuestOSInfo' } ++ ++## ++# @guest-ssh-add-authorized-keys: ++# ++# @username: the user account to add the authorized keys ++# @keys: the public keys to add (in OpenSSH/sshd(8) authorized_keys format) ++# ++# Append public keys to user .ssh/authorized_keys on Unix systems (not ++# implemented for other systems). ++# ++# Returns: Nothing on success. ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-ssh-add-authorized-keys', ++ 'data': { 'username': 'str', 'keys': ['str'] }, ++ 'if': 'defined(CONFIG_POSIX)' } ++ ++## ++# @guest-ssh-remove-authorized-keys: ++# ++# @username: the user account to remove the authorized keys ++# @keys: the public keys to remove (in OpenSSH/sshd(8) authorized_keys format) ++# ++# Remove public keys from the user .ssh/authorized_keys on Unix systems (not ++# implemented for other systems). It's not an error if the key is already ++# missing. ++# ++# Returns: Nothing on success. ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-ssh-remove-authorized-keys', ++ 'data': { 'username': 'str', 'keys': ['str'] }, ++ 'if': 'defined(CONFIG_POSIX)' } +-- +2.27.0 + diff --git a/kvm-qga-add-ssh-get-authorized-keys.patch b/kvm-qga-add-ssh-get-authorized-keys.patch new file mode 100755 index 0000000000000000000000000000000000000000..2b4c3771300203b09dc315e39b3c6baeec11b244 --- /dev/null +++ b/kvm-qga-add-ssh-get-authorized-keys.patch @@ -0,0 +1,170 @@ +From 1ed102f5489e6cf3168d9014e9a082909193b6fc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:55:57 -0400 +Subject: [PATCH 04/14] qga: add ssh-get-authorized-keys +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210609100615.2501448-5-marcandre.lureau@redhat.com> +Patchwork-id: 101690 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 4/4] qga: add ssh-get-authorized-keys +Bugzilla: 1967716 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michal Privoznik + +From: Marc-André Lureau + +Signed-off-by: Marc-André Lureau +*fix-up merge conflicts due to qga-ssh-test being disabled in earlier + patch due to G_TEST_OPTION_ISOLATE_DIRS triggering build-oss-fuzz + leak detector. +*fix up style and disallowed g_assert* usage reported by checkpatch +Signed-off-by: Michael Roth + +(cherry picked from commit cad97c08a1c17830d77a46780088bc0199df89d1) +[ Fix trivial schema conflict ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + qga/commands-posix-ssh.c | 66 ++++++++++++++++++++++++++++++++++++++++ + qga/qapi-schema.json | 30 ++++++++++++++++++ + 2 files changed, 96 insertions(+) + +diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c +index 362c9e8816..749167e82d 100644 +--- a/qga/commands-posix-ssh.c ++++ b/qga/commands-posix-ssh.c +@@ -268,6 +268,46 @@ qmp_guest_ssh_remove_authorized_keys(const char *username, strList *keys, + write_authkeys(authkeys_path, new_keys, p, errp); + } + ++GuestAuthorizedKeys * ++qmp_guest_ssh_get_authorized_keys(const char *username, Error **errp) ++{ ++ g_autofree struct passwd *p = NULL; ++ g_autofree char *authkeys_path = NULL; ++ g_auto(GStrv) authkeys = NULL; ++ g_autoptr(GuestAuthorizedKeys) ret = NULL; ++ int i; ++ ++ ERRP_GUARD(); ++ ++ p = get_passwd_entry(username, errp); ++ if (p == NULL) { ++ return NULL; ++ } ++ ++ authkeys_path = g_build_filename(p->pw_dir, ".ssh", ++ "authorized_keys", NULL); ++ authkeys = read_authkeys(authkeys_path, errp); ++ if (authkeys == NULL) { ++ return NULL; ++ } ++ ++ ret = g_new0(GuestAuthorizedKeys, 1); ++ for (i = 0; authkeys[i] != NULL; i++) { ++ strList *new; ++ ++ g_strstrip(authkeys[i]); ++ if (!authkeys[i][0] || authkeys[i][0] == '#') { ++ continue; ++ } ++ ++ new = g_new0(strList, 1); ++ new->value = g_strdup(authkeys[i]); ++ new->next = ret->keys; ++ ret->keys = new; ++ } ++ ++ return g_steal_pointer(&ret); ++} + + #ifdef QGA_BUILD_UNIT_TEST + #if GLIB_CHECK_VERSION(2, 60, 0) +@@ -426,6 +466,31 @@ test_remove_keys(void) + "algo some-key another\n"); + } + ++static void ++test_get_keys(void) ++{ ++ Error *err = NULL; ++ static const char *authkeys = ++ "algo key1 comments\n" ++ "# a commented line\n" ++ "algo some-key another\n"; ++ g_autoptr(GuestAuthorizedKeys) ret = NULL; ++ strList *k; ++ size_t len = 0; ++ ++ test_authorized_keys_set(authkeys); ++ ++ ret = qmp_guest_ssh_get_authorized_keys(g_get_user_name(), &err); ++ g_assert(err == NULL); ++ ++ for (len = 0, k = ret->keys; k != NULL; k = k->next) { ++ g_assert(g_str_has_prefix(k->value, "algo ")); ++ len++; ++ } ++ ++ g_assert(len == 2); ++} ++ + int main(int argc, char *argv[]) + { + setlocale(LC_ALL, ""); +@@ -437,6 +502,7 @@ int main(int argc, char *argv[]) + g_test_add_func("/qga/ssh/add_keys", test_add_keys); + g_test_add_func("/qga/ssh/add_reset_keys", test_add_reset_keys); + g_test_add_func("/qga/ssh/remove_keys", test_remove_keys); ++ g_test_add_func("/qga/ssh/get_keys", test_get_keys); + + return g_test_run(); + } +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index a70ea5da77..97bf96712e 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -1274,6 +1274,36 @@ + { 'command': 'guest-get-osinfo', + 'returns': 'GuestOSInfo' } + ++## ++# @GuestAuthorizedKeys: ++# ++# @keys: public keys (in OpenSSH/sshd(8) authorized_keys format) ++# ++# Since: 5.2 ++## ++{ 'struct': 'GuestAuthorizedKeys', ++ 'data': { ++ 'keys': ['str'] ++ }, ++ 'if': 'defined(CONFIG_POSIX)' } ++ ++## ++# @guest-ssh-get-authorized-keys: ++# ++# @username: the user account to add the authorized keys ++# ++# Return the public keys from user .ssh/authorized_keys on Unix systems (not ++# implemented for other systems). ++# ++# Returns: @GuestAuthorizedKeys ++# ++# Since: 5.2 ++## ++{ 'command': 'guest-ssh-get-authorized-keys', ++ 'data': { 'username': 'str' }, ++ 'returns': 'GuestAuthorizedKeys', ++ 'if': 'defined(CONFIG_POSIX)' } ++ + ## + # @guest-ssh-add-authorized-keys: + # +-- +2.27.0 + diff --git a/kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch b/kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch new file mode 100755 index 0000000000000000000000000000000000000000..0aa244075b695774bb26aa11714ea018ee9cbe10 --- /dev/null +++ b/kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch @@ -0,0 +1,140 @@ +From 3a63e2d29bb2fd92577d42aeb8fa956ae18df22e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 2 Oct 2020 10:17:41 -0400 +Subject: [PATCH 02/14] qga/commands-posix: Move the udev code from the pci to + the generic function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20201002101742.249169-3-thuth@redhat.com> +Patchwork-id: 98526 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/3] qga/commands-posix: Move the udev code from the pci to the generic function +Bugzilla: 1755075 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +The libudev-related code is independent from the other pci-related code +and can be re-used for non-pci devices (like ccw devices on s390x). Thus +move this part to the generic function. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1755075 +Signed-off-by: Thomas Huth +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Michael Roth +(cherry picked from commit 43dadc431bacbc5a5baee7e256288a98a3e95ce3) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 62 +++++++++++++++++++++++--------------------- + 1 file changed, 33 insertions(+), 29 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 99d6b1c8c1..6db76aadd1 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -878,10 +878,6 @@ static bool build_guest_fsinfo_for_pci_dev(char const *syspath, + GuestPCIAddress *pciaddr = disk->pci_controller; + bool has_ata = false, has_host = false, has_tgt = false; + char *p, *q, *driver = NULL; +-#ifdef CONFIG_LIBUDEV +- struct udev *udev = NULL; +- struct udev_device *udevice = NULL; +-#endif + bool ret = false; + + p = strstr(syspath, "/devices/pci"); +@@ -940,26 +936,6 @@ static bool build_guest_fsinfo_for_pci_dev(char const *syspath, + pciaddr->slot = pci[2]; + pciaddr->function = pci[3]; + +-#ifdef CONFIG_LIBUDEV +- udev = udev_new(); +- udevice = udev_device_new_from_syspath(udev, syspath); +- if (udev == NULL || udevice == NULL) { +- g_debug("failed to query udev"); +- } else { +- const char *devnode, *serial; +- devnode = udev_device_get_devnode(udevice); +- if (devnode != NULL) { +- disk->dev = g_strdup(devnode); +- disk->has_dev = true; +- } +- serial = udev_device_get_property_value(udevice, "ID_SERIAL"); +- if (serial != NULL && *serial != 0) { +- disk->serial = g_strdup(serial); +- disk->has_serial = true; +- } +- } +-#endif +- + if (strcmp(driver, "ata_piix") == 0) { + /* a host per ide bus, target*:0::0 */ + if (!has_host || !has_tgt) { +@@ -1021,10 +997,6 @@ static bool build_guest_fsinfo_for_pci_dev(char const *syspath, + + cleanup: + g_free(driver); +-#ifdef CONFIG_LIBUDEV +- udev_unref(udev); +- udev_device_unref(udevice); +-#endif + return ret; + } + +@@ -1037,18 +1009,50 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + GuestPCIAddress *pciaddr; + GuestDiskAddressList *list = NULL; + bool has_hwinf; ++#ifdef CONFIG_LIBUDEV ++ struct udev *udev = NULL; ++ struct udev_device *udevice = NULL; ++#endif + + pciaddr = g_new0(GuestPCIAddress, 1); ++ pciaddr->domain = -1; /* -1 means field is invalid */ ++ pciaddr->bus = -1; ++ pciaddr->slot = -1; ++ pciaddr->function = -1; + + disk = g_new0(GuestDiskAddress, 1); + disk->pci_controller = pciaddr; ++ disk->bus_type = GUEST_DISK_BUS_TYPE_UNKNOWN; + + list = g_new0(GuestDiskAddressList, 1); + list->value = disk; + ++#ifdef CONFIG_LIBUDEV ++ udev = udev_new(); ++ udevice = udev_device_new_from_syspath(udev, syspath); ++ if (udev == NULL || udevice == NULL) { ++ g_debug("failed to query udev"); ++ } else { ++ const char *devnode, *serial; ++ devnode = udev_device_get_devnode(udevice); ++ if (devnode != NULL) { ++ disk->dev = g_strdup(devnode); ++ disk->has_dev = true; ++ } ++ serial = udev_device_get_property_value(udevice, "ID_SERIAL"); ++ if (serial != NULL && *serial != 0) { ++ disk->serial = g_strdup(serial); ++ disk->has_serial = true; ++ } ++ } ++ ++ udev_unref(udev); ++ udev_device_unref(udevice); ++#endif ++ + has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); + +- if (has_hwinf) { ++ if (has_hwinf || disk->has_dev || disk->has_serial) { + list->next = fs->disk; + fs->disk = list; + } else { +-- +2.27.0 + diff --git a/kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch b/kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch new file mode 100755 index 0000000000000000000000000000000000000000..9915334955220eb7bfe8b7d4b18015caf8ab8a13 --- /dev/null +++ b/kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch @@ -0,0 +1,156 @@ +From 84bc86fdf47729bca77957a04161862ffbedbf2f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 2 Oct 2020 10:17:40 -0400 +Subject: [PATCH 01/14] qga/commands-posix: Rework + build_guest_fsinfo_for_real_device() function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Message-id: <20201002101742.249169-2-thuth@redhat.com> +Patchwork-id: 98527 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/3] qga/commands-posix: Rework build_guest_fsinfo_for_real_device() function +Bugzilla: 1755075 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +We are going to support non-PCI devices soon. For this we need to split +the generic GuestDiskAddress and GuestDiskAddressList memory allocation +and list chaining into a separate function first. + +Signed-off-by: Thomas Huth +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Michael Roth +(cherry picked from commit d9fe4f0fea31f0560dc40d3576bc6c48ad97109f) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 65 ++++++++++++++++++++++++++++---------------- + 1 file changed, 41 insertions(+), 24 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 1c1a165dae..99d6b1c8c1 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -865,28 +865,30 @@ static int build_hosts(char const *syspath, char const *host, bool ata, + return i; + } + +-/* Store disk device info specified by @sysfs into @fs */ +-static void build_guest_fsinfo_for_real_device(char const *syspath, +- GuestFilesystemInfo *fs, +- Error **errp) ++/* ++ * Store disk device info for devices on the PCI bus. ++ * Returns true if information has been stored, or false for failure. ++ */ ++static bool build_guest_fsinfo_for_pci_dev(char const *syspath, ++ GuestDiskAddress *disk, ++ Error **errp) + { + unsigned int pci[4], host, hosts[8], tgt[3]; + int i, nhosts = 0, pcilen; +- GuestDiskAddress *disk; +- GuestPCIAddress *pciaddr; +- GuestDiskAddressList *list = NULL; ++ GuestPCIAddress *pciaddr = disk->pci_controller; + bool has_ata = false, has_host = false, has_tgt = false; + char *p, *q, *driver = NULL; + #ifdef CONFIG_LIBUDEV + struct udev *udev = NULL; + struct udev_device *udevice = NULL; + #endif ++ bool ret = false; + + p = strstr(syspath, "/devices/pci"); + if (!p || sscanf(p + 12, "%*x:%*x/%x:%x:%x.%x%n", + pci, pci + 1, pci + 2, pci + 3, &pcilen) < 4) { + g_debug("only pci device is supported: sysfs path '%s'", syspath); +- return; ++ return false; + } + + p += 12 + pcilen; +@@ -907,7 +909,7 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + } + + g_debug("unsupported driver or sysfs path '%s'", syspath); +- return; ++ return false; + } + + p = strstr(syspath, "/target"); +@@ -933,18 +935,11 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + } + } + +- pciaddr = g_malloc0(sizeof(*pciaddr)); + pciaddr->domain = pci[0]; + pciaddr->bus = pci[1]; + pciaddr->slot = pci[2]; + pciaddr->function = pci[3]; + +- disk = g_malloc0(sizeof(*disk)); +- disk->pci_controller = pciaddr; +- +- list = g_malloc0(sizeof(*list)); +- list->value = disk; +- + #ifdef CONFIG_LIBUDEV + udev = udev_new(); + udevice = udev_device_new_from_syspath(udev, syspath); +@@ -1022,21 +1017,43 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + goto cleanup; + } + +- list->next = fs->disk; +- fs->disk = list; +- goto out; ++ ret = true; + + cleanup: +- if (list) { +- qapi_free_GuestDiskAddressList(list); +- } +-out: + g_free(driver); + #ifdef CONFIG_LIBUDEV + udev_unref(udev); + udev_device_unref(udevice); + #endif +- return; ++ return ret; ++} ++ ++/* Store disk device info specified by @sysfs into @fs */ ++static void build_guest_fsinfo_for_real_device(char const *syspath, ++ GuestFilesystemInfo *fs, ++ Error **errp) ++{ ++ GuestDiskAddress *disk; ++ GuestPCIAddress *pciaddr; ++ GuestDiskAddressList *list = NULL; ++ bool has_hwinf; ++ ++ pciaddr = g_new0(GuestPCIAddress, 1); ++ ++ disk = g_new0(GuestDiskAddress, 1); ++ disk->pci_controller = pciaddr; ++ ++ list = g_new0(GuestDiskAddressList, 1); ++ list->value = disk; ++ ++ has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); ++ ++ if (has_hwinf) { ++ list->next = fs->disk; ++ fs->disk = list; ++ } else { ++ qapi_free_GuestDiskAddressList(list); ++ } + } + + static void build_guest_fsinfo_for_device(char const *devpath, +-- +2.27.0 + diff --git a/kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch b/kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch new file mode 100755 index 0000000000000000000000000000000000000000..0d37a64d199a45b8d951bb577cbe05db5c750fd6 --- /dev/null +++ b/kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch @@ -0,0 +1,94 @@ +From 250227a53c1d43d2bd8346922edb3452f3534be6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 2 Oct 2020 10:17:42 -0400 +Subject: [PATCH 03/14] qga/commands-posix: Support fsinfo for non-PCI virtio + devices, too +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20201002101742.249169-4-thuth@redhat.com> +Patchwork-id: 98528 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/3] qga/commands-posix: Support fsinfo for non-PCI virtio devices, too +Bugzilla: 1755075 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +QEMU on s390x uses virtio via channel I/O instead of PCI by default. +Add a function to detect and provide information for virtio-scsi and +virtio-block devices here, too. + +Signed-off-by: Thomas Huth +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Michael Roth +(cherry picked from commit 23843c129d5e1ca360605e511a43a34faebb47c4) +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 42 +++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 41 insertions(+), 1 deletion(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 6db76aadd1..c86c87ed52 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1000,6 +1000,39 @@ cleanup: + return ret; + } + ++/* ++ * Store disk device info for non-PCI virtio devices (for example s390x ++ * channel I/O devices). Returns true if information has been stored, or ++ * false for failure. ++ */ ++static bool build_guest_fsinfo_for_nonpci_virtio(char const *syspath, ++ GuestDiskAddress *disk, ++ Error **errp) ++{ ++ unsigned int tgt[3]; ++ char *p; ++ ++ if (!strstr(syspath, "/virtio") || !strstr(syspath, "/block")) { ++ g_debug("Unsupported virtio device '%s'", syspath); ++ return false; ++ } ++ ++ p = strstr(syspath, "/target"); ++ if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u", ++ &tgt[0], &tgt[1], &tgt[2]) == 3) { ++ /* virtio-scsi: target*:0:: */ ++ disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI; ++ disk->bus = tgt[0]; ++ disk->target = tgt[1]; ++ disk->unit = tgt[2]; ++ } else { ++ /* virtio-blk: 1 disk per 1 device */ ++ disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO; ++ } ++ ++ return true; ++} ++ + /* Store disk device info specified by @sysfs into @fs */ + static void build_guest_fsinfo_for_real_device(char const *syspath, + GuestFilesystemInfo *fs, +@@ -1050,7 +1083,14 @@ static void build_guest_fsinfo_for_real_device(char const *syspath, + udev_device_unref(udevice); + #endif + +- has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); ++ if (strstr(syspath, "/devices/pci")) { ++ has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp); ++ } else if (strstr(syspath, "/virtio")) { ++ has_hwinf = build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp); ++ } else { ++ g_debug("Unsupported device type for '%s'", syspath); ++ has_hwinf = false; ++ } + + if (has_hwinf || disk->has_dev || disk->has_serial) { + list->next = fs->disk; +-- +2.27.0 + diff --git a/kvm-qga-fix-assert-regression-on-guest-shutdown.patch b/kvm-qga-fix-assert-regression-on-guest-shutdown.patch new file mode 100755 index 0000000000000000000000000000000000000000..7db6e1fe49afd59c144630f5206c50d40738a102 --- /dev/null +++ b/kvm-qga-fix-assert-regression-on-guest-shutdown.patch @@ -0,0 +1,61 @@ +From 93b37bad75d14ed4b9e96cc3587d8ae16cb96ba3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 2 Oct 2020 17:46:08 -0400 +Subject: [PATCH 01/18] qga: fix assert regression on guest-shutdown +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201002174608.943992-2-marcandre.lureau@redhat.com> +Patchwork-id: 98534 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] qga: fix assert regression on guest-shutdown +Bugzilla: 1884531 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Marc-André Lureau + +Since commit 781f2b3d1e ("qga: process_event() simplification"), +send_response() is called unconditionally, but will assert when "rsp" is +NULL. This may happen with QCO_NO_SUCCESS_RESP commands, such as +"guest-shutdown". + +Fixes: 781f2b3d1e5ef389b44016a897fd55e7a780bf35 +Cc: Michael Roth +Reported-by: Christian Ehrhardt +Signed-off-by: Marc-André Lureau +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Christian Ehrhardt +Tested-by: Christian Ehrhardt +Cc: qemu-stable@nongnu.org +Signed-off-by: Michael Roth + +(cherry picked from commit 844bd70b5652f30bbace89499f513e3fbbb6457a) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/main.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/qga/main.c b/qga/main.c +index c35c2a21209..12fa463f4cd 100644 +--- a/qga/main.c ++++ b/qga/main.c +@@ -529,7 +529,11 @@ static int send_response(GAState *s, const QDict *rsp) + QString *payload_qstr, *response_qstr; + GIOStatus status; + +- g_assert(rsp && s->channel); ++ g_assert(s->channel); ++ ++ if (!rsp) { ++ return 0; ++ } + + payload_qstr = qobject_to_json(QOBJECT(rsp)); + if (!payload_qstr) { +-- +2.27.0 + diff --git a/kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch b/kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch new file mode 100755 index 0000000000000000000000000000000000000000..6ffc5bdcaa9b17bcb8d7dbfc70f9e530a833c6ff --- /dev/null +++ b/kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch @@ -0,0 +1,54 @@ +From c9b1eb9d6c0da9098d5410d90d290d6fca6ea7dc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:14 -0500 +Subject: [PATCH 13/14] qga: fix missing closedir() in qmp_guest_get_disks() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-10-marcandre.lureau@redhat.com> +Patchwork-id: 100481 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 09/10] qga: fix missing closedir() in qmp_guest_get_disks() +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Michael Roth + +We opendir("/sys/block") at the beginning of the function, but we never +close it prior to returning. + +Fixes: Coverity CID 1436130 +Fixes: fed3956429d5 ("qga: add implementation of guest-get-disks for Linux") +Reported-by: Peter Maydell +Cc: Marc-André Lureau +Cc: Tomáš Golembiovský +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Michael Roth + +(cherry-picked from commit b1b9ab1c04d560f86d8da3dfca4d8b21de75fee6) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 96f5ddafd3a..9a170dee14c 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1445,6 +1445,9 @@ GuestDiskInfoList *qmp_guest_get_disks(Error **errp) + get_disk_deps(disk_dir, disk); + ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name); + } ++ ++ closedir(dp); ++ + return ret; + } + +-- +2.27.0 + diff --git a/kvm-qga-rename-Error-parameter-to-more-common-errp.patch b/kvm-qga-rename-Error-parameter-to-more-common-errp.patch new file mode 100755 index 0000000000000000000000000000000000000000..2528d26d634da2ebbb2f19fb8fc055a140896cad --- /dev/null +++ b/kvm-qga-rename-Error-parameter-to-more-common-errp.patch @@ -0,0 +1,121 @@ +From 457ba062cc1026a88a70ab3cb9a52acd62c5a2a8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 24 Dec 2020 12:53:02 -0500 +Subject: [PATCH 2/5] qga: rename Error ** parameter to more common errp +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201224125304.62697-2-marcandre.lureau@redhat.com> +Patchwork-id: 100498 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/3] qga: rename Error ** parameter to more common errp +Bugzilla: 1910326 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Vladimir Sementsov-Ogievskiy + +Signed-off-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20191205174635.18758-13-vsementsov@virtuozzo.com> +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Markus Armbruster +Signed-off-by: Markus Armbruster + +(cherry picked from commit b90abbac0b95f68a7ebac5545ab77b98f598a9c7) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 2 +- + qga/commands-win32.c | 2 +- + qga/commands.c | 12 ++++++------ + 3 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index c02373cdf7d..29353e90c8f 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -3134,7 +3134,7 @@ static double ga_get_login_time(struct utmpx *user_info) + return seconds + useconds; + } + +-GuestUserList *qmp_guest_get_users(Error **err) ++GuestUserList *qmp_guest_get_users(Error **errp) + { + GHashTable *cache = NULL; + GuestUserList *head = NULL, *cur_item = NULL; +diff --git a/qga/commands-win32.c b/qga/commands-win32.c +index a07725e874b..618ccdfadaa 100644 +--- a/qga/commands-win32.c ++++ b/qga/commands-win32.c +@@ -2047,7 +2047,7 @@ typedef struct _GA_WTSINFOA { + + } GA_WTSINFOA; + +-GuestUserList *qmp_guest_get_users(Error **err) ++GuestUserList *qmp_guest_get_users(Error **errp) + { + #define QGA_NANOSECONDS 10000000 + +diff --git a/qga/commands.c b/qga/commands.c +index 0c7d1385c23..43c323ceada 100644 +--- a/qga/commands.c ++++ b/qga/commands.c +@@ -143,7 +143,7 @@ static GuestExecInfo *guest_exec_info_find(int64_t pid_numeric) + return NULL; + } + +-GuestExecStatus *qmp_guest_exec_status(int64_t pid, Error **err) ++GuestExecStatus *qmp_guest_exec_status(int64_t pid, Error **errp) + { + GuestExecInfo *gei; + GuestExecStatus *ges; +@@ -152,7 +152,7 @@ GuestExecStatus *qmp_guest_exec_status(int64_t pid, Error **err) + + gei = guest_exec_info_find(pid); + if (gei == NULL) { +- error_setg(err, QERR_INVALID_PARAMETER, "pid"); ++ error_setg(errp, QERR_INVALID_PARAMETER, "pid"); + return NULL; + } + +@@ -385,7 +385,7 @@ GuestExec *qmp_guest_exec(const char *path, + bool has_env, strList *env, + bool has_input_data, const char *input_data, + bool has_capture_output, bool capture_output, +- Error **err) ++ Error **errp) + { + GPid pid; + GuestExec *ge = NULL; +@@ -405,7 +405,7 @@ GuestExec *qmp_guest_exec(const char *path, + arglist.next = has_arg ? arg : NULL; + + if (has_input_data) { +- input = qbase64_decode(input_data, -1, &ninput, err); ++ input = qbase64_decode(input_data, -1, &ninput, errp); + if (!input) { + return NULL; + } +@@ -424,7 +424,7 @@ GuestExec *qmp_guest_exec(const char *path, + guest_exec_task_setup, NULL, &pid, has_input_data ? &in_fd : NULL, + has_output ? &out_fd : NULL, has_output ? &err_fd : NULL, &gerr); + if (!ret) { +- error_setg(err, QERR_QGA_COMMAND_FAILED, gerr->message); ++ error_setg(errp, QERR_QGA_COMMAND_FAILED, gerr->message); + g_error_free(gerr); + goto done; + } +@@ -499,7 +499,7 @@ int ga_parse_whence(GuestFileWhence *whence, Error **errp) + return -1; + } + +-GuestHostName *qmp_guest_get_host_name(Error **err) ++GuestHostName *qmp_guest_get_host_name(Error **errp) + { + GuestHostName *result = NULL; + gchar const *hostname = g_get_host_name(); +-- +2.27.0 + diff --git a/kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch b/kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch new file mode 100755 index 0000000000000000000000000000000000000000..727015e5e0a72586cd5c12fe238d308670e3b4ad --- /dev/null +++ b/kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch @@ -0,0 +1,113 @@ +From ff881d64d3f29825ab093eb2be183658226ccba3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 16 Dec 2020 16:06:15 -0500 +Subject: [PATCH 14/14] qga: update schema for guest-get-disks 'dependents' + field +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201216160615.324213-11-marcandre.lureau@redhat.com> +Patchwork-id: 100480 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 10/10] qga: update schema for guest-get-disks 'dependents' field +Bugzilla: 1859494 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Stefan Hajnoczi + +From: Michael Roth + +The recently-added 'guest-get-disk' command returns a list of +GuestDiskInfo entries, which in turn have a 'dependents' field which +lists devices these entries are dependent upon. Thus, 'dependencies' +is a better name for this field. Address this by renaming the field +accordingly. + +Additionally, 'dependents' is specified as non-optional, even though +it's not implemented for w32. This is misleading, since it gives users +the impression that a particular disk might not have dependencies, +when in reality that information is simply not known to the guest +agent. Address this by making 'dependents' an optional field, and only +marking it as in-use when the facilities to obtain this information are +available to the guest agent. + +Cc: Eric Blake +Cc: Tomáš Golembiovský +Cc: Marc-André Lureau +Reviewed-by: Eric Blake +Reviewed-by: Marc-André Lureau +Signed-off-by: Michael Roth + +(cherry-picked from commit a8aa94b5f8427cc2924d8cdd417c8014db1c86c0) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + qga/commands-posix.c | 10 ++++++---- + qga/qapi-schema.json | 8 ++++---- + 2 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 9a170dee14c..c02373cdf7d 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1287,6 +1287,7 @@ static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) + g_debug("failed to list entries in %s", deps_dir); + return; + } ++ disk->has_dependencies = true; + while ((dep = g_dir_read_name(dp_deps)) != NULL) { + g_autofree char *dep_dir = NULL; + strList *dep_item = NULL; +@@ -1299,8 +1300,8 @@ static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk) + g_debug(" adding dependent device: %s", dev_name); + dep_item = g_new0(strList, 1); + dep_item->value = dev_name; +- dep_item->next = disk->dependents; +- disk->dependents = dep_item; ++ dep_item->next = disk->dependencies; ++ disk->dependencies = dep_item; + } + } + g_dir_close(dp_deps); +@@ -1353,8 +1354,9 @@ static GuestDiskInfoList *get_disk_partitions( + partition->name = dev_name; + partition->partition = true; + /* Add parent disk as dependent for easier tracking of hierarchy */ +- partition->dependents = g_new0(strList, 1); +- partition->dependents->value = g_strdup(disk_dev); ++ partition->dependencies = g_new0(strList, 1); ++ partition->dependencies->value = g_strdup(disk_dev); ++ partition->has_dependencies = true; + + item = g_new0(GuestDiskInfoList, 1); + item->value = partition; +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index 22df375c92f..4222cb92d34 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -857,9 +857,9 @@ + # + # @name: device node (Linux) or device UNC (Windows) + # @partition: whether this is a partition or disk +-# @dependents: list of dependent devices; e.g. for LVs of the LVM this will +-# hold the list of PVs, for LUKS encrypted volume this will +-# contain the disk where the volume is placed. (Linux) ++# @dependencies: list of device dependencies; e.g. for LVs of the LVM this will ++# hold the list of PVs, for LUKS encrypted volume this will ++# contain the disk where the volume is placed. (Linux) + # @address: disk address information (only for non-virtual devices) + # @alias: optional alias assigned to the disk, on Linux this is a name assigned + # by device mapper +@@ -867,7 +867,7 @@ + # Since 5.2 + ## + { 'struct': 'GuestDiskInfo', +- 'data': {'name': 'str', 'partition': 'bool', 'dependents': ['str'], ++ 'data': {'name': 'str', 'partition': 'bool', '*dependencies': ['str'], + '*address': 'GuestDiskAddress', '*alias': 'str'} } + + ## +-- +2.27.0 + diff --git a/kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch b/kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch new file mode 100755 index 0000000000000000000000000000000000000000..5384b5160513f2ed6f612048b6ca1b38e8f4feb1 --- /dev/null +++ b/kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch @@ -0,0 +1,55 @@ +From 5d590d354e42515ea074bf2110a2ab236dbabba1 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 8 Jun 2020 15:01:34 +0100 +Subject: [PATCH 06/17] raw-format: Support BDRV_REQ_ZERO_WRITE for truncate + +RH-Author: Kevin Wolf +Message-id: <20200608150140.38218-6-kwolf@redhat.com> +Patchwork-id: 97447 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 05/11] raw-format: Support BDRV_REQ_ZERO_WRITE for truncate +Bugzilla: 1780574 +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Eric Blake +RH-Acked-by: Max Reitz + +The raw format driver can simply forward the flag and let its bs->file +child take care of actually providing the zeros. + +Signed-off-by: Kevin Wolf +Reviewed-by: Max Reitz +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Message-Id: <20200424125448.63318-6-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 1ddaabaecb7eaeb6d8948a32340af95db44c54a1) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/raw-format.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/block/raw-format.c b/block/raw-format.c +index c3acf9a..bdec466 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -387,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, + + s->size = offset; + offset += s->offset; +- return bdrv_co_truncate(bs->file, offset, exact, prealloc, 0, errp); ++ return bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + } + + static void raw_eject(BlockDriverState *bs, bool eject_flag) +@@ -445,6 +445,8 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & + bs->file->bs->supported_zero_flags); ++ bs->supported_truncate_flags = bs->file->bs->supported_truncate_flags & ++ BDRV_REQ_ZERO_WRITE; + + if (bs->probed && !bdrv_is_read_only(bs)) { + bdrv_refresh_filename(bs->file->bs); +-- +1.8.3.1 + diff --git a/kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch b/kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch new file mode 100755 index 0000000000000000000000000000000000000000..55be349ee7b4697b5c0b9882a5b05798c5c52432 --- /dev/null +++ b/kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch @@ -0,0 +1,72 @@ +From b07219611480dd4a37b2476604a1cec35c812216 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Wed, 23 Dec 2020 12:29:24 -0500 +Subject: [PATCH 1/5] redhat: link /etc/qemu-ga/fsfreeze-hook to /etc/qemu-kvm/ +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201223122924.341944-1-marcandre.lureau@redhat.com> +Patchwork-id: 100496 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH] redhat: link /etc/qemu-ga/fsfreeze-hook to /etc/qemu-kvm/ +Bugzilla: 1910267 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Danilo de Paula + +From: Danilo de Paula + +BZ: 1910267 +BRANCH: rhel-8.4.0 +UPSTREAM: RHEL-only +BREW: 33929331 + +When qemu-ga was introduced to RHEL-8, we used the qemu-guest-agent +from RHEL-7 as base. + +In RHEL-7, qemu-guest-agent is built as standalone package. +It's built as "qemu-ga", hence the "qemu-ga" folders. + +For RHEL-8, that should have been renamed to qemu-kvm, but I missed it. +Renaming those folders to /etc/qemu-kvm is a no go today, because +users might have populated the /etc/qemu-ga/fsfreeze-hook.d folder. + +So, in order to make qemu-ga -F works in RHEL-8, a link is being +created in the expected place, pointing to the real one. + +Also, fsfreeze-hook opens up the fsfreeze-hook.d on the same PATH where +it is stored. However, it doesn't follow symlinks. In order to fix this, +I had to change it to make sure it follows the link. + +An option would be to also link the fsfreeze-hook.d folder, but I choose +not to do so as it creates a permanent/visible change in users +environments. The downside is to keep another downstream-only change. + +Signed-off-by: Danilo C. L. de Paula + +[ cherry-picked from commit 020501879841afb788087f0455df79367c0337a0 ] +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + redhat/qemu-kvm.spec.template | 6 ++++++ + scripts/qemu-guest-agent/fsfreeze-hook | 2 +- + 2 files changed, 7 insertions(+), 1 deletion(-) + + +diff --git a/scripts/qemu-guest-agent/fsfreeze-hook b/scripts/qemu-guest-agent/fsfreeze-hook +index 13aafd48451..e9b84ec0284 100755 +--- a/scripts/qemu-guest-agent/fsfreeze-hook ++++ b/scripts/qemu-guest-agent/fsfreeze-hook +@@ -8,7 +8,7 @@ + # request, it is issued with "thaw" argument after filesystem is thawed. + + LOGFILE=/var/log/qga-fsfreeze-hook.log +-FSFREEZE_D=$(dirname -- "$0")/fsfreeze-hook.d ++FSFREEZE_D=$(dirname -- "$(realpath $0)")/fsfreeze-hook.d + + # Check whether file $1 is a backup or rpm-generated file and should be ignored + is_ignored_file() { +-- +2.27.0 + diff --git a/kvm-replication-assert-we-own-context-before-job_cancel_.patch b/kvm-replication-assert-we-own-context-before-job_cancel_.patch new file mode 100755 index 0000000000000000000000000000000000000000..09ef4de993741d1f3780bb726edb79bc22f82ec5 --- /dev/null +++ b/kvm-replication-assert-we-own-context-before-job_cancel_.patch @@ -0,0 +1,57 @@ +From 46887feac666d0d7633ff3f5af5721fe2a80a8ab Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 8 Apr 2020 17:29:13 +0100 +Subject: [PATCH 2/6] replication: assert we own context before job_cancel_sync + +RH-Author: Kevin Wolf +Message-id: <20200408172917.18712-3-kwolf@redhat.com> +Patchwork-id: 94595 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/6] replication: assert we own context before job_cancel_sync +Bugzilla: 1817621 +RH-Acked-by: Eric Blake +RH-Acked-by: Danilo de Paula +RH-Acked-by: Max Reitz + +From: Stefan Reiter + +job_cancel_sync requires the job's lock to be held, all other callers +already do this (replication_stop, drive_backup_abort, +blockdev_backup_abort, job_cancel_sync_all, cancel_common). + +In this case we're in a BlockDriver handler, so we already have a lock, +just assert that it is the same as the one used for the commit_job. + +Signed-off-by: Stefan Reiter +Message-Id: <20200407115651.69472-3-s.reiter@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 08558e33257ec796594bd411261028a93414a70c) +Signed-off-by: Kevin Wolf +Signed-off-by: Danilo C. L. de Paula +--- + block/replication.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/block/replication.c b/block/replication.c +index 99532ce..0ce27ee 100644 +--- a/block/replication.c ++++ b/block/replication.c +@@ -144,12 +144,15 @@ fail: + static void replication_close(BlockDriverState *bs) + { + BDRVReplicationState *s = bs->opaque; ++ Job *commit_job; + + if (s->stage == BLOCK_REPLICATION_RUNNING) { + replication_stop(s->rs, false, NULL); + } + if (s->stage == BLOCK_REPLICATION_FAILOVER) { +- job_cancel_sync(&s->commit_job->job); ++ commit_job = &s->commit_job->job; ++ assert(commit_job->aio_context == qemu_get_current_aio_context()); ++ job_cancel_sync(commit_job); + } + + if (s->mode == REPLICATION_MODE_SECONDARY) { +-- +1.8.3.1 + diff --git a/kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch b/kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch new file mode 100755 index 0000000000000000000000000000000000000000..917e3ff9ff22dc19c572a55cdc0133ecda3611e3 --- /dev/null +++ b/kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch @@ -0,0 +1,54 @@ +From 4079c4e96f910fe7e57af13feb433f06246f1d79 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:44 -0400 +Subject: [PATCH 6/9] rtl8139: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-7-jmaloy@redhat.com> +Patchwork-id: 101792 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 6/9] rtl8139: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Alexander Bulekov + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Buglink: https://bugs.launchpad.net/qemu/+bug/1910826 +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 5311fb805a4403bba024e83886fa0e7572265de4) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/rtl8139.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c +index 21d80e96cf..ccb04faa4c 100644 +--- a/hw/net/rtl8139.c ++++ b/hw/net/rtl8139.c +@@ -1793,7 +1793,7 @@ static void rtl8139_transfer_frame(RTL8139State *s, uint8_t *buf, int size, + } + + DPRINTF("+++ transmit loopback mode\n"); +- rtl8139_do_receive(qemu_get_queue(s->nic), buf, size, do_interrupt); ++ qemu_receive_packet(qemu_get_queue(s->nic), buf, size); + + if (iov) { + g_free(buf2); +-- +2.27.0 + diff --git a/kvm-s390-guest-support-for-diagnose-0x318.patch b/kvm-s390-guest-support-for-diagnose-0x318.patch new file mode 100755 index 0000000000000000000000000000000000000000..84fc7bc052ce9184d8af5f10f70dc0845e9a6362 --- /dev/null +++ b/kvm-s390-guest-support-for-diagnose-0x318.patch @@ -0,0 +1,282 @@ +From 7ad1c4aaea6cd202449c05fc0034af6b108def4f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:14 -0500 +Subject: [PATCH 14/18] s390: guest support for diagnose 0x318 + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-11-thuth@redhat.com> +Patchwork-id: 99507 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 10/12] s390: guest support for diagnose 0x318 +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +DIAGNOSE 0x318 (diag318) is an s390 instruction that allows the storage +of diagnostic information that is collected by the firmware in the case +of hardware/firmware service events. + +QEMU handles the instruction by storing the info in the CPU state. A +subsequent register sync will communicate the data to the hypervisor. + +QEMU handles the migration via a VM State Description. + +This feature depends on the Extended-Length SCCB (els) feature. If +els is not present, then a warning will be printed and the SCLP bit +that allows the Linux kernel to execute the instruction will not be +set. + +Availability of this instruction is determined by byte 134 (aka fac134) +bit 0 of the SCLP Read Info block. This coincidentally expands into the +space used for CPU entries, which means VMs running with the diag318 +capability may not be able to read information regarding all CPUs +unless the guest kernel supports an extended-length SCCB. + +This feature is not supported in protected virtualization mode. + +Signed-off-by: Collin Walling +Acked-by: Janosch Frank +Acked-by: Thomas Huth +Acked-by: David Hildenbrand +Acked-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-9-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit fabdada9357b9cfd980c7744ddce47e34600bbef) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 5 ++++ + include/hw/s390x/sclp.h | 8 ++++++ + target/s390x/cpu.h | 2 ++ + target/s390x/cpu_features.h | 1 + + target/s390x/cpu_features_def.inc.h | 3 +++ + target/s390x/cpu_models.c | 1 + + target/s390x/gen-features.c | 1 + + target/s390x/kvm.c | 39 +++++++++++++++++++++++++++++ + target/s390x/machine.c | 17 +++++++++++++ + 9 files changed, 77 insertions(+) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 8d111628e04..2931046f456 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -139,6 +139,11 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + s390_get_feat_block(S390_FEAT_TYPE_SCLP_CONF_CHAR_EXT, + read_info->conf_char_ext); + ++ if (s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB)) { ++ s390_get_feat_block(S390_FEAT_TYPE_SCLP_FAC134, ++ &read_info->fac134); ++ } ++ + read_info->facilities = cpu_to_be64(SCLP_HAS_CPU_INFO | + SCLP_HAS_IOA_RECONFIG); + +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index 62e2aa1d9f1..addd904e5f4 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -133,7 +133,15 @@ typedef struct ReadInfo { + uint16_t highest_cpu; + uint8_t _reserved5[124 - 122]; /* 122-123 */ + uint32_t hmfai; ++ uint8_t _reserved7[134 - 128]; /* 128-133 */ ++ uint8_t fac134; ++ uint8_t _reserved8[144 - 135]; /* 135-143 */ + struct CPUEntry entries[]; ++ /* ++ * When the Extended-Length SCCB (ELS) feature is enabled the ++ * start of the entries field begins at an offset denoted by the ++ * offset_cpu field, otherwise it's at an offset of 128. ++ */ + } QEMU_PACKED ReadInfo; + + typedef struct ReadCpuInfo { +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index a48e655c4d4..1dc21cd311d 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -117,6 +117,8 @@ struct CPUS390XState { + uint16_t external_call_addr; + DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS); + ++ uint64_t diag318_info; ++ + /* Fields up to this point are cleared by a CPU reset */ + struct {} end_reset_fields; + +diff --git a/target/s390x/cpu_features.h b/target/s390x/cpu_features.h +index da695a8346e..f74f7fc3a11 100644 +--- a/target/s390x/cpu_features.h ++++ b/target/s390x/cpu_features.h +@@ -23,6 +23,7 @@ typedef enum { + S390_FEAT_TYPE_STFL, + S390_FEAT_TYPE_SCLP_CONF_CHAR, + S390_FEAT_TYPE_SCLP_CONF_CHAR_EXT, ++ S390_FEAT_TYPE_SCLP_FAC134, + S390_FEAT_TYPE_SCLP_CPU, + S390_FEAT_TYPE_MISC, + S390_FEAT_TYPE_PLO, +diff --git a/target/s390x/cpu_features_def.inc.h b/target/s390x/cpu_features_def.inc.h +index 3548d65a69a..cf7e04ee44f 100644 +--- a/target/s390x/cpu_features_def.inc.h ++++ b/target/s390x/cpu_features_def.inc.h +@@ -122,6 +122,9 @@ DEF_FEAT(SIE_CMMA, "cmma", SCLP_CONF_CHAR_EXT, 1, "SIE: Collaborative-memory-man + DEF_FEAT(SIE_PFMFI, "pfmfi", SCLP_CONF_CHAR_EXT, 9, "SIE: PFMF interpretation facility") + DEF_FEAT(SIE_IBS, "ibs", SCLP_CONF_CHAR_EXT, 10, "SIE: Interlock-and-broadcast-suppression facility") + ++/* Features exposed via SCLP SCCB Facilities byte 134 (bit numbers relative to byte-134) */ ++DEF_FEAT(DIAG_318, "diag318", SCLP_FAC134, 0, "Control program name and version codes") ++ + /* Features exposed via SCLP CPU info. */ + DEF_FEAT(SIE_F2, "sief2", SCLP_CPU, 4, "SIE: interception format 2 (Virtual SIE)") + DEF_FEAT(SIE_SKEY, "skey", SCLP_CPU, 5, "SIE: Storage-key facility") +diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c +index be718220d79..bf6a3faba9e 100644 +--- a/target/s390x/cpu_models.c ++++ b/target/s390x/cpu_models.c +@@ -823,6 +823,7 @@ static void check_consistency(const S390CPUModel *model) + { S390_FEAT_PTFF_STOE, S390_FEAT_MULTIPLE_EPOCH }, + { S390_FEAT_PTFF_STOUE, S390_FEAT_MULTIPLE_EPOCH }, + { S390_FEAT_AP_QUEUE_INTERRUPT_CONTROL, S390_FEAT_AP }, ++ { S390_FEAT_DIAG_318, S390_FEAT_EXTENDED_LENGTH_SCCB }, + }; + int i; + +diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c +index 6857f657fba..a1f0a6f3c6f 100644 +--- a/target/s390x/gen-features.c ++++ b/target/s390x/gen-features.c +@@ -523,6 +523,7 @@ static uint16_t full_GEN12_GA1[] = { + S390_FEAT_AP_FACILITIES_TEST, + S390_FEAT_AP, + S390_FEAT_EXTENDED_LENGTH_SCCB, ++ S390_FEAT_DIAG_318, + }; + + static uint16_t full_GEN12_GA2[] = { +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index ef437acb5c1..e5e190d21c9 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -105,6 +105,7 @@ + + #define DIAG_TIMEREVENT 0x288 + #define DIAG_IPL 0x308 ++#define DIAG_SET_CONTROL_PROGRAM_CODES 0x318 + #define DIAG_KVM_HYPERCALL 0x500 + #define DIAG_KVM_BREAKPOINT 0x501 + +@@ -602,6 +603,11 @@ int kvm_arch_put_registers(CPUState *cs, int level) + cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_ETOKEN; + } + ++ if (can_sync_regs(cs, KVM_SYNC_DIAG318)) { ++ cs->kvm_run->s.regs.diag318 = env->diag318_info; ++ cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ } ++ + /* Finally the prefix */ + if (can_sync_regs(cs, KVM_SYNC_PREFIX)) { + cs->kvm_run->s.regs.prefix = env->psa; +@@ -741,6 +747,10 @@ int kvm_arch_get_registers(CPUState *cs) + } + } + ++ if (can_sync_regs(cs, KVM_SYNC_DIAG318)) { ++ env->diag318_info = cs->kvm_run->s.regs.diag318; ++ } ++ + return 0; + } + +@@ -1601,6 +1611,27 @@ static int handle_sw_breakpoint(S390CPU *cpu, struct kvm_run *run) + return -ENOENT; + } + ++static void handle_diag_318(S390CPU *cpu, struct kvm_run *run) ++{ ++ uint64_t reg = (run->s390_sieic.ipa & 0x00f0) >> 4; ++ uint64_t diag318_info = run->s.regs.gprs[reg]; ++ ++ /* ++ * DIAG 318 can only be enabled with KVM support. As such, let's ++ * ensure a guest cannot execute this instruction erroneously. ++ */ ++ if (!s390_has_feat(S390_FEAT_DIAG_318)) { ++ kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION); ++ } ++ ++ cpu->env.diag318_info = diag318_info; ++ ++ if (can_sync_regs(CPU(cpu), KVM_SYNC_DIAG318)) { ++ run->s.regs.diag318 = diag318_info; ++ run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ } ++} ++ + #define DIAG_KVM_CODE_MASK 0x000000000000ffff + + static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb) +@@ -1620,6 +1651,9 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb) + case DIAG_IPL: + kvm_handle_diag_308(cpu, run); + break; ++ case DIAG_SET_CONTROL_PROGRAM_CODES: ++ handle_diag_318(cpu, run); ++ break; + case DIAG_KVM_HYPERCALL: + r = handle_hypercall(cpu, run); + break; +@@ -2449,6 +2483,11 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + */ + set_bit(S390_FEAT_EXTENDED_LENGTH_SCCB, model->features); + ++ /* DIAGNOSE 0x318 is not supported under protected virtualization */ ++ if (!s390_is_pv() && kvm_check_extension(kvm_state, KVM_CAP_S390_DIAG318)) { ++ set_bit(S390_FEAT_DIAG_318, model->features); ++ } ++ + /* strip of features that are not part of the maximum model */ + bitmap_and(model->features, model->features, model->def->full_feat, + S390_FEAT_MAX); +diff --git a/target/s390x/machine.c b/target/s390x/machine.c +index 549bb6c2808..5b4e82f1ab9 100644 +--- a/target/s390x/machine.c ++++ b/target/s390x/machine.c +@@ -234,6 +234,22 @@ const VMStateDescription vmstate_etoken = { + } + }; + ++static bool diag318_needed(void *opaque) ++{ ++ return s390_has_feat(S390_FEAT_DIAG_318); ++} ++ ++const VMStateDescription vmstate_diag318 = { ++ .name = "cpu/diag318", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = diag318_needed, ++ .fields = (VMStateField[]) { ++ VMSTATE_UINT64(env.diag318_info, S390CPU), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++ + const VMStateDescription vmstate_s390_cpu = { + .name = "cpu", + .post_load = cpu_post_load, +@@ -270,6 +286,7 @@ const VMStateDescription vmstate_s390_cpu = { + &vmstate_gscb, + &vmstate_bpbc, + &vmstate_etoken, ++ &vmstate_diag318, + NULL + }, + }; +-- +2.27.0 + diff --git a/kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch b/kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch new file mode 100755 index 0000000000000000000000000000000000000000..c45158a89ff95a075a16ba919818834a55e0d331 --- /dev/null +++ b/kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch @@ -0,0 +1,54 @@ +From 1769600e1e3bd5ca48450de8ce8a118bf0af96f3 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:00 -0400 +Subject: [PATCH 18/42] s390/ipl: fix off-by-one in + update_machine_ipl_properties() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-19-thuth@redhat.com> +Patchwork-id: 97028 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 18/38] s390/ipl: fix off-by-one in update_machine_ipl_properties() +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Halil Pasic + +In update_machine_ipl_properties() the array ascii_loadparm needs to +hold the 8 char loadparm and a string terminating zero char. + +Let's increase the size of ascii_loadparm accordingly. + +Signed-off-by: Halil Pasic +Fixes: 0a01e082a428 ("s390/ipl: sync back loadparm") +Fixes: Coverity CID 1421966 +Reported-by: Peter Maydell +Message-Id: <20200320143101.41764-1-pasic@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 7722837369eb1c7e808021d79da68afa0c01c26f) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index f25339c503..fa0409dc23 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -537,7 +537,7 @@ static void update_machine_ipl_properties(IplParameterBlock *iplb) + /* Sync loadparm */ + if (iplb->flags & DIAG308_FLAGS_LP_VALID) { + uint8_t *ebcdic_loadparm = iplb->loadparm; +- char ascii_loadparm[8]; ++ char ascii_loadparm[9]; + int i; + + for (i = 0; i < 8 && ebcdic_loadparm[i]; i++) { +-- +2.27.0 + diff --git a/kvm-s390-ipl-sync-back-loadparm.patch b/kvm-s390-ipl-sync-back-loadparm.patch new file mode 100755 index 0000000000000000000000000000000000000000..49f4d3fb19662c24255a71e96dc68fba19bbf4ae --- /dev/null +++ b/kvm-s390-ipl-sync-back-loadparm.patch @@ -0,0 +1,91 @@ +From 53053ea2e6c757e5d044655c8b61c485e0aad4ed Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:59 -0400 +Subject: [PATCH 17/42] s390/ipl: sync back loadparm +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-18-thuth@redhat.com> +Patchwork-id: 97039 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 17/38] s390/ipl: sync back loadparm +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Halil Pasic + +We expose loadparm as a r/w machine property, but if loadparm is set by +the guest via DIAG 308, we don't update the property. Having a +disconnect between the guest view and the QEMU property is not nice in +itself, but things get even worse for SCSI, where under certain +circumstances (see 789b5a401b "s390: Ensure IPL from SCSI works as +expected" for details) we call s390_gen_initial_iplb() on resets +effectively overwriting the guest/user supplied loadparm with the stale +value. + +Signed-off-by: Halil Pasic +Fixes: 7104bae9de ("hw/s390x: provide loadparm property for the machine") +Reported-by: Marc Hartmayer +Reviewed-by: Janosch Frank +Reviewed-by: Viktor Mihajlovski +Tested-by: Marc Hartmayer +Reviewed-by: David Hildenbrand +Message-Id: <20200309133223.100491-1-pasic@linux.ibm.com> +[borntraeger@de.ibm.com: use reverse xmas tree] +Signed-off-by: Christian Borntraeger +(cherry picked from commit 0a01e082a428b921e48b5314881b1f23a7b0fe50) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index 0b7548a549..f25339c503 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -529,6 +529,30 @@ static bool is_virtio_scsi_device(IplParameterBlock *iplb) + return is_virtio_ccw_device_of_type(iplb, VIRTIO_ID_SCSI); + } + ++static void update_machine_ipl_properties(IplParameterBlock *iplb) ++{ ++ Object *machine = qdev_get_machine(); ++ Error *err = NULL; ++ ++ /* Sync loadparm */ ++ if (iplb->flags & DIAG308_FLAGS_LP_VALID) { ++ uint8_t *ebcdic_loadparm = iplb->loadparm; ++ char ascii_loadparm[8]; ++ int i; ++ ++ for (i = 0; i < 8 && ebcdic_loadparm[i]; i++) { ++ ascii_loadparm[i] = ebcdic2ascii[(uint8_t) ebcdic_loadparm[i]]; ++ } ++ ascii_loadparm[i] = 0; ++ object_property_set_str(machine, ascii_loadparm, "loadparm", &err); ++ } else { ++ object_property_set_str(machine, "", "loadparm", &err); ++ } ++ if (err) { ++ warn_report_err(err); ++ } ++} ++ + void s390_ipl_update_diag308(IplParameterBlock *iplb) + { + S390IPLState *ipl = get_ipl_device(); +@@ -536,6 +560,7 @@ void s390_ipl_update_diag308(IplParameterBlock *iplb) + ipl->iplb = *iplb; + ipl->iplb_valid = true; + ipl->netboot = is_virtio_net_device(iplb); ++ update_machine_ipl_properties(iplb); + } + + IplParameterBlock *s390_ipl_get_iplb(void) +-- +2.27.0 + diff --git a/kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch b/kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch new file mode 100755 index 0000000000000000000000000000000000000000..f0f25a5aba82ab1357fe6872413ae7208d3f4d8e --- /dev/null +++ b/kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch @@ -0,0 +1,163 @@ +From a0ad4344984c50939be8c99371af0988551fb776 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 20 Nov 2020 11:46:09 -0500 +Subject: [PATCH 17/18] s390/kvm: fix diag318 propagation and reset + functionality + +RH-Author: Thomas Huth +Message-id: <20201120114609.408610-2-thuth@redhat.com> +Patchwork-id: 99787 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] s390/kvm: fix diag318 propagation and reset functionality +Bugzilla: 1659412 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The Control Program Name Code (CPNC) portion of the diag318 +info must be set within the SIE block of each VCPU in the +configuration. The handler will iterate through each VCPU +and dirty the diag318_info reg to be synced with KVM on a +subsequent sync_regs call. + +Additionally, the diag318 info resets must be handled via +userspace. As such, QEMU will reset this value for each +VCPU during a modified clear, load normal, and load clear +reset event. + +Fixes: fabdada9357b ("s390: guest support for diagnose 0x318") +Signed-off-by: Collin Walling +Message-Id: <20201113221022.257054-1-walling@linux.ibm.com> +Reviewed-by: Thomas Huth +Reviewed-by: Janosch Frank +Signed-off-by: Cornelia Huck +(cherry picked from commit e2c6cd567422bfa563be026b9741a1854aecdc06) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 4 ++++ + target/s390x/cpu.c | 7 +++++++ + target/s390x/cpu.h | 1 + + target/s390x/kvm-stub.c | 4 ++++ + target/s390x/kvm.c | 22 +++++++++++++++++----- + target/s390x/kvm_s390x.h | 1 + + 6 files changed, 34 insertions(+), 5 deletions(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index e6ed13b649a..5905d2b7adc 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -489,6 +489,10 @@ static void s390_machine_reset(MachineState *machine) + default: + g_assert_not_reached(); + } ++ ++ CPU_FOREACH(t) { ++ run_on_cpu(t, s390_do_cpu_set_diag318, RUN_ON_CPU_HOST_ULONG(0)); ++ } + s390_ipl_clear_reset_request(); + } + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 371b91b2d72..820cab96e12 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -445,6 +445,13 @@ void s390_enable_css_support(S390CPU *cpu) + kvm_s390_enable_css_support(cpu); + } + } ++ ++void s390_do_cpu_set_diag318(CPUState *cs, run_on_cpu_data arg) ++{ ++ if (kvm_enabled()) { ++ kvm_s390_set_diag318(cs, arg.host_ulong); ++ } ++} + #endif + + static gchar *s390_gdb_arch_name(CPUState *cs) +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 1dc21cd311d..83a23a11b96 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -774,6 +774,7 @@ int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit); + void s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void s390_cmma_reset(void); + void s390_enable_css_support(S390CPU *cpu); ++void s390_do_cpu_set_diag318(CPUState *cs, run_on_cpu_data arg); + int s390_assign_subch_ioeventfd(EventNotifier *notifier, uint32_t sch_id, + int vq, bool assign); + #ifndef CONFIG_USER_ONLY +diff --git a/target/s390x/kvm-stub.c b/target/s390x/kvm-stub.c +index aa185017a2a..9970b5a8c70 100644 +--- a/target/s390x/kvm-stub.c ++++ b/target/s390x/kvm-stub.c +@@ -120,3 +120,7 @@ void kvm_s390_stop_interrupt(S390CPU *cpu) + void kvm_s390_restart_interrupt(S390CPU *cpu) + { + } ++ ++void kvm_s390_set_diag318(CPUState *cs, uint64_t diag318_info) ++{ ++} +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 6edb52f6d25..8d4406124b9 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -1611,10 +1611,23 @@ static int handle_sw_breakpoint(S390CPU *cpu, struct kvm_run *run) + return -ENOENT; + } + ++void kvm_s390_set_diag318(CPUState *cs, uint64_t diag318_info) ++{ ++ CPUS390XState *env = &S390_CPU(cs)->env; ++ ++ /* Feat bit is set only if KVM supports sync for diag318 */ ++ if (s390_has_feat(S390_FEAT_DIAG_318)) { ++ env->diag318_info = diag318_info; ++ cs->kvm_run->s.regs.diag318 = diag318_info; ++ cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ } ++} ++ + static void handle_diag_318(S390CPU *cpu, struct kvm_run *run) + { + uint64_t reg = (run->s390_sieic.ipa & 0x00f0) >> 4; + uint64_t diag318_info = run->s.regs.gprs[reg]; ++ CPUState *t; + + /* + * DIAG 318 can only be enabled with KVM support. As such, let's +@@ -1622,13 +1635,12 @@ static void handle_diag_318(S390CPU *cpu, struct kvm_run *run) + */ + if (!s390_has_feat(S390_FEAT_DIAG_318)) { + kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION); ++ return; + } + +- cpu->env.diag318_info = diag318_info; +- +- if (can_sync_regs(CPU(cpu), KVM_SYNC_DIAG318)) { +- run->s.regs.diag318 = diag318_info; +- run->kvm_dirty_regs |= KVM_SYNC_DIAG318; ++ CPU_FOREACH(t) { ++ run_on_cpu(t, s390_do_cpu_set_diag318, ++ RUN_ON_CPU_HOST_ULONG(diag318_info)); + } + } + +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index 6ab17c81b73..25bbe98b251 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -45,5 +45,6 @@ void kvm_s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void kvm_s390_crypto_reset(void); + void kvm_s390_restart_interrupt(S390CPU *cpu); + void kvm_s390_stop_interrupt(S390CPU *cpu); ++void kvm_s390_set_diag318(CPUState *cs, uint64_t diag318_info); + + #endif /* KVM_S390X_H */ +-- +2.27.0 + diff --git a/kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch b/kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch new file mode 100755 index 0000000000000000000000000000000000000000..c05f50c2750dcfe6e28f8653bee01626b21bb214 --- /dev/null +++ b/kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch @@ -0,0 +1,220 @@ +From e1a3684f9b08fa9db35331b5c5ad11879f512e90 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:11 -0500 +Subject: [PATCH 11/18] s390/sclp: add extended-length sccb support for kvm + guest + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-8-thuth@redhat.com> +Patchwork-id: 99504 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 07/12] s390/sclp: add extended-length sccb support for kvm guest +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +As more features and facilities are added to the Read SCP Info (RSCPI) +response, more space is required to store them. The space used to store +these new features intrudes on the space originally used to store CPU +entries. This means as more features and facilities are added to the +RSCPI response, less space can be used to store CPU entries. + +With the Extended-Length SCCB (ELS) facility, a KVM guest can execute +the RSCPI command and determine if the SCCB is large enough to store a +complete reponse. If it is not large enough, then the required length +will be set in the SCCB header. + +The caller of the SCLP command is responsible for creating a +large-enough SCCB to store a complete response. Proper checking should +be in place, and the caller should execute the command once-more with +the large-enough SCCB. + +This facility also enables an extended SCCB for the Read CPU Info +(RCPUI) command. + +When this facility is enabled, the boundary violation response cannot +be a result from the RSCPI, RSCPI Forced, or RCPUI commands. + +In order to tolerate kernels that do not yet have full support for this +feature, a "fixed" offset to the start of the CPU Entries within the +Read SCP Info struct is set to allow for the original 248 max entries +when this feature is disabled. + +Additionally, this is introduced as a CPU feature to protect the guest +from migrating to a machine that does not support storing an extended +SCCB. This could otherwise hinder the VM from being able to read all +available CPU entries after migration (such as during re-ipl). + +Signed-off-by: Collin Walling +Reviewed-by: Thomas Huth +Acked-by: Cornelia Huck +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-7-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 1ecd6078f587cfadda8edc93d45b5072e35f2d17) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 43 +++++++++++++++++++++++++---- + include/hw/s390x/sclp.h | 1 + + target/s390x/cpu_features_def.inc.h | 1 + + target/s390x/gen-features.c | 1 + + target/s390x/kvm.c | 8 ++++++ + 5 files changed, 48 insertions(+), 6 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 017989b3888..8d111628e04 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -49,13 +49,30 @@ static inline bool sclp_command_code_valid(uint32_t code) + return false; + } + +-static bool sccb_verify_boundary(uint64_t sccb_addr, uint16_t sccb_len) ++static bool sccb_verify_boundary(uint64_t sccb_addr, uint16_t sccb_len, ++ uint32_t code) + { + uint64_t sccb_max_addr = sccb_addr + sccb_len - 1; + uint64_t sccb_boundary = (sccb_addr & PAGE_MASK) + PAGE_SIZE; + +- if (sccb_max_addr < sccb_boundary) { +- return true; ++ switch (code & SCLP_CMD_CODE_MASK) { ++ case SCLP_CMDW_READ_SCP_INFO: ++ case SCLP_CMDW_READ_SCP_INFO_FORCED: ++ case SCLP_CMDW_READ_CPU_INFO: ++ /* ++ * An extended-length SCCB is only allowed for Read SCP/CPU Info and ++ * is allowed to exceed the 4k boundary. The respective commands will ++ * set the length field to the required length if an insufficient ++ * SCCB length is provided. ++ */ ++ if (s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB)) { ++ return true; ++ } ++ /* fallthrough */ ++ default: ++ if (sccb_max_addr < sccb_boundary) { ++ return true; ++ } + } + + return false; +@@ -80,6 +97,12 @@ static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + + #define SCCB_REQ_LEN(s, max_cpus) (sizeof(s) + max_cpus * sizeof(CPUEntry)) + ++static inline bool ext_len_sccb_supported(SCCBHeader header) ++{ ++ return s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB) && ++ header.control_mask[2] & SCLP_VARIABLE_LENGTH_RESPONSE; ++} ++ + /* Provide information about the configuration, CPUs and storage */ + static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + { +@@ -89,10 +112,15 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + int rnsize, rnmax; + IplParameterBlock *ipib = s390_ipl_get_iplb(); + int required_len = SCCB_REQ_LEN(ReadInfo, machine->possible_cpus->len); +- int offset_cpu = offsetof(ReadInfo, entries); ++ int offset_cpu = s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB) ? ++ offsetof(ReadInfo, entries) : ++ SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET; + CPUEntry *entries_start = (void *)sccb + offset_cpu; + + if (be16_to_cpu(sccb->h.length) < required_len) { ++ if (ext_len_sccb_supported(sccb->h)) { ++ sccb->h.length = cpu_to_be16(required_len); ++ } + sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); + return; + } +@@ -153,6 +181,9 @@ static void sclp_read_cpu_info(SCLPDevice *sclp, SCCB *sccb) + int required_len = SCCB_REQ_LEN(ReadCpuInfo, machine->possible_cpus->len); + + if (be16_to_cpu(sccb->h.length) < required_len) { ++ if (ext_len_sccb_supported(sccb->h)) { ++ sccb->h.length = cpu_to_be16(required_len); ++ } + sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); + return; + } +@@ -249,7 +280,7 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length), code)) { + work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } +@@ -302,7 +333,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length), code)) { + work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index 55f53a46540..df2fa4169b0 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -110,6 +110,7 @@ typedef struct CPUEntry { + uint8_t reserved1; + } QEMU_PACKED CPUEntry; + ++#define SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET 128 + typedef struct ReadInfo { + SCCBHeader h; + uint16_t rnmax; +diff --git a/target/s390x/cpu_features_def.inc.h b/target/s390x/cpu_features_def.inc.h +index 60db28351d0..3548d65a69a 100644 +--- a/target/s390x/cpu_features_def.inc.h ++++ b/target/s390x/cpu_features_def.inc.h +@@ -97,6 +97,7 @@ DEF_FEAT(GUARDED_STORAGE, "gs", STFL, 133, "Guarded-storage facility") + DEF_FEAT(VECTOR_PACKED_DECIMAL, "vxpd", STFL, 134, "Vector packed decimal facility") + DEF_FEAT(VECTOR_ENH, "vxeh", STFL, 135, "Vector enhancements facility") + DEF_FEAT(MULTIPLE_EPOCH, "mepoch", STFL, 139, "Multiple-epoch facility") ++DEF_FEAT(EXTENDED_LENGTH_SCCB, "els", STFL, 140, "Extended-length SCCB facility") + DEF_FEAT(TEST_PENDING_EXT_INTERRUPTION, "tpei", STFL, 144, "Test-pending-external-interruption facility") + DEF_FEAT(INSERT_REFERENCE_BITS_MULT, "irbm", STFL, 145, "Insert-reference-bits-multiple facility") + DEF_FEAT(MSA_EXT_8, "msa8-base", STFL, 146, "Message-security-assist-extension-8 facility (excluding subfunctions)") +diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c +index 8ddeebc5441..6857f657fba 100644 +--- a/target/s390x/gen-features.c ++++ b/target/s390x/gen-features.c +@@ -522,6 +522,7 @@ static uint16_t full_GEN12_GA1[] = { + S390_FEAT_AP_QUEUE_INTERRUPT_CONTROL, + S390_FEAT_AP_FACILITIES_TEST, + S390_FEAT_AP, ++ S390_FEAT_EXTENDED_LENGTH_SCCB, + }; + + static uint16_t full_GEN12_GA2[] = { +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 0bbf8f81b09..ef437acb5c1 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2441,6 +2441,14 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + KVM_S390_VM_CRYPTO_ENABLE_APIE)) { + set_bit(S390_FEAT_AP, model->features); + } ++ ++ /* ++ * Extended-Length SCCB is handled entirely within QEMU. ++ * For PV guests this is completely fenced by the Ultravisor, as Service ++ * Call error checking and STFLE interpretation are handled via SIE. ++ */ ++ set_bit(S390_FEAT_EXTENDED_LENGTH_SCCB, model->features); ++ + /* strip of features that are not part of the maximum model */ + bitmap_and(model->features, model->features, model->def->full_feat, + S390_FEAT_MAX); +-- +2.27.0 + diff --git a/kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch b/kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch new file mode 100755 index 0000000000000000000000000000000000000000..6efc35fbc13bffd37023bd5d44752d53a78537c6 --- /dev/null +++ b/kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch @@ -0,0 +1,106 @@ +From 6cc7c8dd7a6fac493c648c607bec4c38c0b275b6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:09 -0500 +Subject: [PATCH 09/18] s390/sclp: check sccb len before filling in data + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-6-thuth@redhat.com> +Patchwork-id: 99502 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 05/12] s390/sclp: check sccb len before filling in data +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The SCCB must be checked for a sufficient length before it is filled +with any data. If the length is insufficient, then the SCLP command +is suppressed and the proper response code is set in the SCCB header. + +While we're at it, let's cleanup the length check by placing the +calculation inside a macro. + +Fixes: 832be0d8a3bb ("s390x: sclp: Report insufficient SCCB length") +Signed-off-by: Collin Walling +Reviewed-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Cornelia Huck +Reviewed-by: Thomas Huth +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-5-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0260b97824495ebfacfa8bbae0be10b0ef986bf6) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 26 ++++++++++++++------------ + 1 file changed, 14 insertions(+), 12 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index cf1292beb22..2b4c6c5cfad 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -78,6 +78,8 @@ static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + } + } + ++#define SCCB_REQ_LEN(s, max_cpus) (sizeof(s) + max_cpus * sizeof(CPUEntry)) ++ + /* Provide information about the configuration, CPUs and storage */ + static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + { +@@ -86,6 +88,12 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + int cpu_count; + int rnsize, rnmax; + IplParameterBlock *ipib = s390_ipl_get_iplb(); ++ int required_len = SCCB_REQ_LEN(ReadInfo, machine->possible_cpus->len); ++ ++ if (be16_to_cpu(sccb->h.length) < required_len) { ++ sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); ++ return; ++ } + + /* CPU information */ + prepare_cpu_entries(machine, read_info->entries, &cpu_count); +@@ -95,12 +103,6 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + + read_info->ibc_val = cpu_to_be32(s390_get_ibc_val()); + +- if (be16_to_cpu(sccb->h.length) < +- (sizeof(ReadInfo) + cpu_count * sizeof(CPUEntry))) { +- sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); +- return; +- } +- + /* Configuration Characteristic (Extension) */ + s390_get_feat_block(S390_FEAT_TYPE_SCLP_CONF_CHAR, + read_info->conf_char); +@@ -146,18 +148,18 @@ static void sclp_read_cpu_info(SCLPDevice *sclp, SCCB *sccb) + MachineState *machine = MACHINE(qdev_get_machine()); + ReadCpuInfo *cpu_info = (ReadCpuInfo *) sccb; + int cpu_count; ++ int required_len = SCCB_REQ_LEN(ReadCpuInfo, machine->possible_cpus->len); ++ ++ if (be16_to_cpu(sccb->h.length) < required_len) { ++ sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); ++ return; ++ } + + prepare_cpu_entries(machine, cpu_info->entries, &cpu_count); + cpu_info->nr_configured = cpu_to_be16(cpu_count); + cpu_info->offset_configured = cpu_to_be16(offsetof(ReadCpuInfo, entries)); + cpu_info->nr_standby = cpu_to_be16(0); + +- if (be16_to_cpu(sccb->h.length) < +- (sizeof(ReadCpuInfo) + cpu_count * sizeof(CPUEntry))) { +- sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); +- return; +- } +- + /* The standby offset is 16-byte for each CPU */ + cpu_info->offset_standby = cpu_to_be16(cpu_info->offset_configured + + cpu_info->nr_configured*sizeof(CPUEntry)); +-- +2.27.0 + diff --git a/kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch b/kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch new file mode 100755 index 0000000000000000000000000000000000000000..09c72b6cd9073c4e2d59683d28d0ff56baadc2c5 --- /dev/null +++ b/kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch @@ -0,0 +1,75 @@ +From 44e8cdba29b932ee6fff7a2d00b09e6e78c3a0ef Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:06 -0500 +Subject: [PATCH 06/18] s390/sclp: get machine once during read scp/cpu info + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-3-thuth@redhat.com> +Patchwork-id: 99499 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 02/12] s390/sclp: get machine once during read scp/cpu info +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +Functions within read scp/cpu info will need access to the machine +state. Let's make a call to retrieve the machine state once and +pass the appropriate data to the respective functions. + +Signed-off-by: Collin Walling +Reviewed-by: David Hildenbrand +Reviewed-by: Thomas Huth +Reviewed-by: Janosch Frank +Reviewed-by: Cornelia Huck +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-2-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 912d70d2755cb9b3144eeed4014580ebc5485ce6) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index d8ae207731f..fe7d0fece80 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -49,9 +49,8 @@ static inline bool sclp_command_code_valid(uint32_t code) + return false; + } + +-static void prepare_cpu_entries(SCLPDevice *sclp, CPUEntry *entry, int *count) ++static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + { +- MachineState *ms = MACHINE(qdev_get_machine()); + uint8_t features[SCCB_CPU_FEATURE_LEN] = { 0 }; + int i; + +@@ -77,7 +76,7 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + IplParameterBlock *ipib = s390_ipl_get_iplb(); + + /* CPU information */ +- prepare_cpu_entries(sclp, read_info->entries, &cpu_count); ++ prepare_cpu_entries(machine, read_info->entries, &cpu_count); + read_info->entries_cpu = cpu_to_be16(cpu_count); + read_info->offset_cpu = cpu_to_be16(offsetof(ReadInfo, entries)); + read_info->highest_cpu = cpu_to_be16(machine->smp.max_cpus - 1); +@@ -132,10 +131,11 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + /* Provide information about the CPU */ + static void sclp_read_cpu_info(SCLPDevice *sclp, SCCB *sccb) + { ++ MachineState *machine = MACHINE(qdev_get_machine()); + ReadCpuInfo *cpu_info = (ReadCpuInfo *) sccb; + int cpu_count; + +- prepare_cpu_entries(sclp, cpu_info->entries, &cpu_count); ++ prepare_cpu_entries(machine, cpu_info->entries, &cpu_count); + cpu_info->nr_configured = cpu_to_be16(cpu_count); + cpu_info->offset_configured = cpu_to_be16(offsetof(ReadCpuInfo, entries)); + cpu_info->nr_standby = cpu_to_be16(0); +-- +2.27.0 + diff --git a/kvm-s390-sclp-improve-special-wait-psw-logic.patch b/kvm-s390-sclp-improve-special-wait-psw-logic.patch new file mode 100755 index 0000000000000000000000000000000000000000..2040d5cac41fcfd226b0efbf13cb4539c8e27b4d --- /dev/null +++ b/kvm-s390-sclp-improve-special-wait-psw-logic.patch @@ -0,0 +1,52 @@ +From cd7da3cf1b19fef0a497fd556562040a85e579a7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:57 -0400 +Subject: [PATCH 15/42] s390/sclp: improve special wait psw logic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-16-thuth@redhat.com> +Patchwork-id: 97037 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 15/38] s390/sclp: improve special wait psw logic +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +There is a special quiesce PSW that we check for "shutdown". Otherwise disabled +wait is detected as "crashed". Architecturally we must only check PSW bits +116-127. Fix this. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Christian Borntraeger +Message-Id: <1582204582-22995-1-git-send-email-borntraeger@de.ibm.com> +Reviewed-by: David Hildenbrand +Acked-by: Janosch Frank +Signed-off-by: Cornelia Huck +(cherry picked from commit 8b51c0961cc13e55b26bb6665ec3a341abdc7658) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/helper.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/target/s390x/helper.c b/target/s390x/helper.c +index a3a49164e4..6808dfda01 100644 +--- a/target/s390x/helper.c ++++ b/target/s390x/helper.c +@@ -89,7 +89,7 @@ hwaddr s390_cpu_get_phys_addr_debug(CPUState *cs, vaddr vaddr) + static inline bool is_special_wait_psw(uint64_t psw_addr) + { + /* signal quiesce */ +- return psw_addr == 0xfffUL; ++ return (psw_addr & 0xfffUL) == 0xfffUL; + } + + void s390_handle_wait(S390CPU *cpu) +-- +2.27.0 + diff --git a/kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch b/kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch new file mode 100755 index 0000000000000000000000000000000000000000..adb65c70bd5c430592de0ce7b1f45447f9eb8214 --- /dev/null +++ b/kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch @@ -0,0 +1,170 @@ +From 212c129b82f0a53725a4167303de2ee0a865f82d Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:08 -0500 +Subject: [PATCH 08/18] s390/sclp: read sccb from mem based on provided length + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-5-thuth@redhat.com> +Patchwork-id: 99501 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 04/12] s390/sclp: read sccb from mem based on provided length +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The header contained within the SCCB passed to the SCLP service call +contains the actual length of the SCCB. Instead of allocating a static +4K size for the work sccb, let's allow for a variable size determined +by the value in the header. The proper checks are already in place to +ensure the SCCB length is sufficent to store a full response and that +the length does not cross any explicitly-set boundaries. + +Signed-off-by: Collin Walling +Reviewed-by: Thomas Huth +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-4-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit c1db53a5910f988eeb32f031c53a50f3373fd824) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/event-facility.c | 2 +- + hw/s390x/sclp.c | 55 ++++++++++++++++++++++----------------- + include/hw/s390x/sclp.h | 2 +- + 3 files changed, 33 insertions(+), 26 deletions(-) + +diff --git a/hw/s390x/event-facility.c b/hw/s390x/event-facility.c +index 66205697ae7..8aa7017f06b 100644 +--- a/hw/s390x/event-facility.c ++++ b/hw/s390x/event-facility.c +@@ -215,7 +215,7 @@ static uint16_t handle_sccb_read_events(SCLPEventFacility *ef, SCCB *sccb, + + event_buf = &red->ebh; + event_buf->length = 0; +- slen = sizeof(sccb->data); ++ slen = sccb_data_len(sccb); + + rc = SCLP_RC_NO_EVENT_BUFFERS_STORED; + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 38278497319..cf1292beb22 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -231,25 +231,29 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + { + SCLPDevice *sclp = get_sclp_device(); + SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); +- SCCB work_sccb; +- hwaddr sccb_len = sizeof(SCCB); ++ SCCBHeader header; ++ g_autofree SCCB *work_sccb = NULL; + +- s390_cpu_pv_mem_read(env_archcpu(env), 0, &work_sccb, sccb_len); ++ s390_cpu_pv_mem_read(env_archcpu(env), 0, &header, sizeof(SCCBHeader)); ++ ++ work_sccb = g_malloc0(be16_to_cpu(header.length)); ++ s390_cpu_pv_mem_read(env_archcpu(env), 0, work_sccb, ++ be16_to_cpu(header.length)); + + if (!sclp_command_code_valid(code)) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } + +- sclp_c->execute(sclp, &work_sccb, code); ++ sclp_c->execute(sclp, work_sccb, code); + out_write: +- s390_cpu_pv_mem_write(env_archcpu(env), 0, &work_sccb, +- be16_to_cpu(work_sccb.h.length)); ++ s390_cpu_pv_mem_write(env_archcpu(env), 0, work_sccb, ++ be16_to_cpu(work_sccb->h.length)); + sclp_c->service_interrupt(sclp, SCLP_PV_DUMMY_ADDR); + return 0; + } +@@ -258,9 +262,8 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + { + SCLPDevice *sclp = get_sclp_device(); + SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); +- SCCB work_sccb; +- +- hwaddr sccb_len = sizeof(SCCB); ++ SCCBHeader header; ++ g_autofree SCCB *work_sccb = NULL; + + /* first some basic checks on program checks */ + if (env->psw.mask & PSW_MASK_PSTATE) { +@@ -274,32 +277,36 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + return -PGM_SPECIFICATION; + } + ++ /* the header contains the actual length of the sccb */ ++ cpu_physical_memory_read(sccb, &header, sizeof(SCCBHeader)); ++ ++ /* Valid sccb sizes */ ++ if (be16_to_cpu(header.length) < sizeof(SCCBHeader)) { ++ return -PGM_SPECIFICATION; ++ } ++ + /* + * we want to work on a private copy of the sccb, to prevent guests + * from playing dirty tricks by modifying the memory content after + * the host has checked the values + */ +- cpu_physical_memory_read(sccb, &work_sccb, sccb_len); +- +- /* Valid sccb sizes */ +- if (be16_to_cpu(work_sccb.h.length) < sizeof(SCCBHeader)) { +- return -PGM_SPECIFICATION; +- } ++ work_sccb = g_malloc0(be16_to_cpu(header.length)); ++ cpu_physical_memory_read(sccb, work_sccb, be16_to_cpu(header.length)); + + if (!sclp_command_code_valid(code)) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { +- work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length))) { ++ work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } + +- sclp_c->execute(sclp, &work_sccb, code); ++ sclp_c->execute(sclp, work_sccb, code); + out_write: +- cpu_physical_memory_write(sccb, &work_sccb, +- be16_to_cpu(work_sccb.h.length)); ++ cpu_physical_memory_write(sccb, work_sccb, ++ be16_to_cpu(work_sccb->h.length)); + + sclp_c->service_interrupt(sclp, sccb); + +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index c0a3faa37d7..55f53a46540 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -177,7 +177,7 @@ typedef struct IoaCfgSccb { + + typedef struct SCCB { + SCCBHeader h; +- char data[SCCB_DATA_LEN]; ++ char data[]; + } QEMU_PACKED SCCB; + + #define TYPE_SCLP "sclp" +-- +2.27.0 + diff --git a/kvm-s390-sclp-rework-sclp-boundary-checks.patch b/kvm-s390-sclp-rework-sclp-boundary-checks.patch new file mode 100755 index 0000000000000000000000000000000000000000..9bb3a55edb1afeea64d9d9acc2d2bdb996b33116 --- /dev/null +++ b/kvm-s390-sclp-rework-sclp-boundary-checks.patch @@ -0,0 +1,80 @@ +From bc395a979a00bb3e16f3bd92b5b2006db4a5aee3 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:07 -0500 +Subject: [PATCH 07/18] s390/sclp: rework sclp boundary checks + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-4-thuth@redhat.com> +Patchwork-id: 99500 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 03/12] s390/sclp: rework sclp boundary checks +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +Rework the SCLP boundary check to account for different SCLP commands +(eventually) allowing different boundary sizes. + +Signed-off-by: Collin Walling +Reviewed-by: Cornelia Huck +Reviewed-by: Thomas Huth +Acked-by: Janosch Frank +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-3-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit db13387ca01a69d870cc16dd232375c2603596f2) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index fe7d0fece80..38278497319 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -49,6 +49,18 @@ static inline bool sclp_command_code_valid(uint32_t code) + return false; + } + ++static bool sccb_verify_boundary(uint64_t sccb_addr, uint16_t sccb_len) ++{ ++ uint64_t sccb_max_addr = sccb_addr + sccb_len - 1; ++ uint64_t sccb_boundary = (sccb_addr & PAGE_MASK) + PAGE_SIZE; ++ ++ if (sccb_max_addr < sccb_boundary) { ++ return true; ++ } ++ ++ return false; ++} ++ + static void prepare_cpu_entries(MachineState *ms, CPUEntry *entry, int *count) + { + uint8_t features[SCCB_CPU_FEATURE_LEN] = { 0 }; +@@ -229,6 +241,11 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + goto out_write; + } + ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { ++ work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); ++ goto out_write; ++ } ++ + sclp_c->execute(sclp, &work_sccb, code); + out_write: + s390_cpu_pv_mem_write(env_archcpu(env), 0, &work_sccb, +@@ -274,7 +291,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + goto out_write; + } + +- if ((sccb + be16_to_cpu(work_sccb.h.length)) > ((sccb & PAGE_MASK) + PAGE_SIZE)) { ++ if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb.h.length))) { + work_sccb.h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); + goto out_write; + } +-- +2.27.0 + diff --git a/kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch b/kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch new file mode 100755 index 0000000000000000000000000000000000000000..cb99830f575665b201465a0cf86cf99de3a50a80 --- /dev/null +++ b/kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch @@ -0,0 +1,67 @@ +From adf66c037e60d66f864960b24c746b767efb10b9 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:10 -0500 +Subject: [PATCH 10/18] s390/sclp: use cpu offset to locate cpu entries + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-7-thuth@redhat.com> +Patchwork-id: 99503 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 06/12] s390/sclp: use cpu offset to locate cpu entries +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Collin Walling + +The start of the CPU entry region in the Read SCP Info response data is +denoted by the offset_cpu field. As such, QEMU needs to begin creating +entries at this address. + +This is in preparation for when Read SCP Info inevitably introduces new +bytes that push the start of the CPUEntry field further away. + +Read CPU Info is unlikely to ever change, so let's not bother +accounting for the offset there. + +Signed-off-by: Collin Walling +Reviewed-by: Thomas Huth +Reviewed-by: Cornelia Huck +Reviewed-by: Claudio Imbrenda +Message-Id: <20200915194416.107460-6-walling@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 1a7a568859473b1cda39a015493c5c82bb200281) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 2b4c6c5cfad..017989b3888 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -89,6 +89,8 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + int rnsize, rnmax; + IplParameterBlock *ipib = s390_ipl_get_iplb(); + int required_len = SCCB_REQ_LEN(ReadInfo, machine->possible_cpus->len); ++ int offset_cpu = offsetof(ReadInfo, entries); ++ CPUEntry *entries_start = (void *)sccb + offset_cpu; + + if (be16_to_cpu(sccb->h.length) < required_len) { + sccb->h.response_code = cpu_to_be16(SCLP_RC_INSUFFICIENT_SCCB_LENGTH); +@@ -96,9 +98,9 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + } + + /* CPU information */ +- prepare_cpu_entries(machine, read_info->entries, &cpu_count); ++ prepare_cpu_entries(machine, entries_start, &cpu_count); + read_info->entries_cpu = cpu_to_be16(cpu_count); +- read_info->offset_cpu = cpu_to_be16(offsetof(ReadInfo, entries)); ++ read_info->offset_cpu = cpu_to_be16(offset_cpu); + read_info->highest_cpu = cpu_to_be16(machine->smp.max_cpus - 1); + + read_info->ibc_val = cpu_to_be32(s390_get_ibc_val()); +-- +2.27.0 + diff --git a/kvm-s390x-Add-SIDA-memory-ops.patch b/kvm-s390x-Add-SIDA-memory-ops.patch new file mode 100755 index 0000000000000000000000000000000000000000..1b566d7a6f9906f89a858b793789950b400b6f48 --- /dev/null +++ b/kvm-s390x-Add-SIDA-memory-ops.patch @@ -0,0 +1,150 @@ +From ebcd74c2267d69fe09ca03cb8bfed7bef5ea3a85 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:08 -0400 +Subject: [PATCH 26/42] s390x: Add SIDA memory ops + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-27-thuth@redhat.com> +Patchwork-id: 97033 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 26/38] s390x: Add SIDA memory ops +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Protected guests save the instruction control blocks in the SIDA +instead of QEMU/KVM directly accessing the guest's memory. + +Let's introduce new functions to access the SIDA. + +The memops for doing so are available with KVM_CAP_S390_PROTECTED, so +let's check for that. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-8-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 1cca8265499d394d9ed4bfb75bd6e7265b529f89) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.h | 7 ++++++- + target/s390x/kvm.c | 26 ++++++++++++++++++++++++++ + target/s390x/kvm_s390x.h | 2 ++ + target/s390x/mmu_helper.c | 14 ++++++++++++++ + 4 files changed, 48 insertions(+), 1 deletion(-) + +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 1ff84e6b3a..edf8391504 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -828,7 +828,12 @@ int s390_cpu_virt_mem_rw(S390CPU *cpu, vaddr laddr, uint8_t ar, void *hostbuf, + #define s390_cpu_virt_mem_check_write(cpu, laddr, ar, len) \ + s390_cpu_virt_mem_rw(cpu, laddr, ar, NULL, len, true) + void s390_cpu_virt_mem_handle_exc(S390CPU *cpu, uintptr_t ra); +- ++int s390_cpu_pv_mem_rw(S390CPU *cpu, unsigned int offset, void *hostbuf, ++ int len, bool is_write); ++#define s390_cpu_pv_mem_read(cpu, offset, dest, len) \ ++ s390_cpu_pv_mem_rw(cpu, offset, dest, len, false) ++#define s390_cpu_pv_mem_write(cpu, offset, dest, len) \ ++ s390_cpu_pv_mem_rw(cpu, offset, dest, len, true) + + /* sigp.c */ + int s390_cpu_restart(S390CPU *cpu); +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index af50b2c253..f67bb5ce2c 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -154,6 +154,7 @@ static int cap_ri; + static int cap_gs; + static int cap_hpage_1m; + static int cap_vcpu_resets; ++static int cap_protected; + + static int active_cmma; + +@@ -351,6 +352,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP); + cap_s390_irq = kvm_check_extension(s, KVM_CAP_S390_INJECT_IRQ); + cap_vcpu_resets = kvm_check_extension(s, KVM_CAP_S390_VCPU_RESETS); ++ cap_protected = kvm_check_extension(s, KVM_CAP_S390_PROTECTED); + + if (!kvm_check_extension(s, KVM_CAP_S390_GMAP) + || !kvm_check_extension(s, KVM_CAP_S390_COW)) { +@@ -848,6 +850,30 @@ int kvm_s390_mem_op(S390CPU *cpu, vaddr addr, uint8_t ar, void *hostbuf, + return ret; + } + ++int kvm_s390_mem_op_pv(S390CPU *cpu, uint64_t offset, void *hostbuf, ++ int len, bool is_write) ++{ ++ struct kvm_s390_mem_op mem_op = { ++ .sida_offset = offset, ++ .size = len, ++ .op = is_write ? KVM_S390_MEMOP_SIDA_WRITE ++ : KVM_S390_MEMOP_SIDA_READ, ++ .buf = (uint64_t)hostbuf, ++ }; ++ int ret; ++ ++ if (!cap_mem_op || !cap_protected) { ++ return -ENOSYS; ++ } ++ ++ ret = kvm_vcpu_ioctl(CPU(cpu), KVM_S390_MEM_OP, &mem_op); ++ if (ret < 0) { ++ error_report("KVM_S390_MEM_OP failed: %s", strerror(-ret)); ++ abort(); ++ } ++ return ret; ++} ++ + /* + * Legacy layout for s390: + * Older S390 KVM requires the topmost vma of the RAM to be +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index dea813f450..6ab17c81b7 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -19,6 +19,8 @@ void kvm_s390_vcpu_interrupt(S390CPU *cpu, struct kvm_s390_irq *irq); + void kvm_s390_access_exception(S390CPU *cpu, uint16_t code, uint64_t te_code); + int kvm_s390_mem_op(S390CPU *cpu, vaddr addr, uint8_t ar, void *hostbuf, + int len, bool is_write); ++int kvm_s390_mem_op_pv(S390CPU *cpu, vaddr addr, void *hostbuf, int len, ++ bool is_write); + void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code); + int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state); + void kvm_s390_vcpu_interrupt_pre_save(S390CPU *cpu); +diff --git a/target/s390x/mmu_helper.c b/target/s390x/mmu_helper.c +index c9f3f34750..ec8befbdc8 100644 +--- a/target/s390x/mmu_helper.c ++++ b/target/s390x/mmu_helper.c +@@ -474,6 +474,20 @@ static int translate_pages(S390CPU *cpu, vaddr addr, int nr_pages, + return 0; + } + ++int s390_cpu_pv_mem_rw(S390CPU *cpu, unsigned int offset, void *hostbuf, ++ int len, bool is_write) ++{ ++ int ret; ++ ++ if (kvm_enabled()) { ++ ret = kvm_s390_mem_op_pv(cpu, offset, hostbuf, len, is_write); ++ } else { ++ /* Protected Virtualization is a KVM/Hardware only feature */ ++ g_assert_not_reached(); ++ } ++ return ret; ++} ++ + /** + * s390_cpu_virt_mem_rw: + * @laddr: the logical start address +-- +2.27.0 + diff --git a/kvm-s390x-Add-missing-vcpu-reset-functions.patch b/kvm-s390x-Add-missing-vcpu-reset-functions.patch new file mode 100755 index 0000000000000000000000000000000000000000..9ce071eeb10f2a8c653e2c7dac0b780e19af8dd1 --- /dev/null +++ b/kvm-s390x-Add-missing-vcpu-reset-functions.patch @@ -0,0 +1,176 @@ +From e11643b5363262e9f809762a1f2bb5c4a8f26c2a Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:56 -0400 +Subject: [PATCH 14/42] s390x: Add missing vcpu reset functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-15-thuth@redhat.com> +Patchwork-id: 97023 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 14/38] s390x: Add missing vcpu reset functions +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Up to now we only had an ioctl to reset vcpu data QEMU couldn't reach +for the initial reset, which was also called for the clear reset. To +be architecture compliant, we also need to clear local interrupts on a +normal reset. + +Because of this and the upcoming protvirt support we need to add +ioctls for the missing clear and normal resets. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Acked-by: David Hildenbrand +Message-Id: <20200214151636.8764-3-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b91a03946e0f65ddd22927dd80ca1276bf89c5af) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 14 ++++++++++++-- + target/s390x/kvm-stub.c | 10 +++++++++- + target/s390x/kvm.c | 42 ++++++++++++++++++++++++++++++++-------- + target/s390x/kvm_s390x.h | 4 +++- + 4 files changed, 58 insertions(+), 12 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index e538a4a3e2..c0dd502b84 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -144,8 +144,18 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + } + + /* Reset state inside the kernel that we cannot access yet from QEMU. */ +- if (kvm_enabled() && type != S390_CPU_RESET_NORMAL) { +- kvm_s390_reset_vcpu(cpu); ++ if (kvm_enabled()) { ++ switch (type) { ++ case S390_CPU_RESET_CLEAR: ++ kvm_s390_reset_vcpu_clear(cpu); ++ break; ++ case S390_CPU_RESET_INITIAL: ++ kvm_s390_reset_vcpu_initial(cpu); ++ break; ++ case S390_CPU_RESET_NORMAL: ++ kvm_s390_reset_vcpu_normal(cpu); ++ break; ++ } + } + } + +diff --git a/target/s390x/kvm-stub.c b/target/s390x/kvm-stub.c +index 5152e2bdf1..c4cd497f85 100644 +--- a/target/s390x/kvm-stub.c ++++ b/target/s390x/kvm-stub.c +@@ -83,7 +83,15 @@ void kvm_s390_cmma_reset(void) + { + } + +-void kvm_s390_reset_vcpu(S390CPU *cpu) ++void kvm_s390_reset_vcpu_initial(S390CPU *cpu) ++{ ++} ++ ++void kvm_s390_reset_vcpu_clear(S390CPU *cpu) ++{ ++} ++ ++void kvm_s390_reset_vcpu_normal(S390CPU *cpu) + { + } + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 1c5bc7a2f9..75d82af6fc 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -151,6 +151,7 @@ static int cap_s390_irq; + static int cap_ri; + static int cap_gs; + static int cap_hpage_1m; ++static int cap_vcpu_resets; + + static int active_cmma; + +@@ -342,6 +343,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + cap_async_pf = kvm_check_extension(s, KVM_CAP_ASYNC_PF); + cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP); + cap_s390_irq = kvm_check_extension(s, KVM_CAP_S390_INJECT_IRQ); ++ cap_vcpu_resets = kvm_check_extension(s, KVM_CAP_S390_VCPU_RESETS); + + if (!kvm_check_extension(s, KVM_CAP_S390_GMAP) + || !kvm_check_extension(s, KVM_CAP_S390_COW)) { +@@ -403,17 +405,41 @@ int kvm_arch_destroy_vcpu(CPUState *cs) + return 0; + } + +-void kvm_s390_reset_vcpu(S390CPU *cpu) ++static void kvm_s390_reset_vcpu(S390CPU *cpu, unsigned long type) + { + CPUState *cs = CPU(cpu); + +- /* The initial reset call is needed here to reset in-kernel +- * vcpu data that we can't access directly from QEMU +- * (i.e. with older kernels which don't support sync_regs/ONE_REG). +- * Before this ioctl cpu_synchronize_state() is called in common kvm +- * code (kvm-all) */ +- if (kvm_vcpu_ioctl(cs, KVM_S390_INITIAL_RESET, NULL)) { +- error_report("Initial CPU reset failed on CPU %i", cs->cpu_index); ++ /* ++ * The reset call is needed here to reset in-kernel vcpu data that ++ * we can't access directly from QEMU (i.e. with older kernels ++ * which don't support sync_regs/ONE_REG). Before this ioctl ++ * cpu_synchronize_state() is called in common kvm code ++ * (kvm-all). ++ */ ++ if (kvm_vcpu_ioctl(cs, type)) { ++ error_report("CPU reset failed on CPU %i type %lx", ++ cs->cpu_index, type); ++ } ++} ++ ++void kvm_s390_reset_vcpu_initial(S390CPU *cpu) ++{ ++ kvm_s390_reset_vcpu(cpu, KVM_S390_INITIAL_RESET); ++} ++ ++void kvm_s390_reset_vcpu_clear(S390CPU *cpu) ++{ ++ if (cap_vcpu_resets) { ++ kvm_s390_reset_vcpu(cpu, KVM_S390_CLEAR_RESET); ++ } else { ++ kvm_s390_reset_vcpu(cpu, KVM_S390_INITIAL_RESET); ++ } ++} ++ ++void kvm_s390_reset_vcpu_normal(S390CPU *cpu) ++{ ++ if (cap_vcpu_resets) { ++ kvm_s390_reset_vcpu(cpu, KVM_S390_NORMAL_RESET); + } + } + +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index caf985955b..0b21789796 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -34,7 +34,9 @@ int kvm_s390_assign_subch_ioeventfd(EventNotifier *notifier, uint32_t sch, + int vq, bool assign); + int kvm_s390_cmma_active(void); + void kvm_s390_cmma_reset(void); +-void kvm_s390_reset_vcpu(S390CPU *cpu); ++void kvm_s390_reset_vcpu_clear(S390CPU *cpu); ++void kvm_s390_reset_vcpu_normal(S390CPU *cpu); ++void kvm_s390_reset_vcpu_initial(S390CPU *cpu); + int kvm_s390_set_mem_limit(uint64_t new_limit, uint64_t *hw_limit); + void kvm_s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void kvm_s390_crypto_reset(void); +-- +2.27.0 + diff --git a/kvm-s390x-Add-unpack-facility-feature-to-GA1.patch b/kvm-s390x-Add-unpack-facility-feature-to-GA1.patch new file mode 100755 index 0000000000000000000000000000000000000000..8ffb7b05bc62a020a25e9cff1025f72ddcad07d9 --- /dev/null +++ b/kvm-s390x-Add-unpack-facility-feature-to-GA1.patch @@ -0,0 +1,76 @@ +From ab670456375f0d9b9b2d219fd497d04ec0009e1d Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:16 -0400 +Subject: [PATCH 34/42] s390x: Add unpack facility feature to GA1 + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-35-thuth@redhat.com> +Patchwork-id: 97052 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 34/38] s390x: Add unpack facility feature to GA1 +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +The unpack facility is an indication that diagnose 308 subcodes 8-10 +are available to the guest. That means, that the guest can put itself +into protected mode. + +Once it is in protected mode, the hardware stops any attempt of VM +introspection by the hypervisor. + +Some features are currently not supported in protected mode: + * vfio devices + * Migration + * Huge page backings + +Signed-off-by: Christian Borntraeger +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-17-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 572c0826615737f1c095b1b6d9e381ec40f72eb5) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/gen-features.c | 1 + + target/s390x/kvm.c | 8 ++++++++ + 2 files changed, 9 insertions(+) + +diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c +index 6278845b12..8ddeebc544 100644 +--- a/target/s390x/gen-features.c ++++ b/target/s390x/gen-features.c +@@ -562,6 +562,7 @@ static uint16_t full_GEN15_GA1[] = { + S390_FEAT_GROUP_MSA_EXT_9, + S390_FEAT_GROUP_MSA_EXT_9_PCKMO, + S390_FEAT_ETOKEN, ++ S390_FEAT_UNPACK, + }; + + /* Default features (in order of release) +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 56fe60c49c..84d7cadd09 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2407,6 +2407,14 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + clear_bit(S390_FEAT_BPB, model->features); + } + ++ /* ++ * If we have support for protected virtualization, indicate ++ * the protected virtualization IPL unpack facility. ++ */ ++ if (cap_protected) { ++ set_bit(S390_FEAT_UNPACK, model->features); ++ } ++ + /* We emulate a zPCI bus and AEN, therefore we don't need HW support */ + set_bit(S390_FEAT_ZPCI, model->features); + set_bit(S390_FEAT_ADAPTER_EVENT_NOTIFICATION, model->features); +-- +2.27.0 + diff --git a/kvm-s390x-Beautify-diag308-handling.patch b/kvm-s390x-Beautify-diag308-handling.patch new file mode 100755 index 0000000000000000000000000000000000000000..2ffe6a39f113a42d1e703c364972e90d18bfdb2a --- /dev/null +++ b/kvm-s390x-Beautify-diag308-handling.patch @@ -0,0 +1,130 @@ +From da81f2b579987ea12929f0ec803716bc16a93df7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:49 -0400 +Subject: [PATCH 07/42] s390x: Beautify diag308 handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-8-thuth@redhat.com> +Patchwork-id: 97022 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 07/38] s390x: Beautify diag308 handling +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's improve readability by: +* Using constants for the subcodes +* Moving parameter checking into a function +* Removing subcode > 6 check as the default case catches that + +Signed-off-by: Janosch Frank +Reviewed-by: Cornelia Huck +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Message-Id: <20191127175046.4911-6-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0b7fd817e0f383760e37ca9286150d5816cf0594) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/diag.c | 54 +++++++++++++++++++++++++++------------------ + 1 file changed, 32 insertions(+), 22 deletions(-) + +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index 53c2f81f2a..b5aec06d6b 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -53,6 +53,29 @@ int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + #define DIAG_308_RC_NO_CONF 0x0102 + #define DIAG_308_RC_INVALID 0x0402 + ++#define DIAG308_RESET_MOD_CLR 0 ++#define DIAG308_RESET_LOAD_NORM 1 ++#define DIAG308_LOAD_CLEAR 3 ++#define DIAG308_LOAD_NORMAL_DUMP 4 ++#define DIAG308_SET 5 ++#define DIAG308_STORE 6 ++ ++static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, ++ uintptr_t ra, bool write) ++{ ++ if ((r1 & 1) || (addr & ~TARGET_PAGE_MASK)) { ++ s390_program_interrupt(env, PGM_SPECIFICATION, ra); ++ return -1; ++ } ++ if (!address_space_access_valid(&address_space_memory, addr, ++ sizeof(IplParameterBlock), write, ++ MEMTXATTRS_UNSPECIFIED)) { ++ s390_program_interrupt(env, PGM_ADDRESSING, ra); ++ return -1; ++ } ++ return 0; ++} ++ + void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + { + CPUState *cs = env_cpu(env); +@@ -65,30 +88,24 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + return; + } + +- if ((subcode & ~0x0ffffULL) || (subcode > 6)) { ++ if (subcode & ~0x0ffffULL) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; + } + + switch (subcode) { +- case 0: ++ case DIAG308_RESET_MOD_CLR: + s390_ipl_reset_request(cs, S390_RESET_MODIFIED_CLEAR); + break; +- case 1: ++ case DIAG308_RESET_LOAD_NORM: + s390_ipl_reset_request(cs, S390_RESET_LOAD_NORMAL); + break; +- case 3: ++ case DIAG308_LOAD_CLEAR: ++ /* Well we still lack the clearing bit... */ + s390_ipl_reset_request(cs, S390_RESET_REIPL); + break; +- case 5: +- if ((r1 & 1) || (addr & 0x0fffULL)) { +- s390_program_interrupt(env, PGM_SPECIFICATION, ra); +- return; +- } +- if (!address_space_access_valid(&address_space_memory, addr, +- sizeof(IplParameterBlock), false, +- MEMTXATTRS_UNSPECIFIED)) { +- s390_program_interrupt(env, PGM_ADDRESSING, ra); ++ case DIAG308_SET: ++ if (diag308_parm_check(env, r1, addr, ra, false)) { + return; + } + iplb = g_new0(IplParameterBlock, 1); +@@ -110,15 +127,8 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + out: + g_free(iplb); + return; +- case 6: +- if ((r1 & 1) || (addr & 0x0fffULL)) { +- s390_program_interrupt(env, PGM_SPECIFICATION, ra); +- return; +- } +- if (!address_space_access_valid(&address_space_memory, addr, +- sizeof(IplParameterBlock), true, +- MEMTXATTRS_UNSPECIFIED)) { +- s390_program_interrupt(env, PGM_ADDRESSING, ra); ++ case DIAG308_STORE: ++ if (diag308_parm_check(env, r1, addr, ra, true)) { + return; + } + iplb = s390_ipl_get_iplb(); +-- +2.27.0 + diff --git a/kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch b/kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch new file mode 100755 index 0000000000000000000000000000000000000000..dab8acc6ccb554ad48c2a41c31bea5730753aa76 --- /dev/null +++ b/kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch @@ -0,0 +1,52 @@ +From 511638161566d4944a572a31d787eb27bbc0bc8e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:45 -0400 +Subject: [PATCH 03/42] s390x: Don't do a normal reset on the initial cpu +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-4-thuth@redhat.com> +Patchwork-id: 97017 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 03/38] s390x: Don't do a normal reset on the initial cpu +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The initiating cpu needs to be reset with an initial reset. While +doing a normal reset followed by a initial reset is not wrong per se, +the Ultravisor will only allow the correct reset to be performed. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Cornelia Huck +Message-Id: <20191127175046.4911-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit ec9227339fce99412830d44a37eb0bd2fadd5f75) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index c2c83d2fce..4ea01c53c0 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -348,6 +348,9 @@ static void s390_machine_reset(MachineState *machine) + break; + case S390_RESET_LOAD_NORMAL: + CPU_FOREACH(t) { ++ if (t == cs) { ++ continue; ++ } + run_on_cpu(t, s390_do_cpu_reset, RUN_ON_CPU_NULL); + } + subsystem_reset(); +-- +2.27.0 + diff --git a/kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch b/kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch new file mode 100755 index 0000000000000000000000000000000000000000..9b81586b251ceb7bd48bddf4a4b6781ccae265fd --- /dev/null +++ b/kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch @@ -0,0 +1,101 @@ +From bdad28b11e36f657cb8909e7223a7d8fc0948c2e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:51 -0400 +Subject: [PATCH 09/42] s390x: Fix cpu normal reset ri clearing +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-10-thuth@redhat.com> +Patchwork-id: 97029 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 09/38] s390x: Fix cpu normal reset ri clearing +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +As it turns out we need to clear the ri controls and PSW enablement +bit to be architecture compliant. + +Signed-off-by: Janosch Frank +Reviewed-by: Christian Borntraeger +Message-Id: <20191203132813.2734-4-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit e893baee70149896d1e43e341da4d6c614037d5d) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 7 ++++++- + target/s390x/cpu.h | 7 ++++++- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index bd39cb54b7..99ea09085a 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -100,7 +100,7 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + case S390_CPU_RESET_INITIAL: + /* initial reset does not clear everything! */ + memset(&env->start_initial_reset_fields, 0, +- offsetof(CPUS390XState, end_reset_fields) - ++ offsetof(CPUS390XState, start_normal_reset_fields) - + offsetof(CPUS390XState, start_initial_reset_fields)); + + /* architectured initial value for Breaking-Event-Address register */ +@@ -123,6 +123,11 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + &env->fpu_status); + /* fall through */ + case S390_CPU_RESET_NORMAL: ++ env->psw.mask &= ~PSW_MASK_RI; ++ memset(&env->start_normal_reset_fields, 0, ++ offsetof(CPUS390XState, end_reset_fields) - ++ offsetof(CPUS390XState, start_normal_reset_fields)); ++ + env->pfault_token = -1UL; + env->bpbc = false; + break; +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index d2af13b345..7e1c18d596 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -58,7 +58,6 @@ struct CPUS390XState { + */ + uint64_t vregs[32][2] QEMU_ALIGNED(16); /* vector registers */ + uint32_t aregs[16]; /* access registers */ +- uint8_t riccb[64]; /* runtime instrumentation control */ + uint64_t gscb[4]; /* guarded storage control */ + uint64_t etoken; /* etoken */ + uint64_t etoken_extension; /* etoken extension */ +@@ -114,6 +113,10 @@ struct CPUS390XState { + uint64_t gbea; + uint64_t pp; + ++ /* Fields up to this point are not cleared by normal CPU reset */ ++ struct {} start_normal_reset_fields; ++ uint8_t riccb[64]; /* runtime instrumentation control */ ++ + /* Fields up to this point are cleared by a CPU reset */ + struct {} end_reset_fields; + +@@ -252,6 +255,7 @@ extern const VMStateDescription vmstate_s390_cpu; + #undef PSW_SHIFT_ASC + #undef PSW_MASK_CC + #undef PSW_MASK_PM ++#undef PSW_MASK_RI + #undef PSW_SHIFT_MASK_PM + #undef PSW_MASK_64 + #undef PSW_MASK_32 +@@ -273,6 +277,7 @@ extern const VMStateDescription vmstate_s390_cpu; + #define PSW_MASK_CC 0x0000300000000000ULL + #define PSW_MASK_PM 0x00000F0000000000ULL + #define PSW_SHIFT_MASK_PM 40 ++#define PSW_MASK_RI 0x0000008000000000ULL + #define PSW_MASK_64 0x0000000100000000ULL + #define PSW_MASK_32 0x0000000080000000ULL + #define PSW_MASK_ESA_ADDR 0x000000007fffffffULL +-- +2.27.0 + diff --git a/kvm-s390x-Move-clear-reset.patch b/kvm-s390x-Move-clear-reset.patch new file mode 100755 index 0000000000000000000000000000000000000000..7c1614cde2711743084803390abacc302dee4ae6 --- /dev/null +++ b/kvm-s390x-Move-clear-reset.patch @@ -0,0 +1,146 @@ +From f268cc7071ecb4322c03f3183acbcf90421da3c7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:48 -0400 +Subject: [PATCH 06/42] s390x: Move clear reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-7-thuth@redhat.com> +Patchwork-id: 97019 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 06/38] s390x: Move clear reset +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's also move the clear reset function into the reset handler. + +Signed-off-by: Janosch Frank +Message-Id: <20191127175046.4911-5-frankja@linux.ibm.com> +Reviewed-by: David Hildenbrand +Reviewed-by: Thomas Huth +Signed-off-by: Cornelia Huck +(cherry picked from commit eb8adcc3e9e3b8405c104ede72cf9f3bb2a5e226) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu-qom.h | 1 + + target/s390x/cpu.c | 58 +++++++++++++----------------------------- + 2 files changed, 18 insertions(+), 41 deletions(-) + +diff --git a/target/s390x/cpu-qom.h b/target/s390x/cpu-qom.h +index 6f0a12042e..dbe5346ec9 100644 +--- a/target/s390x/cpu-qom.h ++++ b/target/s390x/cpu-qom.h +@@ -37,6 +37,7 @@ typedef struct S390CPUDef S390CPUDef; + typedef enum cpu_reset_type { + S390_CPU_RESET_NORMAL, + S390_CPU_RESET_INITIAL, ++ S390_CPU_RESET_CLEAR, + } cpu_reset_type; + + /** +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index ca62fe7685..bd39cb54b7 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -94,6 +94,9 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); + + switch (type) { ++ case S390_CPU_RESET_CLEAR: ++ memset(env, 0, offsetof(CPUS390XState, start_initial_reset_fields)); ++ /* fall through */ + case S390_CPU_RESET_INITIAL: + /* initial reset does not clear everything! */ + memset(&env->start_initial_reset_fields, 0, +@@ -107,6 +110,14 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + env->cregs[0] = CR0_RESET; + env->cregs[14] = CR14_RESET; + ++#if defined(CONFIG_USER_ONLY) ++ /* user mode should always be allowed to use the full FPU */ ++ env->cregs[0] |= CR0_AFP; ++ if (s390_has_feat(S390_FEAT_VECTOR)) { ++ env->cregs[0] |= CR0_VECTOR; ++ } ++#endif ++ + /* tininess for underflow is detected before rounding */ + set_float_detect_tininess(float_tininess_before_rounding, + &env->fpu_status); +@@ -125,46 +136,6 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + } + } + +-/* CPUClass:reset() */ +-static void s390_cpu_full_reset(CPUState *s) +-{ +- S390CPU *cpu = S390_CPU(s); +- S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); +- CPUS390XState *env = &cpu->env; +- +- scc->parent_reset(s); +- cpu->env.sigp_order = 0; +- s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); +- +- memset(env, 0, offsetof(CPUS390XState, end_reset_fields)); +- +- /* architectured initial values for CR 0 and 14 */ +- env->cregs[0] = CR0_RESET; +- env->cregs[14] = CR14_RESET; +- +-#if defined(CONFIG_USER_ONLY) +- /* user mode should always be allowed to use the full FPU */ +- env->cregs[0] |= CR0_AFP; +- if (s390_has_feat(S390_FEAT_VECTOR)) { +- env->cregs[0] |= CR0_VECTOR; +- } +-#endif +- +- /* architectured initial value for Breaking-Event-Address register */ +- env->gbea = 1; +- +- env->pfault_token = -1UL; +- +- /* tininess for underflow is detected before rounding */ +- set_float_detect_tininess(float_tininess_before_rounding, +- &env->fpu_status); +- +- /* Reset state inside the kernel that we cannot access yet from QEMU. */ +- if (kvm_enabled()) { +- kvm_s390_reset_vcpu(cpu); +- } +-} +- + #if !defined(CONFIG_USER_ONLY) + static void s390_cpu_machine_reset_cb(void *opaque) + { +@@ -456,6 +427,11 @@ static Property s390x_cpu_properties[] = { + DEFINE_PROP_END_OF_LIST() + }; + ++static void s390_cpu_reset_full(CPUState *s) ++{ ++ return s390_cpu_reset(s, S390_CPU_RESET_CLEAR); ++} ++ + static void s390_cpu_class_init(ObjectClass *oc, void *data) + { + S390CPUClass *scc = S390_CPU_CLASS(oc); +@@ -472,7 +448,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + scc->load_normal = s390_cpu_load_normal; + #endif + scc->reset = s390_cpu_reset; +- cc->reset = s390_cpu_full_reset; ++ cc->reset = s390_cpu_reset_full; + cc->class_by_name = s390_cpu_class_by_name, + cc->has_work = s390_cpu_has_work; + #ifdef CONFIG_TCG +-- +2.27.0 + diff --git a/kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch b/kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch new file mode 100755 index 0000000000000000000000000000000000000000..ac183cfe936036ed7ed99aeaa170e3f71c35491a --- /dev/null +++ b/kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch @@ -0,0 +1,83 @@ +From c9eee8aeed39976293e0d857039fcf729b821e83 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:03 -0400 +Subject: [PATCH 21/42] s390x: Move diagnose 308 subcodes and rcs into ipl.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-22-thuth@redhat.com> +Patchwork-id: 97032 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 21/38] s390x: Move diagnose 308 subcodes and rcs into ipl.h +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +They are part of the IPL process, so let's put them into the ipl +header. + +Signed-off-by: Janosch Frank +Reviewed-by: Cornelia Huck +Reviewed-by: Christian Borntraeger +Reviewed-by: David Hildenbrand +Message-Id: <20200319131921.2367-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 9b39d29470e9dbef24ee842a44ea56bd92b855ea) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.h | 11 +++++++++++ + target/s390x/diag.c | 11 ----------- + 2 files changed, 11 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index 3e44abe1c6..a5665e6bfd 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -159,6 +159,17 @@ struct S390IPLState { + typedef struct S390IPLState S390IPLState; + QEMU_BUILD_BUG_MSG(offsetof(S390IPLState, iplb) & 3, "alignment of iplb wrong"); + ++#define DIAG_308_RC_OK 0x0001 ++#define DIAG_308_RC_NO_CONF 0x0102 ++#define DIAG_308_RC_INVALID 0x0402 ++ ++#define DIAG308_RESET_MOD_CLR 0 ++#define DIAG308_RESET_LOAD_NORM 1 ++#define DIAG308_LOAD_CLEAR 3 ++#define DIAG308_LOAD_NORMAL_DUMP 4 ++#define DIAG308_SET 5 ++#define DIAG308_STORE 6 ++ + #define S390_IPL_TYPE_FCP 0x00 + #define S390_IPL_TYPE_CCW 0x02 + #define S390_IPL_TYPE_QEMU_SCSI 0xff +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index 54e5670b3f..8aba6341f9 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -49,17 +49,6 @@ int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + return diag288_class->handle_timer(diag288, func, timeout); + } + +-#define DIAG_308_RC_OK 0x0001 +-#define DIAG_308_RC_NO_CONF 0x0102 +-#define DIAG_308_RC_INVALID 0x0402 +- +-#define DIAG308_RESET_MOD_CLR 0 +-#define DIAG308_RESET_LOAD_NORM 1 +-#define DIAG308_LOAD_CLEAR 3 +-#define DIAG308_LOAD_NORMAL_DUMP 4 +-#define DIAG308_SET 5 +-#define DIAG308_STORE 6 +- + static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, + uintptr_t ra, bool write) + { +-- +2.27.0 + diff --git a/kvm-s390x-Move-initial-reset.patch b/kvm-s390x-Move-initial-reset.patch new file mode 100755 index 0000000000000000000000000000000000000000..0f2e9abedb28cd6bc98249d7454a3ebd0d0ed99f --- /dev/null +++ b/kvm-s390x-Move-initial-reset.patch @@ -0,0 +1,159 @@ +From 0d1c0adf25a323be0663863ebe44a6aefb5f7baf Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:47 -0400 +Subject: [PATCH 05/42] s390x: Move initial reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-6-thuth@redhat.com> +Patchwork-id: 97024 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 05/38] s390x: Move initial reset +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's move the intial reset into the reset handler and cleanup +afterwards. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20191128083723.11937-1-frankja@linux.ibm.com> +Reviewed-by: Thomas Huth +Signed-off-by: Cornelia Huck +(cherry picked from commit 81b9222358e5c8f666f0d86057c75e40531d804c) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu-qom.h | 2 +- + target/s390x/cpu.c | 46 +++++++++++++++++------------------------- + target/s390x/cpu.h | 2 +- + target/s390x/sigp.c | 2 +- + 4 files changed, 21 insertions(+), 31 deletions(-) + +diff --git a/target/s390x/cpu-qom.h b/target/s390x/cpu-qom.h +index f3b71bac67..6f0a12042e 100644 +--- a/target/s390x/cpu-qom.h ++++ b/target/s390x/cpu-qom.h +@@ -36,6 +36,7 @@ typedef struct S390CPUDef S390CPUDef; + + typedef enum cpu_reset_type { + S390_CPU_RESET_NORMAL, ++ S390_CPU_RESET_INITIAL, + } cpu_reset_type; + + /** +@@ -62,7 +63,6 @@ typedef struct S390CPUClass { + void (*parent_reset)(CPUState *cpu); + void (*load_normal)(CPUState *cpu); + void (*reset)(CPUState *cpu, cpu_reset_type type); +- void (*initial_cpu_reset)(CPUState *cpu); + } S390CPUClass; + + typedef struct S390CPU S390CPU; +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 67d6fbfa44..ca62fe7685 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -94,6 +94,23 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); + + switch (type) { ++ case S390_CPU_RESET_INITIAL: ++ /* initial reset does not clear everything! */ ++ memset(&env->start_initial_reset_fields, 0, ++ offsetof(CPUS390XState, end_reset_fields) - ++ offsetof(CPUS390XState, start_initial_reset_fields)); ++ ++ /* architectured initial value for Breaking-Event-Address register */ ++ env->gbea = 1; ++ ++ /* architectured initial values for CR 0 and 14 */ ++ env->cregs[0] = CR0_RESET; ++ env->cregs[14] = CR14_RESET; ++ ++ /* tininess for underflow is detected before rounding */ ++ set_float_detect_tininess(float_tininess_before_rounding, ++ &env->fpu_status); ++ /* fall through */ + case S390_CPU_RESET_NORMAL: + env->pfault_token = -1UL; + env->bpbc = false; +@@ -101,35 +118,9 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + default: + g_assert_not_reached(); + } +-} +- +-/* S390CPUClass::initial_reset() */ +-static void s390_cpu_initial_reset(CPUState *s) +-{ +- S390CPU *cpu = S390_CPU(s); +- CPUS390XState *env = &cpu->env; +- +- s390_cpu_reset(s, S390_CPU_RESET_NORMAL); +- /* initial reset does not clear everything! */ +- memset(&env->start_initial_reset_fields, 0, +- offsetof(CPUS390XState, end_reset_fields) - +- offsetof(CPUS390XState, start_initial_reset_fields)); +- +- /* architectured initial values for CR 0 and 14 */ +- env->cregs[0] = CR0_RESET; +- env->cregs[14] = CR14_RESET; +- +- /* architectured initial value for Breaking-Event-Address register */ +- env->gbea = 1; +- +- env->pfault_token = -1UL; +- +- /* tininess for underflow is detected before rounding */ +- set_float_detect_tininess(float_tininess_before_rounding, +- &env->fpu_status); + + /* Reset state inside the kernel that we cannot access yet from QEMU. */ +- if (kvm_enabled()) { ++ if (kvm_enabled() && type != S390_CPU_RESET_NORMAL) { + kvm_s390_reset_vcpu(cpu); + } + } +@@ -481,7 +472,6 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + scc->load_normal = s390_cpu_load_normal; + #endif + scc->reset = s390_cpu_reset; +- scc->initial_cpu_reset = s390_cpu_initial_reset; + cc->reset = s390_cpu_full_reset; + cc->class_by_name = s390_cpu_class_by_name, + cc->has_work = s390_cpu_has_work; +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 18123dfd5b..d2af13b345 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -748,7 +748,7 @@ static inline void s390_do_cpu_initial_reset(CPUState *cs, run_on_cpu_data arg) + { + S390CPUClass *scc = S390_CPU_GET_CLASS(cs); + +- scc->initial_cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_INITIAL); + } + + static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg) +diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c +index 850139b9cd..727875bb4a 100644 +--- a/target/s390x/sigp.c ++++ b/target/s390x/sigp.c +@@ -254,7 +254,7 @@ static void sigp_initial_cpu_reset(CPUState *cs, run_on_cpu_data arg) + SigpInfo *si = arg.host_ptr; + + cpu_synchronize_state(cs); +- scc->initial_cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_INITIAL); + cpu_synchronize_post_reset(cs); + si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } +-- +2.27.0 + diff --git a/kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch b/kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch new file mode 100755 index 0000000000000000000000000000000000000000..81a4368c15fee9f5120dad2b622ec6ded6ee57dc --- /dev/null +++ b/kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch @@ -0,0 +1,145 @@ +From 53b5a7f83f3e6b94c66cbbb97ea42bbf02cb96b4 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:46 -0400 +Subject: [PATCH 04/42] s390x: Move reset normal to shared reset handler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-5-thuth@redhat.com> +Patchwork-id: 97018 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 04/38] s390x: Move reset normal to shared reset handler +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's start moving the cpu reset functions into a single function with +a switch/case, so we can later use fallthroughs and share more code +between resets. + +This patch introduces the reset function by renaming cpu_reset(). + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20191127175046.4911-3-frankja@linux.ibm.com> +Reviewed-by: Thomas Huth +Signed-off-by: Cornelia Huck +(cherry picked from commit eac4f82791f1807c423e85670837db103b9d59b3) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu-qom.h | 6 +++++- + target/s390x/cpu.c | 19 +++++++++++++------ + target/s390x/cpu.h | 2 +- + target/s390x/sigp.c | 2 +- + 4 files changed, 20 insertions(+), 9 deletions(-) + +diff --git a/target/s390x/cpu-qom.h b/target/s390x/cpu-qom.h +index b809ec8418..f3b71bac67 100644 +--- a/target/s390x/cpu-qom.h ++++ b/target/s390x/cpu-qom.h +@@ -34,6 +34,10 @@ + typedef struct S390CPUModel S390CPUModel; + typedef struct S390CPUDef S390CPUDef; + ++typedef enum cpu_reset_type { ++ S390_CPU_RESET_NORMAL, ++} cpu_reset_type; ++ + /** + * S390CPUClass: + * @parent_realize: The parent class' realize handler. +@@ -57,7 +61,7 @@ typedef struct S390CPUClass { + DeviceRealize parent_realize; + void (*parent_reset)(CPUState *cpu); + void (*load_normal)(CPUState *cpu); +- void (*cpu_reset)(CPUState *cpu); ++ void (*reset)(CPUState *cpu, cpu_reset_type type); + void (*initial_cpu_reset)(CPUState *cpu); + } S390CPUClass; + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 3abe7e80fd..67d6fbfa44 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -82,18 +82,25 @@ static void s390_cpu_load_normal(CPUState *s) + } + #endif + +-/* S390CPUClass::cpu_reset() */ +-static void s390_cpu_reset(CPUState *s) ++/* S390CPUClass::reset() */ ++static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + { + S390CPU *cpu = S390_CPU(s); + S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); + CPUS390XState *env = &cpu->env; + +- env->pfault_token = -1UL; +- env->bpbc = false; + scc->parent_reset(s); + cpu->env.sigp_order = 0; + s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); ++ ++ switch (type) { ++ case S390_CPU_RESET_NORMAL: ++ env->pfault_token = -1UL; ++ env->bpbc = false; ++ break; ++ default: ++ g_assert_not_reached(); ++ } + } + + /* S390CPUClass::initial_reset() */ +@@ -102,7 +109,7 @@ static void s390_cpu_initial_reset(CPUState *s) + S390CPU *cpu = S390_CPU(s); + CPUS390XState *env = &cpu->env; + +- s390_cpu_reset(s); ++ s390_cpu_reset(s, S390_CPU_RESET_NORMAL); + /* initial reset does not clear everything! */ + memset(&env->start_initial_reset_fields, 0, + offsetof(CPUS390XState, end_reset_fields) - +@@ -473,7 +480,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + #if !defined(CONFIG_USER_ONLY) + scc->load_normal = s390_cpu_load_normal; + #endif +- scc->cpu_reset = s390_cpu_reset; ++ scc->reset = s390_cpu_reset; + scc->initial_cpu_reset = s390_cpu_initial_reset; + cc->reset = s390_cpu_full_reset; + cc->class_by_name = s390_cpu_class_by_name, +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 17460ed7b3..18123dfd5b 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -741,7 +741,7 @@ static inline void s390_do_cpu_reset(CPUState *cs, run_on_cpu_data arg) + { + S390CPUClass *scc = S390_CPU_GET_CLASS(cs); + +- scc->cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_NORMAL); + } + + static inline void s390_do_cpu_initial_reset(CPUState *cs, run_on_cpu_data arg) +diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c +index 2ce22d4dc1..850139b9cd 100644 +--- a/target/s390x/sigp.c ++++ b/target/s390x/sigp.c +@@ -266,7 +266,7 @@ static void sigp_cpu_reset(CPUState *cs, run_on_cpu_data arg) + SigpInfo *si = arg.host_ptr; + + cpu_synchronize_state(cs); +- scc->cpu_reset(cs); ++ scc->reset(cs, S390_CPU_RESET_NORMAL); + cpu_synchronize_post_reset(cs); + si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } +-- +2.27.0 + diff --git a/kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch b/kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch new file mode 100755 index 0000000000000000000000000000000000000000..9447240f0a2e13ac3787a6cd7d0050e3cd43cd1d --- /dev/null +++ b/kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch @@ -0,0 +1,70 @@ +From 7171a794e8a7d91805516174187addc3b8e6b423 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:54 -0400 +Subject: [PATCH 12/42] s390x: Properly fetch and test the short psw on diag308 + subc 0/1 + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-13-thuth@redhat.com> +Patchwork-id: 97025 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 12/38] s390x: Properly fetch and test the short psw on diag308 subc 0/1 +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +We need to actually fetch the cpu mask and set it. As we invert the +short psw indication in the mask, SIE will report a specification +exception, if it wasn't present in the reset psw. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20191129142025.21453-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 104130cb7c106378dab944397c6a455c4a6d552f) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 12 ++++++++++-- + target/s390x/cpu.h | 1 + + 2 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 99ea09085a..625daeedd1 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -76,8 +76,16 @@ static bool s390_cpu_has_work(CPUState *cs) + static void s390_cpu_load_normal(CPUState *s) + { + S390CPU *cpu = S390_CPU(s); +- cpu->env.psw.addr = ldl_phys(s->as, 4) & PSW_MASK_ESA_ADDR; +- cpu->env.psw.mask = PSW_MASK_32 | PSW_MASK_64; ++ uint64_t spsw = ldq_phys(s->as, 0); ++ ++ cpu->env.psw.mask = spsw & 0xffffffff80000000ULL; ++ /* ++ * Invert short psw indication, so SIE will report a specification ++ * exception if it was not set. ++ */ ++ cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; ++ cpu->env.psw.addr = spsw & 0x7fffffffULL; ++ + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } + #endif +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 7e1c18d596..7f5fa1d35b 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -269,6 +269,7 @@ extern const VMStateDescription vmstate_s390_cpu; + #define PSW_MASK_EXT 0x0100000000000000ULL + #define PSW_MASK_KEY 0x00F0000000000000ULL + #define PSW_SHIFT_KEY 52 ++#define PSW_MASK_SHORTPSW 0x0008000000000000ULL + #define PSW_MASK_MCHECK 0x0004000000000000ULL + #define PSW_MASK_WAIT 0x0002000000000000ULL + #define PSW_MASK_PSTATE 0x0001000000000000ULL +-- +2.27.0 + diff --git a/kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch b/kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch new file mode 100755 index 0000000000000000000000000000000000000000..b1c7e01a99ef8b3d915c839a83ebcf77b5ac5b66 --- /dev/null +++ b/kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch @@ -0,0 +1,87 @@ +From 4bd5ae889376816238ecad1bce054b0e198cde2b Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:55 -0400 +Subject: [PATCH 13/42] s390x: Rename and use constants for short PSW address + and mask + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-14-thuth@redhat.com> +Patchwork-id: 97050 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 13/38] s390x: Rename and use constants for short PSW address and mask +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Let's rename PSW_MASK_ESA_ADDR to PSW_MASK_SHORT_ADDR because we're +not working with a ESA PSW which would not support the extended +addressing bit. Also let's actually use it. + +Additionally we introduce PSW_MASK_SHORT_CTRL and use it throughout +the codebase. + +Signed-off-by: Janosch Frank +Reviewed-by: Christian Borntraeger +Reviewed-by: David Hildenbrand +Message-Id: <20200227092341.38558-1-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b6c2dbd7214b0b2396e1dcf9668c8b48ab571115) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 2 +- + target/s390x/cpu.c | 4 ++-- + target/s390x/cpu.h | 3 ++- + 3 files changed, 5 insertions(+), 4 deletions(-) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index ca544d64c5..0b7548a549 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -179,7 +179,7 @@ static void s390_ipl_realize(DeviceState *dev, Error **errp) + /* if not Linux load the address of the (short) IPL PSW */ + ipl_psw = rom_ptr(4, 4); + if (ipl_psw) { +- pentry = be32_to_cpu(*ipl_psw) & 0x7fffffffUL; ++ pentry = be32_to_cpu(*ipl_psw) & PSW_MASK_SHORT_ADDR; + } else { + error_setg(&err, "Could not get IPL PSW"); + goto error; +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 625daeedd1..e538a4a3e2 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -78,13 +78,13 @@ static void s390_cpu_load_normal(CPUState *s) + S390CPU *cpu = S390_CPU(s); + uint64_t spsw = ldq_phys(s->as, 0); + +- cpu->env.psw.mask = spsw & 0xffffffff80000000ULL; ++ cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL; + /* + * Invert short psw indication, so SIE will report a specification + * exception if it was not set. + */ + cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; +- cpu->env.psw.addr = spsw & 0x7fffffffULL; ++ cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR; + + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 7f5fa1d35b..1ff84e6b3a 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -281,7 +281,8 @@ extern const VMStateDescription vmstate_s390_cpu; + #define PSW_MASK_RI 0x0000008000000000ULL + #define PSW_MASK_64 0x0000000100000000ULL + #define PSW_MASK_32 0x0000000080000000ULL +-#define PSW_MASK_ESA_ADDR 0x000000007fffffffULL ++#define PSW_MASK_SHORT_ADDR 0x000000007fffffffULL ++#define PSW_MASK_SHORT_CTRL 0xffffffff80000000ULL + + #undef PSW_ASC_PRIMARY + #undef PSW_ASC_ACCREG +-- +2.27.0 + diff --git a/kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch b/kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch new file mode 100755 index 0000000000000000000000000000000000000000..8ce762511dfefa092c4941626991ef84a776372c --- /dev/null +++ b/kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch @@ -0,0 +1,119 @@ +From 04d4e7eda95316b64ea9dc0f4ca8801d531652e7 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:41 -0400 +Subject: [PATCH 07/12] s390x/css: Refactor the css_queue_crw() routine + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-8-cohuck@redhat.com> +Patchwork-id: 97700 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 7/9] s390x/css: Refactor the css_queue_crw() routine +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Eric Farman + +We have a use case (vfio-ccw) where a CRW is already built and +ready to use. Rather than teasing out the components just to +reassemble it later, let's rework this code so we can queue a +fully-qualified CRW directly. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-6-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f6dde1b012e678aa64339520ef7519ec04026cf1) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/css.c | 44 ++++++++++++++++++++++++++++-------------- + include/hw/s390x/css.h | 1 + + 2 files changed, 30 insertions(+), 15 deletions(-) + +diff --git a/hw/s390x/css.c b/hw/s390x/css.c +index 71fd3f9a00..a8de8a0c84 100644 +--- a/hw/s390x/css.c ++++ b/hw/s390x/css.c +@@ -2170,30 +2170,23 @@ void css_subch_assign(uint8_t cssid, uint8_t ssid, uint16_t schid, + } + } + +-void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, +- int chain, uint16_t rsid) ++void css_crw_add_to_queue(CRW crw) + { + CrwContainer *crw_cont; + +- trace_css_crw(rsc, erc, rsid, chain ? "(chained)" : ""); ++ trace_css_crw((crw.flags & CRW_FLAGS_MASK_RSC) >> 8, ++ crw.flags & CRW_FLAGS_MASK_ERC, ++ crw.rsid, ++ (crw.flags & CRW_FLAGS_MASK_C) ? "(chained)" : ""); ++ + /* TODO: Maybe use a static crw pool? */ + crw_cont = g_try_new0(CrwContainer, 1); + if (!crw_cont) { + channel_subsys.crws_lost = true; + return; + } +- crw_cont->crw.flags = (rsc << 8) | erc; +- if (solicited) { +- crw_cont->crw.flags |= CRW_FLAGS_MASK_S; +- } +- if (chain) { +- crw_cont->crw.flags |= CRW_FLAGS_MASK_C; +- } +- crw_cont->crw.rsid = rsid; +- if (channel_subsys.crws_lost) { +- crw_cont->crw.flags |= CRW_FLAGS_MASK_R; +- channel_subsys.crws_lost = false; +- } ++ ++ crw_cont->crw = crw; + + QTAILQ_INSERT_TAIL(&channel_subsys.pending_crws, crw_cont, sibling); + +@@ -2204,6 +2197,27 @@ void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, + } + } + ++void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, ++ int chain, uint16_t rsid) ++{ ++ CRW crw; ++ ++ crw.flags = (rsc << 8) | erc; ++ if (solicited) { ++ crw.flags |= CRW_FLAGS_MASK_S; ++ } ++ if (chain) { ++ crw.flags |= CRW_FLAGS_MASK_C; ++ } ++ crw.rsid = rsid; ++ if (channel_subsys.crws_lost) { ++ crw.flags |= CRW_FLAGS_MASK_R; ++ channel_subsys.crws_lost = false; ++ } ++ ++ css_crw_add_to_queue(crw); ++} ++ + void css_generate_sch_crws(uint8_t cssid, uint8_t ssid, uint16_t schid, + int hotplugged, int add) + { +diff --git a/include/hw/s390x/css.h b/include/hw/s390x/css.h +index 7e3a5e7433..08c869ab0a 100644 +--- a/include/hw/s390x/css.h ++++ b/include/hw/s390x/css.h +@@ -205,6 +205,7 @@ void copy_scsw_to_guest(SCSW *dest, const SCSW *src); + void css_inject_io_interrupt(SubchDev *sch); + void css_reset(void); + void css_reset_sch(SubchDev *sch); ++void css_crw_add_to_queue(CRW crw); + void css_queue_crw(uint8_t rsc, uint8_t erc, int solicited, + int chain, uint16_t rsid); + void css_generate_sch_crws(uint8_t cssid, uint8_t ssid, uint16_t schid, +-- +2.27.0 + diff --git a/kvm-s390x-fix-build-for-without-default-devices.patch b/kvm-s390x-fix-build-for-without-default-devices.patch new file mode 100755 index 0000000000000000000000000000000000000000..6567c0463dec96454a15aa652f76c573d54291c8 --- /dev/null +++ b/kvm-s390x-fix-build-for-without-default-devices.patch @@ -0,0 +1,74 @@ +From d86158eeb752242791e3f94172ed020204040250 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:46 -0500 +Subject: [PATCH 7/7] s390x: fix build for --without-default-devices +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-8-cohuck@redhat.com> +Patchwork-id: 100681 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 7/7] s390x: fix build for --without-default-devices +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +s390-pci-vfio.c calls into the vfio code, so we need it to be +built conditionally on vfio (which implies CONFIG_LINUX). + +Fixes: cd7498d07fbb ("s390x/pci: Add routine to get the vfio dma available count") +Reported-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Matthew Rosato +Message-Id: <20201103123237.718242-1-cohuck@redhat.com> +Acked-by: Greg Kurz +Tested-by: Greg Kurz +Signed-off-by: Cornelia Huck +(cherry picked from commit 77280d33bc9cfdbfb5b5d462259d644f5aefe9b3) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/meson.build + include/hw/s390x/s390-pci-vfio.h + --> adaptions due to missing Meson rework + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/Makefile.objs | 2 +- + include/hw/s390x/s390-pci-vfio.h | 3 ++- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index 43756c9437d..dbef4b8906c 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -7,7 +7,7 @@ obj-y += ipl.o + obj-y += css.o + obj-$(CONFIG_S390_CCW_VIRTIO) += s390-virtio-ccw.o + obj-$(CONFIG_TERMINAL3270) += 3270-ccw.o +-obj-$(CONFIG_LINUX) += s390-pci-vfio.o ++obj-$(CONFIG_VFIO) += s390-pci-vfio.o + ifeq ($(CONFIG_VIRTIO_CCW),y) + obj-y += virtio-ccw.o + obj-$(CONFIG_VIRTIO_SERIAL) += virtio-ccw-serial.o +diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h +index 539bcf04eb5..685b136d46b 100644 +--- a/include/hw/s390x/s390-pci-vfio.h ++++ b/include/hw/s390x/s390-pci-vfio.h +@@ -13,8 +13,9 @@ + #define HW_S390_PCI_VFIO_H + + #include "hw/s390x/s390-pci-bus.h" ++#include "config-devices.h" + +-#ifdef CONFIG_LINUX ++#ifdef CONFIG_VFIO + bool s390_pci_update_dma_avail(int fd, unsigned int *avail); + S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, + S390PCIBusDevice *pbdev); +-- +2.27.0 + diff --git a/kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch b/kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch new file mode 100755 index 0000000000000000000000000000000000000000..8b9294e643ab50a5a25a0f045678523fe2655d47 --- /dev/null +++ b/kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch @@ -0,0 +1,82 @@ +From 536b6081c0739bebbb33583370f62116d0cb42da Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:01 -0400 +Subject: [PATCH 19/42] s390x: ipl: Consolidate iplb validity check into one + function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-20-thuth@redhat.com> +Patchwork-id: 97038 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 19/38] s390x: ipl: Consolidate iplb validity check into one function +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +It's nicer to just call one function than calling a function for each +possible iplb type. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20200310090950.61172-1-frankja@linux.ibm.com> +Reviewed-by: Christian Borntraeger +Signed-off-by: Christian Borntraeger +(cherry picked from commit 94c21436e5a89143f8b9cb4d089d1a2f3f4fd377) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.h | 18 +++++++++--------- + target/s390x/diag.c | 2 +- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index d4813105db..3e44abe1c6 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -173,16 +173,16 @@ static inline bool iplb_valid_len(IplParameterBlock *iplb) + return be32_to_cpu(iplb->len) <= sizeof(IplParameterBlock); + } + +-static inline bool iplb_valid_ccw(IplParameterBlock *iplb) ++static inline bool iplb_valid(IplParameterBlock *iplb) + { +- return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_CCW_LEN && +- iplb->pbt == S390_IPL_TYPE_CCW; +-} +- +-static inline bool iplb_valid_fcp(IplParameterBlock *iplb) +-{ +- return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_FCP_LEN && +- iplb->pbt == S390_IPL_TYPE_FCP; ++ switch (iplb->pbt) { ++ case S390_IPL_TYPE_FCP: ++ return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_FCP_LEN; ++ case S390_IPL_TYPE_CCW: ++ return be32_to_cpu(iplb->len) >= S390_IPLB_MIN_CCW_LEN; ++ default: ++ return false; ++ } + } + + #endif +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index b5aec06d6b..54e5670b3f 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -117,7 +117,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + + cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); + +- if (!iplb_valid_ccw(iplb) && !iplb_valid_fcp(iplb)) { ++ if (!iplb_valid(iplb)) { + env->regs[r1 + 1] = DIAG_308_RC_INVALID; + goto out; + } +-- +2.27.0 + diff --git a/kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch b/kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch new file mode 100755 index 0000000000000000000000000000000000000000..9882324a823a9eeecb4fdd83b34f4ed3ff81723c --- /dev/null +++ b/kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch @@ -0,0 +1,83 @@ +From 999cf62d870ff9aa8e9609fcbbcefef9ae1aceb6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:50 -0400 +Subject: [PATCH 08/42] s390x: kvm: Make kvm_sclp_service_call void +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-9-thuth@redhat.com> +Patchwork-id: 97030 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 08/38] s390x: kvm: Make kvm_sclp_service_call void +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +It defaults to returning 0 anyway and that return value is not +necessary, as 0 is also the default rc that the caller would return. + +While doing that we can simplify the logic a bit and return early if +we inject a PGM exception. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Message-Id: <20191129091713.4582-1-frankja@linux.ibm.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Cornelia Huck +(cherry picked from commit 15b6c0370c3e2774fd9ffda5c10c6e36952e8eb6) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index a02d569537..1c5bc7a2f9 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -1159,13 +1159,13 @@ void kvm_s390_access_exception(S390CPU *cpu, uint16_t code, uint64_t te_code) + kvm_s390_vcpu_interrupt(cpu, &irq); + } + +-static int kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, ++static void kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, + uint16_t ipbh0) + { + CPUS390XState *env = &cpu->env; + uint64_t sccb; + uint32_t code; +- int r = 0; ++ int r; + + sccb = env->regs[ipbh0 & 0xf]; + code = env->regs[(ipbh0 & 0xf0) >> 4]; +@@ -1173,11 +1173,9 @@ static int kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, + r = sclp_service_call(env, sccb, code); + if (r < 0) { + kvm_s390_program_interrupt(cpu, -r); +- } else { +- setcc(cpu, r); ++ return; + } +- +- return 0; ++ setcc(cpu, r); + } + + static int handle_b2(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1) +@@ -1240,7 +1238,7 @@ static int handle_b2(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1) + setcc(cpu, 3); + break; + case PRIV_B2_SCLP_CALL: +- rc = kvm_sclp_service_call(cpu, run, ipbh0); ++ kvm_sclp_service_call(cpu, run, ipbh0); + break; + default: + rc = -1; +-- +2.27.0 + diff --git a/kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch b/kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch new file mode 100755 index 0000000000000000000000000000000000000000..5e48efb6ccdb824dce24932003282ac015eefd1f --- /dev/null +++ b/kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch @@ -0,0 +1,150 @@ +From 3927f54a56e29003b84e0e3726d3a0170681128b Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:44 -0500 +Subject: [PATCH 5/7] s390x/pci: Add routine to get the vfio dma available + count + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-6-cohuck@redhat.com> +Patchwork-id: 100679 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 5/7] s390x/pci: Add routine to get the vfio dma available count +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +Create new files for separating out vfio-specific work for s390 +pci. Add the first such routine, which issues VFIO_IOMMU_GET_INFO +ioctl to collect the current dma available count. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +[aw: Fix non-Linux build with CONFIG_LINUX] +Signed-off-by: Alex Williamson +(cherry picked from commit cd7498d07fbb20fa04790ff7ee168a8a8d01cb30) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/meson.build + --> added the file in hw/s390x/Makefile.objs instead, + since we do not use Meson yet + hw/s390x/s390-pci-vfio.c + --> NULL-initialize "info" to avoid a downstream-only + compiler warning + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/Makefile.objs | 1 + + hw/s390x/s390-pci-vfio.c | 54 ++++++++++++++++++++++++++++++++ + include/hw/s390x/s390-pci-vfio.h | 24 ++++++++++++++ + 3 files changed, 79 insertions(+) + create mode 100644 hw/s390x/s390-pci-vfio.c + create mode 100644 include/hw/s390x/s390-pci-vfio.h + +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index c4086ec3171..43756c9437d 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -7,6 +7,7 @@ obj-y += ipl.o + obj-y += css.o + obj-$(CONFIG_S390_CCW_VIRTIO) += s390-virtio-ccw.o + obj-$(CONFIG_TERMINAL3270) += 3270-ccw.o ++obj-$(CONFIG_LINUX) += s390-pci-vfio.o + ifeq ($(CONFIG_VIRTIO_CCW),y) + obj-y += virtio-ccw.o + obj-$(CONFIG_VIRTIO_SERIAL) += virtio-ccw-serial.o +diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c +new file mode 100644 +index 00000000000..0eb22ffec4c +--- /dev/null ++++ b/hw/s390x/s390-pci-vfio.c +@@ -0,0 +1,54 @@ ++/* ++ * s390 vfio-pci interfaces ++ * ++ * Copyright 2020 IBM Corp. ++ * Author(s): Matthew Rosato ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++ ++#include ++ ++#include "qemu/osdep.h" ++#include "hw/s390x/s390-pci-vfio.h" ++#include "hw/vfio/vfio-common.h" ++ ++/* ++ * Get the current DMA available count from vfio. Returns true if vfio is ++ * limiting DMA requests, false otherwise. The current available count read ++ * from vfio is returned in avail. ++ */ ++bool s390_pci_update_dma_avail(int fd, unsigned int *avail) ++{ ++ g_autofree struct vfio_iommu_type1_info *info = NULL; ++ uint32_t argsz; ++ ++ assert(avail); ++ ++ argsz = sizeof(struct vfio_iommu_type1_info); ++ info = g_malloc0(argsz); ++ ++ /* ++ * If the specified argsz is not large enough to contain all capabilities ++ * it will be updated upon return from the ioctl. Retry until we have ++ * a big enough buffer to hold the entire capability chain. ++ */ ++retry: ++ info->argsz = argsz; ++ ++ if (ioctl(fd, VFIO_IOMMU_GET_INFO, info)) { ++ return false; ++ } ++ ++ if (info->argsz > argsz) { ++ argsz = info->argsz; ++ info = g_realloc(info, argsz); ++ goto retry; ++ } ++ ++ /* If the capability exists, update with the current value */ ++ return vfio_get_info_dma_avail(info, avail); ++} ++ +diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h +new file mode 100644 +index 00000000000..1727292e9b5 +--- /dev/null ++++ b/include/hw/s390x/s390-pci-vfio.h +@@ -0,0 +1,24 @@ ++/* ++ * s390 vfio-pci interfaces ++ * ++ * Copyright 2020 IBM Corp. ++ * Author(s): Matthew Rosato ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++ ++#ifndef HW_S390_PCI_VFIO_H ++#define HW_S390_PCI_VFIO_H ++ ++#ifdef CONFIG_LINUX ++bool s390_pci_update_dma_avail(int fd, unsigned int *avail); ++#else ++static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) ++{ ++ return false; ++} ++#endif ++ ++#endif +-- +2.27.0 + diff --git a/kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch b/kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch new file mode 100755 index 0000000000000000000000000000000000000000..13fd6b7dedbf5795036b25af41cbceed80a86b27 --- /dev/null +++ b/kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch @@ -0,0 +1,357 @@ +From 7ef9b9c593da98ad32ad20c28d17bb2700a35c29 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:45 -0500 +Subject: [PATCH 6/7] s390x/pci: Honor DMA limits set by vfio + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-7-cohuck@redhat.com> +Patchwork-id: 100680 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 6/7] s390x/pci: Honor DMA limits set by vfio +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +When an s390 guest is using lazy unmapping, it can result in a very +large number of oustanding DMA requests, far beyond the default +limit configured for vfio. Let's track DMA usage similar to vfio +in the host, and trigger the guest to flush their DMA mappings +before vfio runs out. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +[aw: non-Linux build fixes] +Signed-off-by: Alex Williamson +(cherry picked from commit 37fa32de707340f3a93959ad5a1ebc41ba1520ee) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/s390-pci-bus.c + --> adapt to missing 981c3dcd9489 ("qdev: Convert to + qdev_unrealize() with Coccinelle") + hw/s390x/s390-pci-inst.c + --> adapt to out of order inclusion of 5039caf3c449 ("memory: + Add IOMMUTLBEvent") + include/hw/s390x/s390-pci-bus.h + --> adapt to missing db1015e92e04 ("Move QOM typedefs and + add missing includes") + +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-pci-bus.c | 16 ++++++++---- + hw/s390x/s390-pci-inst.c | 45 +++++++++++++++++++++++++++----- + hw/s390x/s390-pci-vfio.c | 42 +++++++++++++++++++++++++++++ + include/hw/s390x/s390-pci-bus.h | 9 +++++++ + include/hw/s390x/s390-pci-inst.h | 3 +++ + include/hw/s390x/s390-pci-vfio.h | 12 +++++++++ + 6 files changed, 116 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c +index 6daef2b6d57..a9f6f550472 100644 +--- a/hw/s390x/s390-pci-bus.c ++++ b/hw/s390x/s390-pci-bus.c +@@ -17,6 +17,7 @@ + #include "cpu.h" + #include "hw/s390x/s390-pci-bus.h" + #include "hw/s390x/s390-pci-inst.h" ++#include "hw/s390x/s390-pci-vfio.h" + #include "hw/pci/pci_bus.h" + #include "hw/qdev-properties.h" + #include "hw/pci/pci_bridge.h" +@@ -771,6 +772,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) + s->bus_no = 0; + QTAILQ_INIT(&s->pending_sei); + QTAILQ_INIT(&s->zpci_devs); ++ QTAILQ_INIT(&s->zpci_dma_limit); + + css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, + S390_ADAPTER_SUPPRESSIBLE, &local_err); +@@ -951,17 +953,18 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + } + } + ++ pbdev->pdev = pdev; ++ pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); ++ pbdev->iommu->pbdev = pbdev; ++ pbdev->state = ZPCI_FS_DISABLED; ++ + if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { + pbdev->fh |= FH_SHM_VFIO; ++ pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); + } else { + pbdev->fh |= FH_SHM_EMUL; + } + +- pbdev->pdev = pdev; +- pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); +- pbdev->iommu->pbdev = pbdev; +- pbdev->state = ZPCI_FS_DISABLED; +- + if (s390_pci_msix_init(pbdev)) { + error_setg(errp, "MSI-X support is mandatory " + "in the S390 architecture"); +@@ -1014,6 +1017,9 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, + pbdev->fid = 0; + QTAILQ_REMOVE(&s->zpci_devs, pbdev, link); + g_hash_table_remove(s->zpci_table, &pbdev->idx); ++ if (pbdev->iommu->dma_limit) { ++ s390_pci_end_dma_count(s, pbdev->iommu->dma_limit); ++ } + object_property_set_bool(OBJECT(dev), false, "realized", NULL); + } + } +diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c +index b1885344f18..edbdf727984 100644 +--- a/hw/s390x/s390-pci-inst.c ++++ b/hw/s390x/s390-pci-inst.c +@@ -32,6 +32,20 @@ + } \ + } while (0) + ++static inline void inc_dma_avail(S390PCIIOMMU *iommu) ++{ ++ if (iommu->dma_limit) { ++ iommu->dma_limit->avail++; ++ } ++} ++ ++static inline void dec_dma_avail(S390PCIIOMMU *iommu) ++{ ++ if (iommu->dma_limit) { ++ iommu->dma_limit->avail--; ++ } ++} ++ + static void s390_set_status_code(CPUS390XState *env, + uint8_t r, uint64_t status_code) + { +@@ -572,7 +586,8 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + return 0; + } + +-static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) ++static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu, ++ S390IOTLBEntry *entry) + { + S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); + IOMMUTLBEvent event = { +@@ -588,14 +603,15 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + + if (event.type == IOMMU_NOTIFIER_UNMAP) { + if (!cache) { +- return; ++ goto out; + } + g_hash_table_remove(iommu->iotlb, &entry->iova); ++ inc_dma_avail(iommu); + } else { + if (cache) { + if (cache->perm == entry->perm && + cache->translated_addr == entry->translated_addr) { +- return; ++ goto out; + } + + event.type = IOMMU_NOTIFIER_UNMAP; +@@ -611,9 +627,13 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) + cache->len = PAGE_SIZE; + cache->perm = entry->perm; + g_hash_table_replace(iommu->iotlb, &cache->iova, cache); ++ dec_dma_avail(iommu); + } + + memory_region_notify_iommu(&iommu->iommu_mr, 0, event); ++ ++out: ++ return iommu->dma_limit ? iommu->dma_limit->avail : 1; + } + + int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) +@@ -625,6 +645,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + S390PCIIOMMU *iommu; + S390IOTLBEntry entry; + hwaddr start, end; ++ uint32_t dma_avail; + + if (env->psw.mask & PSW_MASK_PSTATE) { + s390_program_interrupt(env, PGM_PRIVILEGED, ra); +@@ -663,6 +684,11 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + } + + iommu = pbdev->iommu; ++ if (iommu->dma_limit) { ++ dma_avail = iommu->dma_limit->avail; ++ } else { ++ dma_avail = 1; ++ } + if (!iommu->g_iota) { + error = ERR_EVENT_INVALAS; + goto err; +@@ -680,8 +706,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) + } + + start += entry.len; +- while (entry.iova < start && entry.iova < end) { +- s390_pci_update_iotlb(iommu, &entry); ++ while (entry.iova < start && entry.iova < end && ++ (dma_avail > 0 || entry.perm == IOMMU_NONE)) { ++ dma_avail = s390_pci_update_iotlb(iommu, &entry); + entry.iova += PAGE_SIZE; + entry.translated_addr += PAGE_SIZE; + } +@@ -694,7 +721,13 @@ err: + s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0); + } else { + pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++; +- setcc(cpu, ZPCI_PCI_LS_OK); ++ if (dma_avail > 0) { ++ setcc(cpu, ZPCI_PCI_LS_OK); ++ } else { ++ /* vfio DMA mappings are exhausted, trigger a RPCIT */ ++ setcc(cpu, ZPCI_PCI_LS_ERR); ++ s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES); ++ } + } + return 0; + } +diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c +index 0eb22ffec4c..01c1e8ac89a 100644 +--- a/hw/s390x/s390-pci-vfio.c ++++ b/hw/s390x/s390-pci-vfio.c +@@ -12,7 +12,9 @@ + #include + + #include "qemu/osdep.h" ++#include "hw/s390x/s390-pci-bus.h" + #include "hw/s390x/s390-pci-vfio.h" ++#include "hw/vfio/pci.h" + #include "hw/vfio/vfio-common.h" + + /* +@@ -52,3 +54,43 @@ retry: + return vfio_get_info_dma_avail(info, avail); + } + ++S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, ++ S390PCIBusDevice *pbdev) ++{ ++ S390PCIDMACount *cnt; ++ uint32_t avail; ++ VFIOPCIDevice *vpdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); ++ int id; ++ ++ assert(vpdev); ++ ++ id = vpdev->vbasedev.group->container->fd; ++ ++ if (!s390_pci_update_dma_avail(id, &avail)) { ++ return NULL; ++ } ++ ++ QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) { ++ if (cnt->id == id) { ++ cnt->users++; ++ return cnt; ++ } ++ } ++ ++ cnt = g_new0(S390PCIDMACount, 1); ++ cnt->id = id; ++ cnt->users = 1; ++ cnt->avail = avail; ++ QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link); ++ return cnt; ++} ++ ++void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) ++{ ++ assert(cnt); ++ ++ cnt->users--; ++ if (cnt->users == 0) { ++ QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link); ++ } ++} +diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h +index 550f3cc5e92..2f2edbd0bf3 100644 +--- a/include/hw/s390x/s390-pci-bus.h ++++ b/include/hw/s390x/s390-pci-bus.h +@@ -266,6 +266,13 @@ typedef struct S390IOTLBEntry { + } S390IOTLBEntry; + + typedef struct S390PCIBusDevice S390PCIBusDevice; ++typedef struct S390PCIDMACount { ++ int id; ++ int users; ++ uint32_t avail; ++ QTAILQ_ENTRY(S390PCIDMACount) link; ++} S390PCIDMACount; ++ + typedef struct S390PCIIOMMU { + Object parent_obj; + S390PCIBusDevice *pbdev; +@@ -277,6 +284,7 @@ typedef struct S390PCIIOMMU { + uint64_t pba; + uint64_t pal; + GHashTable *iotlb; ++ S390PCIDMACount *dma_limit; + } S390PCIIOMMU; + + typedef struct S390PCIIOMMUTable { +@@ -352,6 +360,7 @@ typedef struct S390pciState { + GHashTable *zpci_table; + QTAILQ_HEAD(, SeiContainer) pending_sei; + QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs; ++ QTAILQ_HEAD(, S390PCIDMACount) zpci_dma_limit; + } S390pciState; + + S390pciState *s390_get_phb(void); +diff --git a/include/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h +index fa3bf8b5aad..8ee3a3c2375 100644 +--- a/include/hw/s390x/s390-pci-inst.h ++++ b/include/hw/s390x/s390-pci-inst.h +@@ -254,6 +254,9 @@ typedef struct ClpReqRspQueryPciGrp { + #define ZPCI_STPCIFC_ST_INVAL_DMAAS 28 + #define ZPCI_STPCIFC_ST_ERROR_RECOVER 40 + ++/* Refresh PCI Translations status codes */ ++#define ZPCI_RPCIT_ST_INSUFF_RES 16 ++ + /* FIB function controls */ + #define ZPCI_FIB_FC_ENABLED 0x80 + #define ZPCI_FIB_FC_ERROR 0x40 +diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h +index 1727292e9b5..539bcf04eb5 100644 +--- a/include/hw/s390x/s390-pci-vfio.h ++++ b/include/hw/s390x/s390-pci-vfio.h +@@ -12,13 +12,25 @@ + #ifndef HW_S390_PCI_VFIO_H + #define HW_S390_PCI_VFIO_H + ++#include "hw/s390x/s390-pci-bus.h" ++ + #ifdef CONFIG_LINUX + bool s390_pci_update_dma_avail(int fd, unsigned int *avail); ++S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, ++ S390PCIBusDevice *pbdev); ++void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt); + #else + static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) + { + return false; + } ++static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, ++ S390PCIBusDevice *pbdev) ++{ ++ return NULL; ++} ++static inline void s390_pci_end_dma_count(S390pciState *s, ++ S390PCIDMACount *cnt) { } + #endif + + #endif +-- +2.27.0 + diff --git a/kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch b/kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch new file mode 100755 index 0000000000000000000000000000000000000000..27e5fa2919a6cf5d0e547b692d6c8f65897b4d4d --- /dev/null +++ b/kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch @@ -0,0 +1,110 @@ +From 73fb2438518ef2073f2486fcf1dd8cddffb29228 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:41 -0500 +Subject: [PATCH 2/7] s390x/pci: Move header files to include/hw/s390x + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-3-cohuck@redhat.com> +Patchwork-id: 100676 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/7] s390x/pci: Move header files to include/hw/s390x +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +Seems a more appropriate location for them. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +Signed-off-by: Alex Williamson +(cherry picked from commit 408b55db8be3e3edae041d46ef8786fabc1476aa) +Signed-off-by: Cornelia Huck + + Conflicts: + hw/s390x/s390-virtio-ccw.c + --> context diff + +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 1 + + hw/s390x/s390-pci-bus.c | 4 ++-- + hw/s390x/s390-pci-inst.c | 4 ++-- + hw/s390x/s390-virtio-ccw.c | 2 +- + {hw => include/hw}/s390x/s390-pci-bus.h | 0 + {hw => include/hw}/s390x/s390-pci-inst.h | 0 + 6 files changed, 6 insertions(+), 5 deletions(-) + rename {hw => include/hw}/s390x/s390-pci-bus.h (100%) + rename {hw => include/hw}/s390x/s390-pci-inst.h (100%) + +diff --git a/MAINTAINERS b/MAINTAINERS +index 2742c955754..56ca8193d86 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1225,6 +1225,7 @@ S390 PCI + M: Matthew Rosato + S: Supported + F: hw/s390x/s390-pci* ++F: include/hw/s390x/s390-pci* + L: qemu-s390x@nongnu.org + + UniCore32 Machines +diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c +index 2d2f4a7c419..6daef2b6d57 100644 +--- a/hw/s390x/s390-pci-bus.c ++++ b/hw/s390x/s390-pci-bus.c +@@ -15,8 +15,8 @@ + #include "qapi/error.h" + #include "qapi/visitor.h" + #include "cpu.h" +-#include "s390-pci-bus.h" +-#include "s390-pci-inst.h" ++#include "hw/s390x/s390-pci-bus.h" ++#include "hw/s390x/s390-pci-inst.h" + #include "hw/pci/pci_bus.h" + #include "hw/qdev-properties.h" + #include "hw/pci/pci_bridge.h" +diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c +index 27b189e6d75..b1885344f18 100644 +--- a/hw/s390x/s390-pci-inst.c ++++ b/hw/s390x/s390-pci-inst.c +@@ -13,12 +13,12 @@ + + #include "qemu/osdep.h" + #include "cpu.h" +-#include "s390-pci-inst.h" +-#include "s390-pci-bus.h" + #include "exec/memop.h" + #include "exec/memory-internal.h" + #include "qemu/error-report.h" + #include "sysemu/hw_accel.h" ++#include "hw/s390x/s390-pci-inst.h" ++#include "hw/s390x/s390-pci-bus.h" + #include "hw/s390x/tod.h" + + #ifndef DEBUG_S390PCI_INST +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 5b3d07f55c4..101f3b7c6e1 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -27,7 +27,7 @@ + #include "qemu/ctype.h" + #include "qemu/error-report.h" + #include "qemu/option.h" +-#include "s390-pci-bus.h" ++#include "hw/s390x/s390-pci-bus.h" + #include "sysemu/reset.h" + #include "hw/s390x/storage-keys.h" + #include "hw/s390x/storage-attributes.h" +diff --git a/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h +similarity index 100% +rename from hw/s390x/s390-pci-bus.h +rename to include/hw/s390x/s390-pci-bus.h +diff --git a/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h +similarity index 100% +rename from hw/s390x/s390-pci-inst.h +rename to include/hw/s390x/s390-pci-inst.h +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Add-migration-blocker.patch b/kvm-s390x-protvirt-Add-migration-blocker.patch new file mode 100755 index 0000000000000000000000000000000000000000..056f8d5958a45c3bb8caa7f8b243ece050ddf0cc --- /dev/null +++ b/kvm-s390x-protvirt-Add-migration-blocker.patch @@ -0,0 +1,79 @@ +From 0ba8d4ea1cc34230356cc446dfa8d1cb52cbd2f3 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:05 -0400 +Subject: [PATCH 23/42] s390x: protvirt: Add migration blocker + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-24-thuth@redhat.com> +Patchwork-id: 97043 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 23/38] s390x: protvirt: Add migration blocker +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Migration is not yet supported. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-5-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0141e1b47707d90f5bd9d252da064ebdaca698a6) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 82da1d9ab5..dbd5125232 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -44,6 +44,9 @@ + #include "sysemu/sysemu.h" + #include "hw/s390x/pv.h" + #include ++#include "migration/blocker.h" ++ ++static Error *pv_mig_blocker; + + S390CPU *s390_cpu_addr2state(uint16_t cpu_addr) + { +@@ -325,15 +328,30 @@ static void s390_machine_unprotect(S390CcwMachineState *ms) + { + s390_pv_vm_disable(); + ms->pv = false; ++ migrate_del_blocker(pv_mig_blocker); ++ error_free_or_abort(&pv_mig_blocker); + } + + static int s390_machine_protect(S390CcwMachineState *ms) + { ++ Error *local_err = NULL; + int rc; + ++ error_setg(&pv_mig_blocker, ++ "protected VMs are currently not migrateable."); ++ rc = migrate_add_blocker(pv_mig_blocker, &local_err); ++ if (rc) { ++ error_report_err(local_err); ++ error_free_or_abort(&pv_mig_blocker); ++ return rc; ++ } ++ + /* Create SE VM */ + rc = s390_pv_vm_enable(); + if (rc) { ++ error_report_err(local_err); ++ migrate_del_blocker(pv_mig_blocker); ++ error_free_or_abort(&pv_mig_blocker); + return rc; + } + +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch b/kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch new file mode 100755 index 0000000000000000000000000000000000000000..0cf75b052fed4db52d6ecc971dff1b7d4b4be964 --- /dev/null +++ b/kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch @@ -0,0 +1,135 @@ +From 1cfcff169f392179258e4535e60d4ef9cabae3c6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:13 -0400 +Subject: [PATCH 31/42] s390x: protvirt: Disable address checks for PV guest IO + emulation + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-32-thuth@redhat.com> +Patchwork-id: 97044 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 31/38] s390x: protvirt: Disable address checks for PV guest IO emulation +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +IO instruction data is routed through SIDAD for protected guests, so +adresses do not need to be checked, as this is kernel memory which is +always available. + +Also the instruction data always starts at offset 0 of the SIDAD. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-13-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit c10b708752e5264a85b5c3afa0a0ccfcf6503ddf) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/ioinst.c | 35 ++++++++++++++++++++++++++++------- + 1 file changed, 28 insertions(+), 7 deletions(-) + +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index c437a1d8c6..bbcccf6be2 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -16,6 +16,25 @@ + #include "hw/s390x/ioinst.h" + #include "trace.h" + #include "hw/s390x/s390-pci-bus.h" ++#include "hw/s390x/pv.h" ++ ++/* All I/O instructions but chsc use the s format */ ++static uint64_t get_address_from_regs(CPUS390XState *env, uint32_t ipb, ++ uint8_t *ar) ++{ ++ /* ++ * Addresses for protected guests are all offsets into the ++ * satellite block which holds the IO control structures. Those ++ * control structures are always starting at offset 0 and are ++ * always aligned and accessible. So we can return 0 here which ++ * will pass the following address checks. ++ */ ++ if (s390_is_pv()) { ++ *ar = 0; ++ return 0; ++ } ++ return decode_basedisp_s(env, ipb, ar); ++} + + int ioinst_disassemble_sch_ident(uint32_t value, int *m, int *cssid, int *ssid, + int *schid) +@@ -114,7 +133,7 @@ void ioinst_handle_msch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -171,7 +190,7 @@ void ioinst_handle_ssch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -203,7 +222,7 @@ void ioinst_handle_stcrw(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -234,7 +253,7 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + CPUS390XState *env = &cpu->env; + uint8_t ar; + +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; +@@ -303,7 +322,7 @@ int ioinst_handle_tsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + return -EIO; + } + trace_ioinst_sch_id("tsch", cssid, ssid, schid); +- addr = decode_basedisp_s(env, ipb, &ar); ++ addr = get_address_from_regs(env, ipb, &ar); + if (addr & 3) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return -EIO; +@@ -601,7 +620,7 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + { + ChscReq *req; + ChscResp *res; +- uint64_t addr; ++ uint64_t addr = 0; + int reg; + uint16_t len; + uint16_t command; +@@ -610,7 +629,9 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + + trace_ioinst("chsc"); + reg = (ipb >> 20) & 0x00f; +- addr = env->regs[reg]; ++ if (!s390_is_pv()) { ++ addr = env->regs[reg]; ++ } + /* Page boundary? */ + if (addr & 0xfff) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch b/kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch new file mode 100755 index 0000000000000000000000000000000000000000..9857f2886648ddf01116b90a496ac502a7f3bf3c --- /dev/null +++ b/kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch @@ -0,0 +1,55 @@ +From b54e5e6df5d5bbe4dc0a206be9f6b6d971ce6f43 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:17 -0400 +Subject: [PATCH 35/42] s390x: protvirt: Fix stray error_report_err in + s390_machine_protect +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-36-thuth@redhat.com> +Patchwork-id: 97042 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 35/38] s390x: protvirt: Fix stray error_report_err in s390_machine_protect +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +In case the protection of the machine fails at s390_pv_vm_enable(), +we'll currently report the local_error variable. Problem is that +there's no migration blocker error that we can report at this point so +the pointer is always NULL which leads to a SEGFAULT. + +Let's remove the error report. + +Signed-off-by: Janosch Frank +Reported-by: Marc Hartmayer +Fixes: 0141e1b47707 ("s390x: protvirt: Add migration blocker") +Message-Id: <20200326140505.2432-1-frankja@linux.ibm.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Cornelia Huck +(cherry picked from commit 7152c9ecc6530ea145c122b0a58cc28802f630c6) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index b4ebe83766..c08e42bda1 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -360,7 +360,6 @@ static int s390_machine_protect(S390CcwMachineState *ms) + rc = s390_pv_vm_enable(); + if (rc) { + qemu_balloon_inhibit(false); +- error_report_err(local_err); + migrate_del_blocker(pv_mig_blocker); + error_free_or_abort(&pv_mig_blocker); + return rc; +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch b/kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch new file mode 100755 index 0000000000000000000000000000000000000000..4d6a44b2dad67843242f689a8d8f74ef1d379438 --- /dev/null +++ b/kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch @@ -0,0 +1,61 @@ +From 680154545d1f9d75fb33615b1900661e7d09be4e Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:15 -0400 +Subject: [PATCH 33/42] s390x: protvirt: Handle SIGP store status correctly + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-34-thuth@redhat.com> +Patchwork-id: 97054 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 33/38] s390x: protvirt: Handle SIGP store status correctly +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected VMs status storing is not done by QEMU anymore. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-15-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f2a2d9a2bae8f6fdc5e9a40c1241e9428f15b4df) +[thuth: fixed contextual conflict due to missing commit 44eaccd091a7365fd37) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/helper.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/target/s390x/helper.c b/target/s390x/helper.c +index 6808dfda01..36b6d3d9d1 100644 +--- a/target/s390x/helper.c ++++ b/target/s390x/helper.c +@@ -25,6 +25,7 @@ + #include "qemu/timer.h" + #include "qemu/qemu-print.h" + #include "hw/s390x/ioinst.h" ++#include "hw/s390x/pv.h" + #include "sysemu/hw_accel.h" + #include "sysemu/runstate.h" + #ifndef CONFIG_USER_ONLY +@@ -246,6 +247,11 @@ int s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch) + hwaddr len = sizeof(*sa); + int i; + ++ /* For PVMs storing will occur when this cpu enters SIE again */ ++ if (s390_is_pv()) { ++ return 0; ++ } ++ + sa = cpu_physical_memory_map(addr, &len, 1); + if (!sa) { + return -EFAULT; +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch b/kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch new file mode 100755 index 0000000000000000000000000000000000000000..a843d036ca70158149cb854754b1364e855abe23 --- /dev/null +++ b/kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch @@ -0,0 +1,104 @@ +From 095553f9dd1fec02869bf974e8cc07614d6587e5 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:06 -0400 +Subject: [PATCH 24/42] s390x: protvirt: Inhibit balloon when switching to + protected mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-25-thuth@redhat.com> +Patchwork-id: 97036 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 24/38] s390x: protvirt: Inhibit balloon when switching to protected mode +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Ballooning in protected VMs can only be done when the guest shares the +pages it gives to the host. If pages are not shared, the integrity +checks will fail once those pages have been altered and are given back +to the guest. + +As we currently do not yet have a solution for this we will continue +like this: + +1. We block ballooning now in QEMU (with this patch). + +2. Later we will provide a change to virtio that removes the blocker +and adds VIRTIO_F_IOMMU_PLATFORM automatically by QEMU when doing the +protvirt switch. This is OK, as the balloon driver in Linux (the only +supported guest) will refuse to work with the IOMMU_PLATFORM feature +bit set. + +3. Later, we can fix the guest balloon driver to accept the IOMMU +feature bit and correctly exercise sharing and unsharing of balloon +pages. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-6-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b1697f63fd8f8201b1447bb55f595830b9cbde31) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index dbd5125232..b4ebe83766 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -42,6 +42,7 @@ + #include "hw/qdev-properties.h" + #include "hw/s390x/tod.h" + #include "sysemu/sysemu.h" ++#include "sysemu/balloon.h" + #include "hw/s390x/pv.h" + #include + #include "migration/blocker.h" +@@ -330,6 +331,7 @@ static void s390_machine_unprotect(S390CcwMachineState *ms) + ms->pv = false; + migrate_del_blocker(pv_mig_blocker); + error_free_or_abort(&pv_mig_blocker); ++ qemu_balloon_inhibit(false); + } + + static int s390_machine_protect(S390CcwMachineState *ms) +@@ -337,10 +339,18 @@ static int s390_machine_protect(S390CcwMachineState *ms) + Error *local_err = NULL; + int rc; + ++ /* ++ * Ballooning on protected VMs needs support in the guest for ++ * sharing and unsharing balloon pages. Block ballooning for ++ * now, until we have a solution to make at least Linux guests ++ * either support it or fail gracefully. ++ */ ++ qemu_balloon_inhibit(true); + error_setg(&pv_mig_blocker, + "protected VMs are currently not migrateable."); + rc = migrate_add_blocker(pv_mig_blocker, &local_err); + if (rc) { ++ qemu_balloon_inhibit(false); + error_report_err(local_err); + error_free_or_abort(&pv_mig_blocker); + return rc; +@@ -349,6 +359,7 @@ static int s390_machine_protect(S390CcwMachineState *ms) + /* Create SE VM */ + rc = s390_pv_vm_enable(); + if (rc) { ++ qemu_balloon_inhibit(false); + error_report_err(local_err); + migrate_del_blocker(pv_mig_blocker); + error_free_or_abort(&pv_mig_blocker); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-KVM-intercept-changes.patch b/kvm-s390x-protvirt-KVM-intercept-changes.patch new file mode 100755 index 0000000000000000000000000000000000000000..2ac3d036e8c78ec9d7cccb881c1006c34de21272 --- /dev/null +++ b/kvm-s390x-protvirt-KVM-intercept-changes.patch @@ -0,0 +1,75 @@ +From 10ed4f6ad687d98f0bfe06d75775e8c541da80a0 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:07 -0400 +Subject: [PATCH 25/42] s390x: protvirt: KVM intercept changes + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-26-thuth@redhat.com> +Patchwork-id: 97035 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 25/38] s390x: protvirt: KVM intercept changes +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Protected VMs no longer intercept with code 4 for an instruction +interception. Instead they have codes 104 and 108 for protected +instruction interception and protected instruction notification +respectively. + +The 104 mirrors the 4 interception. + +The 108 is a notification interception to let KVM and QEMU know that +something changed and we need to update tracking information or +perform specific tasks. It's currently taken for the following +instructions: + +* spx (To inform about the changed prefix location) +* sclp (On incorrect SCCB values, so we can inject a IRQ) +* sigp (All but "stop and store status") +* diag308 (Subcodes 0/1) + +Of these exits only sclp errors, state changing sigps and diag308 will +reach QEMU. QEMU will do its parts of the job, while the ultravisor +has done the instruction part of the job. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-7-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 2585e507ffa1da01b57dbea26b1e1fe507d27198) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 9a0be13959..af50b2c253 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -115,6 +115,8 @@ + #define ICPT_CPU_STOP 0x28 + #define ICPT_OPEREXC 0x2c + #define ICPT_IO 0x40 ++#define ICPT_PV_INSTR 0x68 ++#define ICPT_PV_INSTR_NOTIFICATION 0x6c + + #define NR_LOCAL_IRQS 32 + /* +@@ -1695,6 +1697,8 @@ static int handle_intercept(S390CPU *cpu) + (long)cs->kvm_run->psw_addr); + switch (icpt_code) { + case ICPT_INSTRUCTION: ++ case ICPT_PV_INSTR: ++ case ICPT_PV_INSTR_NOTIFICATION: + r = handle_instruction(cpu, run); + break; + case ICPT_PROGRAM: +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch b/kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch new file mode 100755 index 0000000000000000000000000000000000000000..0609546addd11ea5fca1b90169c7aaf82b5dcd39 --- /dev/null +++ b/kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch @@ -0,0 +1,171 @@ +From 8345b90f43b14435938fbbe0f3a510a60f5d0ded Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:14 -0400 +Subject: [PATCH 32/42] s390x: protvirt: Move IO control structures over SIDA + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-33-thuth@redhat.com> +Patchwork-id: 97040 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 32/38] s390x: protvirt: Move IO control structures over SIDA +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected guests, we need to put the IO emulation results into the +SIDA, so SIE will write them into the guest at the next entry. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-14-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit fcc10c1470d6e9460ebcf4c30f5bbd37b921a041) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/ioinst.c | 61 +++++++++++++++++++++++++++++++------------ + 1 file changed, 45 insertions(+), 16 deletions(-) + +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index bbcccf6be2..f40c35c6ff 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -138,7 +138,9 @@ void ioinst_handle_msch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; + } +- if (s390_cpu_virt_mem_read(cpu, addr, ar, &schib, sizeof(schib))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, addr, &schib, sizeof(schib)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, ar, &schib, sizeof(schib))) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -195,7 +197,9 @@ void ioinst_handle_ssch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return; + } +- if (s390_cpu_virt_mem_read(cpu, addr, ar, &orig_orb, sizeof(orb))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, addr, &orig_orb, sizeof(orb)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, ar, &orig_orb, sizeof(orb))) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -231,14 +235,19 @@ void ioinst_handle_stcrw(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + cc = css_do_stcrw(&crw); + /* 0 - crw stored, 1 - zeroes stored */ + +- if (s390_cpu_virt_mem_write(cpu, addr, ar, &crw, sizeof(crw)) == 0) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr, &crw, sizeof(crw)); + setcc(cpu, cc); + } else { +- if (cc == 0) { +- /* Write failed: requeue CRW since STCRW is suppressing */ +- css_undo_stcrw(&crw); ++ if (s390_cpu_virt_mem_write(cpu, addr, ar, &crw, sizeof(crw)) == 0) { ++ setcc(cpu, cc); ++ } else { ++ if (cc == 0) { ++ /* Write failed: requeue CRW since STCRW is suppressing */ ++ css_undo_stcrw(&crw); ++ } ++ s390_cpu_virt_mem_handle_exc(cpu, ra); + } +- s390_cpu_virt_mem_handle_exc(cpu, ra); + } + } + +@@ -260,6 +269,13 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + } + + if (ioinst_disassemble_sch_ident(reg1, &m, &cssid, &ssid, &schid)) { ++ /* ++ * The Ultravisor checks schid bit 16 to be one and bits 0-12 ++ * to be 0 and injects a operand exception itself. ++ * ++ * Hence we should never end up here. ++ */ ++ g_assert(!s390_is_pv()); + /* + * As operand exceptions have a lower priority than access exceptions, + * we check whether the memory area is writeable (injecting the +@@ -292,14 +308,17 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + } + } + if (cc != 3) { +- if (s390_cpu_virt_mem_write(cpu, addr, ar, &schib, +- sizeof(schib)) != 0) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr, &schib, sizeof(schib)); ++ } else if (s390_cpu_virt_mem_write(cpu, addr, ar, &schib, ++ sizeof(schib)) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } + } else { + /* Access exceptions have a higher priority than cc3 */ +- if (s390_cpu_virt_mem_check_write(cpu, addr, ar, sizeof(schib)) != 0) { ++ if (!s390_is_pv() && ++ s390_cpu_virt_mem_check_write(cpu, addr, ar, sizeof(schib)) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -336,7 +355,9 @@ int ioinst_handle_tsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + } + /* 0 - status pending, 1 - not status pending, 3 - not operational */ + if (cc != 3) { +- if (s390_cpu_virt_mem_write(cpu, addr, ar, &irb, irb_len) != 0) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr, &irb, irb_len); ++ } else if (s390_cpu_virt_mem_write(cpu, addr, ar, &irb, irb_len) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return -EFAULT; + } +@@ -344,7 +365,8 @@ int ioinst_handle_tsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, uintptr_t ra) + } else { + irb_len = sizeof(irb) - sizeof(irb.emw); + /* Access exceptions have a higher priority than cc3 */ +- if (s390_cpu_virt_mem_check_write(cpu, addr, ar, irb_len) != 0) { ++ if (!s390_is_pv() && ++ s390_cpu_virt_mem_check_write(cpu, addr, ar, irb_len) != 0) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return -EFAULT; + } +@@ -642,7 +664,9 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + * present CHSC sub-handlers ... if we ever need more, we should take + * care of req->len here first. + */ +- if (s390_cpu_virt_mem_read(cpu, addr, reg, buf, sizeof(ChscReq))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, addr, buf, sizeof(ChscReq)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, reg, buf, sizeof(ChscReq))) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + return; + } +@@ -675,11 +699,16 @@ void ioinst_handle_chsc(S390CPU *cpu, uint32_t ipb, uintptr_t ra) + break; + } + +- if (!s390_cpu_virt_mem_write(cpu, addr + len, reg, res, +- be16_to_cpu(res->len))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, addr + len, res, be16_to_cpu(res->len)); + setcc(cpu, 0); /* Command execution complete */ + } else { +- s390_cpu_virt_mem_handle_exc(cpu, ra); ++ if (!s390_cpu_virt_mem_write(cpu, addr + len, reg, res, ++ be16_to_cpu(res->len))) { ++ setcc(cpu, 0); /* Command execution complete */ ++ } else { ++ s390_cpu_virt_mem_handle_exc(cpu, ra); ++ } + } + } + +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch b/kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch new file mode 100755 index 0000000000000000000000000000000000000000..1d60070afb5167e9e370231712ef90108e9697ec --- /dev/null +++ b/kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch @@ -0,0 +1,70 @@ +From 27f5d8a3af2863e39b7c46a3128009988d772f15 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:09 -0400 +Subject: [PATCH 27/42] s390x: protvirt: Move STSI data over SIDAD + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-28-thuth@redhat.com> +Patchwork-id: 97046 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 27/38] s390x: protvirt: Move STSI data over SIDAD +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected guests, we need to put the STSI emulation results into +the SIDA, so SIE will write them into the guest at the next entry. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-9-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 7c713b8acb70fb61f9650f8a7702dec546752bb6) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index f67bb5ce2c..6809a5ac40 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -50,6 +50,7 @@ + #include "exec/memattrs.h" + #include "hw/s390x/s390-virtio-ccw.h" + #include "hw/s390x/s390-virtio-hcall.h" ++#include "hw/s390x/pv.h" + + #ifndef DEBUG_KVM + #define DEBUG_KVM 0 +@@ -1803,7 +1804,9 @@ static void insert_stsi_3_2_2(S390CPU *cpu, __u64 addr, uint8_t ar) + SysIB_322 sysib; + int del; + +- if (s390_cpu_virt_mem_read(cpu, addr, ar, &sysib, sizeof(sysib))) { ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_read(cpu, 0, &sysib, sizeof(sysib)); ++ } else if (s390_cpu_virt_mem_read(cpu, addr, ar, &sysib, sizeof(sysib))) { + return; + } + /* Shift the stack of Extended Names to prepare for our own data */ +@@ -1843,7 +1846,11 @@ static void insert_stsi_3_2_2(S390CPU *cpu, __u64 addr, uint8_t ar) + /* Insert UUID */ + memcpy(sysib.vm[0].uuid, &qemu_uuid, sizeof(sysib.vm[0].uuid)); + +- s390_cpu_virt_mem_write(cpu, addr, ar, &sysib, sizeof(sysib)); ++ if (s390_is_pv()) { ++ s390_cpu_pv_mem_write(cpu, 0, &sysib, sizeof(sysib)); ++ } else { ++ s390_cpu_virt_mem_write(cpu, addr, ar, &sysib, sizeof(sysib)); ++ } + } + + static int handle_stsi(S390CPU *cpu) +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch b/kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch new file mode 100755 index 0000000000000000000000000000000000000000..1b227199a03c9ea3033726bba7147c539697798f --- /dev/null +++ b/kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch @@ -0,0 +1,93 @@ +From 33d4e21cfd236aecd9e4dbe8228d058fd1f22400 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:12 -0400 +Subject: [PATCH 30/42] s390x: protvirt: Move diag 308 data over SIDA + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-31-thuth@redhat.com> +Patchwork-id: 97048 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 30/38] s390x: protvirt: Move diag 308 data over SIDA +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +For protected guests the IPIB is written/read to/from the SIDA, so we +need those accesses to go through s390_cpu_pv_mem_read/write(). + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-12-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 9c61e11238cfa8f70e3eb90aac5d3e5646e5432f) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/diag.c | 25 ++++++++++++++++++++----- + 1 file changed, 20 insertions(+), 5 deletions(-) + +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index b2cbefb8cf..1a48429564 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -75,6 +75,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + { + bool valid; + CPUState *cs = env_cpu(env); ++ S390CPU *cpu = S390_CPU(cs); + uint64_t addr = env->regs[r1]; + uint64_t subcode = env->regs[r3]; + IplParameterBlock *iplb; +@@ -111,13 +112,22 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + return; + } + iplb = g_new0(IplParameterBlock, 1); +- cpu_physical_memory_read(addr, iplb, sizeof(iplb->len)); ++ if (!s390_is_pv()) { ++ cpu_physical_memory_read(addr, iplb, sizeof(iplb->len)); ++ } else { ++ s390_cpu_pv_mem_read(cpu, 0, iplb, sizeof(iplb->len)); ++ } ++ + if (!iplb_valid_len(iplb)) { + env->regs[r1 + 1] = DIAG_308_RC_INVALID; + goto out; + } + +- cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); ++ if (!s390_is_pv()) { ++ cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); ++ } else { ++ s390_cpu_pv_mem_read(cpu, 0, iplb, be32_to_cpu(iplb->len)); ++ } + + valid = subcode == DIAG308_PV_SET ? iplb_valid_pv(iplb) : iplb_valid(iplb); + if (!valid) { +@@ -140,12 +150,17 @@ out: + } else { + iplb = s390_ipl_get_iplb(); + } +- if (iplb) { ++ if (!iplb) { ++ env->regs[r1 + 1] = DIAG_308_RC_NO_CONF; ++ return; ++ } ++ ++ if (!s390_is_pv()) { + cpu_physical_memory_write(addr, iplb, be32_to_cpu(iplb->len)); +- env->regs[r1 + 1] = DIAG_308_RC_OK; + } else { +- env->regs[r1 + 1] = DIAG_308_RC_NO_CONF; ++ s390_cpu_pv_mem_write(cpu, 0, iplb, be32_to_cpu(iplb->len)); + } ++ env->regs[r1 + 1] = DIAG_308_RC_OK; + return; + case DIAG308_PV_START: + iplb = s390_ipl_get_iplb_pv(); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-SCLP-interpretation.patch b/kvm-s390x-protvirt-SCLP-interpretation.patch new file mode 100755 index 0000000000000000000000000000000000000000..10f19308c2103699f8b58c2a4b1c30789c5798cd --- /dev/null +++ b/kvm-s390x-protvirt-SCLP-interpretation.patch @@ -0,0 +1,172 @@ +From 5a8b40c3fdafeb49072f8643210bea00ce1478c4 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:10 -0400 +Subject: [PATCH 28/42] s390x: protvirt: SCLP interpretation + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-29-thuth@redhat.com> +Patchwork-id: 97053 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 28/38] s390x: protvirt: SCLP interpretation +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +SCLP for a protected guest is done over the SIDAD, so we need to use +the s390_cpu_pv_mem_* functions to access the SIDAD instead of guest +memory when reading/writing SCBs. + +To not confuse the sclp emulation, we set 0x4000 as the SCCB address, +since the function that injects the sclp external interrupt would +reject a zero sccb address. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Reviewed-by: Christian Borntraeger +Message-Id: <20200319131921.2367-10-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 0f73c5b30b8ba6c0828608be496d2f59a5427539) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 56 +++++++++++++++++++++++++++++++++-------- + include/hw/s390x/sclp.h | 2 ++ + target/s390x/kvm.c | 25 ++++++++++++++---- + 3 files changed, 67 insertions(+), 16 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index f57ce7b739..1c380a49cc 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -33,6 +33,22 @@ static inline SCLPDevice *get_sclp_device(void) + return sclp; + } + ++static inline bool sclp_command_code_valid(uint32_t code) ++{ ++ switch (code & SCLP_CMD_CODE_MASK) { ++ case SCLP_CMDW_READ_SCP_INFO: ++ case SCLP_CMDW_READ_SCP_INFO_FORCED: ++ case SCLP_CMDW_READ_CPU_INFO: ++ case SCLP_CMDW_CONFIGURE_IOA: ++ case SCLP_CMDW_DECONFIGURE_IOA: ++ case SCLP_CMD_READ_EVENT_DATA: ++ case SCLP_CMD_WRITE_EVENT_DATA: ++ case SCLP_CMD_WRITE_EVENT_MASK: ++ return true; ++ } ++ return false; ++} ++ + static void prepare_cpu_entries(SCLPDevice *sclp, CPUEntry *entry, int *count) + { + MachineState *ms = MACHINE(qdev_get_machine()); +@@ -193,6 +209,34 @@ static void sclp_execute(SCLPDevice *sclp, SCCB *sccb, uint32_t code) + } + } + ++/* ++ * We only need the address to have something valid for the ++ * service_interrupt call. ++ */ ++#define SCLP_PV_DUMMY_ADDR 0x4000 ++int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, ++ uint32_t code) ++{ ++ SCLPDevice *sclp = get_sclp_device(); ++ SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); ++ SCCB work_sccb; ++ hwaddr sccb_len = sizeof(SCCB); ++ ++ s390_cpu_pv_mem_read(env_archcpu(env), 0, &work_sccb, sccb_len); ++ ++ if (!sclp_command_code_valid(code)) { ++ work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); ++ goto out_write; ++ } ++ ++ sclp_c->execute(sclp, &work_sccb, code); ++out_write: ++ s390_cpu_pv_mem_write(env_archcpu(env), 0, &work_sccb, ++ be16_to_cpu(work_sccb.h.length)); ++ sclp_c->service_interrupt(sclp, SCLP_PV_DUMMY_ADDR); ++ return 0; ++} ++ + int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + { + SCLPDevice *sclp = get_sclp_device(); +@@ -230,17 +274,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + goto out; + } + +- switch (code & SCLP_CMD_CODE_MASK) { +- case SCLP_CMDW_READ_SCP_INFO: +- case SCLP_CMDW_READ_SCP_INFO_FORCED: +- case SCLP_CMDW_READ_CPU_INFO: +- case SCLP_CMDW_CONFIGURE_IOA: +- case SCLP_CMDW_DECONFIGURE_IOA: +- case SCLP_CMD_READ_EVENT_DATA: +- case SCLP_CMD_WRITE_EVENT_DATA: +- case SCLP_CMD_WRITE_EVENT_MASK: +- break; +- default: ++ if (!sclp_command_code_valid(code)) { + work_sccb.h.response_code = cpu_to_be16(SCLP_RC_INVALID_SCLP_COMMAND); + goto out_write; + } +diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h +index c54413b78c..c0a3faa37d 100644 +--- a/include/hw/s390x/sclp.h ++++ b/include/hw/s390x/sclp.h +@@ -217,5 +217,7 @@ void s390_sclp_init(void); + void sclp_service_interrupt(uint32_t sccb); + void raise_irq_cpu_hotplug(void); + int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code); ++int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, ++ uint32_t code); + + #endif +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 6809a5ac40..56fe60c49c 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -1230,12 +1230,27 @@ static void kvm_sclp_service_call(S390CPU *cpu, struct kvm_run *run, + sccb = env->regs[ipbh0 & 0xf]; + code = env->regs[(ipbh0 & 0xf0) >> 4]; + +- r = sclp_service_call(env, sccb, code); +- if (r < 0) { +- kvm_s390_program_interrupt(cpu, -r); +- return; ++ switch (run->s390_sieic.icptcode) { ++ case ICPT_PV_INSTR_NOTIFICATION: ++ g_assert(s390_is_pv()); ++ /* The notification intercepts are currently handled by KVM */ ++ error_report("unexpected SCLP PV notification"); ++ exit(1); ++ break; ++ case ICPT_PV_INSTR: ++ g_assert(s390_is_pv()); ++ sclp_service_call_protected(env, sccb, code); ++ /* Setting the CC is done by the Ultravisor. */ ++ break; ++ case ICPT_INSTRUCTION: ++ g_assert(!s390_is_pv()); ++ r = sclp_service_call(env, sccb, code); ++ if (r < 0) { ++ kvm_s390_program_interrupt(cpu, -r); ++ return; ++ } ++ setcc(cpu, r); + } +- setcc(cpu, r); + } + + static int handle_b2(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1) +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Set-guest-IPL-PSW.patch b/kvm-s390x-protvirt-Set-guest-IPL-PSW.patch new file mode 100755 index 0000000000000000000000000000000000000000..ef246c7a4d1401178dc893d5d438575ca19276fb --- /dev/null +++ b/kvm-s390x-protvirt-Set-guest-IPL-PSW.patch @@ -0,0 +1,75 @@ +From d738b4336c79be68b6040f73427e089f46957728 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:11 -0400 +Subject: [PATCH 29/42] s390x: protvirt: Set guest IPL PSW + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-30-thuth@redhat.com> +Patchwork-id: 97049 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 29/38] s390x: protvirt: Set guest IPL PSW +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Handling of CPU reset and setting of the IPL psw from guest storage at +offset 0 is done by a Ultravisor call. Let's only fetch it if +necessary. + +Signed-off-by: Janosch Frank +Reviewed-by: Thomas Huth +Reviewed-by: David Hildenbrand +Reviewed-by: Christian Borntraeger +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200319131921.2367-11-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 59181010a2ff82c3a97e9b5768ee87c38e4815f1) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.c | 26 +++++++++++++++++--------- + 1 file changed, 17 insertions(+), 9 deletions(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 8f38cd8e6f..371b91b2d7 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -77,16 +77,24 @@ static bool s390_cpu_has_work(CPUState *cs) + static void s390_cpu_load_normal(CPUState *s) + { + S390CPU *cpu = S390_CPU(s); +- uint64_t spsw = ldq_phys(s->as, 0); +- +- cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL; +- /* +- * Invert short psw indication, so SIE will report a specification +- * exception if it was not set. +- */ +- cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; +- cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR; ++ uint64_t spsw; + ++ if (!s390_is_pv()) { ++ spsw = ldq_phys(s->as, 0); ++ cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL; ++ /* ++ * Invert short psw indication, so SIE will report a specification ++ * exception if it was not set. ++ */ ++ cpu->env.psw.mask ^= PSW_MASK_SHORTPSW; ++ cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR; ++ } else { ++ /* ++ * Firmware requires us to set the load state before we set ++ * the cpu to operating on protected guests. ++ */ ++ s390_cpu_set_state(S390_CPU_STATE_LOAD, cpu); ++ } + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } + #endif +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-Support-unpack-facility.patch b/kvm-s390x-protvirt-Support-unpack-facility.patch new file mode 100755 index 0000000000000000000000000000000000000000..204de2aa4864a17e638d4f0322817727e5e22c23 --- /dev/null +++ b/kvm-s390x-protvirt-Support-unpack-facility.patch @@ -0,0 +1,886 @@ +From e6474080e3816e82e87c545a3d22db77c55ab053 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:04 -0400 +Subject: [PATCH 22/42] s390x: protvirt: Support unpack facility + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-23-thuth@redhat.com> +Patchwork-id: 97045 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 22/38] s390x: protvirt: Support unpack facility +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The unpack facility provides the means to setup a protected guest. A +protected guest cannot be introspected by the hypervisor or any +user/administrator of the machine it is running on. + +Protected guests are encrypted at rest and need a special boot +mechanism via diag308 subcode 8 and 10. + +Code 8 sets the PV specific IPLB which is retained separately from +those set via code 5. + +Code 10 is used to unpack the VM into protected memory, verify its +integrity and start it. + +Signed-off-by: Janosch Frank +Co-developed-by: Christian Borntraeger [Changes +to machine] +Reviewed-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Reviewed-by: Cornelia Huck +Message-Id: <20200323083606.24520-1-frankja@linux.ibm.com> +[CH: fixed up KVM_PV_VM_ -> KVM_PV_] +Signed-off-by: Cornelia Huck +(cherry picked from commit c3347ed0d2ee42a7dcf7bfe7f9c3884a9596727a) +Signed-off-by: Danilo C. L. de Paula +--- + MAINTAINERS | 2 + + hw/s390x/Makefile.objs | 1 + + hw/s390x/ipl.c | 59 +++++++++++++- + hw/s390x/ipl.h | 91 ++++++++++++++++++++- + hw/s390x/pv.c | 98 +++++++++++++++++++++++ + hw/s390x/s390-virtio-ccw.c | 119 +++++++++++++++++++++++++++- + include/hw/s390x/pv.h | 55 +++++++++++++ + include/hw/s390x/s390-virtio-ccw.h | 1 + + target/s390x/cpu.c | 1 + + target/s390x/cpu_features_def.inc.h | 1 + + target/s390x/diag.c | 39 ++++++++- + target/s390x/kvm-stub.c | 5 ++ + target/s390x/kvm.c | 5 ++ + target/s390x/kvm_s390x.h | 1 + + 14 files changed, 468 insertions(+), 10 deletions(-) + create mode 100644 hw/s390x/pv.c + create mode 100644 include/hw/s390x/pv.h + +diff --git a/MAINTAINERS b/MAINTAINERS +index 49d5d44edc..2742c95575 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -385,6 +385,8 @@ F: target/s390x/machine.c + F: target/s390x/sigp.c + F: target/s390x/cpu_features*.[ch] + F: target/s390x/cpu_models.[ch] ++F: hw/s390x/pv.c ++F: include/hw/s390x/pv.h + F: hw/intc/s390_flic.c + F: hw/intc/s390_flic_kvm.c + F: include/hw/s390x/s390_flic.h +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index e02ed80b68..a46a1c7894 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -31,6 +31,7 @@ obj-y += tod-qemu.o + obj-$(CONFIG_KVM) += tod-kvm.o + obj-$(CONFIG_KVM) += s390-skeys-kvm.o + obj-$(CONFIG_KVM) += s390-stattrib-kvm.o ++obj-$(CONFIG_KVM) += pv.o + obj-y += s390-ccw.o + obj-y += ap-device.o + obj-y += ap-bridge.o +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index fa0409dc23..586d95b5b6 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -1,10 +1,11 @@ + /* + * bootloader support + * +- * Copyright IBM, Corp. 2012 ++ * Copyright IBM, Corp. 2012, 2020 + * + * Authors: + * Christian Borntraeger ++ * Janosch Frank + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at your + * option) any later version. See the COPYING file in the top-level directory. +@@ -27,6 +28,7 @@ + #include "hw/s390x/vfio-ccw.h" + #include "hw/s390x/css.h" + #include "hw/s390x/ebcdic.h" ++#include "hw/s390x/pv.h" + #include "ipl.h" + #include "qemu/error-report.h" + #include "qemu/config-file.h" +@@ -557,12 +559,31 @@ void s390_ipl_update_diag308(IplParameterBlock *iplb) + { + S390IPLState *ipl = get_ipl_device(); + +- ipl->iplb = *iplb; +- ipl->iplb_valid = true; ++ /* ++ * The IPLB set and retrieved by subcodes 8/9 is completely ++ * separate from the one managed via subcodes 5/6. ++ */ ++ if (iplb->pbt == S390_IPL_TYPE_PV) { ++ ipl->iplb_pv = *iplb; ++ ipl->iplb_valid_pv = true; ++ } else { ++ ipl->iplb = *iplb; ++ ipl->iplb_valid = true; ++ } + ipl->netboot = is_virtio_net_device(iplb); + update_machine_ipl_properties(iplb); + } + ++IplParameterBlock *s390_ipl_get_iplb_pv(void) ++{ ++ S390IPLState *ipl = get_ipl_device(); ++ ++ if (!ipl->iplb_valid_pv) { ++ return NULL; ++ } ++ return &ipl->iplb_pv; ++} ++ + IplParameterBlock *s390_ipl_get_iplb(void) + { + S390IPLState *ipl = get_ipl_device(); +@@ -651,6 +672,38 @@ static void s390_ipl_prepare_qipl(S390CPU *cpu) + cpu_physical_memory_unmap(addr, len, 1, len); + } + ++int s390_ipl_prepare_pv_header(void) ++{ ++ IplParameterBlock *ipib = s390_ipl_get_iplb_pv(); ++ IPLBlockPV *ipib_pv = &ipib->pv; ++ void *hdr = g_malloc(ipib_pv->pv_header_len); ++ int rc; ++ ++ cpu_physical_memory_read(ipib_pv->pv_header_addr, hdr, ++ ipib_pv->pv_header_len); ++ rc = s390_pv_set_sec_parms((uintptr_t)hdr, ++ ipib_pv->pv_header_len); ++ g_free(hdr); ++ return rc; ++} ++ ++int s390_ipl_pv_unpack(void) ++{ ++ IplParameterBlock *ipib = s390_ipl_get_iplb_pv(); ++ IPLBlockPV *ipib_pv = &ipib->pv; ++ int i, rc = 0; ++ ++ for (i = 0; i < ipib_pv->num_comp; i++) { ++ rc = s390_pv_unpack(ipib_pv->components[i].addr, ++ TARGET_PAGE_ALIGN(ipib_pv->components[i].size), ++ ipib_pv->components[i].tweak_pref); ++ if (rc) { ++ break; ++ } ++ } ++ return rc; ++} ++ + void s390_ipl_prepare_cpu(S390CPU *cpu) + { + S390IPLState *ipl = get_ipl_device(); +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index a5665e6bfd..89b3044d7a 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -1,8 +1,9 @@ + /* + * s390 IPL device + * +- * Copyright 2015 IBM Corp. ++ * Copyright 2015, 2020 IBM Corp. + * Author(s): Zhang Fan ++ * Janosch Frank + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level +@@ -15,6 +16,24 @@ + #include "cpu.h" + #include "hw/qdev-core.h" + ++struct IPLBlockPVComp { ++ uint64_t tweak_pref; ++ uint64_t addr; ++ uint64_t size; ++} QEMU_PACKED; ++typedef struct IPLBlockPVComp IPLBlockPVComp; ++ ++struct IPLBlockPV { ++ uint8_t reserved18[87]; /* 0x18 */ ++ uint8_t version; /* 0x6f */ ++ uint32_t reserved70; /* 0x70 */ ++ uint32_t num_comp; /* 0x74 */ ++ uint64_t pv_header_addr; /* 0x78 */ ++ uint64_t pv_header_len; /* 0x80 */ ++ struct IPLBlockPVComp components[]; ++} QEMU_PACKED; ++typedef struct IPLBlockPV IPLBlockPV; ++ + struct IplBlockCcw { + uint8_t reserved0[85]; + uint8_t ssid; +@@ -71,6 +90,7 @@ union IplParameterBlock { + union { + IplBlockCcw ccw; + IplBlockFcp fcp; ++ IPLBlockPV pv; + IplBlockQemuScsi scsi; + }; + } QEMU_PACKED; +@@ -85,8 +105,11 @@ typedef union IplParameterBlock IplParameterBlock; + + int s390_ipl_set_loadparm(uint8_t *loadparm); + void s390_ipl_update_diag308(IplParameterBlock *iplb); ++int s390_ipl_prepare_pv_header(void); ++int s390_ipl_pv_unpack(void); + void s390_ipl_prepare_cpu(S390CPU *cpu); + IplParameterBlock *s390_ipl_get_iplb(void); ++IplParameterBlock *s390_ipl_get_iplb_pv(void); + + enum s390_reset { + /* default is a reset not triggered by a CPU e.g. issued by QMP */ +@@ -94,6 +117,7 @@ enum s390_reset { + S390_RESET_REIPL, + S390_RESET_MODIFIED_CLEAR, + S390_RESET_LOAD_NORMAL, ++ S390_RESET_PV, + }; + void s390_ipl_reset_request(CPUState *cs, enum s390_reset reset_type); + void s390_ipl_get_reset_request(CPUState **cs, enum s390_reset *reset_type); +@@ -133,6 +157,7 @@ struct S390IPLState { + /*< private >*/ + DeviceState parent_obj; + IplParameterBlock iplb; ++ IplParameterBlock iplb_pv; + QemuIplParameters qipl; + uint64_t start_addr; + uint64_t compat_start_addr; +@@ -140,6 +165,7 @@ struct S390IPLState { + uint64_t compat_bios_start_addr; + bool enforce_bios; + bool iplb_valid; ++ bool iplb_valid_pv; + bool netboot; + /* reset related properties don't have to be migrated or reset */ + enum s390_reset reset_type; +@@ -162,6 +188,8 @@ QEMU_BUILD_BUG_MSG(offsetof(S390IPLState, iplb) & 3, "alignment of iplb wrong"); + #define DIAG_308_RC_OK 0x0001 + #define DIAG_308_RC_NO_CONF 0x0102 + #define DIAG_308_RC_INVALID 0x0402 ++#define DIAG_308_RC_NO_PV_CONF 0x0902 ++#define DIAG_308_RC_INVAL_FOR_PV 0x0a02 + + #define DIAG308_RESET_MOD_CLR 0 + #define DIAG308_RESET_LOAD_NORM 1 +@@ -169,12 +197,17 @@ QEMU_BUILD_BUG_MSG(offsetof(S390IPLState, iplb) & 3, "alignment of iplb wrong"); + #define DIAG308_LOAD_NORMAL_DUMP 4 + #define DIAG308_SET 5 + #define DIAG308_STORE 6 ++#define DIAG308_PV_SET 8 ++#define DIAG308_PV_STORE 9 ++#define DIAG308_PV_START 10 + + #define S390_IPL_TYPE_FCP 0x00 + #define S390_IPL_TYPE_CCW 0x02 ++#define S390_IPL_TYPE_PV 0x05 + #define S390_IPL_TYPE_QEMU_SCSI 0xff + + #define S390_IPLB_HEADER_LEN 8 ++#define S390_IPLB_MIN_PV_LEN 148 + #define S390_IPLB_MIN_CCW_LEN 200 + #define S390_IPLB_MIN_FCP_LEN 384 + #define S390_IPLB_MIN_QEMU_SCSI_LEN 200 +@@ -184,6 +217,62 @@ static inline bool iplb_valid_len(IplParameterBlock *iplb) + return be32_to_cpu(iplb->len) <= sizeof(IplParameterBlock); + } + ++static inline bool ipl_valid_pv_components(IplParameterBlock *iplb) ++{ ++ IPLBlockPV *ipib_pv = &iplb->pv; ++ int i; ++ ++ if (ipib_pv->num_comp == 0) { ++ return false; ++ } ++ ++ for (i = 0; i < ipib_pv->num_comp; i++) { ++ /* Addr must be 4k aligned */ ++ if (ipib_pv->components[i].addr & ~TARGET_PAGE_MASK) { ++ return false; ++ } ++ ++ /* Tweak prefix is monotonically increasing with each component */ ++ if (i < ipib_pv->num_comp - 1 && ++ ipib_pv->components[i].tweak_pref >= ++ ipib_pv->components[i + 1].tweak_pref) { ++ return false; ++ } ++ } ++ return true; ++} ++ ++static inline bool ipl_valid_pv_header(IplParameterBlock *iplb) ++{ ++ IPLBlockPV *ipib_pv = &iplb->pv; ++ ++ if (ipib_pv->pv_header_len > 2 * TARGET_PAGE_SIZE) { ++ return false; ++ } ++ ++ if (!address_space_access_valid(&address_space_memory, ++ ipib_pv->pv_header_addr, ++ ipib_pv->pv_header_len, ++ false, ++ MEMTXATTRS_UNSPECIFIED)) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static inline bool iplb_valid_pv(IplParameterBlock *iplb) ++{ ++ if (iplb->pbt != S390_IPL_TYPE_PV || ++ be32_to_cpu(iplb->len) < S390_IPLB_MIN_PV_LEN) { ++ return false; ++ } ++ if (!ipl_valid_pv_header(iplb)) { ++ return false; ++ } ++ return ipl_valid_pv_components(iplb); ++} ++ + static inline bool iplb_valid(IplParameterBlock *iplb) + { + switch (iplb->pbt) { +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +new file mode 100644 +index 0000000000..a40a844806 +--- /dev/null ++++ b/hw/s390x/pv.c +@@ -0,0 +1,98 @@ ++/* ++ * Protected Virtualization functions ++ * ++ * Copyright IBM Corp. 2020 ++ * Author(s): ++ * Janosch Frank ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++#include "qemu/osdep.h" ++ ++#include ++ ++#include "qemu/error-report.h" ++#include "sysemu/kvm.h" ++#include "hw/s390x/pv.h" ++ ++static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) ++{ ++ struct kvm_pv_cmd pv_cmd = { ++ .cmd = cmd, ++ .data = (uint64_t)data, ++ }; ++ int rc = kvm_vm_ioctl(kvm_state, KVM_S390_PV_COMMAND, &pv_cmd); ++ ++ if (rc) { ++ error_report("KVM PV command %d (%s) failed: header rc %x rrc %x " ++ "IOCTL rc: %d", cmd, cmdname, pv_cmd.rc, pv_cmd.rrc, ++ rc); ++ } ++ return rc; ++} ++ ++/* ++ * This macro lets us pass the command as a string to the function so ++ * we can print it on an error. ++ */ ++#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data); ++#define s390_pv_cmd_exit(cmd, data) \ ++{ \ ++ int rc; \ ++ \ ++ rc = __s390_pv_cmd(cmd, #cmd, data);\ ++ if (rc) { \ ++ exit(1); \ ++ } \ ++} ++ ++int s390_pv_vm_enable(void) ++{ ++ return s390_pv_cmd(KVM_PV_ENABLE, NULL); ++} ++ ++void s390_pv_vm_disable(void) ++{ ++ s390_pv_cmd_exit(KVM_PV_DISABLE, NULL); ++} ++ ++int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) ++{ ++ struct kvm_s390_pv_sec_parm args = { ++ .origin = origin, ++ .length = length, ++ }; ++ ++ return s390_pv_cmd(KVM_PV_SET_SEC_PARMS, &args); ++} ++ ++/* ++ * Called for each component in the SE type IPL parameter block 0. ++ */ ++int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) ++{ ++ struct kvm_s390_pv_unp args = { ++ .addr = addr, ++ .size = size, ++ .tweak = tweak, ++ }; ++ ++ return s390_pv_cmd(KVM_PV_UNPACK, &args); ++} ++ ++void s390_pv_perf_clear_reset(void) ++{ ++ s390_pv_cmd_exit(KVM_PV_PREP_RESET, NULL); ++} ++ ++int s390_pv_verify(void) ++{ ++ return s390_pv_cmd(KVM_PV_VERIFY, NULL); ++} ++ ++void s390_pv_unshare(void) ++{ ++ s390_pv_cmd_exit(KVM_PV_UNSHARE_ALL, NULL); ++} +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 4ea01c53c0..82da1d9ab5 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -1,9 +1,10 @@ + /* + * virtio ccw machine + * +- * Copyright 2012 IBM Corp. ++ * Copyright 2012, 2020 IBM Corp. + * Copyright (c) 2009 Alexander Graf + * Author(s): Cornelia Huck ++ * Janosch Frank + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level +@@ -41,6 +42,8 @@ + #include "hw/qdev-properties.h" + #include "hw/s390x/tod.h" + #include "sysemu/sysemu.h" ++#include "hw/s390x/pv.h" ++#include + + S390CPU *s390_cpu_addr2state(uint16_t cpu_addr) + { +@@ -318,10 +321,78 @@ static inline void s390_do_cpu_ipl(CPUState *cs, run_on_cpu_data arg) + s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); + } + ++static void s390_machine_unprotect(S390CcwMachineState *ms) ++{ ++ s390_pv_vm_disable(); ++ ms->pv = false; ++} ++ ++static int s390_machine_protect(S390CcwMachineState *ms) ++{ ++ int rc; ++ ++ /* Create SE VM */ ++ rc = s390_pv_vm_enable(); ++ if (rc) { ++ return rc; ++ } ++ ++ ms->pv = true; ++ ++ /* Set SE header and unpack */ ++ rc = s390_ipl_prepare_pv_header(); ++ if (rc) { ++ goto out_err; ++ } ++ ++ /* Decrypt image */ ++ rc = s390_ipl_pv_unpack(); ++ if (rc) { ++ goto out_err; ++ } ++ ++ /* Verify integrity */ ++ rc = s390_pv_verify(); ++ if (rc) { ++ goto out_err; ++ } ++ return rc; ++ ++out_err: ++ s390_machine_unprotect(ms); ++ return rc; ++} ++ ++static void s390_machine_inject_pv_error(CPUState *cs) ++{ ++ int r1 = (cs->kvm_run->s390_sieic.ipa & 0x00f0) >> 4; ++ CPUS390XState *env = &S390_CPU(cs)->env; ++ ++ /* Report that we are unable to enter protected mode */ ++ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; ++} ++ ++static void s390_pv_prepare_reset(S390CcwMachineState *ms) ++{ ++ CPUState *cs; ++ ++ if (!s390_is_pv()) { ++ return; ++ } ++ /* Unsharing requires all cpus to be stopped */ ++ CPU_FOREACH(cs) { ++ s390_cpu_set_state(S390_CPU_STATE_STOPPED, S390_CPU(cs)); ++ } ++ s390_pv_unshare(); ++ s390_pv_perf_clear_reset(); ++} ++ + static void s390_machine_reset(MachineState *machine) + { ++ S390CcwMachineState *ms = S390_CCW_MACHINE(machine); + enum s390_reset reset_type; + CPUState *cs, *t; ++ S390CPU *cpu; + + /* get the reset parameters, reset them once done */ + s390_ipl_get_reset_request(&cs, &reset_type); +@@ -329,9 +400,15 @@ static void s390_machine_reset(MachineState *machine) + /* all CPUs are paused and synchronized at this point */ + s390_cmma_reset(); + ++ cpu = S390_CPU(cs); ++ + switch (reset_type) { + case S390_RESET_EXTERNAL: + case S390_RESET_REIPL: ++ if (s390_is_pv()) { ++ s390_machine_unprotect(ms); ++ } ++ + qemu_devices_reset(); + s390_crypto_reset(); + +@@ -339,22 +416,56 @@ static void s390_machine_reset(MachineState *machine) + run_on_cpu(cs, s390_do_cpu_ipl, RUN_ON_CPU_NULL); + break; + case S390_RESET_MODIFIED_CLEAR: ++ /* ++ * Susbsystem reset needs to be done before we unshare memory ++ * and lose access to VIRTIO structures in guest memory. ++ */ ++ subsystem_reset(); ++ s390_crypto_reset(); ++ s390_pv_prepare_reset(ms); + CPU_FOREACH(t) { + run_on_cpu(t, s390_do_cpu_full_reset, RUN_ON_CPU_NULL); + } +- subsystem_reset(); +- s390_crypto_reset(); + run_on_cpu(cs, s390_do_cpu_load_normal, RUN_ON_CPU_NULL); + break; + case S390_RESET_LOAD_NORMAL: ++ /* ++ * Susbsystem reset needs to be done before we unshare memory ++ * and lose access to VIRTIO structures in guest memory. ++ */ ++ subsystem_reset(); ++ s390_pv_prepare_reset(ms); + CPU_FOREACH(t) { + if (t == cs) { + continue; + } + run_on_cpu(t, s390_do_cpu_reset, RUN_ON_CPU_NULL); + } +- subsystem_reset(); + run_on_cpu(cs, s390_do_cpu_initial_reset, RUN_ON_CPU_NULL); ++ run_on_cpu(cs, s390_do_cpu_load_normal, RUN_ON_CPU_NULL); ++ break; ++ case S390_RESET_PV: /* Subcode 10 */ ++ subsystem_reset(); ++ s390_crypto_reset(); ++ ++ CPU_FOREACH(t) { ++ if (t == cs) { ++ continue; ++ } ++ run_on_cpu(t, s390_do_cpu_full_reset, RUN_ON_CPU_NULL); ++ } ++ run_on_cpu(cs, s390_do_cpu_reset, RUN_ON_CPU_NULL); ++ ++ if (s390_machine_protect(ms)) { ++ s390_machine_inject_pv_error(cs); ++ /* ++ * Continue after the diag308 so the guest knows something ++ * went wrong. ++ */ ++ s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu); ++ return; ++ } ++ + run_on_cpu(cs, s390_do_cpu_load_normal, RUN_ON_CPU_NULL); + break; + default: +diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h +new file mode 100644 +index 0000000000..c6cb360f2f +--- /dev/null ++++ b/include/hw/s390x/pv.h +@@ -0,0 +1,55 @@ ++/* ++ * Protected Virtualization header ++ * ++ * Copyright IBM Corp. 2020 ++ * Author(s): ++ * Janosch Frank ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++#ifndef HW_S390_PV_H ++#define HW_S390_PV_H ++ ++#ifdef CONFIG_KVM ++#include "hw/s390x/s390-virtio-ccw.h" ++ ++static inline bool s390_is_pv(void) ++{ ++ static S390CcwMachineState *ccw; ++ Object *obj; ++ ++ if (ccw) { ++ return ccw->pv; ++ } ++ ++ /* we have to bail out for the "none" machine */ ++ obj = object_dynamic_cast(qdev_get_machine(), ++ TYPE_S390_CCW_MACHINE); ++ if (!obj) { ++ return false; ++ } ++ ccw = S390_CCW_MACHINE(obj); ++ return ccw->pv; ++} ++ ++int s390_pv_vm_enable(void); ++void s390_pv_vm_disable(void); ++int s390_pv_set_sec_parms(uint64_t origin, uint64_t length); ++int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak); ++void s390_pv_perf_clear_reset(void); ++int s390_pv_verify(void); ++void s390_pv_unshare(void); ++#else /* CONFIG_KVM */ ++static inline bool s390_is_pv(void) { return false; } ++static inline int s390_pv_vm_enable(void) { return 0; } ++static inline void s390_pv_vm_disable(void) {} ++static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { return 0; } ++static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { return 0; } ++static inline void s390_pv_perf_clear_reset(void) {} ++static inline int s390_pv_verify(void) { return 0; } ++static inline void s390_pv_unshare(void) {} ++#endif /* CONFIG_KVM */ ++ ++#endif /* HW_S390_PV_H */ +diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h +index 8aa27199c9..cd1dccc6e3 100644 +--- a/include/hw/s390x/s390-virtio-ccw.h ++++ b/include/hw/s390x/s390-virtio-ccw.h +@@ -28,6 +28,7 @@ typedef struct S390CcwMachineState { + /*< public >*/ + bool aes_key_wrap; + bool dea_key_wrap; ++ bool pv; + uint8_t loadparm[8]; + } S390CcwMachineState; + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index c0dd502b84..8f38cd8e6f 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -37,6 +37,7 @@ + #include "sysemu/hw_accel.h" + #include "hw/qdev-properties.h" + #ifndef CONFIG_USER_ONLY ++#include "hw/s390x/pv.h" + #include "hw/boards.h" + #include "sysemu/arch_init.h" + #include "sysemu/sysemu.h" +diff --git a/target/s390x/cpu_features_def.inc.h b/target/s390x/cpu_features_def.inc.h +index 31dff0d84e..60db28351d 100644 +--- a/target/s390x/cpu_features_def.inc.h ++++ b/target/s390x/cpu_features_def.inc.h +@@ -107,6 +107,7 @@ DEF_FEAT(DEFLATE_BASE, "deflate-base", STFL, 151, "Deflate-conversion facility ( + DEF_FEAT(VECTOR_PACKED_DECIMAL_ENH, "vxpdeh", STFL, 152, "Vector-Packed-Decimal-Enhancement Facility") + DEF_FEAT(MSA_EXT_9, "msa9-base", STFL, 155, "Message-security-assist-extension-9 facility (excluding subfunctions)") + DEF_FEAT(ETOKEN, "etoken", STFL, 156, "Etoken facility") ++DEF_FEAT(UNPACK, "unpack", STFL, 161, "Unpack facility") + + /* Features exposed via SCLP SCCB Byte 80 - 98 (bit numbers relative to byte-80) */ + DEF_FEAT(SIE_GSLS, "gsls", SCLP_CONF_CHAR, 40, "SIE: Guest-storage-limit-suppression facility") +diff --git a/target/s390x/diag.c b/target/s390x/diag.c +index 8aba6341f9..b2cbefb8cf 100644 +--- a/target/s390x/diag.c ++++ b/target/s390x/diag.c +@@ -20,6 +20,8 @@ + #include "sysemu/cpus.h" + #include "hw/s390x/ipl.h" + #include "hw/s390x/s390-virtio-ccw.h" ++#include "hw/s390x/pv.h" ++#include "kvm_s390x.h" + + int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + { +@@ -52,6 +54,10 @@ int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3) + static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, + uintptr_t ra, bool write) + { ++ /* Handled by the Ultravisor */ ++ if (s390_is_pv()) { ++ return 0; ++ } + if ((r1 & 1) || (addr & ~TARGET_PAGE_MASK)) { + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + return -1; +@@ -67,6 +73,7 @@ static int diag308_parm_check(CPUS390XState *env, uint64_t r1, uint64_t addr, + + void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + { ++ bool valid; + CPUState *cs = env_cpu(env); + uint64_t addr = env->regs[r1]; + uint64_t subcode = env->regs[r3]; +@@ -82,6 +89,11 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + return; + } + ++ if (subcode >= DIAG308_PV_SET && !s390_has_feat(S390_FEAT_UNPACK)) { ++ s390_program_interrupt(env, PGM_SPECIFICATION, ra); ++ return; ++ } ++ + switch (subcode) { + case DIAG308_RESET_MOD_CLR: + s390_ipl_reset_request(cs, S390_RESET_MODIFIED_CLEAR); +@@ -94,6 +106,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + s390_ipl_reset_request(cs, S390_RESET_REIPL); + break; + case DIAG308_SET: ++ case DIAG308_PV_SET: + if (diag308_parm_check(env, r1, addr, ra, false)) { + return; + } +@@ -106,7 +119,8 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3, uintptr_t ra) + + cpu_physical_memory_read(addr, iplb, be32_to_cpu(iplb->len)); + +- if (!iplb_valid(iplb)) { ++ valid = subcode == DIAG308_PV_SET ? iplb_valid_pv(iplb) : iplb_valid(iplb); ++ if (!valid) { + env->regs[r1 + 1] = DIAG_308_RC_INVALID; + goto out; + } +@@ -117,10 +131,15 @@ out: + g_free(iplb); + return; + case DIAG308_STORE: ++ case DIAG308_PV_STORE: + if (diag308_parm_check(env, r1, addr, ra, true)) { + return; + } +- iplb = s390_ipl_get_iplb(); ++ if (subcode == DIAG308_PV_STORE) { ++ iplb = s390_ipl_get_iplb_pv(); ++ } else { ++ iplb = s390_ipl_get_iplb(); ++ } + if (iplb) { + cpu_physical_memory_write(addr, iplb, be32_to_cpu(iplb->len)); + env->regs[r1 + 1] = DIAG_308_RC_OK; +@@ -128,6 +147,22 @@ out: + env->regs[r1 + 1] = DIAG_308_RC_NO_CONF; + } + return; ++ case DIAG308_PV_START: ++ iplb = s390_ipl_get_iplb_pv(); ++ if (!iplb) { ++ env->regs[r1 + 1] = DIAG_308_RC_NO_PV_CONF; ++ return; ++ } ++ ++ if (kvm_s390_get_hpage_1m()) { ++ error_report("Protected VMs can currently not be backed with " ++ "huge pages"); ++ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; ++ return; ++ } ++ ++ s390_ipl_reset_request(cs, S390_RESET_PV); ++ break; + default: + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + break; +diff --git a/target/s390x/kvm-stub.c b/target/s390x/kvm-stub.c +index c4cd497f85..aa185017a2 100644 +--- a/target/s390x/kvm-stub.c ++++ b/target/s390x/kvm-stub.c +@@ -39,6 +39,11 @@ int kvm_s390_vcpu_interrupt_post_load(S390CPU *cpu) + return 0; + } + ++int kvm_s390_get_hpage_1m(void) ++{ ++ return 0; ++} ++ + int kvm_s390_get_ri(void) + { + return 0; +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index 75d82af6fc..9a0be13959 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -321,6 +321,11 @@ void kvm_s390_set_max_pagesize(uint64_t pagesize, Error **errp) + cap_hpage_1m = 1; + } + ++int kvm_s390_get_hpage_1m(void) ++{ ++ return cap_hpage_1m; ++} ++ + static void ccw_machine_class_foreach(ObjectClass *oc, void *opaque) + { + MachineClass *mc = MACHINE_CLASS(oc); +diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h +index 0b21789796..dea813f450 100644 +--- a/target/s390x/kvm_s390x.h ++++ b/target/s390x/kvm_s390x.h +@@ -23,6 +23,7 @@ void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code); + int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state); + void kvm_s390_vcpu_interrupt_pre_save(S390CPU *cpu); + int kvm_s390_vcpu_interrupt_post_load(S390CPU *cpu); ++int kvm_s390_get_hpage_1m(void); + int kvm_s390_get_ri(void); + int kvm_s390_get_gs(void); + int kvm_s390_get_clock(uint8_t *tod_high, uint64_t *tod_clock); +-- +2.27.0 + diff --git a/kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch b/kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch new file mode 100755 index 0000000000000000000000000000000000000000..b12b458e5cf39d95ddf07f66d62846034a43eae8 --- /dev/null +++ b/kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch @@ -0,0 +1,61 @@ +From 8b994757136780998e0dd1d41613d2006c0dbcf6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 4 Aug 2020 10:16:04 -0400 +Subject: [PATCH 4/4] s390x/protvirt: allow to IPL secure guests with + -no-reboot + +RH-Author: Thomas Huth +Message-id: <20200804101604.6259-2-thuth@redhat.com> +Patchwork-id: 98126 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] s390x/protvirt: allow to IPL secure guests with -no-reboot +Bugzilla: 1863034 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +Right now, -no-reboot prevents secure guests from running. This is +correct from an implementation point of view, as we have modeled the +transition from non-secure to secure as a program directed IPL. From +a user perspective, this is not the behavior of least surprise. + +We should implement the IPL into protected mode similar to the +functions that we use for kdump/kexec. In other words, we do not stop +here when -no-reboot is specified on the command line. Like function 0 +or function 1, function 10 is not a classic reboot. For example, it +can only be called once. Before calling it a second time, a real +reboot/reset must happen in-between. So function code 10 is more or +less a state transition reset, but not a "standard" reset or reboot. + +Fixes: 4d226deafc44 ("s390x: protvirt: Support unpack facility") +Signed-off-by: Christian Borntraeger +Reviewed-by: Janosch Frank +Reviewed-by: David Hildenbrand +Acked-by: Viktor Mihajlovski +Message-Id: <20200721103202.30610-1-borntraeger@de.ibm.com> +[CH: tweaked description] +Signed-off-by: Cornelia Huck +(cherry picked from commit d1bb69db4ceb6897ef6a17bf263146b53a123632) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c +index 586d95b5b6..5b3ea990af 100644 +--- a/hw/s390x/ipl.c ++++ b/hw/s390x/ipl.c +@@ -624,7 +624,8 @@ void s390_ipl_reset_request(CPUState *cs, enum s390_reset reset_type) + } + } + if (reset_type == S390_RESET_MODIFIED_CLEAR || +- reset_type == S390_RESET_LOAD_NORMAL) { ++ reset_type == S390_RESET_LOAD_NORMAL || ++ reset_type == S390_RESET_PV) { + /* ignore -no-reboot, send no event */ + qemu_system_reset_request(SHUTDOWN_CAUSE_SUBSYSTEM_RESET); + } else { +-- +2.27.0 + diff --git a/kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch b/kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch new file mode 100755 index 0000000000000000000000000000000000000000..764ceb1b1975079634bcd1a7ad704bf49bea5e4d --- /dev/null +++ b/kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch @@ -0,0 +1,92 @@ +From f3594f3d84a7442c194b1b9fd288e7414540ec0f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:20 -0400 +Subject: [PATCH 38/42] s390x: pv: Fix KVM_PV_PREP_RESET command wrapper name +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-39-thuth@redhat.com> +Patchwork-id: 97051 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 38/38] s390x: pv: Fix KVM_PV_PREP_RESET command wrapper name +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Upstream: Merged in https://github.com/cohuck/qemu/tree/s390-next + +s390_pv_perf_clear_reset() is not a very helpful name since that +function needs to be called for a normal and a clear reset via +diag308. + +Let's instead name it s390_pv_prep_reset() which reflects the purpose +of the function a bit better. + +Signed-off-by: Janosch Frank +Reviewed-by: David Hildenbrand +Message-Id: <20200505124159.24099-1-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f9628f3f6db341751002dac3be18610fa77c01ad) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/pv.c | 2 +- + hw/s390x/s390-virtio-ccw.c | 2 +- + include/hw/s390x/pv.h | 4 ++-- + 3 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +index f11868e865..ab3a2482aa 100644 +--- a/hw/s390x/pv.c ++++ b/hw/s390x/pv.c +@@ -88,7 +88,7 @@ int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) + return s390_pv_cmd(KVM_PV_UNPACK, &args); + } + +-void s390_pv_perf_clear_reset(void) ++void s390_pv_prep_reset(void) + { + s390_pv_cmd_exit(KVM_PV_PREP_RESET, NULL); + } +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 07773a12b2..e6ed13b649 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -402,7 +402,7 @@ static void s390_pv_prepare_reset(S390CcwMachineState *ms) + s390_cpu_set_state(S390_CPU_STATE_STOPPED, S390_CPU(cs)); + } + s390_pv_unshare(); +- s390_pv_perf_clear_reset(); ++ s390_pv_prep_reset(); + } + + static void s390_machine_reset(MachineState *machine) +diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h +index 522ca6a04e..aee758bc2d 100644 +--- a/include/hw/s390x/pv.h ++++ b/include/hw/s390x/pv.h +@@ -39,7 +39,7 @@ int s390_pv_vm_enable(void); + void s390_pv_vm_disable(void); + int s390_pv_set_sec_parms(uint64_t origin, uint64_t length); + int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak); +-void s390_pv_perf_clear_reset(void); ++void s390_pv_prep_reset(void); + int s390_pv_verify(void); + void s390_pv_unshare(void); + void s390_pv_inject_reset_error(CPUState *cs); +@@ -49,7 +49,7 @@ static inline int s390_pv_vm_enable(void) { return 0; } + static inline void s390_pv_vm_disable(void) {} + static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { return 0; } + static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { return 0; } +-static inline void s390_pv_perf_clear_reset(void) {} ++static inline void s390_pv_prep_reset(void) {} + static inline int s390_pv_verify(void) { return 0; } + static inline void s390_pv_unshare(void) {} + static inline void s390_pv_inject_reset_error(CPUState *cs) {}; +-- +2.27.0 + diff --git a/kvm-s390x-pv-Fix-diag318-PV-fencing.patch b/kvm-s390x-pv-Fix-diag318-PV-fencing.patch new file mode 100755 index 0000000000000000000000000000000000000000..4dcb862fe77ace75fe17d7dff44b697fa8608bc0 --- /dev/null +++ b/kvm-s390x-pv-Fix-diag318-PV-fencing.patch @@ -0,0 +1,114 @@ +From 722078f9fdb766c2f0990145de6732f0c36a63b7 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:16 -0500 +Subject: [PATCH 16/18] s390x: pv: Fix diag318 PV fencing + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-13-thuth@redhat.com> +Patchwork-id: 99509 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 12/12] s390x: pv: Fix diag318 PV fencing +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +Diag318 fencing needs to be determined on the current VM PV state and +not on the state that the VM has when we create the CPU model. + +Fixes: fabdada935 ("s390: guest support for diagnose 0x318") +Reported-by: Marc Hartmayer +Signed-off-by: Janosch Frank +Tested-by: Marc Hartmayer +Reviewed-by: Christian Borntraeger +Reviewed-by: Collin Walling +Acked-by: David Hildenbrand +Message-Id: <20201022103135.126033-3-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 3ded270a2697852a71961b45291519ae044f25e3) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu_features.c | 5 +++++ + target/s390x/cpu_features.h | 4 ++++ + target/s390x/cpu_models.c | 4 ++++ + target/s390x/kvm.c | 3 +-- + 4 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/cpu_features.c b/target/s390x/cpu_features.c +index 9f817e3cfa7..e5cdf232607 100644 +--- a/target/s390x/cpu_features.c ++++ b/target/s390x/cpu_features.c +@@ -14,6 +14,7 @@ + #include "qemu/osdep.h" + #include "qemu/module.h" + #include "cpu_features.h" ++#include "hw/s390x/pv.h" + + #define DEF_FEAT(_FEAT, _NAME, _TYPE, _BIT, _DESC) \ + [S390_FEAT_##_FEAT] = { \ +@@ -105,6 +106,10 @@ void s390_fill_feat_block(const S390FeatBitmap features, S390FeatType type, + } + feat = find_next_bit(features, S390_FEAT_MAX, feat + 1); + } ++ ++ if (type == S390_FEAT_TYPE_SCLP_FAC134 && s390_is_pv()) { ++ clear_be_bit(s390_feat_def(S390_FEAT_DIAG_318)->bit, data); ++ } + } + + void s390_add_from_feat_block(S390FeatBitmap features, S390FeatType type, +diff --git a/target/s390x/cpu_features.h b/target/s390x/cpu_features.h +index f74f7fc3a11..d3c685a04c8 100644 +--- a/target/s390x/cpu_features.h ++++ b/target/s390x/cpu_features.h +@@ -81,6 +81,10 @@ const S390FeatGroupDef *s390_feat_group_def(S390FeatGroup group); + + #define BE_BIT_NR(BIT) (BIT ^ (BITS_PER_LONG - 1)) + ++static inline void clear_be_bit(unsigned int bit_nr, uint8_t *array) ++{ ++ array[bit_nr / 8] &= ~(0x80 >> (bit_nr % 8)); ++} + static inline void set_be_bit(unsigned int bit_nr, uint8_t *array) + { + array[bit_nr / 8] |= 0x80 >> (bit_nr % 8); +diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c +index bf6a3faba9e..d489923cb8a 100644 +--- a/target/s390x/cpu_models.c ++++ b/target/s390x/cpu_models.c +@@ -29,6 +29,7 @@ + #include "hw/pci/pci.h" + #endif + #include "qapi/qapi-commands-machine-target.h" ++#include "hw/s390x/pv.h" + + #define CPUDEF_INIT(_type, _gen, _ec_ga, _mha_pow, _hmfai, _name, _desc) \ + { \ +@@ -238,6 +239,9 @@ bool s390_has_feat(S390Feat feat) + } + return 0; + } ++ if (feat == S390_FEAT_DIAG_318 && s390_is_pv()) { ++ return false; ++ } + return test_bit(feat, cpu->model->features); + } + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index e5e190d21c9..6edb52f6d25 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -2483,8 +2483,7 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) + */ + set_bit(S390_FEAT_EXTENDED_LENGTH_SCCB, model->features); + +- /* DIAGNOSE 0x318 is not supported under protected virtualization */ +- if (!s390_is_pv() && kvm_check_extension(kvm_state, KVM_CAP_S390_DIAG318)) { ++ if (kvm_check_extension(kvm_state, KVM_CAP_S390_DIAG318)) { + set_bit(S390_FEAT_DIAG_318, model->features); + } + +-- +2.27.0 + diff --git a/kvm-s390x-pv-Remove-sclp-boundary-checks.patch b/kvm-s390x-pv-Remove-sclp-boundary-checks.patch new file mode 100755 index 0000000000000000000000000000000000000000..51ceb48b75c3fad2eb68467b84c9e6db4c7fe0aa --- /dev/null +++ b/kvm-s390x-pv-Remove-sclp-boundary-checks.patch @@ -0,0 +1,57 @@ +From cf3d958b14e21fde929e67262b6e192592d95359 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:15 -0500 +Subject: [PATCH 15/18] s390x: pv: Remove sclp boundary checks + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-12-thuth@redhat.com> +Patchwork-id: 99508 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 11/12] s390x: pv: Remove sclp boundary checks +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The SCLP boundary cross check is done by the Ultravisor for a +protected guest, hence we don't need to do it. As QEMU doesn't get a +valid SCCB address in protected mode this is even problematic and can +lead to QEMU reporting a false boundary cross error. + +Fixes: db13387ca0 ("s390/sclp: rework sclp boundary checks") +Reported-by: Marc Hartmayer +Signed-off-by: Janosch Frank +Tested-by: Marc Hartmayer +Reviewed-by: Christian Borntraeger +Reviewed-by: Thomas Huth +Reviewed-by: Collin Walling +Acked-by: Halil Pasic +Acked-by: David Hildenbrand +Message-Id: <20201022103135.126033-2-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 3df4843d0e612a3c838e8d94c3e9c24520f2e680) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 2931046f456..03f847b2c8a 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -285,11 +285,6 @@ int sclp_service_call_protected(CPUS390XState *env, uint64_t sccb, + goto out_write; + } + +- if (!sccb_verify_boundary(sccb, be16_to_cpu(work_sccb->h.length), code)) { +- work_sccb->h.response_code = cpu_to_be16(SCLP_RC_SCCB_BOUNDARY_VIOLATION); +- goto out_write; +- } +- + sclp_c->execute(sclp, work_sccb, code); + out_write: + s390_cpu_pv_mem_write(env_archcpu(env), 0, work_sccb, +-- +2.27.0 + diff --git a/kvm-s390x-pv-Retry-ioctls-on-EINTR.patch b/kvm-s390x-pv-Retry-ioctls-on-EINTR.patch new file mode 100755 index 0000000000000000000000000000000000000000..65208c75b01dfe030981275f2b660a473b278d50 --- /dev/null +++ b/kvm-s390x-pv-Retry-ioctls-on-EINTR.patch @@ -0,0 +1,57 @@ +From 1678288d945906d83d7adae109b842080aebaf19 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:18 -0400 +Subject: [PATCH 36/42] s390x/pv: Retry ioctls on -EINTR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-37-thuth@redhat.com> +Patchwork-id: 97055 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 36/38] s390x/pv: Retry ioctls on -EINTR +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +PV_ENABLE (and maybe others) might return -EINTR when a signal is +pending. See the Linux kernel patch "s390/gmap: return proper error code +on ksm unsharing" for details. Let us retry the ioctl in that case. + +Fixes: c3347ed0d2ee ("s390x: protvirt: Support unpack facility") +Reported-by: Marc Hartmayer +Acked-by: Janosch Frank +Tested-by: Marc Hartmayer +Signed-off-by: Christian Borntraeger +Message-Id: <20200327124616.34866-1-borntraeger@de.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit e8d12a55f6d3e577455b02f15907c460578c689b) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/pv.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +index a40a844806..cb0dce4a4f 100644 +--- a/hw/s390x/pv.c ++++ b/hw/s390x/pv.c +@@ -23,7 +23,11 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) + .cmd = cmd, + .data = (uint64_t)data, + }; +- int rc = kvm_vm_ioctl(kvm_state, KVM_S390_PV_COMMAND, &pv_cmd); ++ int rc; ++ ++ do { ++ rc = kvm_vm_ioctl(kvm_state, KVM_S390_PV_COMMAND, &pv_cmd); ++ } while (rc == -EINTR); + + if (rc) { + error_report("KVM PV command %d (%s) failed: header rc %x rrc %x " +-- +2.27.0 + diff --git a/kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch b/kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch new file mode 100755 index 0000000000000000000000000000000000000000..e78f4da17721df8c0be12e22dc28427ca8afa481 --- /dev/null +++ b/kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch @@ -0,0 +1,150 @@ +From 0db8d909a2f3c53d12b0ae12307965f9a8193dbc Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:19 -0400 +Subject: [PATCH 37/42] s390x/s390-virtio-ccw: Fix build on systems without KVM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-38-thuth@redhat.com> +Patchwork-id: 97047 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 37/38] s390x/s390-virtio-ccw: Fix build on systems without KVM +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Christian Borntraeger + +linux/kvm.h is not available on all platforms. Let us move +s390_machine_inject_pv_error into pv.c as it uses KVM structures. +Also rename the function to s390_pv_inject_reset_error. + +While at it, ipl.h needs an include for "exec/address-spaces.h" +as it uses address_space_memory. + +Fixes: c3347ed0d2ee ("s390x: protvirt: Support unpack facility") +Reported-by: Bruce Rogers +Signed-off-by: Christian Borntraeger +Message-Id: <20200406100158.5940-2-borntraeger@de.ibm.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Cornelia Huck +(cherry picked from commit fbc1384ccd48fa7c0c38f950adf7992a4fb6042e) +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/ipl.h | 1 + + hw/s390x/pv.c | 11 +++++++++++ + hw/s390x/s390-virtio-ccw.c | 12 +----------- + include/hw/s390x/pv.h | 3 +++ + 4 files changed, 16 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h +index 89b3044d7a..53cc9eb5ac 100644 +--- a/hw/s390x/ipl.h ++++ b/hw/s390x/ipl.h +@@ -14,6 +14,7 @@ + #define HW_S390_IPL_H + + #include "cpu.h" ++#include "exec/address-spaces.h" + #include "hw/qdev-core.h" + + struct IPLBlockPVComp { +diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c +index cb0dce4a4f..f11868e865 100644 +--- a/hw/s390x/pv.c ++++ b/hw/s390x/pv.c +@@ -13,8 +13,10 @@ + + #include + ++#include "cpu.h" + #include "qemu/error-report.h" + #include "sysemu/kvm.h" ++#include "hw/s390x/ipl.h" + #include "hw/s390x/pv.h" + + static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data) +@@ -100,3 +102,12 @@ void s390_pv_unshare(void) + { + s390_pv_cmd_exit(KVM_PV_UNSHARE_ALL, NULL); + } ++ ++void s390_pv_inject_reset_error(CPUState *cs) ++{ ++ int r1 = (cs->kvm_run->s390_sieic.ipa & 0x00f0) >> 4; ++ CPUS390XState *env = &S390_CPU(cs)->env; ++ ++ /* Report that we are unable to enter protected mode */ ++ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; ++} +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index c08e42bda1..07773a12b2 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -44,7 +44,6 @@ + #include "sysemu/sysemu.h" + #include "sysemu/balloon.h" + #include "hw/s390x/pv.h" +-#include + #include "migration/blocker.h" + + static Error *pv_mig_blocker; +@@ -391,15 +390,6 @@ out_err: + return rc; + } + +-static void s390_machine_inject_pv_error(CPUState *cs) +-{ +- int r1 = (cs->kvm_run->s390_sieic.ipa & 0x00f0) >> 4; +- CPUS390XState *env = &S390_CPU(cs)->env; +- +- /* Report that we are unable to enter protected mode */ +- env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; +-} +- + static void s390_pv_prepare_reset(S390CcwMachineState *ms) + { + CPUState *cs; +@@ -485,7 +475,7 @@ static void s390_machine_reset(MachineState *machine) + run_on_cpu(cs, s390_do_cpu_reset, RUN_ON_CPU_NULL); + + if (s390_machine_protect(ms)) { +- s390_machine_inject_pv_error(cs); ++ s390_pv_inject_reset_error(cs); + /* + * Continue after the diag308 so the guest knows something + * went wrong. +diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h +index c6cb360f2f..522ca6a04e 100644 +--- a/include/hw/s390x/pv.h ++++ b/include/hw/s390x/pv.h +@@ -13,6 +13,7 @@ + #define HW_S390_PV_H + + #ifdef CONFIG_KVM ++#include "cpu.h" + #include "hw/s390x/s390-virtio-ccw.h" + + static inline bool s390_is_pv(void) +@@ -41,6 +42,7 @@ int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak); + void s390_pv_perf_clear_reset(void); + int s390_pv_verify(void); + void s390_pv_unshare(void); ++void s390_pv_inject_reset_error(CPUState *cs); + #else /* CONFIG_KVM */ + static inline bool s390_is_pv(void) { return false; } + static inline int s390_pv_vm_enable(void) { return 0; } +@@ -50,6 +52,7 @@ static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { + static inline void s390_pv_perf_clear_reset(void) {} + static inline int s390_pv_verify(void) { return 0; } + static inline void s390_pv_unshare(void) {} ++static inline void s390_pv_inject_reset_error(CPUState *cs) {}; + #endif /* CONFIG_KVM */ + + #endif /* HW_S390_PV_H */ +-- +2.27.0 + diff --git a/kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch b/kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch new file mode 100755 index 0000000000000000000000000000000000000000..f90dc30411bb12a051156affad7adf6bd4dd33fc --- /dev/null +++ b/kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch @@ -0,0 +1,52 @@ +From fa4e13a01ecc316cc43c1f39490330b94c910bc1 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Mon, 14 Dec 2020 18:29:49 -0500 +Subject: [PATCH 04/14] s390x/s390-virtio-ccw: Reset PCI devices during + subsystem reset + +RH-Author: Thomas Huth +Message-id: <20201214182949.35712-2-thuth@redhat.com> +Patchwork-id: 100440 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] s390x/s390-virtio-ccw: Reset PCI devices during subsystem reset +Bugzilla: 1905386 +RH-Acked-by: Danilo de Paula +RH-Acked-by: David Hildenbrand +RH-Acked-by: Cornelia Huck + +From: Matthew Rosato + +Currently, a subsystem reset event leaves PCI devices enabled, causing +issues post-reset in the guest (an example would be after a kexec). These +devices need to be reset during a subsystem reset, allowing them to be +properly re-enabled afterwards. Add the S390 PCI host bridge to the list +of qdevs to be reset during subsystem reset. + +Signed-off-by: Matthew Rosato +Reviewed-by: Eric Farman +Acked-by: Halil Pasic +Acked-by: Christian Borntraeger +Cc: qemu-stable@nongnu.org +Message-Id: <1602767767-32713-1-git-send-email-mjrosato@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit db08244a3a7ec312dfed3fd9b88e114281215458) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/s390-virtio-ccw.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 5905d2b7adc..5b3d07f55c4 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -103,6 +103,7 @@ static const char *const reset_dev_types[] = { + "s390-sclp-event-facility", + "s390-flic", + "diag288", ++ TYPE_S390_PCI_HOST_BRIDGE, + }; + + static void subsystem_reset(void) +-- +2.27.0 + diff --git a/kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch b/kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch new file mode 100755 index 0000000000000000000000000000000000000000..5a38a88ec609a3014f6fb9b37087dbdc83ae59d8 --- /dev/null +++ b/kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch @@ -0,0 +1,90 @@ +From 8b06cba98e37b9c50e2a9deb1567d8cf4e1ba2b6 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Wed, 11 Nov 2020 12:03:05 -0500 +Subject: [PATCH 05/18] s390x/sclp.c: remove unneeded label in + sclp_service_call() + +RH-Author: Thomas Huth +Message-id: <20201111120316.707489-2-thuth@redhat.com> +Patchwork-id: 99497 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 01/12] s390x/sclp.c: remove unneeded label in sclp_service_call() +Bugzilla: 1798506 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Daniel Henrique Barboza + +'out' label can be replaced by 'return' with the appropriate +value. The 'r' integer, which is used solely to set the +return value for this label, can also be removed. + +CC: Cornelia Huck +CC: Halil Pasic +CC: Christian Borntraeger +Signed-off-by: Daniel Henrique Barboza +Reviewed-by: Thomas Huth +Message-Id: <20200106182425.20312-39-danielhb413@gmail.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit e6de76fca48012348d8c81b1399c861f444bd4a4) +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/sclp.c | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 1c380a49cc7..d8ae207731f 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -241,24 +241,20 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + { + SCLPDevice *sclp = get_sclp_device(); + SCLPDeviceClass *sclp_c = SCLP_GET_CLASS(sclp); +- int r = 0; + SCCB work_sccb; + + hwaddr sccb_len = sizeof(SCCB); + + /* first some basic checks on program checks */ + if (env->psw.mask & PSW_MASK_PSTATE) { +- r = -PGM_PRIVILEGED; +- goto out; ++ return -PGM_PRIVILEGED; + } + if (cpu_physical_memory_is_io(sccb)) { +- r = -PGM_ADDRESSING; +- goto out; ++ return -PGM_ADDRESSING; + } + if ((sccb & ~0x1fffUL) == 0 || (sccb & ~0x1fffUL) == env->psa + || (sccb & ~0x7ffffff8UL) != 0) { +- r = -PGM_SPECIFICATION; +- goto out; ++ return -PGM_SPECIFICATION; + } + + /* +@@ -270,8 +266,7 @@ int sclp_service_call(CPUS390XState *env, uint64_t sccb, uint32_t code) + + /* Valid sccb sizes */ + if (be16_to_cpu(work_sccb.h.length) < sizeof(SCCBHeader)) { +- r = -PGM_SPECIFICATION; +- goto out; ++ return -PGM_SPECIFICATION; + } + + if (!sclp_command_code_valid(code)) { +@@ -291,8 +286,7 @@ out_write: + + sclp_c->service_interrupt(sclp, sccb); + +-out: +- return r; ++ return 0; + } + + static void service_interrupt(SCLPDevice *sclp, uint32_t sccb) +-- +2.27.0 + diff --git a/kvm-s390x-sigp-Fix-sense-running-reporting.patch b/kvm-s390x-sigp-Fix-sense-running-reporting.patch new file mode 100755 index 0000000000000000000000000000000000000000..71439644c26425c9afef50784e789f50b328fee6 --- /dev/null +++ b/kvm-s390x-sigp-Fix-sense-running-reporting.patch @@ -0,0 +1,49 @@ +From a2befb24c10f58ce6c27d242f3b88afee1f77ec8 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 7 Jul 2020 09:35:31 -0400 +Subject: [PATCH 2/4] s390x: sigp: Fix sense running reporting + +RH-Author: Thomas Huth +Message-id: <20200707093532.22456-2-thuth@redhat.com> +Patchwork-id: 97920 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/2] s390x: sigp: Fix sense running reporting +Bugzilla: 1854092 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Janosch Frank + +The logic was inverted and reported running if the cpu was stopped. +Let's fix that. + +Signed-off-by: Janosch Frank +Fixes: d1b468bc8869 ("s390x/tcg: implement SIGP SENSE RUNNING STATUS") +Reviewed-by: David Hildenbrand +Message-Id: <20200124134818.9981-1-frankja@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 4103500e2fa934a6995e4cedab37423e606715bf) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/sigp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c +index 727875bb4a..c604f17710 100644 +--- a/target/s390x/sigp.c ++++ b/target/s390x/sigp.c +@@ -348,9 +348,9 @@ static void sigp_sense_running(S390CPU *dst_cpu, SigpInfo *si) + + /* If halted (which includes also STOPPED), it is not running */ + if (CPU(dst_cpu)->halted) { +- si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; +- } else { + set_sigp_status(si, SIGP_STAT_NOT_RUNNING); ++ } else { ++ si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } + } + +-- +2.27.0 + diff --git a/kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch b/kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch new file mode 100755 index 0000000000000000000000000000000000000000..b6ac31461d312f5d2a1ebfd7f5ae1f281df65cbd --- /dev/null +++ b/kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch @@ -0,0 +1,57 @@ +From 0c85e86077b42547034ec6e8330a3e61d79b97ee Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 7 Jul 2020 09:35:32 -0400 +Subject: [PATCH 3/4] s390x/tcg: clear local interrupts on reset normal + +RH-Author: Thomas Huth +Message-id: <20200707093532.22456-3-thuth@redhat.com> +Patchwork-id: 97919 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/2] s390x/tcg: clear local interrupts on reset normal +Bugzilla: 1854092 +RH-Acked-by: Jens Freimann +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Cornelia Huck + +We neglected to clean up pending interrupts and emergency signals; +fix that. + +Message-Id: <20191206135404.16051-1-cohuck@redhat.com> +Signed-off-by: Cornelia Huck +Reviewed-by: David Hildenbrand +(cherry picked from commit bcf88d56efec4ffc153bbe98d11b689a5ebe1a91) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/cpu.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index edf8391504..a48e655c4d 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -98,10 +98,6 @@ struct CPUS390XState { + + uint64_t cregs[16]; /* control registers */ + +- int pending_int; +- uint16_t external_call_addr; +- DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS); +- + uint64_t ckc; + uint64_t cputm; + uint32_t todpr; +@@ -117,6 +113,10 @@ struct CPUS390XState { + struct {} start_normal_reset_fields; + uint8_t riccb[64]; /* runtime instrumentation control */ + ++ int pending_int; ++ uint16_t external_call_addr; ++ DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS); ++ + /* Fields up to this point are cleared by a CPU reset */ + struct {} end_reset_fields; + +-- +2.27.0 + diff --git a/kvm-s390x.conf b/kvm-s390x.conf new file mode 100755 index 0000000000000000000000000000000000000000..d82b8186f4e2e75bfaa0b4a57908665da9c57f09 --- /dev/null +++ b/kvm-s390x.conf @@ -0,0 +1,19 @@ +# User changes in this file are preserved across upgrades. +# +# Setting "modprobe kvm nested=1" only enables Nested Virtualization until +# the next reboot or module reload. Uncomment the option below to enable +# the feature permanently. +# +#options kvm nested=1 +# +# +# Setting "modprobe kvm hpage=1" only enables Huge Page Backing (1MB) +# support until the next reboot or module reload. Uncomment the option +# below to enable the feature permanently. +# +# Note: - Incompatible with "nested=1". Loading the module will fail. +# - Dirty page logging will be performed on a 1MB (not 4KB) basis, +# which can result in a lot of data having to be transferred during +# migration, and therefore taking very long to converge. +# +#options kvm hpage=1 diff --git a/kvm-scsi-make-io_timeout-configurable.patch b/kvm-scsi-make-io_timeout-configurable.patch new file mode 100755 index 0000000000000000000000000000000000000000..a073728b2194f56907c6f2e202e2faec363d76a3 --- /dev/null +++ b/kvm-scsi-make-io_timeout-configurable.patch @@ -0,0 +1,177 @@ +From 2903ab6d961b2165f3cbfb786cd3be59a407a6b4 Mon Sep 17 00:00:00 2001 +From: Hannes Reinecke +Date: Mon, 16 Nov 2020 19:31:13 +0100 +Subject: [PATCH] scsi: make io_timeout configurable + +RH-Author: Paolo Bonzini +RH-MergeRequest: 40: scsi: make io_timeout configurable +RH-Commit: [1/1] 147c4b7bac867c708c1905e98c4f9329ab4ef838 +RH-Bugzilla: 1994041 +RH-Acked-by: Jon Maloy +RH-Acked-by: Thomas Huth +RH-Acked-by: Laszlo Ersek + +The current code sets an infinite timeout on SG_IO requests, +causing the guest to stall if the host experiences a frame +loss. +This patch adds an 'io_timeout' parameter for SCSIDevice to +make the SG_IO timeout configurable, and also shortens the +default timeout to 30 seconds to avoid infinite stalls. + +Signed-off-by: Hannes Reinecke +Message-Id: <20201116183114.55703-3-hare@suse.de> +Signed-off-by: Paolo Bonzini +(cherry picked from commit c9b6609b69facad0cc5425d4fa7934c33d7f2e91) +--- + hw/scsi/scsi-disk.c | 6 ++++-- + hw/scsi/scsi-generic.c | 17 +++++++++++------ + include/hw/scsi/scsi.h | 4 +++- + 3 files changed, 18 insertions(+), 9 deletions(-) + +diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c +index e44c61eeb4..5cb5fd35bd 100644 +--- a/hw/scsi/scsi-disk.c ++++ b/hw/scsi/scsi-disk.c +@@ -2610,7 +2610,7 @@ static int get_device_type(SCSIDiskState *s) + cmd[4] = sizeof(buf); + + ret = scsi_SG_IO_FROM_DEV(s->qdev.conf.blk, cmd, sizeof(cmd), +- buf, sizeof(buf)); ++ buf, sizeof(buf), s->qdev.io_timeout); + if (ret < 0) { + return -1; + } +@@ -2771,7 +2771,7 @@ static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req, + /* The rest is as in scsi-generic.c. */ + io_header->mx_sb_len = sizeof(r->req.sense); + io_header->sbp = r->req.sense; +- io_header->timeout = UINT_MAX; ++ io_header->timeout = s->qdev.io_timeout * 1000; + io_header->usr_ptr = r; + io_header->flags |= SG_FLAG_DIRECT_IO; + +@@ -3089,6 +3089,8 @@ static Property scsi_block_properties[] = { + DEFAULT_MAX_IO_SIZE), + DEFINE_PROP_INT32("scsi_version", SCSIDiskState, qdev.default_scsi_version, + -1), ++ DEFINE_PROP_UINT32("io_timeout", SCSIDiskState, qdev.io_timeout, ++ DEFAULT_IO_TIMEOUT), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c +index e7798ebcd0..3a383fe4d6 100644 +--- a/hw/scsi/scsi-generic.c ++++ b/hw/scsi/scsi-generic.c +@@ -114,6 +114,8 @@ static int execute_command(BlockBackend *blk, + SCSIGenericReq *r, int direction, + BlockCompletionFunc *complete) + { ++ SCSIDevice *s = r->req.dev; ++ + r->io_header.interface_id = 'S'; + r->io_header.dxfer_direction = direction; + r->io_header.dxferp = r->buf; +@@ -122,7 +124,7 @@ static int execute_command(BlockBackend *blk, + r->io_header.cmd_len = r->req.cmd.len; + r->io_header.mx_sb_len = sizeof(r->req.sense); + r->io_header.sbp = r->req.sense; +- r->io_header.timeout = MAX_UINT; ++ r->io_header.timeout = s->io_timeout * 1000; + r->io_header.usr_ptr = r; + r->io_header.flags |= SG_FLAG_DIRECT_IO; + +@@ -503,7 +505,7 @@ static int read_naa_id(const uint8_t *p, uint64_t *p_wwn) + } + + int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size, +- uint8_t *buf, uint8_t buf_size) ++ uint8_t *buf, uint8_t buf_size, uint32_t timeout) + { + sg_io_hdr_t io_header; + uint8_t sensebuf[8]; +@@ -518,7 +520,7 @@ int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size, + io_header.cmd_len = cmd_size; + io_header.mx_sb_len = sizeof(sensebuf); + io_header.sbp = sensebuf; +- io_header.timeout = 6000; /* XXX */ ++ io_header.timeout = timeout * 1000; + + ret = blk_ioctl(blk, SG_IO, &io_header); + if (ret < 0 || io_header.driver_status || io_header.host_status) { +@@ -548,7 +550,7 @@ static void scsi_generic_set_vpd_bl_emulation(SCSIDevice *s) + cmd[4] = sizeof(buf); + + ret = scsi_SG_IO_FROM_DEV(s->conf.blk, cmd, sizeof(cmd), +- buf, sizeof(buf)); ++ buf, sizeof(buf), s->io_timeout); + if (ret < 0) { + /* + * Do not assume anything if we can't retrieve the +@@ -584,7 +586,7 @@ static void scsi_generic_read_device_identification(SCSIDevice *s) + cmd[4] = sizeof(buf); + + ret = scsi_SG_IO_FROM_DEV(s->conf.blk, cmd, sizeof(cmd), +- buf, sizeof(buf)); ++ buf, sizeof(buf), s->io_timeout); + if (ret < 0) { + return; + } +@@ -635,7 +637,7 @@ static int get_stream_blocksize(BlockBackend *blk) + cmd[0] = MODE_SENSE; + cmd[4] = sizeof(buf); + +- ret = scsi_SG_IO_FROM_DEV(blk, cmd, sizeof(cmd), buf, sizeof(buf)); ++ ret = scsi_SG_IO_FROM_DEV(blk, cmd, sizeof(cmd), buf, sizeof(buf), 6); + if (ret < 0) { + return -1; + } +@@ -725,6 +727,7 @@ static void scsi_generic_realize(SCSIDevice *s, Error **errp) + + /* Only used by scsi-block, but initialize it nevertheless to be clean. */ + s->default_scsi_version = -1; ++ s->io_timeout = DEFAULT_IO_TIMEOUT; + scsi_generic_read_device_inquiry(s); + } + +@@ -748,6 +751,8 @@ static SCSIRequest *scsi_new_request(SCSIDevice *d, uint32_t tag, uint32_t lun, + static Property scsi_generic_properties[] = { + DEFINE_PROP_DRIVE("drive", SCSIDevice, conf.blk), + DEFINE_PROP_BOOL("share-rw", SCSIDevice, conf.share_rw, false), ++ DEFINE_PROP_UINT32("io_timeout", SCSIDevice, io_timeout, ++ DEFAULT_IO_TIMEOUT), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h +index 332ef602f4..ae0724a32d 100644 +--- a/include/hw/scsi/scsi.h ++++ b/include/hw/scsi/scsi.h +@@ -17,6 +17,7 @@ typedef struct SCSIReqOps SCSIReqOps; + + #define SCSI_SENSE_BUF_SIZE_OLD 96 + #define SCSI_SENSE_BUF_SIZE 252 ++#define DEFAULT_IO_TIMEOUT 30 + + struct SCSIRequest { + SCSIBus *bus; +@@ -88,6 +89,7 @@ struct SCSIDevice + uint64_t port_wwn; + int scsi_version; + int default_scsi_version; ++ uint32_t io_timeout; + bool needs_vpd_bl_emulation; + bool hba_supports_iothread; + }; +@@ -192,7 +194,7 @@ void scsi_device_unit_attention_reported(SCSIDevice *dev); + void scsi_generic_read_device_inquiry(SCSIDevice *dev); + int scsi_device_get_sense(SCSIDevice *dev, uint8_t *buf, int len, bool fixed); + int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size, +- uint8_t *buf, uint8_t buf_size); ++ uint8_t *buf, uint8_t buf_size, uint32_t timeout); + SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int target, int lun); + + /* scsi-generic.c. */ +-- +2.18.2 + diff --git a/kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch b/kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch new file mode 100755 index 0000000000000000000000000000000000000000..189be7ea3294bc2952e407895bebca89978d1567 --- /dev/null +++ b/kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch @@ -0,0 +1,79 @@ +From 08dc2a4dc481916fae9597220ad0faf3f6ed70c1 Mon Sep 17 00:00:00 2001 +From: Eduardo Otubo +Date: Mon, 16 Nov 2020 15:15:38 -0500 +Subject: [PATCH 1/5] seccomp: fix killing of whole process instead of thread +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eduardo Otubo +Message-id: <20201116151538.22254-1-otubo@redhat.com> +Patchwork-id: 99654 +O-Subject: [RHEL-8.3.0/RHEL-8.4.0 qemu-kvm PATCH] seccomp: fix killing of whole process instead of thread +Bugzilla: 1880546 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Danilo de Paula +RH-Acked-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1890885 +BRANCH: rhel-8.3.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=1890885 + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1880546 +BRANCH: rhel-8.4.0 +UPSTREAM: Merged +BREW: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=33125023 + +From: Daniel P. Berrangé + +Back in 2018 we introduced support for killing the whole QEMU process +instead of just one thread, when a seccomp rule is violated: + + commit bda08a5764d470f101fa38635d30b41179a313e1 + Author: Marc-André Lureau + Date: Wed Aug 22 19:02:48 2018 +0200 + + seccomp: prefer SCMP_ACT_KILL_PROCESS if available + +Fast forward a year and we introduced a patch to avoid killing the +process for resource control syscalls tickled by Mesa. + + commit 9a1565a03b79d80b236bc7cc2dbce52a2ef3a1b8 + Author: Daniel P. Berrangé + Date: Wed Mar 13 09:49:03 2019 +0000 + + seccomp: don't kill process for resource control syscalls + +Unfortunately a logic bug effectively reverted the first commit +mentioned so that we go back to only killing the thread, not the whole +process. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Acked-by: Eduardo Otubo +(cherry picked from commit e474e3aacf4276eb0781d11c45e2fab996f9dc56) +Signed-off-by: Eduardo Otubo +Signed-off-by: Danilo C. L. de Paula +--- + qemu-seccomp.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/qemu-seccomp.c b/qemu-seccomp.c +index e0a1829b3dd..8325ecb766e 100644 +--- a/qemu-seccomp.c ++++ b/qemu-seccomp.c +@@ -136,8 +136,9 @@ static uint32_t qemu_seccomp_get_action(int set) + + if (qemu_seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &action) == 0) { + kill_process = 1; ++ } else { ++ kill_process = 0; + } +- kill_process = 0; + } + if (kill_process == 1) { + return SCMP_ACT_KILL_PROCESS; +-- +2.27.0 + diff --git a/kvm-setup b/kvm-setup new file mode 100755 index 0000000000000000000000000000000000000000..3bfedf675703e8488ed99b322e218103ee351578 --- /dev/null +++ b/kvm-setup @@ -0,0 +1,49 @@ +#! /bin/bash + +kvm_setup_powerpc () { + if grep '^platform[[:space:]]*:[[:space:]]*PowerNV' /proc/cpuinfo > /dev/null; then + # PowerNV platform, which is KVM HV capable + + if [ -z "$SUBCORES" ]; then + SUBCORES=1 + fi + + # Step 1. Load the KVM HVmodule + if ! modprobe -b kvm_hv; then + return + fi + + # On POWER8 a host core can only run threads of a single + # guest, meaning that SMT must be disabled on the host in + # order to run KVM guests. (Also applieds to POWER7, but we + # don't support that). + # + # POWER9 doesn't have this limitation (though it will for hash + # guests on radix host when that's implemented). So, only set + # up subcores and disable SMT for POWER*. + if grep '^cpu[[:space:]]*:[[:space:]]*POWER8' /proc/cpuinfo > /dev/null; then + # Step 2. Configure subcore mode + /usr/sbin/ppc64_cpu --subcores-per-core=$SUBCORES + + # Step 3. Disable SMT (multithreading) + /usr/sbin/ppc64_cpu --smt=off + fi + fi +} + +kvm_setup_s390x () { + if grep -q "^features.*sie" /proc/cpuinfo; then + modprobe kvm + fi +} + +case $(uname -m) in + ppc64|ppc64le) + kvm_setup_powerpc + ;; + s390x) + kvm_setup_s390x + ;; +esac + +exit 0 diff --git a/kvm-setup.service b/kvm-setup.service new file mode 100755 index 0000000000000000000000000000000000000000..9c4bf97231b7f7fdd71b40e24942fda1669d1bce --- /dev/null +++ b/kvm-setup.service @@ -0,0 +1,14 @@ +[Unit] +Description=Perform system configuration to prepare system to run KVM guests +# Offlining CPUs can cause irqbalance to throw warnings if it's running +Before=irqbalance.service +# libvirtd reads CPU topology at startup, so change it before +Before=libvirtd.service + +[Service] +Type=oneshot +EnvironmentFile=-/etc/sysconfig/kvm +ExecStart=/usr/lib/systemd/kvm-setup + +[Install] +WantedBy=multi-user.target diff --git a/kvm-slirp-check-pkt_len-before-reading-protocol-header.patch b/kvm-slirp-check-pkt_len-before-reading-protocol-header.patch new file mode 100755 index 0000000000000000000000000000000000000000..43c44eac369cd06b65067435a4418940a6b8fbb1 --- /dev/null +++ b/kvm-slirp-check-pkt_len-before-reading-protocol-header.patch @@ -0,0 +1,72 @@ +From 2bfa25e55c0a49bc079e5769db2199989eda7745 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Fri, 11 Dec 2020 00:59:26 -0500 +Subject: [PATCH 03/14] slirp: check pkt_len before reading protocol header +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20201211005926.618830-2-jmaloy@redhat.com> +Patchwork-id: 100398 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] slirp: check pkt_len before reading protocol header +Bugzilla: 1902237 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Marc-André Lureau + +While processing ARP/NCSI packets in 'arp_input' or 'ncsi_input' +routines, ensure that pkt_len is large enough to accommodate the +respective protocol headers, lest it should do an OOB access. +Add check to avoid it. + +CVE-2020-29129 CVE-2020-29130 + QEMU: slirp: out-of-bounds access while processing ARP/NCSI packets + -> https://www.openwall.com/lists/oss-security/2020/11/27/1 + +Reported-by: Qiuhao Li +Signed-off-by: Prasad J Pandit +Message-Id: <20201126135706.273950-1-ppandit@redhat.com> +Reviewed-by: Marc-André Lureau + +(cherry picked from libslirp commit 2e1dcbc0c2af64fcb17009eaf2ceedd81be2b27f) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/ncsi.c | 4 ++++ + slirp/src/slirp.c | 4 ++++ + 2 files changed, 8 insertions(+) + +diff --git a/slirp/src/ncsi.c b/slirp/src/ncsi.c +index 6864b735db4..251c0d2bfbb 100644 +--- a/slirp/src/ncsi.c ++++ b/slirp/src/ncsi.c +@@ -147,6 +147,10 @@ void ncsi_input(Slirp *slirp, const uint8_t *pkt, int pkt_len) + uint32_t checksum; + uint32_t *pchecksum; + ++ if (pkt_len < ETH_HLEN + sizeof(struct ncsi_pkt_hdr)) { ++ return; /* packet too short */ ++ } ++ + memset(ncsi_reply, 0, sizeof(ncsi_reply)); + + memset(reh->h_dest, 0xff, ETH_ALEN); +diff --git a/slirp/src/slirp.c b/slirp/src/slirp.c +index b0194cb32bb..86b0f52d923 100644 +--- a/slirp/src/slirp.c ++++ b/slirp/src/slirp.c +@@ -700,6 +700,10 @@ static void arp_input(Slirp *slirp, const uint8_t *pkt, int pkt_len) + return; + } + ++ if (pkt_len < ETH_HLEN + sizeof(struct slirp_arphdr)) { ++ return; /* packet too short */ ++ } ++ + ar_op = ntohs(ah->ar_op); + switch (ar_op) { + case ARPOP_REQUEST: +-- +2.27.0 + diff --git a/kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch b/kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch new file mode 100755 index 0000000000000000000000000000000000000000..6d8dfe15964ff7e8132f2beb9458f40c7202a6dd --- /dev/null +++ b/kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch @@ -0,0 +1,77 @@ +From 0f659af4870f151e25a7d2184b9a383bff58e3ba Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 17 Jan 2020 12:07:57 +0100 +Subject: [PATCH 2/4] slirp: use correct size while emulating IRC commands +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200117120758.1076549-3-marcandre.lureau@redhat.com> +Patchwork-id: 93400 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm + RHEL-AV-8.2.0 qemu-kvm PATCH 2/3] slirp: use correct size while emulating IRC commands +Bugzilla: 1791568 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +From: Prasad J Pandit + +While emulating IRC DCC commands, tcp_emu() uses 'mbuf' size +'m->m_size' to write DCC commands via snprintf(3). This may +lead to OOB write access, because 'bptr' points somewhere in +the middle of 'mbuf' buffer, not at the start. Use M_FREEROOM(m) +size to avoid OOB access. + +Reported-by: Vishnu Dev TJ +Signed-off-by: Prasad J Pandit +Reviewed-by: Samuel Thibault +Message-Id: <20200109094228.79764-2-ppandit@redhat.com> + +(cherry picked from libslirp commit ce131029d6d4a405cb7d3ac6716d03e58fb4a5d9) +Signed-off-by: Marc-André Lureau + +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tcp_subr.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index cbecd64..cedbfb2 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -778,7 +778,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, m->m_size, "DCC CHAT chat %lu %u%c\n", ++ m->m_len += snprintf(bptr, M_FREEROOM(m), ++ "DCC CHAT chat %lu %u%c\n", + (unsigned long)ntohl(so->so_faddr.s_addr), + ntohs(so->so_fport), 1); + } else if (sscanf(bptr, "DCC SEND %256s %u %u %u", buff, &laddr, &lport, +@@ -788,8 +789,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += +- snprintf(bptr, m->m_size, "DCC SEND %s %lu %u %u%c\n", buff, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), ++ "DCC SEND %s %lu %u %u%c\n", buff, + (unsigned long)ntohl(so->so_faddr.s_addr), + ntohs(so->so_fport), n1, 1); + } else if (sscanf(bptr, "DCC MOVE %256s %u %u %u", buff, &laddr, &lport, +@@ -799,8 +800,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += +- snprintf(bptr, m->m_size, "DCC MOVE %s %lu %u %u%c\n", buff, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), ++ "DCC MOVE %s %lu %u %u%c\n", buff, + (unsigned long)ntohl(so->so_faddr.s_addr), + ntohs(so->so_fport), n1, 1); + } +-- +1.8.3.1 + diff --git a/kvm-slirp-use-correct-size-while-emulating-commands.patch b/kvm-slirp-use-correct-size-while-emulating-commands.patch new file mode 100755 index 0000000000000000000000000000000000000000..fe42f4fe816668e6264e9ec76d7eaf573a458a97 --- /dev/null +++ b/kvm-slirp-use-correct-size-while-emulating-commands.patch @@ -0,0 +1,71 @@ +From dfbfcf02738640ab83f7970e636b72b78f166675 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 17 Jan 2020 12:07:58 +0100 +Subject: [PATCH 3/4] slirp: use correct size while emulating commands +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200117120758.1076549-4-marcandre.lureau@redhat.com> +Patchwork-id: 93401 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm + RHEL-AV-8.2.0 qemu-kvm PATCH 3/3] slirp: use correct size while emulating commands +Bugzilla: 1791568 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +From: Prasad J Pandit + +While emulating services in tcp_emu(), it uses 'mbuf' size +'m->m_size' to write commands via snprintf(3). Use M_FREEROOM(m) +size to avoid possible OOB access. + +Signed-off-by: Prasad J Pandit +Signed-off-by: Samuel Thibault +Message-Id: <20200109094228.79764-3-ppandit@redhat.com> + +(cherry picked from commit 82ebe9c370a0e2970fb5695aa19aa5214a6a1c80) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tcp_subr.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index cedbfb2..954d1a6 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -696,7 +696,7 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, m->m_size - m->m_len, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), + "ORT %d,%d,%d,%d,%d,%d\r\n%s", n1, n2, n3, n4, + n5, n6, x == 7 ? buff : ""); + return 1; +@@ -731,8 +731,7 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += +- snprintf(bptr, m->m_size - m->m_len, ++ m->m_len += snprintf(bptr, M_FREEROOM(m), + "27 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n%s", + n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); + +@@ -758,8 +757,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + if (m->m_data[m->m_len - 1] == '\0' && lport != 0 && + (so = tcp_listen(slirp, INADDR_ANY, 0, so->so_laddr.s_addr, + htons(lport), SS_FACCEPTONCE)) != NULL) +- m->m_len = +- snprintf(m->m_data, m->m_size, "%d", ntohs(so->so_fport)) + 1; ++ m->m_len = snprintf(m->m_data, M_ROOM(m), ++ "%d", ntohs(so->so_fport)) + 1; + return 1; + + case EMU_IRC: +-- +1.8.3.1 + diff --git a/kvm-softmmu-memory-Log-invalid-memory-accesses.patch b/kvm-softmmu-memory-Log-invalid-memory-accesses.patch new file mode 100755 index 0000000000000000000000000000000000000000..e4e1bc44b0850db2c07e0224b439f97dd8ceffbc --- /dev/null +++ b/kvm-softmmu-memory-Log-invalid-memory-accesses.patch @@ -0,0 +1,84 @@ +From be0a190e3c5c4ff84f7c53630ed5a55644d18acc Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:06 -0400 +Subject: [PATCH 7/7] softmmu/memory: Log invalid memory accesses +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-7-jmaloy@redhat.com> +Patchwork-id: 101481 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 6/6] softmmu/memory: Log invalid memory accesses +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Philippe Mathieu-Daudé + +Log invalid memory accesses with as GUEST_ERROR. + +This is particularly useful since commit 5d971f9e67 which reverted +("memory: accept mismatching sizes in memory_region_access_valid"). + +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Michael S. Tsirkin +Message-Id: <20201005152725.2143444-1-philmd@redhat.com> +Signed-off-by: Laurent Vivier + +(cherry picked from commit 21786c7e59847b1612406ff394958f22e5b323f8) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + memory.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/memory.c b/memory.c +index 0cfcb72a5a..660df8159a 100644 +--- a/memory.c ++++ b/memory.c +@@ -14,6 +14,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/log.h" + #include "qapi/error.h" + #include "cpu.h" + #include "exec/memory.h" +@@ -1353,10 +1354,18 @@ bool memory_region_access_valid(MemoryRegion *mr, + { + if (mr->ops->valid.accepts + && !mr->ops->valid.accepts(mr->opaque, addr, size, is_write, attrs)) { ++ qemu_log_mask(LOG_GUEST_ERROR, "Invalid access at addr " ++ "0x%" HWADDR_PRIX ", size %u, " ++ "region '%s', reason: rejected\n", ++ addr, size, memory_region_name(mr)); + return false; + } + + if (!mr->ops->valid.unaligned && (addr & (size - 1))) { ++ qemu_log_mask(LOG_GUEST_ERROR, "Invalid access at addr " ++ "0x%" HWADDR_PRIX ", size %u, " ++ "region '%s', reason: unaligned\n", ++ addr, size, memory_region_name(mr)); + return false; + } + +@@ -1367,6 +1376,13 @@ bool memory_region_access_valid(MemoryRegion *mr, + + if (size > mr->ops->valid.max_access_size + || size < mr->ops->valid.min_access_size) { ++ qemu_log_mask(LOG_GUEST_ERROR, "Invalid access at addr " ++ "0x%" HWADDR_PRIX ", size %u, " ++ "region '%s', reason: invalid size " ++ "(min:%u max:%u)\n", ++ addr, size, memory_region_name(mr), ++ mr->ops->valid.min_access_size, ++ mr->ops->valid.max_access_size); + return false; + } + return true; +-- +2.27.0 + diff --git a/kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch b/kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch new file mode 100755 index 0000000000000000000000000000000000000000..7aaa98287fb64471cad45e7ad4771010edb943cd --- /dev/null +++ b/kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch @@ -0,0 +1,205 @@ +From dfdf950e893c23e77c9dc0be18fca66ad195d260 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Wed, 10 Feb 2021 15:56:45 +0000 +Subject: [PATCH 2/2] spapr: Adjust firmware path of PCI devices +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Greg Kurz +Message-id: <20210210165645.470195-2-gkurz@redhat.com> +Patchwork-id: 101038 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] spapr: Adjust firmware path of PCI devices +Bugzilla: 1912891 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: David Gibson +RH-Acked-by: Laszlo Ersek + +From: Greg Kurz + +It is currently not possible to perform a strict boot from USB storage: + +$ qemu-system-ppc64 -accel kvm -nodefaults -nographic -serial stdio \ + -boot strict=on \ + -device qemu-xhci \ + -device usb-storage,drive=disk,bootindex=0 \ + -blockdev driver=file,node-name=disk,filename=fedora-ppc64le.qcow2 + +SLOF ********************************************************************** +QEMU Starting + Build Date = Jul 17 2020 11:15:24 + FW Version = git-e18ddad8516ff2cf + Press "s" to enter Open Firmware. + +Populating /vdevice methods +Populating /vdevice/vty@71000000 +Populating /vdevice/nvram@71000001 +Populating /pci@800000020000000 + 00 0000 (D) : 1b36 000d serial bus [ usb-xhci ] +No NVRAM common partition, re-initializing... +Scanning USB + XHCI: Initializing + USB Storage + SCSI: Looking for devices + 101000000000000 DISK : "QEMU QEMU HARDDISK 2.5+" +Using default console: /vdevice/vty@71000000 + + Welcome to Open Firmware + + Copyright (c) 2004, 2017 IBM Corporation All rights reserved. + This program and the accompanying materials are made available + under the terms of the BSD License available at + http://www.opensource.org/licenses/bsd-license.php + +Trying to load: from: /pci@800000020000000/usb@0/storage@1/disk@101000000000000 ... +E3405: No such device + +E3407: Load failed + + Type 'boot' and press return to continue booting the system. + Type 'reset-all' and press return to reboot the system. + +Ready! +0 > + +The device tree handed over by QEMU to SLOF indeed contains: + +qemu,boot-list = + "/pci@800000020000000/usb@0/storage@1/disk@101000000000000 HALT"; + +but the device node is named usb-xhci@0, not usb@0. + +This happens because the firmware names of PCI devices returned +by get_boot_devices_list() come from pcibus_get_fw_dev_path(), +while the sPAPR PHB code uses a different naming scheme for +device nodes. This inconsistency has always been there but it was +hidden for a long time because SLOF used to rename USB device +nodes, until this commit, merged in QEMU 4.2.0 : + +commit 85164ad4ed9960cac842fa4cc067c6b6699b0994 +Author: Alexey Kardashevskiy +Date: Wed Sep 11 16:24:32 2019 +1000 + + pseries: Update SLOF firmware image + + This fixes USB host bus adapter name in the device tree to match QEMU's + one. + + Signed-off-by: Alexey Kardashevskiy + Signed-off-by: David Gibson + +Fortunately, sPAPR implements the firmware path provider interface. +This provides a way to override the default firmware paths. + +Just factor out the sPAPR PHB naming logic from spapr_dt_pci_device() +to a helper, and use it in the sPAPR firmware path provider hook. + +Fixes: 85164ad4ed99 ("pseries: Update SLOF firmware image") +Signed-off-by: Greg Kurz +Message-Id: <20210122170157.246374-1-groug@kaod.org> +Reviewed-by: Daniel Henrique Barboza +Signed-off-by: David Gibson +(cherry picked from commit 040bdafce12f750816d879442014df2999a995c4) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 5 +++++ + hw/ppc/spapr_pci.c | 33 ++++++++++++++++++--------------- + include/hw/pci-host/spapr.h | 2 ++ + 3 files changed, 25 insertions(+), 15 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 00b1ef075e..bee2299199 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -3013,6 +3013,7 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, + SCSIDevice *d = CAST(SCSIDevice, dev, TYPE_SCSI_DEVICE); + SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE); + VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON); ++ PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE); + + if (d) { + void *spapr = CAST(void, bus->parent, "spapr-vscsi"); +@@ -3086,6 +3087,10 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, + return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn)); + } + ++ if (pcidev) { ++ return spapr_pci_fw_dev_name(pcidev); ++ } ++ + return NULL; + } + +diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c +index f6fbcf99ed..befa570aa8 100644 +--- a/hw/ppc/spapr_pci.c ++++ b/hw/ppc/spapr_pci.c +@@ -1348,15 +1348,29 @@ static int spapr_dt_pci_bus(SpaprPhbState *sphb, PCIBus *bus, + return offset; + } + ++char *spapr_pci_fw_dev_name(PCIDevice *dev) ++{ ++ const gchar *basename; ++ int slot = PCI_SLOT(dev->devfn); ++ int func = PCI_FUNC(dev->devfn); ++ uint32_t ccode = pci_default_read_config(dev, PCI_CLASS_PROG, 3); ++ ++ basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff, ++ ccode & 0xff); ++ ++ if (func != 0) { ++ return g_strdup_printf("%s@%x,%x", basename, slot, func); ++ } else { ++ return g_strdup_printf("%s@%x", basename, slot); ++ } ++} ++ + /* create OF node for pci device and required OF DT properties */ + static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev, + void *fdt, int parent_offset) + { + int offset; +- const gchar *basename; +- gchar *nodename; +- int slot = PCI_SLOT(dev->devfn); +- int func = PCI_FUNC(dev->devfn); ++ g_autofree gchar *nodename = spapr_pci_fw_dev_name(dev); + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); + ResourceProps rp; + SpaprDrc *drc = drc_from_dev(sphb, dev); +@@ -1373,19 +1387,8 @@ static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev, + uint32_t pci_status = pci_default_read_config(dev, PCI_STATUS, 2); + gchar *loc_code; + +- basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff, +- ccode & 0xff); +- +- if (func != 0) { +- nodename = g_strdup_printf("%s@%x,%x", basename, slot, func); +- } else { +- nodename = g_strdup_printf("%s@%x", basename, slot); +- } +- + _FDT(offset = fdt_add_subnode(fdt, parent_offset, nodename)); + +- g_free(nodename); +- + /* in accordance with PAPR+ v2.7 13.6.3, Table 181 */ + _FDT(fdt_setprop_cell(fdt, offset, "vendor-id", vendor_id)); + _FDT(fdt_setprop_cell(fdt, offset, "device-id", device_id)); +diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h +index 8877ff51fb..9522db9047 100644 +--- a/include/hw/pci-host/spapr.h ++++ b/include/hw/pci-host/spapr.h +@@ -212,4 +212,6 @@ static inline unsigned spapr_phb_windows_supported(SpaprPhbState *sphb) + return sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1; + } + ++char *spapr_pci_fw_dev_name(PCIDevice *dev); ++ + #endif /* PCI_HOST_SPAPR_H */ +-- +2.27.0 + diff --git a/kvm-spapr-Allow-memory-unplug-to-always-succeed.patch b/kvm-spapr-Allow-memory-unplug-to-always-succeed.patch new file mode 100755 index 0000000000000000000000000000000000000000..2968267dc054fbc4d7ab82673775d417633e4897 --- /dev/null +++ b/kvm-spapr-Allow-memory-unplug-to-always-succeed.patch @@ -0,0 +1,101 @@ +From 1fc9b693c54c93736c6f902f3df8b94440e8cc5d Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:53 -0500 +Subject: [PATCH 5/9] spapr: Allow memory unplug to always succeed + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-6-gkurz@redhat.com> +Patchwork-id: 100686 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 5/6] spapr: Allow memory unplug to always succeed +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +It is currently impossible to hot-unplug a memory device between +machine reset and CAS. + +(qemu) device_del dimm1 +Error: Memory hot unplug not supported for this guest + +This limitation was introduced in order to provide an explicit +error path for older guests that didn't support hot-plug event +sources (and thus memory hot-unplug). + +The linux kernel has been supporting these since 4.11. All recent +enough guests are thus capable of handling the removal of a memory +device at all time, including during early boot. + +Lift the limitation for the latest machine type. This means that +trying to unplug memory from a guest that doesn't support it will +likely just do nothing and the memory will only get removed at +next reboot. Such older guests can still get the existing behavior +by using an older machine type. + +Signed-off-by: Greg Kurz +Message-Id: <160794035064.23292.17560963281911312439.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 1e8b5b1aa16b7d73ba8ba52c95d0b52329d5c9d0) +Signed-off-by: Greg Kurz + +Conflicts: + hw/ppc/spapr.c + include/hw/ppc/spapr.h + +Conflicts around the addition of pre_6_0_memory_unplug. Ignore the +change that sets pre_6_0_memory_unplug for older machine types. +This is ok because pre_6_0_memory_unplug is removed in a subsequent +patch anyway. + +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 3 ++- + hw/ppc/spapr_events.c | 3 ++- + include/hw/ppc/spapr.h | 1 + + 3 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 992bd08aaa..f8de33e3e5 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -4001,7 +4001,8 @@ static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev, + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { +- if (spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) { ++ if (!smc->pre_6_0_memory_unplug || ++ spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) { + spapr_memory_unplug_request(hotplug_dev, dev, errp); + } else { + /* NOTE: this means there is a window after guest reset, prior to +diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c +index 15b92b63ad..6e284aa4bc 100644 +--- a/hw/ppc/spapr_events.c ++++ b/hw/ppc/spapr_events.c +@@ -547,7 +547,8 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action, + /* we should not be using count_indexed value unless the guest + * supports dedicated hotplug event source + */ +- g_assert(spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT)); ++ g_assert(!SPAPR_MACHINE_GET_CLASS(spapr)->pre_6_0_memory_unplug || ++ spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT)); + hp->drc_id.count_indexed.count = + cpu_to_be32(drc_id->count_indexed.count); + hp->drc_id.count_indexed.index = +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index e5e2a99046..ac6961ed16 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -124,6 +124,7 @@ struct SpaprMachineClass { + bool pre_4_1_migration; /* don't migrate hpt-max-page-size */ + bool linux_pci_probe; + bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ ++ bool pre_6_0_memory_unplug; + + bool has_power9_support; + void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, +-- +2.18.2 + diff --git a/kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch b/kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch new file mode 100755 index 0000000000000000000000000000000000000000..d934712fd7b01bf60999095e68c6415ad94e2562 --- /dev/null +++ b/kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch @@ -0,0 +1,113 @@ +From f2aeed761d2dad14920fa08c977dc45564886d9b Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Fri, 3 Jan 2020 01:15:12 +0000 +Subject: [PATCH 1/5] spapr: Don't trigger a CAS reboot for XICS/XIVE mode + changeover +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200103011512.49129-2-dgibson@redhat.com> +Patchwork-id: 93261 +O-Subject: [RHEL-AV-4.2 qemu-kvm PATCH 1/1] spapr: Don't trigger a CAS reboot for XICS/XIVE mode changeover +Bugzilla: 1733893 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: David Gibson + +PAPR allows the interrupt controller used on a POWER9 machine (XICS or +XIVE) to be selected by the guest operating system, by using the +ibm,client-architecture-support (CAS) feature negotiation call. + +Currently, if the guest selects an interrupt controller different from the +one selected at initial boot, this causes the system to be reset with the +new model and the boot starts again. This means we run through the SLOF +boot process twice, as well as any other bootloader (e.g. grub) in use +before the OS calls CAS. This can be confusing and/or inconvenient for +users. + +Thanks to two fairly recent changes, we no longer need this reboot. 1) we +now completely regenerate the device tree when CAS is called (meaning we +don't need special case updates for all the device tree changes caused by +the interrupt controller mode change), 2) we now have explicit code paths +to activate and deactivate the different interrupt controllers, rather than +just implicitly calling those at machine reset time. + +We can therefore eliminate the reboot for changing irq mode, simply by +putting a call to spapr_irq_update_active_intc() before we call +spapr_h_cas_compose_response() (which gives the updated device tree to +the guest firmware and OS). + +Signed-off-by: David Gibson +Reviewed-by: Cedric Le Goater +Reviewed-by: Greg Kurz +(cherry picked from commit 8deb8019d696c75e6ecaee7545026b62aba2f1bb) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1733893 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_hcall.c | 33 +++++++++++++-------------------- + 1 file changed, 13 insertions(+), 20 deletions(-) + +diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c +index 140f05c..05a7ca2 100644 +--- a/hw/ppc/spapr_hcall.c ++++ b/hw/ppc/spapr_hcall.c +@@ -1767,21 +1767,10 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + } + spapr->cas_pre_isa3_guest = !spapr_ovec_test(ov1_guest, OV1_PPC_3_00); + spapr_ovec_cleanup(ov1_guest); +- if (!spapr->cas_reboot) { +- /* If spapr_machine_reset() did not set up a HPT but one is necessary +- * (because the guest isn't going to use radix) then set it up here. */ +- if ((spapr->patb_entry & PATE1_GR) && !guest_radix) { +- /* legacy hash or new hash: */ +- spapr_setup_hpt_and_vrma(spapr); +- } +- spapr->cas_reboot = +- (spapr_h_cas_compose_response(spapr, args[1], args[2], +- ov5_updates) != 0); +- } + + /* +- * Ensure the guest asks for an interrupt mode we support; otherwise +- * terminate the boot. ++ * Ensure the guest asks for an interrupt mode we support; ++ * otherwise terminate the boot. + */ + if (guest_xive) { + if (!spapr->irq->xive) { +@@ -1797,14 +1786,18 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + } + } + +- /* +- * Generate a machine reset when we have an update of the +- * interrupt mode. Only required when the machine supports both +- * modes. +- */ ++ spapr_irq_update_active_intc(spapr); ++ + if (!spapr->cas_reboot) { +- spapr->cas_reboot = spapr_ovec_test(ov5_updates, OV5_XIVE_EXPLOIT) +- && spapr->irq->xics && spapr->irq->xive; ++ /* If spapr_machine_reset() did not set up a HPT but one is necessary ++ * (because the guest isn't going to use radix) then set it up here. */ ++ if ((spapr->patb_entry & PATE1_GR) && !guest_radix) { ++ /* legacy hash or new hash: */ ++ spapr_setup_hpt_and_vrma(spapr); ++ } ++ spapr->cas_reboot = ++ (spapr_h_cas_compose_response(spapr, args[1], args[2], ++ ov5_updates) != 0); + } + + spapr_ovec_cleanup(ov5_updates); +-- +1.8.3.1 + diff --git a/kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch b/kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch new file mode 100755 index 0000000000000000000000000000000000000000..1462d5230f029538683965eca8c15762add44a84 --- /dev/null +++ b/kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch @@ -0,0 +1,145 @@ +From ad7aaf34400b1bbd41bbec182fd5895eaad50932 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:51 -0500 +Subject: [PATCH 3/9] spapr: Don't use spapr_drc_needed() in CAS code + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-4-gkurz@redhat.com> +Patchwork-id: 100683 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 3/6] spapr: Don't use spapr_drc_needed() in CAS code +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +We currently don't support hotplug of devices between boot and CAS. If +this happens a CAS reboot is triggered. We detect this during CAS using +the spapr_drc_needed() function which is essentially a VMStateDescription +.needed callback. Even if the condition for CAS reboot happens to be the +same as for DRC migration, it looks wrong to piggyback a migration helper +for this. + +Introduce a helper with slightly more explicit name and use it in both CAS +and DRC migration code. Since a subsequent patch will enhance this helper +to cover the case of hot unplug, let's go for spapr_drc_transient(). While +here convert spapr_hotplugged_dev_before_cas() to the "transient" wording as +well. + +This doesn't change any behaviour. + +Signed-off-by: Greg Kurz +Message-Id: <158169248180.3465937.9531405453362718771.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 4b63db1289a9e597bc151fa5e4d72f882cb6de1e) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr_drc.c | 20 ++++++++++++++------ + hw/ppc/spapr_hcall.c | 14 +++++++++----- + include/hw/ppc/spapr_drc.h | 4 +++- + 3 files changed, 26 insertions(+), 12 deletions(-) + +diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c +index 62f1a42592..9b498d429e 100644 +--- a/hw/ppc/spapr_drc.c ++++ b/hw/ppc/spapr_drc.c +@@ -455,23 +455,31 @@ void spapr_drc_reset(SpaprDrc *drc) + } + } + +-bool spapr_drc_needed(void *opaque) ++bool spapr_drc_transient(SpaprDrc *drc) + { +- SpaprDrc *drc = (SpaprDrc *)opaque; + SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); + +- /* If no dev is plugged in there is no need to migrate the DRC state */ ++ /* ++ * If no dev is plugged in there is no need to migrate the DRC state ++ * nor to reset the DRC at CAS. ++ */ + if (!drc->dev) { + return false; + } + + /* +- * We need to migrate the state if it's not equal to the expected +- * long-term state, which is the same as the coldplugged initial +- * state */ ++ * We need to reset the DRC at CAS or to migrate the DRC state if it's ++ * not equal to the expected long-term state, which is the same as the ++ * coldplugged initial state. ++ */ + return (drc->state != drck->ready_state); + } + ++static bool spapr_drc_needed(void *opaque) ++{ ++ return spapr_drc_transient(opaque); ++} ++ + static const VMStateDescription vmstate_spapr_drc = { + .name = "spapr_drc", + .version_id = 1, +diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c +index 0f19be794c..d70e643752 100644 +--- a/hw/ppc/spapr_hcall.c ++++ b/hw/ppc/spapr_hcall.c +@@ -1640,20 +1640,24 @@ static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, + return best_compat; + } + +-static bool spapr_hotplugged_dev_before_cas(void) ++static bool spapr_transient_dev_before_cas(void) + { +- Object *drc_container, *obj; ++ Object *drc_container; + ObjectProperty *prop; + ObjectPropertyIterator iter; + + drc_container = container_get(object_get_root(), "/dr-connector"); + object_property_iter_init(&iter, drc_container); + while ((prop = object_property_iter_next(&iter))) { ++ SpaprDrc *drc; ++ + if (!strstart(prop->type, "link<", NULL)) { + continue; + } +- obj = object_property_get_link(drc_container, prop->name, NULL); +- if (spapr_drc_needed(obj)) { ++ drc = SPAPR_DR_CONNECTOR(object_property_get_link(drc_container, ++ prop->name, NULL)); ++ ++ if (spapr_drc_transient(drc)) { + return true; + } + } +@@ -1812,7 +1816,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + + spapr_irq_update_active_intc(spapr); + +- if (spapr_hotplugged_dev_before_cas()) { ++ if (spapr_transient_dev_before_cas()) { + spapr->cas_reboot = true; + } + +diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h +index 83f03cc577..7e09d57114 100644 +--- a/include/hw/ppc/spapr_drc.h ++++ b/include/hw/ppc/spapr_drc.h +@@ -269,7 +269,9 @@ int spapr_dt_drc(void *fdt, int offset, Object *owner, uint32_t drc_type_mask); + + void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp); + void spapr_drc_detach(SpaprDrc *drc); +-bool spapr_drc_needed(void *opaque); ++ ++/* Returns true if a hot plug/unplug request is pending */ ++bool spapr_drc_transient(SpaprDrc *drc); + + static inline bool spapr_drc_unplug_requested(SpaprDrc *drc) + { +-- +2.18.2 + diff --git a/kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch b/kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch new file mode 100755 index 0000000000000000000000000000000000000000..0aa782bfc9442a8cb9a02bce66b12fc5d2d30059 --- /dev/null +++ b/kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch @@ -0,0 +1,135 @@ +From eb121ffa97c1c25d7853d51b4c8209c0bb521deb Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Fri, 7 Feb 2020 00:57:04 +0000 +Subject: [PATCH 1/7] spapr: Enable DD2.3 accelerated count cache flush in + pseries-5.0 machine + +RH-Author: David Gibson +Message-id: <20200207005704.194428-1-dgibson@redhat.com> +Patchwork-id: 93737 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCHv2] spapr: Enable DD2.3 accelerated count cache flush in pseries-5.0 machine +Bugzilla: 1796240 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: David Gibson + +For POWER9 DD2.2 cpus, the best current Spectre v2 indirect branch +mitigation is "count cache disabled", which is configured with: + -machine cap-ibs=fixed-ccd +However, this option isn't available on DD2.3 CPUs with KVM, because they +don't have the count cache disabled. + +For POWER9 DD2.3 cpus, it is "count cache flush with assist", configured +with: + -machine cap-ibs=workaround,cap-ccf-assist=on +However this option isn't available on DD2.2 CPUs with KVM, because they +don't have the special CCF assist instruction this relies on. + +On current machine types, we default to "count cache flush w/o assist", +that is: + -machine cap-ibs=workaround,cap-ccf-assist=off +This runs, with mitigation on both DD2.2 and DD2.3 host cpus, but has a +fairly significant performance impact. + +It turns out we can do better. The special instruction that CCF assist +uses to trigger a count cache flush is a no-op on earlier CPUs, rather than +trapping or causing other badness. It doesn't, of itself, implement the +mitigation, but *if* we have count-cache-disabled, then the count cache +flush is unnecessary, and so using the count cache flush mitigation is +harmless. + +Therefore for the new pseries-5.0 machine type, enable cap-ccf-assist by +default. Along with that, suppress throwing an error if cap-ccf-assist +is selected but KVM doesn't support it, as long as KVM *is* giving us +count-cache-disabled. To allow TCG to work out of the box, even though it +doesn't implement the ccf flush assist, downgrade the error in that case to +a warning. This matches several Spectre mitigations where we allow TCG +to operate for debugging, since we don't really make guarantees about TCG +security properties anyway. + +While we're there, make the TCG warning for this case match that for other +mitigations. + +Signed-off-by: David Gibson +Tested-by: Michael Ellerman +(cherry picked from commit 37965dfe4dffa3ac49438337417608e7f346b58a) +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + hw/ppc/spapr.c + +Adjusted machine version compatibility code to the RHEL machine types +rather than the upstream machine types. + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1796240 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=26285002 +Branch: rhel-av-8.2.0 +Upstream: Merged for qemu-5.0 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr.c | 4 +++- + hw/ppc/spapr_caps.c | 21 +++++++++++++++++---- + 2 files changed, 20 insertions(+), 5 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index c12862d..a330f03 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -4440,7 +4440,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) + smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */ + smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF; + smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON; +- smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF; ++ smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON; + spapr_caps_add_properties(smc, &error_abort); + smc->irq = &spapr_irq_dual; + smc->dr_phb_enabled = true; +@@ -4904,6 +4904,8 @@ static void spapr_machine_rhel810_class_options(MachineClass *mc) + hw_compat_rhel_8_1_len); + compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); + ++ /* from pseries-4.2 */ ++ smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF; + } + + DEFINE_SPAPR_MACHINE(rhel810, "rhel8.1.0", false); +diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c +index 805f385..6e6fb28 100644 +--- a/hw/ppc/spapr_caps.c ++++ b/hw/ppc/spapr_caps.c +@@ -492,11 +492,24 @@ static void cap_ccf_assist_apply(SpaprMachineState *spapr, uint8_t val, + uint8_t kvm_val = kvmppc_get_cap_count_cache_flush_assist(); + + if (tcg_enabled() && val) { +- /* TODO - for now only allow broken for TCG */ +- error_setg(errp, +-"Requested count cache flush assist capability level not supported by tcg," +- " try appending -machine cap-ccf-assist=off"); ++ /* TCG doesn't implement anything here, but allow with a warning */ ++ warn_report("TCG doesn't support requested feature, cap-ccf-assist=on"); + } else if (kvm_enabled() && (val > kvm_val)) { ++ uint8_t kvm_ibs = kvmppc_get_cap_safe_indirect_branch(); ++ ++ if (kvm_ibs == SPAPR_CAP_FIXED_CCD) { ++ /* ++ * If we don't have CCF assist on the host, the assist ++ * instruction is a harmless no-op. It won't correctly ++ * implement the cache count flush *but* if we have ++ * count-cache-disabled in the host, that flush is ++ * unnnecessary. So, specifically allow this case. This ++ * allows us to have better performance on POWER9 DD2.3, ++ * while still working on POWER9 DD2.2 and POWER8 host ++ * cpus. ++ */ ++ return; ++ } + error_setg(errp, + "Requested count cache flush assist capability level not supported by kvm," + " try appending -machine cap-ccf-assist=off"); +-- +1.8.3.1 + diff --git a/kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch b/kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch new file mode 100755 index 0000000000000000000000000000000000000000..8d30406eebef918d7f0091877ab8aa60c80d0fc7 --- /dev/null +++ b/kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch @@ -0,0 +1,165 @@ +From f9d332b1280cd3f6009b59323719548a36a7c52b Mon Sep 17 00:00:00 2001 +From: Daniel Henrique Barboza +Date: Mon, 21 Jun 2021 14:40:24 -0400 +Subject: [PATCH 2/4] spapr: Fix EEH capability issue on KVM guest for PCI + passthru + +RH-Author: Daniel Henrique Barboza +Message-id: <20210621144024.199732-2-dbarboza@redhat.com> +Patchwork-id: 101740 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] spapr: Fix EEH capability issue on KVM guest for PCI passthru +Bugzilla: 1957866 +RH-Acked-by: Laurent Vivier +RH-Acked-by: Greg Kurz +RH-Acked-by: David Gibson + +From: Mahesh Salgaonkar + +With upstream kernel, especially after commit 98ba956f6a389 +("powerpc/pseries/eeh: Rework device EEH PE determination") we see that KVM +guest isn't able to enable EEH option for PCI pass-through devices anymore. + +[root@atest-guest ~]# dmesg | grep EEH +[ 0.032337] EEH: pSeries platform initialized +[ 0.298207] EEH: No capable adapters found: recovery disabled. +[root@atest-guest ~]# + +So far the linux kernel was assuming pe_config_addr equal to device's +config_addr and using it to enable EEH on the PE through ibm,set-eeh-option +RTAS call. Which wasn't the correct way as per PAPR. The linux kernel +commit 98ba956f6a389 fixed this flow. With that fixed, linux now uses PE +config address returned by ibm,get-config-addr-info2 RTAS call to enable +EEH option per-PE basis instead of per-device basis. However this has +uncovered a bug in qemu where ibm,set-eeh-option is treating PE config +address as per-device config address. + +Hence in qemu guest with recent kernel the ibm,set-eeh-option RTAS call +fails with -3 return value indicating that there is no PCI device exist for +the specified PE config address. The rtas_ibm_set_eeh_option call uses +pci_find_device() to get the PC device that matches specific bus and devfn +extracted from PE config address passed as argument. Thus it tries to map +the PE config address to a single specific PCI device 'bus->devices[devfn]' +which always results into checking device on slot 0 'bus->devices[0]'. +This succeeds when there is a pass-through device (vfio-pci) present on +slot 0. But in cases where there is no pass-through device present in slot +0, but present in non-zero slots, ibm,set-eeh-option call fails to enable +the EEH capability. + +hw/ppc/spapr_pci_vfio.c: spapr_phb_vfio_eeh_set_option() + case RTAS_EEH_ENABLE: { + PCIHostState *phb; + PCIDevice *pdev; + + /* + * The EEH functionality is enabled on basis of PCI device, + * instead of PE. We need check the validity of the PCI + * device address. + */ + phb = PCI_HOST_BRIDGE(sphb); + pdev = pci_find_device(phb->bus, + (addr >> 16) & 0xFF, (addr >> 8) & 0xFF); + if (!pdev || !object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + return RTAS_OUT_PARAM_ERROR; + } + +hw/pci/pci.c:pci_find_device() + +PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn) +{ + bus = pci_find_bus_nr(bus, bus_num); + + if (!bus) + return NULL; + + return bus->devices[devfn]; +} + +This patch fixes ibm,set-eeh-option to check for presence of any PCI device +(vfio-pci) under specified bus and enable the EEH if found. The current +code already makes sure that all the devices on that bus are from same +iommu group (within same PE) and fail very early if it does not. + +After this fix guest is able to find EEH capable devices and enable EEH +recovery on it. + +[root@atest-guest ~]# dmesg | grep EEH +[ 0.048139] EEH: pSeries platform initialized +[ 0.405115] EEH: Capable adapter found: recovery enabled. +[root@atest-guest ~]# + +Reviewed-by: Daniel Henrique Barboza +Signed-off-by: Mahesh Salgaonkar +Message-Id: <162158429107.145117.5843504911924013125.stgit@jupiter> +Signed-off-by: David Gibson +(cherry picked from commit ac9ef668321ebb6eb871a0c4dd380fa7d7891b4e) +Signed-off-by: Daniel Henrique Barboza +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_pci_vfio.c | 40 +++++++++++++++++++++++++++++++++------- + 1 file changed, 33 insertions(+), 7 deletions(-) + +diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c +index ecb34aaade..a411b08d60 100644 +--- a/hw/ppc/spapr_pci_vfio.c ++++ b/hw/ppc/spapr_pci_vfio.c +@@ -48,6 +48,16 @@ void spapr_phb_vfio_reset(DeviceState *qdev) + spapr_phb_vfio_eeh_reenable(SPAPR_PCI_HOST_BRIDGE(qdev)); + } + ++static void spapr_eeh_pci_find_device(PCIBus *bus, PCIDevice *pdev, ++ void *opaque) ++{ ++ bool *found = opaque; ++ ++ if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { ++ *found = true; ++ } ++} ++ + int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb, + unsigned int addr, int option) + { +@@ -60,17 +70,33 @@ int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb, + break; + case RTAS_EEH_ENABLE: { + PCIHostState *phb; +- PCIDevice *pdev; ++ bool found = false; + + /* +- * The EEH functionality is enabled on basis of PCI device, +- * instead of PE. We need check the validity of the PCI +- * device address. ++ * The EEH functionality is enabled per sphb level instead of ++ * per PCI device. We have already identified this specific sphb ++ * based on buid passed as argument to ibm,set-eeh-option rtas ++ * call. Now we just need to check the validity of the PCI ++ * pass-through devices (vfio-pci) under this sphb bus. ++ * We have already validated that all the devices under this sphb ++ * are from same iommu group (within same PE) before comming here. ++ * ++ * Prior to linux commit 98ba956f6a389 ("powerpc/pseries/eeh: ++ * Rework device EEH PE determination") kernel would call ++ * eeh-set-option for each device in the PE using the device's ++ * config_address as the argument rather than the PE address. ++ * Hence if we check validity of supplied config_addr whether ++ * it matches to this PHB will cause issues with older kernel ++ * versions v5.9 and older. If we return an error from ++ * eeh-set-option when the argument isn't a valid PE address ++ * then older kernels (v5.9 and older) will interpret that as ++ * EEH not being supported. + */ + phb = PCI_HOST_BRIDGE(sphb); +- pdev = pci_find_device(phb->bus, +- (addr >> 16) & 0xFF, (addr >> 8) & 0xFF); +- if (!pdev || !object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { ++ pci_for_each_device(phb->bus, (addr >> 16) & 0xFF, ++ spapr_eeh_pci_find_device, &found); ++ ++ if (!found) { + return RTAS_OUT_PARAM_ERROR; + } + +-- +2.27.0 + diff --git a/kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch b/kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch new file mode 100755 index 0000000000000000000000000000000000000000..c14aa7d61fd478aa5951232e4884542f6a15881e --- /dev/null +++ b/kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch @@ -0,0 +1,105 @@ +From 9ebed8090b88282f9b7432258df9182b9d3944ee Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:52 -0500 +Subject: [PATCH 4/9] spapr: Fix handling of unplugged devices during CAS and + migration + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-5-gkurz@redhat.com> +Patchwork-id: 100685 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 4/6] spapr: Fix handling of unplugged devices during CAS and migration +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +We already detect if a device is being hot plugged before CAS to trigger +a CAS reboot and during migration to migrate the state of the associated +DRC. But hot unplugging a device is also an asynchronous operation that +requires the guest to take action. This means that if the guest is migrated +after the hot unplug event was sent but before it could release the device +with RTAS, the destination QEMU doesn't know about the pending unplug +operation and doesn't actually remove the device when the guest finally +releases it. + +Similarly, if the unplug request is fired before CAS, the guest isn't +notified of the change, just like with hotplug. It ends up booting with +the device still present in the DT and configures it, just like it was +never removed. Even weirder, since the event is still queued, it will +be eventually processed when some other unrelated event is posted to +the guest. + +Enhance spapr_drc_transient() to also return true if an unplug request is +pending. This fixes the issue at CAS with a CAS reboot request and +causes the DRC state to be migrated. Some extra care is still needed to +inform the destination that an unplug request is pending : migrate the +unplug_requested field of the DRC in an optional subsection. This might +break backwards migration, but this is still better than ending with +an inconsistent guest. + +Signed-off-by: Greg Kurz +Message-Id: <158169248798.3465937.1108351365840514270.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit ab8584349c476f9818dc6403359c85f9ab0ad5eb) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr_drc.c | 25 +++++++++++++++++++++++-- + 1 file changed, 23 insertions(+), 2 deletions(-) + +diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c +index 9b498d429e..897bb7aae0 100644 +--- a/hw/ppc/spapr_drc.c ++++ b/hw/ppc/spapr_drc.c +@@ -455,6 +455,22 @@ void spapr_drc_reset(SpaprDrc *drc) + } + } + ++static bool spapr_drc_unplug_requested_needed(void *opaque) ++{ ++ return spapr_drc_unplug_requested(opaque); ++} ++ ++static const VMStateDescription vmstate_spapr_drc_unplug_requested = { ++ .name = "spapr_drc/unplug_requested", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = spapr_drc_unplug_requested_needed, ++ .fields = (VMStateField []) { ++ VMSTATE_BOOL(unplug_requested, SpaprDrc), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++ + bool spapr_drc_transient(SpaprDrc *drc) + { + SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); +@@ -470,9 +486,10 @@ bool spapr_drc_transient(SpaprDrc *drc) + /* + * We need to reset the DRC at CAS or to migrate the DRC state if it's + * not equal to the expected long-term state, which is the same as the +- * coldplugged initial state. ++ * coldplugged initial state, or if an unplug request is pending. + */ +- return (drc->state != drck->ready_state); ++ return drc->state != drck->ready_state || ++ spapr_drc_unplug_requested(drc); + } + + static bool spapr_drc_needed(void *opaque) +@@ -488,6 +505,10 @@ static const VMStateDescription vmstate_spapr_drc = { + .fields = (VMStateField []) { + VMSTATE_UINT32(state, SpaprDrc), + VMSTATE_END_OF_LIST() ++ }, ++ .subsections = (const VMStateDescription * []) { ++ &vmstate_spapr_drc_unplug_requested, ++ NULL + } + }; + +-- +2.18.2 + diff --git a/kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch b/kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch new file mode 100755 index 0000000000000000000000000000000000000000..b0ca288afa9e8a280b2205c3222fb5e92dba57cc --- /dev/null +++ b/kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch @@ -0,0 +1,246 @@ +From cb9d5380b1376b2a44d91d84eaf09f948ef1e165 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:50 -0500 +Subject: [PATCH 2/9] spapr: Fold h_cas_compose_response() into + h_client_architecture_support() + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-3-gkurz@redhat.com> +Patchwork-id: 100687 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 2/6] spapr: Fold h_cas_compose_response() into h_client_architecture_support() +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: David Gibson + +spapr_h_cas_compose_response() handles the last piece of the PAPR feature +negotiation process invoked via the ibm,client-architecture-support OF +call. Its only caller is h_client_architecture_support() which handles +most of the rest of that process. + +I believe it was placed in a separate file originally to handle some +fiddly dependencies between functions, but mostly it's just confusing +to have the CAS process split into two pieces like this. Now that +compose response is simplified (by just generating the whole device +tree anew), it's cleaner to just fold it into +h_client_architecture_support(). + +Signed-off-by: David Gibson +Reviewed-by: Cedric Le Goater +Reviewed-by: Greg Kurz +(cherry picked from commit 0c21e073541cc093b4cb8744640e24f130e6f8ba) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 61 +----------------------------------------- + hw/ppc/spapr_hcall.c | 55 ++++++++++++++++++++++++++++++++++--- + include/hw/ppc/spapr.h | 4 +-- + 3 files changed, 54 insertions(+), 66 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 92f63ad035..992bd08aaa 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -76,7 +76,6 @@ + #include "hw/nmi.h" + #include "hw/intc/intc.h" + +-#include "qemu/cutils.h" + #include "hw/ppc/spapr_cpu_core.h" + #include "hw/mem/memory-device.h" + #include "hw/ppc/spapr_tpm_proxy.h" +@@ -898,63 +897,6 @@ out: + return ret; + } + +-static bool spapr_hotplugged_dev_before_cas(void) +-{ +- Object *drc_container, *obj; +- ObjectProperty *prop; +- ObjectPropertyIterator iter; +- +- drc_container = container_get(object_get_root(), "/dr-connector"); +- object_property_iter_init(&iter, drc_container); +- while ((prop = object_property_iter_next(&iter))) { +- if (!strstart(prop->type, "link<", NULL)) { +- continue; +- } +- obj = object_property_get_link(drc_container, prop->name, NULL); +- if (spapr_drc_needed(obj)) { +- return true; +- } +- } +- return false; +-} +- +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, +- size_t space); +- +-int spapr_h_cas_compose_response(SpaprMachineState *spapr, +- target_ulong addr, target_ulong size, +- SpaprOptionVector *ov5_updates) +-{ +- void *fdt; +- SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 }; +- +- if (spapr_hotplugged_dev_before_cas()) { +- return 1; +- } +- +- if (size < sizeof(hdr)) { +- error_report("SLOF provided insufficient CAS buffer " +- TARGET_FMT_lu " (min: %zu)", size, sizeof(hdr)); +- exit(EXIT_FAILURE); +- } +- +- size -= sizeof(hdr); +- +- fdt = spapr_build_fdt(spapr, false, size); +- _FDT((fdt_pack(fdt))); +- +- cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); +- cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt)); +- trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); +- +- g_free(spapr->fdt_blob); +- spapr->fdt_size = fdt_totalsize(fdt); +- spapr->fdt_initial_size = spapr->fdt_size; +- spapr->fdt_blob = fdt; +- +- return 0; +-} +- + static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) + { + MachineState *ms = MACHINE(spapr); +@@ -1192,8 +1134,7 @@ static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt) + } + } + +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, +- size_t space) ++void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space) + { + MachineState *machine = MACHINE(spapr); + MachineClass *mc = MACHINE_GET_CLASS(machine); +diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c +index 05a7ca275b..0f19be794c 100644 +--- a/hw/ppc/spapr_hcall.c ++++ b/hw/ppc/spapr_hcall.c +@@ -1,4 +1,5 @@ + #include "qemu/osdep.h" ++#include "qemu/cutils.h" + #include "qapi/error.h" + #include "sysemu/hw_accel.h" + #include "sysemu/runstate.h" +@@ -15,6 +16,7 @@ + #include "cpu-models.h" + #include "trace.h" + #include "kvm_ppc.h" ++#include "hw/ppc/fdt.h" + #include "hw/ppc/spapr_ovec.h" + #include "mmu-book3s-v3.h" + #include "hw/mem/memory-device.h" +@@ -1638,6 +1640,26 @@ static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, + return best_compat; + } + ++static bool spapr_hotplugged_dev_before_cas(void) ++{ ++ Object *drc_container, *obj; ++ ObjectProperty *prop; ++ ObjectPropertyIterator iter; ++ ++ drc_container = container_get(object_get_root(), "/dr-connector"); ++ object_property_iter_init(&iter, drc_container); ++ while ((prop = object_property_iter_next(&iter))) { ++ if (!strstart(prop->type, "link<", NULL)) { ++ continue; ++ } ++ obj = object_property_get_link(drc_container, prop->name, NULL); ++ if (spapr_drc_needed(obj)) { ++ return true; ++ } ++ } ++ return false; ++} ++ + static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, +@@ -1645,6 +1667,8 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + { + /* Working address in data buffer */ + target_ulong addr = ppc64_phys_to_real(args[0]); ++ target_ulong fdt_buf = args[1]; ++ target_ulong fdt_bufsize = args[2]; + target_ulong ov_table; + uint32_t cas_pvr; + SpaprOptionVector *ov1_guest, *ov5_guest, *ov5_cas_old, *ov5_updates; +@@ -1788,16 +1812,41 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, + + spapr_irq_update_active_intc(spapr); + ++ if (spapr_hotplugged_dev_before_cas()) { ++ spapr->cas_reboot = true; ++ } ++ + if (!spapr->cas_reboot) { ++ void *fdt; ++ SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 }; ++ + /* If spapr_machine_reset() did not set up a HPT but one is necessary + * (because the guest isn't going to use radix) then set it up here. */ + if ((spapr->patb_entry & PATE1_GR) && !guest_radix) { + /* legacy hash or new hash: */ + spapr_setup_hpt_and_vrma(spapr); + } +- spapr->cas_reboot = +- (spapr_h_cas_compose_response(spapr, args[1], args[2], +- ov5_updates) != 0); ++ ++ if (fdt_bufsize < sizeof(hdr)) { ++ error_report("SLOF provided insufficient CAS buffer " ++ TARGET_FMT_lu " (min: %zu)", fdt_bufsize, sizeof(hdr)); ++ exit(EXIT_FAILURE); ++ } ++ ++ fdt_bufsize -= sizeof(hdr); ++ ++ fdt = spapr_build_fdt(spapr, false, fdt_bufsize); ++ _FDT((fdt_pack(fdt))); ++ ++ cpu_physical_memory_write(fdt_buf, &hdr, sizeof(hdr)); ++ cpu_physical_memory_write(fdt_buf + sizeof(hdr), fdt, ++ fdt_totalsize(fdt)); ++ trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); ++ ++ g_free(spapr->fdt_blob); ++ spapr->fdt_size = fdt_totalsize(fdt); ++ spapr->fdt_initial_size = spapr->fdt_size; ++ spapr->fdt_blob = fdt; + } + + spapr_ovec_cleanup(ov5_updates); +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index e047dabf30..e5e2a99046 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -767,11 +767,9 @@ struct SpaprEventLogEntry { + QTAILQ_ENTRY(SpaprEventLogEntry) next; + }; + ++void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space); + void spapr_events_init(SpaprMachineState *sm); + void spapr_dt_events(SpaprMachineState *sm, void *fdt); +-int spapr_h_cas_compose_response(SpaprMachineState *sm, +- target_ulong addr, target_ulong size, +- SpaprOptionVector *ov5_updates); + void close_htab_fd(SpaprMachineState *spapr); + void spapr_setup_hpt_and_vrma(SpaprMachineState *spapr); + void spapr_free_hpt(SpaprMachineState *spapr); +-- +2.18.2 + diff --git a/kvm-spapr-Improve-handling-of-fdt-buffer-size.patch b/kvm-spapr-Improve-handling-of-fdt-buffer-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..2f57cded43a03cbf852718a6cb4c98e67d016dbe --- /dev/null +++ b/kvm-spapr-Improve-handling-of-fdt-buffer-size.patch @@ -0,0 +1,125 @@ +From 04f7fe2423a4de8d2fea7068b3fb316e15e76eaa Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:49 -0500 +Subject: [PATCH 1/9] spapr: Improve handling of fdt buffer size + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-2-gkurz@redhat.com> +Patchwork-id: 100682 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 1/6] spapr: Improve handling of fdt buffer size +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: David Gibson + +Previously, spapr_build_fdt() constructed the device tree in a fixed +buffer of size FDT_MAX_SIZE. This is a bit inflexible, but more +importantly it's awkward for the case where we use it during CAS. In +that case the guest firmware supplies a buffer and we have to +awkwardly check that what we generated fits into it afterwards, after +doing a lot of size checks during spapr_build_fdt(). + +Simplify this by having spapr_build_fdt() take a 'space' parameter. +For the CAS case, we pass in the buffer size provided by SLOF, for the +machine init case, we continue to pass FDT_MAX_SIZE. + +Signed-off-by: David Gibson +Reviewed-by: Cedric Le Goater +Reviewed-by: Greg Kurz +(cherry picked from commit 97b32a6afa78ae68fb16344b9a144b6f433f42a2) +Signed-off-by: Greg Kurz +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 33 +++++++++++---------------------- + 1 file changed, 11 insertions(+), 22 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index c74079702d..92f63ad035 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -918,7 +918,8 @@ static bool spapr_hotplugged_dev_before_cas(void) + return false; + } + +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset); ++static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, ++ size_t space); + + int spapr_h_cas_compose_response(SpaprMachineState *spapr, + target_ulong addr, target_ulong size, +@@ -931,24 +932,17 @@ int spapr_h_cas_compose_response(SpaprMachineState *spapr, + return 1; + } + +- if (size < sizeof(hdr) || size > FW_MAX_SIZE) { +- error_report("SLOF provided an unexpected CAS buffer size " +- TARGET_FMT_lu " (min: %zu, max: %u)", +- size, sizeof(hdr), FW_MAX_SIZE); ++ if (size < sizeof(hdr)) { ++ error_report("SLOF provided insufficient CAS buffer " ++ TARGET_FMT_lu " (min: %zu)", size, sizeof(hdr)); + exit(EXIT_FAILURE); + } + + size -= sizeof(hdr); + +- fdt = spapr_build_fdt(spapr, false); ++ fdt = spapr_build_fdt(spapr, false, size); + _FDT((fdt_pack(fdt))); + +- if (fdt_totalsize(fdt) + sizeof(hdr) > size) { +- g_free(fdt); +- trace_spapr_cas_failed(size); +- return -1; +- } +- + cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); + cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt)); + trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); +@@ -1198,7 +1192,8 @@ static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt) + } + } + +-static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset) ++static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, ++ size_t space) + { + MachineState *machine = MACHINE(spapr); + MachineClass *mc = MACHINE_GET_CLASS(machine); +@@ -1208,8 +1203,8 @@ static void *spapr_build_fdt(SpaprMachineState *spapr, bool reset) + SpaprPhbState *phb; + char *buf; + +- fdt = g_malloc0(FDT_MAX_SIZE); +- _FDT((fdt_create_empty_tree(fdt, FDT_MAX_SIZE))); ++ fdt = g_malloc0(space); ++ _FDT((fdt_create_empty_tree(fdt, space))); + + /* Root node */ + _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp")); +@@ -1724,19 +1719,13 @@ static void spapr_machine_reset(MachineState *machine) + */ + fdt_addr = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FDT_MAX_SIZE; + +- fdt = spapr_build_fdt(spapr, true); ++ fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE); + + rc = fdt_pack(fdt); + + /* Should only fail if we've built a corrupted tree */ + assert(rc == 0); + +- if (fdt_totalsize(fdt) > FDT_MAX_SIZE) { +- error_report("FDT too big ! 0x%x bytes (max is 0x%x)", +- fdt_totalsize(fdt), FDT_MAX_SIZE); +- exit(1); +- } +- + /* Load the fdt */ + qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt)); + cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt)); +-- +2.18.2 + diff --git a/kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch b/kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch new file mode 100755 index 0000000000000000000000000000000000000000..b4b2b5fe7d360af64cd0cb609cc36daa4440f6f7 --- /dev/null +++ b/kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch @@ -0,0 +1,170 @@ +From f94b3a4eb9d709f1f6a14ad9ad6ebcc1b67b6923 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Tue, 19 Jan 2021 15:09:54 -0500 +Subject: [PATCH 6/9] spapr: Improve handling of memory unplug with old guests + +RH-Author: Greg Kurz +Message-id: <20210119150954.1017058-7-gkurz@redhat.com> +Patchwork-id: 100684 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH v2 6/6] spapr: Improve handling of memory unplug with old guests +Bugzilla: 1901837 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Laurent Vivier +RH-Acked-by: David Gibson + +From: Greg Kurz + +Since commit 1e8b5b1aa16b ("spapr: Allow memory unplug to always succeed") +trying to unplug memory from a guest that doesn't support it (eg. rhel6) +no longer generates an error like it used to. Instead, it leaves the +memory around : only a subsequent reboot or manual use of drmgr within +the guest can complete the hot-unplug sequence. A flag was added to +SpaprMachineClass so that this new behavior only applies to the default +machine type. + +We can do better. CAS processes all pending hot-unplug requests. This +means that we don't really care about what the guest supports if +the hot-unplug request happens before CAS. + +All guests that we care for, even old ones, set enough bits in OV5 +that lead to a non-empty bitmap in spapr->ov5_cas. Use that as a +heuristic to decide if CAS has already occured or not. + +Always accept unplug requests that happen before CAS since CAS will +process them. Restore the previous behavior of rejecting them after +CAS when we know that the guest doesn't support memory hot-unplug. + +This behavior is suitable for all machine types : this allows to +drop the pre_6_0_memory_unplug flag. + +Fixes: 1e8b5b1aa16b ("spapr: Allow memory unplug to always succeed") +Signed-off-by: Greg Kurz +Message-Id: <161012708715.801107.11418801796987916516.stgit@bahia.lan> +Reviewed-by: Daniel Henrique Barboza +Signed-off-by: David Gibson +(cherry picked from commit 73598c75df0585e039825e642adede21912dabc7) +Signed-off-by: Greg Kurz + +Conflicts: + hw/ppc/spapr.c + include/hw/ppc/spapr.h + +Contextual conflicts around the removal of pre_6_0_memory_unplug, +which was only partially backported from upstream 1e8b5b1aa16b, and +the addition of spapr_memory_hot_unplug_supported(). + +Signed-off-by: Jon Maloy +--- + hw/ppc/spapr.c | 21 +++++++++++++-------- + hw/ppc/spapr_events.c | 3 +-- + hw/ppc/spapr_ovec.c | 7 +++++++ + include/hw/ppc/spapr.h | 2 +- + include/hw/ppc/spapr_ovec.h | 1 + + 5 files changed, 23 insertions(+), 11 deletions(-) + +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index f8de33e3e5..00b1ef075e 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -3993,6 +3993,18 @@ static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev, + } + } + ++bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr) ++{ ++ return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) || ++ /* ++ * CAS will process all pending unplug requests. ++ * ++ * HACK: a guest could theoretically have cleared all bits in OV5, ++ * but none of the guests we care for do. ++ */ ++ spapr_ovec_empty(spapr->ov5_cas); ++} ++ + static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) + { +@@ -4001,16 +4013,9 @@ static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev, + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { +- if (!smc->pre_6_0_memory_unplug || +- spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) { ++ if (spapr_memory_hot_unplug_supported(sms)) { + spapr_memory_unplug_request(hotplug_dev, dev, errp); + } else { +- /* NOTE: this means there is a window after guest reset, prior to +- * CAS negotiation, where unplug requests will fail due to the +- * capability not being detected yet. This is a bit different than +- * the case with PCI unplug, where the events will be queued and +- * eventually handled by the guest after boot +- */ + error_setg(errp, "Memory hot unplug not supported for this guest"); + } + } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) { +diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c +index 6e284aa4bc..08168acd65 100644 +--- a/hw/ppc/spapr_events.c ++++ b/hw/ppc/spapr_events.c +@@ -547,8 +547,7 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action, + /* we should not be using count_indexed value unless the guest + * supports dedicated hotplug event source + */ +- g_assert(!SPAPR_MACHINE_GET_CLASS(spapr)->pre_6_0_memory_unplug || +- spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT)); ++ g_assert(spapr_memory_hot_unplug_supported(spapr)); + hp->drc_id.count_indexed.count = + cpu_to_be32(drc_id->count_indexed.count); + hp->drc_id.count_indexed.index = +diff --git a/hw/ppc/spapr_ovec.c b/hw/ppc/spapr_ovec.c +index 811fadf143..f858afc7d5 100644 +--- a/hw/ppc/spapr_ovec.c ++++ b/hw/ppc/spapr_ovec.c +@@ -135,6 +135,13 @@ bool spapr_ovec_test(SpaprOptionVector *ov, long bitnr) + return test_bit(bitnr, ov->bitmap) ? true : false; + } + ++bool spapr_ovec_empty(SpaprOptionVector *ov) ++{ ++ g_assert(ov); ++ ++ return bitmap_empty(ov->bitmap, OV_MAXBITS); ++} ++ + static void guest_byte_to_bitmap(uint8_t entry, unsigned long *bitmap, + long bitmap_offset) + { +diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h +index ac6961ed16..7aaf5d9996 100644 +--- a/include/hw/ppc/spapr.h ++++ b/include/hw/ppc/spapr.h +@@ -124,7 +124,6 @@ struct SpaprMachineClass { + bool pre_4_1_migration; /* don't migrate hpt-max-page-size */ + bool linux_pci_probe; + bool smp_threads_vsmt; /* set VSMT to smp_threads by default */ +- bool pre_6_0_memory_unplug; + + bool has_power9_support; + void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, +@@ -894,4 +893,5 @@ void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, + #define SPAPR_OV5_XIVE_BOTH 0x80 /* Only to advertise on the platform */ + + void spapr_set_all_lpcrs(target_ulong value, target_ulong mask); ++bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr); + #endif /* HW_SPAPR_H */ +diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h +index 7891e9caac..98c73bf601 100644 +--- a/include/hw/ppc/spapr_ovec.h ++++ b/include/hw/ppc/spapr_ovec.h +@@ -73,6 +73,7 @@ void spapr_ovec_cleanup(SpaprOptionVector *ov); + void spapr_ovec_set(SpaprOptionVector *ov, long bitnr); + void spapr_ovec_clear(SpaprOptionVector *ov, long bitnr); + bool spapr_ovec_test(SpaprOptionVector *ov, long bitnr); ++bool spapr_ovec_empty(SpaprOptionVector *ov); + SpaprOptionVector *spapr_ovec_parse_vector(target_ulong table_addr, int vector); + int spapr_ovec_populate_dt(void *fdt, int fdt_offset, + SpaprOptionVector *ov, const char *name); +-- +2.18.2 + diff --git a/kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch b/kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch new file mode 100755 index 0000000000000000000000000000000000000000..7c4871810bf18c7be9cfda3d8e46f066abb6856a --- /dev/null +++ b/kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch @@ -0,0 +1,213 @@ +From 5aea41b56f07f586e0f56a5c8b3e8443e485cd77 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 5 Jun 2020 07:41:09 -0400 +Subject: [PATCH 39/42] spapr: Pass the maximum number of vCPUs to the KVM + interrupt controller +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200605074111.2185-2-thuth@redhat.com> +Patchwork-id: 97368 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/3] spapr: Pass the maximum number of vCPUs to the KVM interrupt controller +Bugzilla: 1756946 +RH-Acked-by: Greg Kurz +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Greg Kurz + +The XIVE and XICS-on-XIVE KVM devices on POWER9 hosts can greatly reduce +their consumption of some scarce HW resources, namely Virtual Presenter +identifiers, if they know the maximum number of vCPUs that may run in the +VM. + +Prepare ground for this by passing the value down to xics_kvm_connect() +and kvmppc_xive_connect(). This is purely mechanical, no functional +change. + +Signed-off-by: Greg Kurz +Message-Id: <157478678301.67101.2717368060417156338.stgit@bahia.tlslab.ibm.com> +Reviewed-by: Cédric Le Goater +Signed-off-by: David Gibson +(cherry picked from commit 4ffb7496881ec361deaf1f51c41a933bde3cbf7b) +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/spapr_xive.c | 6 ++++-- + hw/intc/spapr_xive_kvm.c | 3 ++- + hw/intc/xics_kvm.c | 3 ++- + hw/intc/xics_spapr.c | 5 +++-- + hw/ppc/spapr_irq.c | 8 +++++--- + include/hw/ppc/spapr_irq.h | 10 ++++++++-- + include/hw/ppc/spapr_xive.h | 3 ++- + include/hw/ppc/xics_spapr.h | 3 ++- + 8 files changed, 28 insertions(+), 13 deletions(-) + +diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c +index 9cb8d38a3b..a570e6e90a 100644 +--- a/hw/intc/spapr_xive.c ++++ b/hw/intc/spapr_xive.c +@@ -651,12 +651,14 @@ static void spapr_xive_dt(SpaprInterruptController *intc, uint32_t nr_servers, + plat_res_int_priorities, sizeof(plat_res_int_priorities))); + } + +-static int spapr_xive_activate(SpaprInterruptController *intc, Error **errp) ++static int spapr_xive_activate(SpaprInterruptController *intc, ++ uint32_t nr_servers, Error **errp) + { + SpaprXive *xive = SPAPR_XIVE(intc); + + if (kvm_enabled()) { +- int rc = spapr_irq_init_kvm(kvmppc_xive_connect, intc, errp); ++ int rc = spapr_irq_init_kvm(kvmppc_xive_connect, intc, nr_servers, ++ errp); + if (rc < 0) { + return rc; + } +diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c +index 08012ac7cd..c1c837a764 100644 +--- a/hw/intc/spapr_xive_kvm.c ++++ b/hw/intc/spapr_xive_kvm.c +@@ -740,7 +740,8 @@ static void *kvmppc_xive_mmap(SpaprXive *xive, int pgoff, size_t len, + * All the XIVE memory regions are now backed by mappings from the KVM + * XIVE device. + */ +-int kvmppc_xive_connect(SpaprInterruptController *intc, Error **errp) ++int kvmppc_xive_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp) + { + SpaprXive *xive = SPAPR_XIVE(intc); + XiveSource *xsrc = &xive->source; +diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c +index 954c424b36..a1f1b7b0d3 100644 +--- a/hw/intc/xics_kvm.c ++++ b/hw/intc/xics_kvm.c +@@ -342,7 +342,8 @@ void ics_kvm_set_irq(ICSState *ics, int srcno, int val) + } + } + +-int xics_kvm_connect(SpaprInterruptController *intc, Error **errp) ++int xics_kvm_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp) + { + ICSState *ics = ICS_SPAPR(intc); + int rc; +diff --git a/hw/intc/xics_spapr.c b/hw/intc/xics_spapr.c +index b3705dab0e..8ae4f41459 100644 +--- a/hw/intc/xics_spapr.c ++++ b/hw/intc/xics_spapr.c +@@ -422,10 +422,11 @@ static int xics_spapr_post_load(SpaprInterruptController *intc, int version_id) + return 0; + } + +-static int xics_spapr_activate(SpaprInterruptController *intc, Error **errp) ++static int xics_spapr_activate(SpaprInterruptController *intc, ++ uint32_t nr_servers, Error **errp) + { + if (kvm_enabled()) { +- return spapr_irq_init_kvm(xics_kvm_connect, intc, errp); ++ return spapr_irq_init_kvm(xics_kvm_connect, intc, nr_servers, errp); + } + return 0; + } +diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c +index d6bb7fd2d6..9da423658a 100644 +--- a/hw/ppc/spapr_irq.c ++++ b/hw/ppc/spapr_irq.c +@@ -70,15 +70,16 @@ void spapr_irq_msi_free(SpaprMachineState *spapr, int irq, uint32_t num) + bitmap_clear(spapr->irq_map, irq - SPAPR_IRQ_MSI, num); + } + +-int spapr_irq_init_kvm(int (*fn)(SpaprInterruptController *, Error **), ++int spapr_irq_init_kvm(SpaprInterruptControllerInitKvm fn, + SpaprInterruptController *intc, ++ uint32_t nr_servers, + Error **errp) + { + MachineState *machine = MACHINE(qdev_get_machine()); + Error *local_err = NULL; + + if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { +- if (fn(intc, &local_err) < 0) { ++ if (fn(intc, nr_servers, &local_err) < 0) { + if (machine_kernel_irqchip_required(machine)) { + error_prepend(&local_err, + "kernel_irqchip requested but unavailable: "); +@@ -495,6 +496,7 @@ static void set_active_intc(SpaprMachineState *spapr, + SpaprInterruptController *new_intc) + { + SpaprInterruptControllerClass *sicc; ++ uint32_t nr_servers = spapr_max_server_number(spapr); + + assert(new_intc); + +@@ -512,7 +514,7 @@ static void set_active_intc(SpaprMachineState *spapr, + + sicc = SPAPR_INTC_GET_CLASS(new_intc); + if (sicc->activate) { +- sicc->activate(new_intc, &error_fatal); ++ sicc->activate(new_intc, nr_servers, &error_fatal); + } + + spapr->active_intc = new_intc; +diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h +index ff814d13de..ca8cb44213 100644 +--- a/include/hw/ppc/spapr_irq.h ++++ b/include/hw/ppc/spapr_irq.h +@@ -43,7 +43,8 @@ typedef struct SpaprInterruptController SpaprInterruptController; + typedef struct SpaprInterruptControllerClass { + InterfaceClass parent; + +- int (*activate)(SpaprInterruptController *intc, Error **errp); ++ int (*activate)(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp); + void (*deactivate)(SpaprInterruptController *intc); + + /* +@@ -98,8 +99,13 @@ qemu_irq spapr_qirq(SpaprMachineState *spapr, int irq); + int spapr_irq_post_load(SpaprMachineState *spapr, int version_id); + void spapr_irq_reset(SpaprMachineState *spapr, Error **errp); + int spapr_irq_get_phandle(SpaprMachineState *spapr, void *fdt, Error **errp); +-int spapr_irq_init_kvm(int (*fn)(SpaprInterruptController *, Error **), ++ ++typedef int (*SpaprInterruptControllerInitKvm)(SpaprInterruptController *, ++ uint32_t, Error **); ++ ++int spapr_irq_init_kvm(SpaprInterruptControllerInitKvm fn, + SpaprInterruptController *intc, ++ uint32_t nr_servers, + Error **errp); + + /* +diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h +index 742b7e834f..3a103c224d 100644 +--- a/include/hw/ppc/spapr_xive.h ++++ b/include/hw/ppc/spapr_xive.h +@@ -66,7 +66,8 @@ int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx, + /* + * KVM XIVE device helpers + */ +-int kvmppc_xive_connect(SpaprInterruptController *intc, Error **errp); ++int kvmppc_xive_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp); + void kvmppc_xive_disconnect(SpaprInterruptController *intc); + void kvmppc_xive_reset(SpaprXive *xive, Error **errp); + void kvmppc_xive_set_source_config(SpaprXive *xive, uint32_t lisn, XiveEAS *eas, +diff --git a/include/hw/ppc/xics_spapr.h b/include/hw/ppc/xics_spapr.h +index 28b87038c8..1c65c96e3c 100644 +--- a/include/hw/ppc/xics_spapr.h ++++ b/include/hw/ppc/xics_spapr.h +@@ -32,7 +32,8 @@ + #define TYPE_ICS_SPAPR "ics-spapr" + #define ICS_SPAPR(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS_SPAPR) + +-int xics_kvm_connect(SpaprInterruptController *intc, Error **errp); ++int xics_kvm_connect(SpaprInterruptController *intc, uint32_t nr_servers, ++ Error **errp); + void xics_kvm_disconnect(SpaprInterruptController *intc); + bool xics_kvm_has_broken_disconnect(SpaprMachineState *spapr); + +-- +2.27.0 + diff --git a/kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch b/kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch new file mode 100755 index 0000000000000000000000000000000000000000..4f15509a7dcafd3aa52227cf036128c1ee008144 --- /dev/null +++ b/kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch @@ -0,0 +1,50 @@ +From b46fdf56b1a7938468565838bdadf260870e4f9b Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Wed, 9 Jun 2021 10:05:00 -0400 +Subject: [PATCH 3/4] spapr: Remove stale comment about power-saving LPCR bits +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210609100501.427096-2-lvivier@redhat.com> +Patchwork-id: 101682 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] spapr: Remove stale comment about power-saving LPCR bits +Bugzilla: 1969768 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: David Gibson +RH-Acked-by: Greg Kurz + +From: Nicholas Piggin + +Commit 47a9b551547 ("spapr: Clean up handling of LPCR power-saving exit +bits") moved this logic but did not remove the comment from the +previous location. + +Signed-off-by: Nicholas Piggin +Message-Id: <20210526091626.3388262-2-npiggin@gmail.com> +Reviewed-by: Cédric Le Goater +Reviewed-by: Greg Kurz +Signed-off-by: David Gibson +(cherry picked from commit 7be3bf6c8429969f97728bb712d9a99997835607) +Signed-off-by: Laurent Vivier +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_rtas.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c +index 8d8d8cdfcb..295eac986e 100644 +--- a/hw/ppc/spapr_rtas.c ++++ b/hw/ppc/spapr_rtas.c +@@ -163,7 +163,6 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + + env->msr = (1ULL << MSR_SF) | (1ULL << MSR_ME); + +- /* Enable Power-saving mode Exit Cause exceptions for the new CPU */ + lpcr = env->spr[SPR_LPCR]; + if (!pcc->interrupts_big_endian(callcpu)) { + lpcr |= LPCR_ILE; +-- +2.27.0 + diff --git a/kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch b/kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch new file mode 100755 index 0000000000000000000000000000000000000000..84abc74f1f0ea28d32781e20f886436e41e259a4 --- /dev/null +++ b/kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch @@ -0,0 +1,89 @@ +From 28794dca79a94d01c8732b84fe6ac6ba2986ce45 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Wed, 9 Jun 2021 10:05:01 -0400 +Subject: [PATCH 4/4] spapr: Set LPCR to current AIL mode when starting a new + CPU +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +Message-id: <20210609100501.427096-3-lvivier@redhat.com> +Patchwork-id: 101683 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] spapr: Set LPCR to current AIL mode when starting a new CPU +Bugzilla: 1969768 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: David Gibson +RH-Acked-by: Greg Kurz + +From: Nicholas Piggin + +TCG does not keep track of AIL mode in a central place, it's based on +the current LPCR[AIL] bits. Synchronize the new CPU's LPCR to the +current LPCR in rtas_start_cpu(), similarly to the way the ILE bit is +synchronized. + +Open-code the ILE setting as well now that the caller's LPCR is +available directly, there is no need for the indirection. + +Without this, under both TCG and KVM, adding a POWER8/9/10 class CPU +with a new core ID after a modern Linux has booted results in the new +CPU's LPCR missing the LPCR[AIL]=0b11 setting that the other CPUs have. +This can cause crashes and unexpected behaviour. + +Signed-off-by: Nicholas Piggin +Message-Id: <20210526091626.3388262-3-npiggin@gmail.com> +Reviewed-by: Cédric Le Goater +Reviewed-by: Greg Kurz +Signed-off-by: David Gibson +(cherry picked from commit ac559ecbea2649819e7b3fdd09f4e0243e0128db) +Signed-off-by: Laurent Vivier +Signed-off-by: Danilo C. L. de Paula +--- + hw/ppc/spapr_rtas.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c +index 295eac986e..5acb7c1f10 100644 +--- a/hw/ppc/spapr_rtas.c ++++ b/hw/ppc/spapr_rtas.c +@@ -132,8 +132,8 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + target_ulong id, start, r3; + PowerPCCPU *newcpu; + CPUPPCState *env; +- PowerPCCPUClass *pcc; + target_ulong lpcr; ++ target_ulong caller_lpcr; + + if (nargs != 3 || nret != 1) { + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +@@ -152,7 +152,6 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + } + + env = &newcpu->env; +- pcc = POWERPC_CPU_GET_CLASS(newcpu); + + if (!CPU(newcpu)->halted) { + rtas_st(rets, 0, RTAS_OUT_HW_ERROR); +@@ -163,10 +162,15 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, + + env->msr = (1ULL << MSR_SF) | (1ULL << MSR_ME); + ++ caller_lpcr = callcpu->env.spr[SPR_LPCR]; + lpcr = env->spr[SPR_LPCR]; +- if (!pcc->interrupts_big_endian(callcpu)) { +- lpcr |= LPCR_ILE; +- } ++ ++ /* Set ILE the same way */ ++ lpcr = (lpcr & ~LPCR_ILE) | (caller_lpcr & LPCR_ILE); ++ ++ /* Set AIL the same way */ ++ lpcr = (lpcr & ~LPCR_AIL) | (caller_lpcr & LPCR_AIL); ++ + if (env->mmu_model == POWERPC_MMU_3_00) { + /* + * New cpus are expected to start in the same radix/hash mode +-- +2.27.0 + diff --git a/kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch b/kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch new file mode 100755 index 0000000000000000000000000000000000000000..e8c9f8b452f94b8d92f14afa2c31dfabe2934c67 --- /dev/null +++ b/kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch @@ -0,0 +1,54 @@ +From 07df0f52c26a3819bc02b4f2970b6735bcf15c5b Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:42 -0400 +Subject: [PATCH 4/9] sungem: switch to use qemu_receive_packet() for loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-5-jmaloy@redhat.com> +Patchwork-id: 101786 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 4/9] sungem: switch to use qemu_receive_packet() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_packet() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Mark Cave-Ayland +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Alistair Francis +Signed-off-by: Jason Wang + +(cherry picked from commit 8c92060d3c0248bd4d515719a35922cd2391b9b4) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/sungem.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/sungem.c b/hw/net/sungem.c +index f31d41ac5b..8b202b5c15 100644 +--- a/hw/net/sungem.c ++++ b/hw/net/sungem.c +@@ -305,7 +305,7 @@ static void sungem_send_packet(SunGEMState *s, const uint8_t *buf, + NetClientState *nc = qemu_get_queue(s->nic); + + if (s->macregs[MAC_XIFCFG >> 2] & MAC_XIFCFG_LBCK) { +- nc->info->receive(nc, buf, size); ++ qemu_receive_packet(nc, buf, size); + } else { + qemu_send_packet(nc, buf, size); + } +-- +2.27.0 + diff --git a/kvm-target-arm-Fix-PAuth-sbox-functions.patch b/kvm-target-arm-Fix-PAuth-sbox-functions.patch new file mode 100755 index 0000000000000000000000000000000000000000..0e081845581991e6ec388f4e96c977508f95f0c6 --- /dev/null +++ b/kvm-target-arm-Fix-PAuth-sbox-functions.patch @@ -0,0 +1,65 @@ +From b8c8288a65146952cdfe7d5f0cd96734c9de8ee1 Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Thu, 7 May 2020 17:57:08 +0100 +Subject: [PATCH 1/7] target/arm: Fix PAuth sbox functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200507175708.1165177-2-jmaloy@redhat.com> +Patchwork-id: 96341 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] target/arm: Fix PAuth sbox functions +Bugzilla: 1813940 +RH-Acked-by: Andrew Jones +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella + +From: Vincent Dehors + +In the PAC computation, sbox was applied over wrong bits. +As this is a 4-bit sbox, bit index should be incremented by 4 instead of 16. + +Test vector from QARMA paper (https://eprint.iacr.org/2016/444.pdf) was +used to verify one computation of the pauth_computepac() function which +uses sbox2. + +Launchpad: https://bugs.launchpad.net/bugs/1859713 +Reviewed-by: Richard Henderson +Signed-off-by: Vincent DEHORS +Signed-off-by: Adrien GRASSEIN +Message-id: 20200116230809.19078-2-richard.henderson@linaro.org +Reviewed-by: Peter Maydell +Signed-off-by: Peter Maydell +(cherry picked from commit de0b1bae6461f67243282555475f88b2384a1eb9) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/pauth_helper.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c +index d3194f2..0a5f41e 100644 +--- a/target/arm/pauth_helper.c ++++ b/target/arm/pauth_helper.c +@@ -89,7 +89,7 @@ static uint64_t pac_sub(uint64_t i) + uint64_t o = 0; + int b; + +- for (b = 0; b < 64; b += 16) { ++ for (b = 0; b < 64; b += 4) { + o |= (uint64_t)sub[(i >> b) & 0xf] << b; + } + return o; +@@ -104,7 +104,7 @@ static uint64_t pac_inv_sub(uint64_t i) + uint64_t o = 0; + int b; + +- for (b = 0; b < 64; b += 16) { ++ for (b = 0; b < 64; b += 4) { + o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b; + } + return o; +-- +1.8.3.1 + diff --git a/kvm-target-arm-arch_dump-Add-SVE-notes.patch b/kvm-target-arm-arch_dump-Add-SVE-notes.patch new file mode 100755 index 0000000000000000000000000000000000000000..febea1084fde5644a40b7e87cccd7b1e7c5b7458 --- /dev/null +++ b/kvm-target-arm-arch_dump-Add-SVE-notes.patch @@ -0,0 +1,298 @@ +From d8871ae2842531130c9b333e7c06a6a5d1561286 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 24 Jan 2020 09:14:34 +0100 +Subject: [PATCH 001/116] target/arm/arch_dump: Add SVE notes + +RH-Author: Andrew Jones +Message-id: <20200124091434.15021-2-drjones@redhat.com> +Patchwork-id: 93443 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] target/arm/arch_dump: Add SVE notes +Bugzilla: 1725084 +RH-Acked-by: Auger Eric +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1725084 + +Author: Andrew Jones +Date: Thu, 23 Jan 2020 15:22:40 +0000 + + target/arm/arch_dump: Add SVE notes + + When dumping a guest with dump-guest-memory also dump the SVE + registers if they are in use. + + Signed-off-by: Andrew Jones + Reviewed-by: Richard Henderson + Message-id: 20200120101832.18781-1-drjones@redhat.com + [PMM: fixed checkpatch nits] + Signed-off-by: Peter Maydell + +(cherry picked from commit 538baab245ca881e6a6ff720b5133f3ad1fcaafc) +Signed-off-by: Miroslav Rezanina +--- + include/elf.h | 1 + + target/arm/arch_dump.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++- + target/arm/cpu.h | 25 ++++++++++ + target/arm/kvm64.c | 24 ---------- + 4 files changed, 148 insertions(+), 26 deletions(-) + +diff --git a/include/elf.h b/include/elf.h +index 3501e0c..8fbfe60 100644 +--- a/include/elf.h ++++ b/include/elf.h +@@ -1650,6 +1650,7 @@ typedef struct elf64_shdr { + #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ + #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ + #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ ++#define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension regs */ + + /* + * Physical entry point into the kernel. +diff --git a/target/arm/arch_dump.c b/target/arm/arch_dump.c +index 26a2c09..2345dec 100644 +--- a/target/arm/arch_dump.c ++++ b/target/arm/arch_dump.c +@@ -62,12 +62,23 @@ struct aarch64_user_vfp_state { + + QEMU_BUILD_BUG_ON(sizeof(struct aarch64_user_vfp_state) != 528); + ++/* struct user_sve_header from arch/arm64/include/uapi/asm/ptrace.h */ ++struct aarch64_user_sve_header { ++ uint32_t size; ++ uint32_t max_size; ++ uint16_t vl; ++ uint16_t max_vl; ++ uint16_t flags; ++ uint16_t reserved; ++} QEMU_PACKED; ++ + struct aarch64_note { + Elf64_Nhdr hdr; + char name[8]; /* align_up(sizeof("CORE"), 4) */ + union { + struct aarch64_elf_prstatus prstatus; + struct aarch64_user_vfp_state vfp; ++ struct aarch64_user_sve_header sve; + }; + } QEMU_PACKED; + +@@ -76,6 +87,8 @@ struct aarch64_note { + (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_elf_prstatus)) + #define AARCH64_PRFPREG_NOTE_SIZE \ + (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_user_vfp_state)) ++#define AARCH64_SVE_NOTE_SIZE(env) \ ++ (AARCH64_NOTE_HEADER_SIZE + sve_size(env)) + + static void aarch64_note_init(struct aarch64_note *note, DumpState *s, + const char *name, Elf64_Word namesz, +@@ -128,11 +141,102 @@ static int aarch64_write_elf64_prfpreg(WriteCoreDumpFunction f, + return 0; + } + ++#ifdef TARGET_AARCH64 ++static off_t sve_zreg_offset(uint32_t vq, int n) ++{ ++ off_t off = sizeof(struct aarch64_user_sve_header); ++ return ROUND_UP(off, 16) + vq * 16 * n; ++} ++ ++static off_t sve_preg_offset(uint32_t vq, int n) ++{ ++ return sve_zreg_offset(vq, 32) + vq * 16 / 8 * n; ++} ++ ++static off_t sve_fpsr_offset(uint32_t vq) ++{ ++ off_t off = sve_preg_offset(vq, 17); ++ return ROUND_UP(off, 16); ++} ++ ++static off_t sve_fpcr_offset(uint32_t vq) ++{ ++ return sve_fpsr_offset(vq) + sizeof(uint32_t); ++} ++ ++static uint32_t sve_current_vq(CPUARMState *env) ++{ ++ return sve_zcr_len_for_el(env, arm_current_el(env)) + 1; ++} ++ ++static size_t sve_size_vq(uint32_t vq) ++{ ++ off_t off = sve_fpcr_offset(vq) + sizeof(uint32_t); ++ return ROUND_UP(off, 16); ++} ++ ++static size_t sve_size(CPUARMState *env) ++{ ++ return sve_size_vq(sve_current_vq(env)); ++} ++ ++static int aarch64_write_elf64_sve(WriteCoreDumpFunction f, ++ CPUARMState *env, int cpuid, ++ DumpState *s) ++{ ++ struct aarch64_note *note; ++ ARMCPU *cpu = env_archcpu(env); ++ uint32_t vq = sve_current_vq(env); ++ uint64_t tmp[ARM_MAX_VQ * 2], *r; ++ uint32_t fpr; ++ uint8_t *buf; ++ int ret, i; ++ ++ note = g_malloc0(AARCH64_SVE_NOTE_SIZE(env)); ++ buf = (uint8_t *)¬e->sve; ++ ++ aarch64_note_init(note, s, "LINUX", 6, NT_ARM_SVE, sve_size_vq(vq)); ++ ++ note->sve.size = cpu_to_dump32(s, sve_size_vq(vq)); ++ note->sve.max_size = cpu_to_dump32(s, sve_size_vq(cpu->sve_max_vq)); ++ note->sve.vl = cpu_to_dump16(s, vq * 16); ++ note->sve.max_vl = cpu_to_dump16(s, cpu->sve_max_vq * 16); ++ note->sve.flags = cpu_to_dump16(s, 1); ++ ++ for (i = 0; i < 32; ++i) { ++ r = sve_bswap64(tmp, &env->vfp.zregs[i].d[0], vq * 2); ++ memcpy(&buf[sve_zreg_offset(vq, i)], r, vq * 16); ++ } ++ ++ for (i = 0; i < 17; ++i) { ++ r = sve_bswap64(tmp, r = &env->vfp.pregs[i].p[0], ++ DIV_ROUND_UP(vq * 2, 8)); ++ memcpy(&buf[sve_preg_offset(vq, i)], r, vq * 16 / 8); ++ } ++ ++ fpr = cpu_to_dump32(s, vfp_get_fpsr(env)); ++ memcpy(&buf[sve_fpsr_offset(vq)], &fpr, sizeof(uint32_t)); ++ ++ fpr = cpu_to_dump32(s, vfp_get_fpcr(env)); ++ memcpy(&buf[sve_fpcr_offset(vq)], &fpr, sizeof(uint32_t)); ++ ++ ret = f(note, AARCH64_SVE_NOTE_SIZE(env), s); ++ g_free(note); ++ ++ if (ret < 0) { ++ return -1; ++ } ++ ++ return 0; ++} ++#endif ++ + int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, + int cpuid, void *opaque) + { + struct aarch64_note note; +- CPUARMState *env = &ARM_CPU(cs)->env; ++ ARMCPU *cpu = ARM_CPU(cs); ++ CPUARMState *env = &cpu->env; + DumpState *s = opaque; + uint64_t pstate, sp; + int ret, i; +@@ -163,7 +267,18 @@ int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, + return -1; + } + +- return aarch64_write_elf64_prfpreg(f, env, cpuid, s); ++ ret = aarch64_write_elf64_prfpreg(f, env, cpuid, s); ++ if (ret) { ++ return ret; ++ } ++ ++#ifdef TARGET_AARCH64 ++ if (cpu_isar_feature(aa64_sve, cpu)) { ++ ret = aarch64_write_elf64_sve(f, env, cpuid, s); ++ } ++#endif ++ ++ return ret; + } + + /* struct pt_regs from arch/arm/include/asm/ptrace.h */ +@@ -335,6 +450,11 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus) + if (class == ELFCLASS64) { + note_size = AARCH64_PRSTATUS_NOTE_SIZE; + note_size += AARCH64_PRFPREG_NOTE_SIZE; ++#ifdef TARGET_AARCH64 ++ if (cpu_isar_feature(aa64_sve, cpu)) { ++ note_size += AARCH64_SVE_NOTE_SIZE(env); ++ } ++#endif + } else { + note_size = ARM_PRSTATUS_NOTE_SIZE; + if (arm_feature(env, ARM_FEATURE_VFP)) { +diff --git a/target/arm/cpu.h b/target/arm/cpu.h +index 83a809d..82dd3cc 100644 +--- a/target/arm/cpu.h ++++ b/target/arm/cpu.h +@@ -975,6 +975,31 @@ void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq); + void aarch64_sve_change_el(CPUARMState *env, int old_el, + int new_el, bool el0_a64); + void aarch64_add_sve_properties(Object *obj); ++ ++/* ++ * SVE registers are encoded in KVM's memory in an endianness-invariant format. ++ * The byte at offset i from the start of the in-memory representation contains ++ * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the ++ * lowest offsets are stored in the lowest memory addresses, then that nearly ++ * matches QEMU's representation, which is to use an array of host-endian ++ * uint64_t's, where the lower offsets are at the lower indices. To complete ++ * the translation we just need to byte swap the uint64_t's on big-endian hosts. ++ */ ++static inline uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) ++{ ++#ifdef HOST_WORDS_BIGENDIAN ++ int i; ++ ++ for (i = 0; i < nr; ++i) { ++ dst[i] = bswap64(src[i]); ++ } ++ ++ return dst; ++#else ++ return src; ++#endif ++} ++ + #else + static inline void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq) { } + static inline void aarch64_sve_change_el(CPUARMState *env, int o, +diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c +index 876184b..e2da756 100644 +--- a/target/arm/kvm64.c ++++ b/target/arm/kvm64.c +@@ -877,30 +877,6 @@ static int kvm_arch_put_fpsimd(CPUState *cs) + } + + /* +- * SVE registers are encoded in KVM's memory in an endianness-invariant format. +- * The byte at offset i from the start of the in-memory representation contains +- * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the +- * lowest offsets are stored in the lowest memory addresses, then that nearly +- * matches QEMU's representation, which is to use an array of host-endian +- * uint64_t's, where the lower offsets are at the lower indices. To complete +- * the translation we just need to byte swap the uint64_t's on big-endian hosts. +- */ +-static uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) +-{ +-#ifdef HOST_WORDS_BIGENDIAN +- int i; +- +- for (i = 0; i < nr; ++i) { +- dst[i] = bswap64(src[i]); +- } +- +- return dst; +-#else +- return src; +-#endif +-} +- +-/* + * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits + * and PREGS and the FFR have a slice size of 256 bits. However we simply hard + * code the slice index to zero for now as it's unlikely we'll need more than +-- +1.8.3.1 + diff --git a/kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch b/kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch new file mode 100755 index 0000000000000000000000000000000000000000..601b8c4dc40d84f6b4afb2bd2c477fecee9fa94d --- /dev/null +++ b/kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch @@ -0,0 +1,281 @@ +From 730f72105b478553c4f22555c29b0f64224ff914 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:14 +0000 +Subject: [PATCH 12/15] target/arm/cpu: Add the kvm-no-adjvtime CPU property +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-6-drjones@redhat.com> +Patchwork-id: 93623 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/5] target/arm/cpu: Add the kvm-no-adjvtime CPU property +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + target/arm/cpu: Add the kvm-no-adjvtime CPU property + + kvm-no-adjvtime is a KVM specific CPU property and a first of its + kind. To accommodate it we also add kvm_arm_add_vcpu_properties() + and a KVM specific CPU properties description to the CPU features + document. + + Signed-off-by: Andrew Jones + Message-id: 20200120101023.16030-7-drjones@redhat.com + Reviewed-by: Peter Maydell + Signed-off-by: Peter Maydell + +(cherry picked from commit dea101a1ae9968c9fec6ab0291489dad7c49f36f) +Signed-off-by: Danilo C. L. de Paula + +Conflicts: + Dropped the second hunk of the hw/arm/virt.c changes + as they would patch dead code. + +Signed-off-by: Danilo C. L. de Paula +--- + docs/arm-cpu-features.rst | 37 ++++++++++++++++++++++++++++++++++++- + hw/arm/virt.c | 5 +++++ + include/hw/arm/virt.h | 1 + + target/arm/cpu.c | 2 ++ + target/arm/cpu64.c | 1 + + target/arm/kvm.c | 28 ++++++++++++++++++++++++++++ + target/arm/kvm_arm.h | 11 +++++++++++ + target/arm/monitor.c | 1 + + tests/arm-cpu-features.c | 4 ++++ + 9 files changed, 89 insertions(+), 1 deletion(-) + +diff --git a/docs/arm-cpu-features.rst b/docs/arm-cpu-features.rst +index 1b367e2..45d1eb6 100644 +--- a/docs/arm-cpu-features.rst ++++ b/docs/arm-cpu-features.rst +@@ -31,7 +31,9 @@ supporting the feature or only supporting the feature under certain + configurations. For example, the `aarch64` CPU feature, which, when + disabled, enables the optional AArch32 CPU feature, is only supported + when using the KVM accelerator and when running on a host CPU type that +-supports the feature. ++supports the feature. While `aarch64` currently only works with KVM, ++it could work with TCG. CPU features that are specific to KVM are ++prefixed with "kvm-" and are described in "KVM VCPU Features". + + CPU Feature Probing + =================== +@@ -171,6 +173,39 @@ disabling many SVE vector lengths would be quite verbose, the `sve` CPU + properties have special semantics (see "SVE CPU Property Parsing + Semantics"). + ++KVM VCPU Features ++================= ++ ++KVM VCPU features are CPU features that are specific to KVM, such as ++paravirt features or features that enable CPU virtualization extensions. ++The features' CPU properties are only available when KVM is enabled and ++are named with the prefix "kvm-". KVM VCPU features may be probed, ++enabled, and disabled in the same way as other CPU features. Below is ++the list of KVM VCPU features and their descriptions. ++ ++ kvm-no-adjvtime By default kvm-no-adjvtime is disabled. This ++ means that by default the virtual time ++ adjustment is enabled (vtime is *not not* ++ adjusted). ++ ++ When virtual time adjustment is enabled each ++ time the VM transitions back to running state ++ the VCPU's virtual counter is updated to ensure ++ stopped time is not counted. This avoids time ++ jumps surprising guest OSes and applications, ++ as long as they use the virtual counter for ++ timekeeping. However it has the side effect of ++ the virtual and physical counters diverging. ++ All timekeeping based on the virtual counter ++ will appear to lag behind any timekeeping that ++ does not subtract VM stopped time. The guest ++ may resynchronize its virtual counter with ++ other time sources as needed. ++ ++ Enable kvm-no-adjvtime to disable virtual time ++ adjustment, also restoring the legacy (pre-5.0) ++ behavior. ++ + SVE CPU Properties + ================== + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index e108391..d30d38c 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -1707,6 +1707,11 @@ static void machvirt_init(MachineState *machine) + } + } + ++ if (vmc->kvm_no_adjvtime && ++ object_property_find(cpuobj, "kvm-no-adjvtime", NULL)) { ++ object_property_set_bool(cpuobj, true, "kvm-no-adjvtime", NULL); ++ } ++ + if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) { + object_property_set_bool(cpuobj, false, "pmu", NULL); + } +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index 53fdf16..77828ce 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -109,6 +109,7 @@ typedef struct { + bool smbios_old_sys_ver; + bool no_highmem_ecam; + bool no_ged; /* Machines < 4.2 has no support for ACPI GED device */ ++ bool kvm_no_adjvtime; + } VirtMachineClass; + + typedef struct { +diff --git a/target/arm/cpu.c b/target/arm/cpu.c +index 3788fc3..e46efe9 100644 +--- a/target/arm/cpu.c ++++ b/target/arm/cpu.c +@@ -2482,6 +2482,7 @@ static void arm_max_initfn(Object *obj) + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); ++ kvm_arm_add_vcpu_properties(obj); + } else { + cortex_a15_initfn(obj); + +@@ -2673,6 +2674,7 @@ static void arm_host_initfn(Object *obj) + if (arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) { + aarch64_add_sve_properties(obj); + } ++ kvm_arm_add_vcpu_properties(obj); + arm_cpu_post_init(obj); + } + +diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c +index a39d6fc..3cd416d 100644 +--- a/target/arm/cpu64.c ++++ b/target/arm/cpu64.c +@@ -605,6 +605,7 @@ static void aarch64_max_initfn(Object *obj) + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); ++ kvm_arm_add_vcpu_properties(obj); + } else { + uint64_t t; + uint32_t u; +diff --git a/target/arm/kvm.c b/target/arm/kvm.c +index 26d7f8b..4be9497 100644 +--- a/target/arm/kvm.c ++++ b/target/arm/kvm.c +@@ -17,6 +17,8 @@ + #include "qemu/timer.h" + #include "qemu/error-report.h" + #include "qemu/main-loop.h" ++#include "qom/object.h" ++#include "qapi/error.h" + #include "sysemu/sysemu.h" + #include "sysemu/kvm.h" + #include "sysemu/kvm_int.h" +@@ -179,6 +181,32 @@ void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu) + env->features = arm_host_cpu_features.features; + } + ++static bool kvm_no_adjvtime_get(Object *obj, Error **errp) ++{ ++ return !ARM_CPU(obj)->kvm_adjvtime; ++} ++ ++static void kvm_no_adjvtime_set(Object *obj, bool value, Error **errp) ++{ ++ ARM_CPU(obj)->kvm_adjvtime = !value; ++} ++ ++/* KVM VCPU properties should be prefixed with "kvm-". */ ++void kvm_arm_add_vcpu_properties(Object *obj) ++{ ++ if (!kvm_enabled()) { ++ return; ++ } ++ ++ ARM_CPU(obj)->kvm_adjvtime = true; ++ object_property_add_bool(obj, "kvm-no-adjvtime", kvm_no_adjvtime_get, ++ kvm_no_adjvtime_set, &error_abort); ++ object_property_set_description(obj, "kvm-no-adjvtime", ++ "Set on to disable the adjustment of " ++ "the virtual counter. VM stopped time " ++ "will be counted.", &error_abort); ++} ++ + bool kvm_arm_pmu_supported(CPUState *cpu) + { + KVMState *s = KVM_STATE(current_machine->accelerator); +diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h +index 01a9a18..ae9e075 100644 +--- a/target/arm/kvm_arm.h ++++ b/target/arm/kvm_arm.h +@@ -256,6 +256,15 @@ void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map); + void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu); + + /** ++ * kvm_arm_add_vcpu_properties: ++ * @obj: The CPU object to add the properties to ++ * ++ * Add all KVM specific CPU properties to the CPU object. These ++ * are the CPU properties with "kvm-" prefixed names. ++ */ ++void kvm_arm_add_vcpu_properties(Object *obj); ++ ++/** + * kvm_arm_aarch32_supported: + * @cs: CPUState + * +@@ -345,6 +354,8 @@ static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu) + cpu->host_cpu_probe_failed = true; + } + ++static inline void kvm_arm_add_vcpu_properties(Object *obj) {} ++ + static inline bool kvm_arm_aarch32_supported(CPUState *cs) + { + return false; +diff --git a/target/arm/monitor.c b/target/arm/monitor.c +index fa054f8..9725dff 100644 +--- a/target/arm/monitor.c ++++ b/target/arm/monitor.c +@@ -103,6 +103,7 @@ static const char *cpu_model_advertised_features[] = { + "sve128", "sve256", "sve384", "sve512", + "sve640", "sve768", "sve896", "sve1024", "sve1152", "sve1280", + "sve1408", "sve1536", "sve1664", "sve1792", "sve1920", "sve2048", ++ "kvm-no-adjvtime", + NULL + }; + +diff --git a/tests/arm-cpu-features.c b/tests/arm-cpu-features.c +index 89285ca..ba1a6fe 100644 +--- a/tests/arm-cpu-features.c ++++ b/tests/arm-cpu-features.c +@@ -428,6 +428,8 @@ static void test_query_cpu_model_expansion(const void *data) + assert_has_feature_enabled(qts, "cortex-a15", "pmu"); + assert_has_not_feature(qts, "cortex-a15", "aarch64"); + ++ assert_has_not_feature(qts, "max", "kvm-no-adjvtime"); ++ + if (g_str_equal(qtest_get_arch(), "aarch64")) { + assert_has_feature_enabled(qts, "max", "aarch64"); + assert_has_feature_enabled(qts, "max", "sve"); +@@ -462,6 +464,8 @@ static void test_query_cpu_model_expansion_kvm(const void *data) + return; + } + ++ assert_has_feature_disabled(qts, "host", "kvm-no-adjvtime"); ++ + if (g_str_equal(qtest_get_arch(), "aarch64")) { + bool kvm_supports_sve; + char max_name[8], name[8]; +-- +1.8.3.1 + diff --git a/kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch b/kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch new file mode 100755 index 0000000000000000000000000000000000000000..3396a32c9cece453516b6cdfd39a4eee1d06f2ad --- /dev/null +++ b/kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch @@ -0,0 +1,330 @@ +From 5388ea3fc0737d1a659256ff3663057bef484c19 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:13 +0000 +Subject: [PATCH 11/15] target/arm/kvm: Implement virtual time adjustment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-5-drjones@redhat.com> +Patchwork-id: 93622 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/5] target/arm/kvm: Implement virtual time adjustment +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + target/arm/kvm: Implement virtual time adjustment + + When a VM is stopped (such as when it's paused) guest virtual time + should stop counting. Otherwise, when the VM is resumed it will + experience time jumps and its kernel may report soft lockups. Not + counting virtual time while the VM is stopped has the side effect + of making the guest's time appear to lag when compared with real + time, and even with time derived from the physical counter. For + this reason, this change, which is enabled by default, comes with + a KVM CPU feature allowing it to be disabled, restoring legacy + behavior. + + This patch only provides the implementation of the virtual time + adjustment. A subsequent patch will provide the CPU property + allowing the change to be enabled and disabled. + + Reported-by: Bijan Mottahedeh + Signed-off-by: Andrew Jones + Message-id: 20200120101023.16030-6-drjones@redhat.com + Reviewed-by: Peter Maydell + Signed-off-by: Peter Maydell + +(cherry picked from commit e5ac4200b4cddf44df9adbef677af0d1f1c579c6) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/cpu.h | 7 ++++ + target/arm/kvm.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + target/arm/kvm32.c | 3 ++ + target/arm/kvm64.c | 3 ++ + target/arm/kvm_arm.h | 38 ++++++++++++++++++++++ + target/arm/machine.c | 7 ++++ + 6 files changed, 150 insertions(+) + +diff --git a/target/arm/cpu.h b/target/arm/cpu.h +index 82dd3cc..fbd8ea0 100644 +--- a/target/arm/cpu.h ++++ b/target/arm/cpu.h +@@ -821,6 +821,13 @@ struct ARMCPU { + /* KVM init features for this CPU */ + uint32_t kvm_init_features[7]; + ++ /* KVM CPU state */ ++ ++ /* KVM virtual time adjustment */ ++ bool kvm_adjvtime; ++ bool kvm_vtime_dirty; ++ uint64_t kvm_vtime; ++ + /* Uniprocessor system with MP extensions */ + bool mp_is_up; + +diff --git a/target/arm/kvm.c b/target/arm/kvm.c +index 5b82cef..26d7f8b 100644 +--- a/target/arm/kvm.c ++++ b/target/arm/kvm.c +@@ -359,6 +359,22 @@ static int compare_u64(const void *a, const void *b) + return 0; + } + ++/* ++ * cpreg_values are sorted in ascending order by KVM register ID ++ * (see kvm_arm_init_cpreg_list). This allows us to cheaply find ++ * the storage for a KVM register by ID with a binary search. ++ */ ++static uint64_t *kvm_arm_get_cpreg_ptr(ARMCPU *cpu, uint64_t regidx) ++{ ++ uint64_t *res; ++ ++ res = bsearch(®idx, cpu->cpreg_indexes, cpu->cpreg_array_len, ++ sizeof(uint64_t), compare_u64); ++ assert(res); ++ ++ return &cpu->cpreg_values[res - cpu->cpreg_indexes]; ++} ++ + /* Initialize the ARMCPU cpreg list according to the kernel's + * definition of what CPU registers it knows about (and throw away + * the previous TCG-created cpreg list). +@@ -512,6 +528,23 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level) + return ok; + } + ++void kvm_arm_cpu_pre_save(ARMCPU *cpu) ++{ ++ /* KVM virtual time adjustment */ ++ if (cpu->kvm_vtime_dirty) { ++ *kvm_arm_get_cpreg_ptr(cpu, KVM_REG_ARM_TIMER_CNT) = cpu->kvm_vtime; ++ } ++} ++ ++void kvm_arm_cpu_post_load(ARMCPU *cpu) ++{ ++ /* KVM virtual time adjustment */ ++ if (cpu->kvm_adjvtime) { ++ cpu->kvm_vtime = *kvm_arm_get_cpreg_ptr(cpu, KVM_REG_ARM_TIMER_CNT); ++ cpu->kvm_vtime_dirty = true; ++ } ++} ++ + void kvm_arm_reset_vcpu(ARMCPU *cpu) + { + int ret; +@@ -579,6 +612,50 @@ int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu) + return 0; + } + ++void kvm_arm_get_virtual_time(CPUState *cs) ++{ ++ ARMCPU *cpu = ARM_CPU(cs); ++ struct kvm_one_reg reg = { ++ .id = KVM_REG_ARM_TIMER_CNT, ++ .addr = (uintptr_t)&cpu->kvm_vtime, ++ }; ++ int ret; ++ ++ if (cpu->kvm_vtime_dirty) { ++ return; ++ } ++ ++ ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); ++ if (ret) { ++ error_report("Failed to get KVM_REG_ARM_TIMER_CNT"); ++ abort(); ++ } ++ ++ cpu->kvm_vtime_dirty = true; ++} ++ ++void kvm_arm_put_virtual_time(CPUState *cs) ++{ ++ ARMCPU *cpu = ARM_CPU(cs); ++ struct kvm_one_reg reg = { ++ .id = KVM_REG_ARM_TIMER_CNT, ++ .addr = (uintptr_t)&cpu->kvm_vtime, ++ }; ++ int ret; ++ ++ if (!cpu->kvm_vtime_dirty) { ++ return; ++ } ++ ++ ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); ++ if (ret) { ++ error_report("Failed to set KVM_REG_ARM_TIMER_CNT"); ++ abort(); ++ } ++ ++ cpu->kvm_vtime_dirty = false; ++} ++ + int kvm_put_vcpu_events(ARMCPU *cpu) + { + CPUARMState *env = &cpu->env; +@@ -690,6 +767,21 @@ MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run) + return MEMTXATTRS_UNSPECIFIED; + } + ++void kvm_arm_vm_state_change(void *opaque, int running, RunState state) ++{ ++ CPUState *cs = opaque; ++ ARMCPU *cpu = ARM_CPU(cs); ++ ++ if (running) { ++ if (cpu->kvm_adjvtime) { ++ kvm_arm_put_virtual_time(cs); ++ } ++ } else { ++ if (cpu->kvm_adjvtime) { ++ kvm_arm_get_virtual_time(cs); ++ } ++ } ++} + + int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) + { +diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c +index 32bf8d6..3a8b437 100644 +--- a/target/arm/kvm32.c ++++ b/target/arm/kvm32.c +@@ -16,6 +16,7 @@ + #include "qemu-common.h" + #include "cpu.h" + #include "qemu/timer.h" ++#include "sysemu/runstate.h" + #include "sysemu/kvm.h" + #include "kvm_arm.h" + #include "internals.h" +@@ -198,6 +199,8 @@ int kvm_arch_init_vcpu(CPUState *cs) + return -EINVAL; + } + ++ qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs); ++ + /* Determine init features for this CPU */ + memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); + if (cpu->start_powered_off) { +diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c +index 666a81a..d368189 100644 +--- a/target/arm/kvm64.c ++++ b/target/arm/kvm64.c +@@ -23,6 +23,7 @@ + #include "qemu/host-utils.h" + #include "qemu/main-loop.h" + #include "exec/gdbstub.h" ++#include "sysemu/runstate.h" + #include "sysemu/kvm.h" + #include "sysemu/kvm_int.h" + #include "kvm_arm.h" +@@ -735,6 +736,8 @@ int kvm_arch_init_vcpu(CPUState *cs) + return -EINVAL; + } + ++ qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs); ++ + /* Determine init features for this CPU */ + memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); + if (cpu->start_powered_off) { +diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h +index b48a9c9..01a9a18 100644 +--- a/target/arm/kvm_arm.h ++++ b/target/arm/kvm_arm.h +@@ -128,6 +128,23 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level); + bool write_kvmstate_to_list(ARMCPU *cpu); + + /** ++ * kvm_arm_cpu_pre_save: ++ * @cpu: ARMCPU ++ * ++ * Called after write_kvmstate_to_list() from cpu_pre_save() to update ++ * the cpreg list with KVM CPU state. ++ */ ++void kvm_arm_cpu_pre_save(ARMCPU *cpu); ++ ++/** ++ * kvm_arm_cpu_post_load: ++ * @cpu: ARMCPU ++ * ++ * Called from cpu_post_load() to update KVM CPU state from the cpreg list. ++ */ ++void kvm_arm_cpu_post_load(ARMCPU *cpu); ++ ++/** + * kvm_arm_reset_vcpu: + * @cpu: ARMCPU + * +@@ -292,6 +309,24 @@ int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu); + */ + int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu); + ++/** ++ * kvm_arm_get_virtual_time: ++ * @cs: CPUState ++ * ++ * Gets the VCPU's virtual counter and stores it in the KVM CPU state. ++ */ ++void kvm_arm_get_virtual_time(CPUState *cs); ++ ++/** ++ * kvm_arm_put_virtual_time: ++ * @cs: CPUState ++ * ++ * Sets the VCPU's virtual counter to the value stored in the KVM CPU state. ++ */ ++void kvm_arm_put_virtual_time(CPUState *cs); ++ ++void kvm_arm_vm_state_change(void *opaque, int running, RunState state); ++ + int kvm_arm_vgic_probe(void); + + void kvm_arm_pmu_set_irq(CPUState *cs, int irq); +@@ -339,6 +374,9 @@ static inline void kvm_arm_pmu_set_irq(CPUState *cs, int irq) {} + static inline void kvm_arm_pmu_init(CPUState *cs) {} + + static inline void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map) {} ++ ++static inline void kvm_arm_get_virtual_time(CPUState *cs) {} ++static inline void kvm_arm_put_virtual_time(CPUState *cs) {} + #endif + + static inline const char *gic_class_name(void) +diff --git a/target/arm/machine.c b/target/arm/machine.c +index eb28b23..241890a 100644 +--- a/target/arm/machine.c ++++ b/target/arm/machine.c +@@ -642,6 +642,12 @@ static int cpu_pre_save(void *opaque) + /* This should never fail */ + abort(); + } ++ ++ /* ++ * kvm_arm_cpu_pre_save() must be called after ++ * write_kvmstate_to_list() ++ */ ++ kvm_arm_cpu_pre_save(cpu); + } else { + if (!write_cpustate_to_list(cpu, false)) { + /* This should never fail. */ +@@ -744,6 +750,7 @@ static int cpu_post_load(void *opaque, int version_id) + * we're using it. + */ + write_list_to_cpustate(cpu); ++ kvm_arm_cpu_post_load(cpu); + } else { + if (!write_list_to_cpustate(cpu)) { + return -1; +-- +1.8.3.1 + diff --git a/kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch b/kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch new file mode 100755 index 0000000000000000000000000000000000000000..8cdc867e708d492718655c0693b33bb98048ffcd --- /dev/null +++ b/kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch @@ -0,0 +1,197 @@ +From 11cb9cb7b1b56d5c9723e9c50bc2903281893bcc Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:10 +0000 +Subject: [PATCH 08/15] target/arm/kvm: trivial: Clean up header documentation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-2-drjones@redhat.com> +Patchwork-id: 93625 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/5] target/arm/kvm: trivial: Clean up header documentation +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:05 +0000 + + target/arm/kvm: trivial: Clean up header documentation + + Signed-off-by: Andrew Jones + Message-id: 20200120101023.16030-2-drjones@redhat.com + Reviewed-by: Peter Maydell + Signed-off-by: Peter Maydell + +(cherry picked from commit d1ebbc9d16297b54b153ee33abe05eb4f1df0c66) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/kvm_arm.h | 46 +++++++++++++++++++++++++++------------------- + 1 file changed, 27 insertions(+), 19 deletions(-) + +diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h +index 8e14d40..b48a9c9 100644 +--- a/target/arm/kvm_arm.h ++++ b/target/arm/kvm_arm.h +@@ -28,9 +28,9 @@ + int kvm_arm_vcpu_init(CPUState *cs); + + /** +- * kvm_arm_vcpu_finalize ++ * kvm_arm_vcpu_finalize: + * @cs: CPUState +- * @feature: int ++ * @feature: feature to finalize + * + * Finalizes the configuration of the specified VCPU feature by + * invoking the KVM_ARM_VCPU_FINALIZE ioctl. Features requiring +@@ -75,8 +75,8 @@ void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group, + int kvm_arm_init_cpreg_list(ARMCPU *cpu); + + /** +- * kvm_arm_reg_syncs_via_cpreg_list +- * regidx: KVM register index ++ * kvm_arm_reg_syncs_via_cpreg_list: ++ * @regidx: KVM register index + * + * Return true if this KVM register should be synchronized via the + * cpreg list of arbitrary system registers, false if it is synchronized +@@ -85,8 +85,8 @@ int kvm_arm_init_cpreg_list(ARMCPU *cpu); + bool kvm_arm_reg_syncs_via_cpreg_list(uint64_t regidx); + + /** +- * kvm_arm_cpreg_level +- * regidx: KVM register index ++ * kvm_arm_cpreg_level: ++ * @regidx: KVM register index + * + * Return the level of this coprocessor/system register. Return value is + * either KVM_PUT_RUNTIME_STATE, KVM_PUT_RESET_STATE, or KVM_PUT_FULL_STATE. +@@ -148,6 +148,8 @@ void kvm_arm_init_serror_injection(CPUState *cs); + * @cpu: ARMCPU + * + * Get VCPU related state from kvm. ++ * ++ * Returns: 0 if success else < 0 error code + */ + int kvm_get_vcpu_events(ARMCPU *cpu); + +@@ -156,6 +158,8 @@ int kvm_get_vcpu_events(ARMCPU *cpu); + * @cpu: ARMCPU + * + * Put VCPU related state to kvm. ++ * ++ * Returns: 0 if success else < 0 error code + */ + int kvm_put_vcpu_events(ARMCPU *cpu); + +@@ -205,10 +209,12 @@ typedef struct ARMHostCPUFeatures { + + /** + * kvm_arm_get_host_cpu_features: +- * @ahcc: ARMHostCPUClass to fill in ++ * @ahcf: ARMHostCPUClass to fill in + * + * Probe the capabilities of the host kernel's preferred CPU and fill + * in the ARMHostCPUClass struct accordingly. ++ * ++ * Returns true on success and false otherwise. + */ + bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf); + +@@ -242,7 +248,7 @@ void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu); + bool kvm_arm_aarch32_supported(CPUState *cs); + + /** +- * bool kvm_arm_pmu_supported: ++ * kvm_arm_pmu_supported: + * @cs: CPUState + * + * Returns: true if the KVM VCPU can enable its PMU +@@ -251,7 +257,7 @@ bool kvm_arm_aarch32_supported(CPUState *cs); + bool kvm_arm_pmu_supported(CPUState *cs); + + /** +- * bool kvm_arm_sve_supported: ++ * kvm_arm_sve_supported: + * @cs: CPUState + * + * Returns true if the KVM VCPU can enable SVE and false otherwise. +@@ -259,26 +265,30 @@ bool kvm_arm_pmu_supported(CPUState *cs); + bool kvm_arm_sve_supported(CPUState *cs); + + /** +- * kvm_arm_get_max_vm_ipa_size - Returns the number of bits in the +- * IPA address space supported by KVM +- * ++ * kvm_arm_get_max_vm_ipa_size: + * @ms: Machine state handle ++ * ++ * Returns the number of bits in the IPA address space supported by KVM + */ + int kvm_arm_get_max_vm_ipa_size(MachineState *ms); + + /** +- * kvm_arm_sync_mpstate_to_kvm ++ * kvm_arm_sync_mpstate_to_kvm: + * @cpu: ARMCPU + * + * If supported set the KVM MP_STATE based on QEMU's model. ++ * ++ * Returns 0 on success and -1 on failure. + */ + int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu); + + /** +- * kvm_arm_sync_mpstate_to_qemu ++ * kvm_arm_sync_mpstate_to_qemu: + * @cpu: ARMCPU + * + * If supported get the MP_STATE from KVM and store in QEMU's model. ++ * ++ * Returns 0 on success and aborts on failure. + */ + int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu); + +@@ -292,7 +302,8 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); + + static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu) + { +- /* This should never actually be called in the "not KVM" case, ++ /* ++ * This should never actually be called in the "not KVM" case, + * but set up the fields to indicate an error anyway. + */ + cpu->kvm_target = QEMU_KVM_ARM_TARGET_NONE; +@@ -377,23 +388,20 @@ bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit); + * + * Return: TRUE if any hardware breakpoints in use. + */ +- + bool kvm_arm_hw_debug_active(CPUState *cs); + + /** + * kvm_arm_copy_hw_debug_data: +- * + * @ptr: kvm_guest_debug_arch structure + * + * Copy the architecture specific debug registers into the + * kvm_guest_debug ioctl structure. + */ + struct kvm_guest_debug_arch; +- + void kvm_arm_copy_hw_debug_data(struct kvm_guest_debug_arch *ptr); + + /** +- * its_class_name ++ * its_class_name: + * + * Return the ITS class name to use depending on whether KVM acceleration + * and KVM CAP_SIGNAL_MSI are supported +-- +1.8.3.1 + diff --git a/kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch b/kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch new file mode 100755 index 0000000000000000000000000000000000000000..36c0f1aa2ae8b82a5de9da3098113994ea8ee1df --- /dev/null +++ b/kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch @@ -0,0 +1,60 @@ +From 2740a84fe798ade5c1ce725d65cdaffb255da47c Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:11 +0000 +Subject: [PATCH 09/15] target/arm/kvm64: kvm64 cpus have timer registers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-3-drjones@redhat.com> +Patchwork-id: 93621 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/5] target/arm/kvm64: kvm64 cpus have timer registers +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + target/arm/kvm64: kvm64 cpus have timer registers + + Add the missing GENERIC_TIMER feature to kvm64 cpus. + + We don't currently use these registers when KVM is enabled, but it's + probably best we add the feature flag for consistency and potential + future use. There's also precedent, as we add the PMU feature flag to + KVM enabled guests, even though we don't use those registers either. + + This change was originally posted as a hunk of a different, never + merged patch from Bijan Mottahedeh. + + Signed-off-by: Andrew Jones + Reviewed-by: Richard Henderson + Message-id: 20200120101023.16030-4-drjones@redhat.com + Signed-off-by: Peter Maydell + +(cherry picked from commit 65caa415487f4a6e265105446c6ef8f56bb0aa70) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/kvm64.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c +index e2da756..666a81a 100644 +--- a/target/arm/kvm64.c ++++ b/target/arm/kvm64.c +@@ -605,6 +605,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf) + set_feature(&features, ARM_FEATURE_NEON); + set_feature(&features, ARM_FEATURE_AARCH64); + set_feature(&features, ARM_FEATURE_PMU); ++ set_feature(&features, ARM_FEATURE_GENERIC_TIMER); + + ahcf->features = features; + +-- +1.8.3.1 + diff --git a/kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch b/kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch new file mode 100755 index 0000000000000000000000000000000000000000..55f328dcf6b5dfb25a72de098f3ce3afb8e657fc --- /dev/null +++ b/kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch @@ -0,0 +1,81 @@ +From c82cf5c08617c947b34eb490d1714729103e3379 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Mon, 10 Feb 2020 17:33:57 +0000 +Subject: [PATCH 17/18] target/arm/monitor: query-cpu-model-expansion crashed + qemu when using machine type none +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200210173358.16896-2-drjones@redhat.com> +Patchwork-id: 93773 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] target/arm/monitor: query-cpu-model-expansion crashed qemu when using machine type none +Bugzilla: 1801320 +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan +RH-Acked-by: Philippe Mathieu-Daudé + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1801320 + +Author: Liang Yan +Date: Fri, 07 Feb 2020 14:04:21 +0000 + + target/arm/monitor: query-cpu-model-expansion crashed qemu when using machine type none + + Commit e19afd566781 mentioned that target-arm only supports queryable + cpu models 'max', 'host', and the current type when KVM is in use. + The logic works well until using machine type none. + + For machine type none, cpu_type will be null if cpu option is not + set by command line, strlen(cpu_type) will terminate process. + So We add a check above it. + + This won't affect i386 and s390x since they do not use current_cpu. + + Signed-off-by: Liang Yan + Message-id: 20200203134251.12986-1-lyan@suse.com + Reviewed-by: Andrew Jones + Tested-by: Andrew Jones + Signed-off-by: Peter Maydell + +(cherry picked from commit 0999a4ba8718aa96105b978d3567fc7e90244c7e) +Signed-off-by: Danilo C. L. de Paula +--- + target/arm/monitor.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +diff --git a/target/arm/monitor.c b/target/arm/monitor.c +index 9725dff..c2dc790 100644 +--- a/target/arm/monitor.c ++++ b/target/arm/monitor.c +@@ -137,17 +137,20 @@ CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type, + } + + if (kvm_enabled()) { +- const char *cpu_type = current_machine->cpu_type; +- int len = strlen(cpu_type) - strlen(ARM_CPU_TYPE_SUFFIX); + bool supported = false; + + if (!strcmp(model->name, "host") || !strcmp(model->name, "max")) { + /* These are kvmarm's recommended cpu types */ + supported = true; +- } else if (strlen(model->name) == len && +- !strncmp(model->name, cpu_type, len)) { +- /* KVM is enabled and we're using this type, so it works. */ +- supported = true; ++ } else if (current_machine->cpu_type) { ++ const char *cpu_type = current_machine->cpu_type; ++ int len = strlen(cpu_type) - strlen(ARM_CPU_TYPE_SUFFIX); ++ ++ if (strlen(model->name) == len && ++ !strncmp(model->name, cpu_type, len)) { ++ /* KVM is enabled and we're using this type, so it works. */ ++ supported = true; ++ } + } + if (!supported) { + error_setg(errp, "We cannot guarantee the CPU type '%s' works " +-- +1.8.3.1 + diff --git a/kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch b/kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch new file mode 100755 index 0000000000000000000000000000000000000000..ffb6ab7e1c7b763f6da9631eb1406e26be70e7a0 --- /dev/null +++ b/kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch @@ -0,0 +1,83 @@ +From 4c9201a83e3ff48d2a55e45a34eb27966a1e4ab0 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 5 Jun 2020 18:37:33 -0400 +Subject: [PATCH 3/3] target/i386: Add ARCH_CAPABILITIES related bits into + Icelake-Server CPU model +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: plai@redhat.com +Message-id: <20200605183733.8269-1-plai@redhat.com> +Patchwork-id: 97380 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH] target/i386: Add ARCH_CAPABILITIES related bits into Icelake-Server CPU model +Bugzilla: 1840342 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Bandan Das +RH-Acked-by: Danilo de Paula +RH-Acked-by: Eduardo Habkost + +From: Xiaoyao Li + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1840342 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=28983822 +Branch: rhel-av-8.2.1 + +Tested on HOST: intel-whitley-09.khw1.lab.eng.bos.redhat.com + +1. qemu-kvm -cpu host … + VM guest does have arch_capabilities in cpuinfo/flags. + [Expected success] + +2. qemu-kvm -cpu Icelake-Server … + VM guest does NOT have arch_capabilities in cpuinfo/flags. + [Expected failure] + +3. qemu-kvm -cpu Icelake-Server-v3 … + VM guest does have arch_capabilities in cpuinfo/flags. + [Expected success] + +--- + +Current Icelake-Server CPU model lacks all the features enumerated by +MSR_IA32_ARCH_CAPABILITIES. + +Add them, so that guest of "Icelake-Server" can see all of them. + +Signed-off-by: Xiaoyao Li +Message-Id: <20200316095605.12318-1-xiaoyao.li@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit d965dc35592d24c0c1519f1c566223c6277cb80e) +Signed-off-by: Paul Lai +Signed-off-by: Eduardo Lima (Etrunko) +--- + target/i386/cpu.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index b763adcdc5..7d7b016bb7 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3496,6 +3496,19 @@ static X86CPUDefinition builtin_x86_defs[] = { + { /* end of list */ } + }, + }, ++ { ++ .version = 3, ++ .props = (PropValue[]) { ++ { "arch-capabilities", "on" }, ++ { "rdctl-no", "on" }, ++ { "ibrs-all", "on" }, ++ { "skip-l1dfl-vmentry", "on" }, ++ { "mds-no", "on" }, ++ { "pschange-mc-no", "on" }, ++ { "taa-no", "on" }, ++ { /* end of list */ } ++ }, ++ }, + { /* end of list */ } + } + }, +-- +2.27.0 + diff --git a/kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch b/kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch new file mode 100755 index 0000000000000000000000000000000000000000..ef95ccfa080d81bc2f84041a999e39a6cd06fe00 --- /dev/null +++ b/kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch @@ -0,0 +1,103 @@ +From 1ffeb321151b3878bcbb2229639456c0677305f5 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Fri, 15 May 2020 18:02:43 +0100 +Subject: [PATCH 17/17] target/i386: Add missed features to Cooperlake CPU + model + +RH-Author: plai@redhat.com +Message-id: <20200515180243.17488-5-plai@redhat.com> +Patchwork-id: 96611 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 4/4] target/i386: Add missed features to Cooperlake CPU model +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Xiaoyao Li + +It lacks VMX features and two security feature bits (disclosed recently) in +MSR_IA32_ARCH_CAPABILITIES in current Cooperlake CPU model, so add them. + +Fixes: 22a866b6166d ("i386: Add new CPU model Cooperlake") +Signed-off-by: Xiaoyao Li +Message-Id: <20191225063018.20038-3-xiaoyao.li@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 2dea9d9ca4ea7e9afe83d0b4153b21a16987e866) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 50 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 996a74f..b763adc 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3202,7 +3202,8 @@ static X86CPUDefinition builtin_x86_defs[] = { + CPUID_7_0_EDX_SPEC_CTRL_SSBD | CPUID_7_0_EDX_ARCH_CAPABILITIES, + .features[FEAT_ARCH_CAPABILITIES] = + MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | +- MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO, ++ MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO | ++ MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_TAA_NO, + .features[FEAT_7_1_EAX] = + CPUID_7_1_EAX_AVX512_BF16, + /* +@@ -3217,6 +3218,54 @@ static X86CPUDefinition builtin_x86_defs[] = { + CPUID_XSAVE_XGETBV1, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, ++ /* Missing: Mode-based execute control (XS/XU), processor tracing, TSC scaling */ ++ .features[FEAT_VMX_BASIC] = MSR_VMX_BASIC_INS_OUTS | ++ MSR_VMX_BASIC_TRUE_CTLS, ++ .features[FEAT_VMX_ENTRY_CTLS] = VMX_VM_ENTRY_IA32E_MODE | ++ VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VMX_VM_ENTRY_LOAD_IA32_PAT | ++ VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_LOAD_IA32_EFER, ++ .features[FEAT_VMX_EPT_VPID_CAPS] = MSR_VMX_EPT_EXECONLY | ++ MSR_VMX_EPT_PAGE_WALK_LENGTH_4 | MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | ++ MSR_VMX_EPT_1GB | MSR_VMX_EPT_INVEPT | ++ MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT | ++ MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR | ++ MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT | MSR_VMX_EPT_INVVPID_ALL_CONTEXT | ++ MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS | MSR_VMX_EPT_AD_BITS, ++ .features[FEAT_VMX_EXIT_CTLS] = ++ VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_DEBUG_CONTROLS | ++ VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | ++ VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_LOAD_IA32_EFER | ++ VMX_VM_EXIT_SAVE_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER | ++ VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, ++ .features[FEAT_VMX_MISC] = MSR_VMX_MISC_ACTIVITY_HLT | ++ MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_VMWRITE_VMEXIT, ++ .features[FEAT_VMX_PINBASED_CTLS] = VMX_PIN_BASED_EXT_INTR_MASK | ++ VMX_PIN_BASED_NMI_EXITING | VMX_PIN_BASED_VIRTUAL_NMIS | ++ VMX_PIN_BASED_VMX_PREEMPTION_TIMER | VMX_PIN_BASED_POSTED_INTR, ++ .features[FEAT_VMX_PROCBASED_CTLS] = VMX_CPU_BASED_VIRTUAL_INTR_PENDING | ++ VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING | ++ VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING | ++ VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING | ++ VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING | ++ VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_MOV_DR_EXITING | ++ VMX_CPU_BASED_UNCOND_IO_EXITING | VMX_CPU_BASED_USE_IO_BITMAPS | ++ VMX_CPU_BASED_MONITOR_EXITING | VMX_CPU_BASED_PAUSE_EXITING | ++ VMX_CPU_BASED_VIRTUAL_NMI_PENDING | VMX_CPU_BASED_USE_MSR_BITMAPS | ++ VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING | ++ VMX_CPU_BASED_MONITOR_TRAP_FLAG | ++ VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS, ++ .features[FEAT_VMX_SECONDARY_CTLS] = ++ VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | ++ VMX_SECONDARY_EXEC_WBINVD_EXITING | VMX_SECONDARY_EXEC_ENABLE_EPT | ++ VMX_SECONDARY_EXEC_DESC | VMX_SECONDARY_EXEC_RDTSCP | ++ VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | ++ VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST | ++ VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT | ++ VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | ++ VMX_SECONDARY_EXEC_RDRAND_EXITING | VMX_SECONDARY_EXEC_ENABLE_INVPCID | ++ VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS | ++ VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML, ++ .features[FEAT_VMX_VMFUNC] = MSR_VMX_VMFUNC_EPT_SWITCHING, + .xlevel = 0x80000008, + .model_id = "Intel Xeon Processor (Cooperlake)", + }, +-- +1.8.3.1 + diff --git a/kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch b/kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch new file mode 100755 index 0000000000000000000000000000000000000000..ad2dd77472cdbcc8acce8731ac900e59b3306374 --- /dev/null +++ b/kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch @@ -0,0 +1,62 @@ +From 6f0630299a3edbb8f5e5ac41eb9e1f1c363f1e3e Mon Sep 17 00:00:00 2001 +From: Danilo de Paula +Date: Tue, 9 Jun 2020 18:46:51 +0100 +Subject: [PATCH 15/17] target/i386: Add new bit definitions of + MSR_IA32_ARCH_CAPABILITIES + +RH-Author: Danilo de Paula +Message-id: <20200609184651.1328372-1-ddepaula@redhat.com> +Patchwork-id: 97489 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 5/4] target/i386: Add new bit definitions of MSR_IA32_ARCH_CAPABILITIES +Bugzilla: 1769912 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Eduardo Habkost + +From: Danilo de Paula + +redhat: builds with that series were failing. It complains about a undefined +MSR_ARCH_CAP_TAA_NO. + +The bit 6, 7 and 8 of MSR_IA32_ARCH_CAPABILITIES are recently disclosed +for some security issues. Add the definitions for them to be used by named +CPU models. + +Signed-off-by: Xiaoyao Li +Message-Id: <20191225063018.20038-2-xiaoyao.li@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 6c997b4adb300788d61d72e2b8bc67c03a584956) + +Signed-off-by: Paolo Bonzini +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.h | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index e77d101..7bfbf2a 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -836,12 +836,15 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_TOPOLOGY_LEVEL_DIE (5U << 8) + + /* MSR Feature Bits */ +-#define MSR_ARCH_CAP_RDCL_NO (1U << 0) +-#define MSR_ARCH_CAP_IBRS_ALL (1U << 1) +-#define MSR_ARCH_CAP_RSBA (1U << 2) ++#define MSR_ARCH_CAP_RDCL_NO (1U << 0) ++#define MSR_ARCH_CAP_IBRS_ALL (1U << 1) ++#define MSR_ARCH_CAP_RSBA (1U << 2) + #define MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY (1U << 3) +-#define MSR_ARCH_CAP_SSB_NO (1U << 4) +-#define MSR_ARCH_CAP_MDS_NO (1U << 5) ++#define MSR_ARCH_CAP_SSB_NO (1U << 4) ++#define MSR_ARCH_CAP_MDS_NO (1U << 5) ++#define MSR_ARCH_CAP_PSCHANGE_MC_NO (1U << 6) ++#define MSR_ARCH_CAP_TSX_CTRL_MSR (1U << 7) ++#define MSR_ARCH_CAP_TAA_NO (1U << 8) + + #define MSR_CORE_CAP_SPLIT_LOCK_DETECT (1U << 5) + +-- +1.8.3.1 + diff --git a/kvm-target-i386-add-a-ucode-rev-property.patch b/kvm-target-i386-add-a-ucode-rev-property.patch new file mode 100755 index 0000000000000000000000000000000000000000..5c3c770e5ce0af1169f7aeb5d74799932236890e --- /dev/null +++ b/kvm-target-i386-add-a-ucode-rev-property.patch @@ -0,0 +1,125 @@ +From 4009f0bcc8004ce481015d088fe335a16b8d7ce1 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:12 +0000 +Subject: [PATCH 2/9] target/i386: add a ucode-rev property + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-3-pbonzini@redhat.com> +Patchwork-id: 93909 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/6] target/i386: add a ucode-rev property +Bugzilla: 1791648 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +Add the property and plumb it in TCG and HVF (the latter of which +tried to support returning a constant value but used the wrong MSR). + +Signed-off-by: Paolo Bonzini +Message-Id: <1579544504-3616-3-git-send-email-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 4e45aff398cd1542c2a384a2a3b8600f23337d86) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 10 ++++++++++ + target/i386/cpu.h | 3 +++ + target/i386/hvf/x86_emu.c | 4 +--- + target/i386/misc_helper.c | 4 ++++ + 4 files changed, 18 insertions(+), 3 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 863192c..e505d3e 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6325,6 +6325,15 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) + } + } + ++ if (cpu->ucode_rev == 0) { ++ /* The default is the same as KVM's. */ ++ if (IS_AMD_CPU(env)) { ++ cpu->ucode_rev = 0x01000065; ++ } else { ++ cpu->ucode_rev = 0x100000000ULL; ++ } ++ } ++ + /* mwait extended info: needed for Core compatibility */ + /* We always wake on interrupt even if host does not have the capability */ + cpu->mwait.ecx |= CPUID_MWAIT_EMX | CPUID_MWAIT_IBE; +@@ -7008,6 +7017,7 @@ static Property x86_cpu_properties[] = { + DEFINE_PROP_UINT32("min-level", X86CPU, env.cpuid_min_level, 0), + DEFINE_PROP_UINT32("min-xlevel", X86CPU, env.cpuid_min_xlevel, 0), + DEFINE_PROP_UINT32("min-xlevel2", X86CPU, env.cpuid_min_xlevel2, 0), ++ DEFINE_PROP_UINT64("ucode-rev", X86CPU, ucode_rev, 0), + DEFINE_PROP_BOOL("full-cpuid-auto-level", X86CPU, full_cpuid_auto_level, true), + DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor_id), + DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true), +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index cde2a16..4441061 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -348,6 +348,7 @@ typedef enum X86Seg { + #define MSR_IA32_SPEC_CTRL 0x48 + #define MSR_VIRT_SSBD 0xc001011f + #define MSR_IA32_PRED_CMD 0x49 ++#define MSR_IA32_UCODE_REV 0x8b + #define MSR_IA32_CORE_CAPABILITY 0xcf + + #define MSR_IA32_ARCH_CAPABILITIES 0x10a +@@ -1621,6 +1622,8 @@ struct X86CPU { + CPUNegativeOffsetState neg; + CPUX86State env; + ++ uint64_t ucode_rev; ++ + uint32_t hyperv_spinlock_attempts; + char *hyperv_vendor_id; + bool hyperv_synic_kvm_only; +diff --git a/target/i386/hvf/x86_emu.c b/target/i386/hvf/x86_emu.c +index 3df7672..92ab815 100644 +--- a/target/i386/hvf/x86_emu.c ++++ b/target/i386/hvf/x86_emu.c +@@ -664,8 +664,6 @@ static void exec_lods(struct CPUX86State *env, struct x86_decode *decode) + RIP(env) += decode->len; + } + +-#define MSR_IA32_UCODE_REV 0x00000017 +- + void simulate_rdmsr(struct CPUState *cpu) + { + X86CPU *x86_cpu = X86_CPU(cpu); +@@ -681,7 +679,7 @@ void simulate_rdmsr(struct CPUState *cpu) + val = cpu_get_apic_base(X86_CPU(cpu)->apic_state); + break; + case MSR_IA32_UCODE_REV: +- val = (0x100000000ULL << 32) | 0x100000000ULL; ++ val = x86_cpu->ucode_rev; + break; + case MSR_EFER: + val = rvmcs(cpu->hvf_fd, VMCS_GUEST_IA32_EFER); +diff --git a/target/i386/misc_helper.c b/target/i386/misc_helper.c +index 3eff688..aed16fe 100644 +--- a/target/i386/misc_helper.c ++++ b/target/i386/misc_helper.c +@@ -229,6 +229,7 @@ void helper_rdmsr(CPUX86State *env) + #else + void helper_wrmsr(CPUX86State *env) + { ++ X86CPU *x86_cpu = env_archcpu(env); + uint64_t val; + + cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 1, GETPC()); +@@ -371,6 +372,9 @@ void helper_wrmsr(CPUX86State *env) + env->msr_bndcfgs = val; + cpu_sync_bndcs_hflags(env); + break; ++ case MSR_IA32_UCODE_REV: ++ val = x86_cpu->ucode_rev; ++ break; + default: + if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL + && (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL + +-- +1.8.3.1 + diff --git a/kvm-target-i386-add-fast-short-REP-MOV-support.patch b/kvm-target-i386-add-fast-short-REP-MOV-support.patch new file mode 100755 index 0000000000000000000000000000000000000000..51af7e7e3fc861a5171104f6b5dbaf3d586e4efe --- /dev/null +++ b/kvm-target-i386-add-fast-short-REP-MOV-support.patch @@ -0,0 +1,59 @@ +From f33880c5f7a4e2cad25c22112da073273c6e2cfb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:35 -0500 +Subject: [PATCH 2/4] target/i386: add fast short REP MOV support + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-3-dgilbert@redhat.com> +Patchwork-id: 101201 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/4] target/i386: add fast short REP MOV support +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Chenyi Qiang + +For CPUs support fast short REP MOV[CPUID.(EAX=7,ECX=0):EDX(bit4)], e.g +Icelake and Tigerlake, expose it to the guest VM. + +Reviewed-by: Eduardo Habkost +Signed-off-by: Chenyi Qiang +Message-Id: <20200714084148.26690-2-chenyi.qiang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit 5cb287d2bd578dfe4897458793b4fce35bc4f744) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 2 +- + target/i386/cpu.h | 2 ++ + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 67dab94aa5..f6a9ed84b3 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1077,7 +1077,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + NULL, NULL, "avx512-4vnniw", "avx512-4fmaps", +- NULL, NULL, NULL, NULL, ++ "fsrm", NULL, NULL, NULL, + "avx512-vp2intersect", NULL, "md-clear", NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL /* pconfig */, NULL, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 8e2e52ed31..f5a4efcec6 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -770,6 +770,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_0_EDX_AVX512_4VNNIW (1U << 2) + /* AVX512 Multiply Accumulation Single Precision */ + #define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) ++/* Fast Short Rep Mov */ ++#define CPUID_7_0_EDX_FSRM (1U << 4) + /* AVX512 Vector Pair Intersection to a Pair of Mask Registers */ + #define CPUID_7_0_EDX_AVX512_VP2INTERSECT (1U << 8) + /* Speculation Control */ +-- +2.27.0 + diff --git a/kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch b/kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch new file mode 100755 index 0000000000000000000000000000000000000000..a80c9d32ac2f35240ef4d2234593be2578a066c1 --- /dev/null +++ b/kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch @@ -0,0 +1,72 @@ +From 27d7b085f2f568050d638b694ed2f51495db718c Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:15 +0000 +Subject: [PATCH 5/9] target/i386: check for availability of MSR_IA32_UCODE_REV + as an emulated MSR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-6-pbonzini@redhat.com> +Patchwork-id: 93898 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/6] target/i386: check for availability of MSR_IA32_UCODE_REV as an emulated MSR +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +Even though MSR_IA32_UCODE_REV has been available long before Linux 5.6, +which added it to the emulated MSR list, a bug caused the microcode +version to revert to 0x100000000 on INIT. As a result, processors other +than the bootstrap processor would not see the host microcode revision; +some Windows version complain loudly about this and crash with a +fairly explicit MICROCODE REVISION MISMATCH error. + +[If running 5.6 prereleases, the kernel fix "KVM: x86: do not reset + microcode version on INIT or RESET" should also be applied.] + +Reported-by: Alex Williamson +Message-id: <20200211175516.10716-1-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 6702514814c7e7b4cbf179624539b5f38c72740b) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/kvm.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 6c61aef..99840ca 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -105,6 +105,7 @@ static bool has_msr_smi_count; + static bool has_msr_arch_capabs; + static bool has_msr_core_capabs; + static bool has_msr_vmx_vmfunc; ++static bool has_msr_ucode_rev; + + static uint32_t has_architectural_pmu_version; + static uint32_t num_architectural_pmu_gp_counters; +@@ -2056,6 +2057,9 @@ static int kvm_get_supported_msrs(KVMState *s) + case MSR_IA32_VMX_VMFUNC: + has_msr_vmx_vmfunc = true; + break; ++ case MSR_IA32_UCODE_REV: ++ has_msr_ucode_rev = true; ++ break; + } + } + } +@@ -2696,8 +2700,7 @@ static void kvm_init_msrs(X86CPU *cpu) + env->features[FEAT_CORE_CAPABILITY]); + } + +- if (kvm_arch_get_supported_msr_feature(kvm_state, +- MSR_IA32_UCODE_REV)) { ++ if (has_msr_ucode_rev) { + kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); + } + +-- +1.8.3.1 + diff --git a/kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch b/kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch new file mode 100755 index 0000000000000000000000000000000000000000..4c2362d12b0f03378eb2dfc3987e0d47b743e95b --- /dev/null +++ b/kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch @@ -0,0 +1,112 @@ +From 77cdcccc49ba988e3b5bcb66decdee2e99fdcd72 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Tue, 14 Apr 2020 15:00:36 +0100 +Subject: [PATCH] target/i386: do not set unsupported VMX secondary execution + controls + +RH-Author: Vitaly Kuznetsov +Message-id: <20200414150036.625732-2-vkuznets@redhat.com> +Patchwork-id: 94674 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] target/i386: do not set unsupported VMX secondary execution controls +Bugzilla: 1822682 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Paolo Bonzini + +Commit 048c95163b4 ("target/i386: work around KVM_GET_MSRS bug for +secondary execution controls") added a workaround for KVM pre-dating +commit 6defc591846d ("KVM: nVMX: include conditional controls in /dev/kvm +KVM_GET_MSRS") which wasn't setting certain available controls. The +workaround uses generic CPUID feature bits to set missing VMX controls. + +It was found that in some cases it is possible to observe hosts which +have certain CPUID features but lack the corresponding VMX control. + +In particular, it was reported that Azure VMs have RDSEED but lack +VMX_SECONDARY_EXEC_RDSEED_EXITING; attempts to enable this feature +bit result in QEMU abort. + +Resolve the issue but not applying the workaround when we don't have +to. As there is no good way to find out if KVM has the fix itself, use +95c5c7c77c ("KVM: nVMX: list VMX MSRs in KVM_GET_MSR_INDEX_LIST") instead +as these [are supposed to] come together. + +Fixes: 048c95163b4 ("target/i386: work around KVM_GET_MSRS bug for secondary execution controls") +Suggested-by: Paolo Bonzini +Signed-off-by: Vitaly Kuznetsov +Message-Id: <20200331162752.1209928-1-vkuznets@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 4a910e1f6ab4155ec8b24c49b2585cc486916985) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/kvm.c | 41 ++++++++++++++++++++++++++--------------- + 1 file changed, 26 insertions(+), 15 deletions(-) + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 99840ca..fcc8f7d 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -106,6 +106,7 @@ static bool has_msr_arch_capabs; + static bool has_msr_core_capabs; + static bool has_msr_vmx_vmfunc; + static bool has_msr_ucode_rev; ++static bool has_msr_vmx_procbased_ctls2; + + static uint32_t has_architectural_pmu_version; + static uint32_t num_architectural_pmu_gp_counters; +@@ -490,21 +491,28 @@ uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index) + value = msr_data.entries[0].data; + switch (index) { + case MSR_IA32_VMX_PROCBASED_CTLS2: +- /* KVM forgot to add these bits for some time, do this ourselves. */ +- if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & CPUID_XSAVE_XSAVES) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & CPUID_EXT_RDRAND) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & CPUID_7_0_EBX_INVPCID) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & CPUID_7_0_EBX_RDSEED) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32; +- } +- if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) & CPUID_EXT2_RDTSCP) { +- value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32; ++ if (!has_msr_vmx_procbased_ctls2) { ++ /* KVM forgot to add these bits for some time, do this ourselves. */ ++ if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & ++ CPUID_XSAVE_XSAVES) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & ++ CPUID_EXT_RDRAND) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & ++ CPUID_7_0_EBX_INVPCID) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & ++ CPUID_7_0_EBX_RDSEED) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32; ++ } ++ if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) & ++ CPUID_EXT2_RDTSCP) { ++ value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32; ++ } + } + /* fall through */ + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: +@@ -2060,6 +2068,9 @@ static int kvm_get_supported_msrs(KVMState *s) + case MSR_IA32_UCODE_REV: + has_msr_ucode_rev = true; + break; ++ case MSR_IA32_VMX_PROCBASED_CTLS2: ++ has_msr_vmx_procbased_ctls2 = true; ++ break; + } + } + } +-- +1.8.3.1 + diff --git a/kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch b/kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch new file mode 100755 index 0000000000000000000000000000000000000000..47438a3cf31babe200d7049581c9a9eed08954da --- /dev/null +++ b/kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch @@ -0,0 +1,49 @@ +From 7b71a7011437ebfa3bc7df9297e892b82293ec98 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:16 +0000 +Subject: [PATCH 6/9] target/i386: enable monitor and ucode revision with -cpu + max +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-7-pbonzini@redhat.com> +Patchwork-id: 93910 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 6/6] target/i386: enable monitor and ucode revision with -cpu max +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +These two features were incorrectly tied to host_cpuid_required rather than +cpu->max_features. As a result, -cpu max was not enabling either MONITOR +features or ucode revision. + +Signed-off-by: Paolo Bonzini +(cherry picked from commit be02cda3afde60d219786e23c3f8edb53aec8e17) + +[RHEL7: context, upstream uses g_autofree] + +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 5ac843d..1685a8c 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6317,7 +6317,9 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) + g_free(name); + goto out; + } ++ } + ++ if (cpu->max_features && accel_uses_host_cpuid()) { + if (enable_cpu_pm) { + host_cpuid(5, 0, &cpu->mwait.eax, &cpu->mwait.ebx, + &cpu->mwait.ecx, &cpu->mwait.edx); +-- +1.8.3.1 + diff --git a/kvm-target-i386-fix-TCG-UCODE_REV-access.patch b/kvm-target-i386-fix-TCG-UCODE_REV-access.patch new file mode 100755 index 0000000000000000000000000000000000000000..c7ced8a2a341fb8d47647adf5851e173d1f0a3d5 --- /dev/null +++ b/kvm-target-i386-fix-TCG-UCODE_REV-access.patch @@ -0,0 +1,73 @@ +From 3d16f05359e6277da1f970f71aa9f76337d655dc Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:14 +0000 +Subject: [PATCH 4/9] target/i386: fix TCG UCODE_REV access +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-5-pbonzini@redhat.com> +Patchwork-id: 93904 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/6] target/i386: fix TCG UCODE_REV access +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +This was a very interesting semantic conflict that caused git to move +the MSR_IA32_UCODE_REV read to helper_wrmsr. Not a big deal, but +still should be fixed... + +Fixes: 4e45aff398 ("target/i386: add a ucode-rev property", 2020-01-24) +Message-id: <20200206171022.9289-1-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 9028c75c9d08be303ccc425bfe3d3b23d8f4cac7) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/misc_helper.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/target/i386/misc_helper.c b/target/i386/misc_helper.c +index aed16fe..7d61221 100644 +--- a/target/i386/misc_helper.c ++++ b/target/i386/misc_helper.c +@@ -229,7 +229,6 @@ void helper_rdmsr(CPUX86State *env) + #else + void helper_wrmsr(CPUX86State *env) + { +- X86CPU *x86_cpu = env_archcpu(env); + uint64_t val; + + cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 1, GETPC()); +@@ -372,9 +371,6 @@ void helper_wrmsr(CPUX86State *env) + env->msr_bndcfgs = val; + cpu_sync_bndcs_hflags(env); + break; +- case MSR_IA32_UCODE_REV: +- val = x86_cpu->ucode_rev; +- break; + default: + if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL + && (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL + +@@ -393,6 +389,7 @@ void helper_wrmsr(CPUX86State *env) + + void helper_rdmsr(CPUX86State *env) + { ++ X86CPU *x86_cpu = env_archcpu(env); + uint64_t val; + + cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 0, GETPC()); +@@ -526,6 +523,9 @@ void helper_rdmsr(CPUX86State *env) + case MSR_IA32_BNDCFGS: + val = env->msr_bndcfgs; + break; ++ case MSR_IA32_UCODE_REV: ++ val = x86_cpu->ucode_rev; ++ break; + default: + if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL + && (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL + +-- +1.8.3.1 + diff --git a/kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch b/kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch new file mode 100755 index 0000000000000000000000000000000000000000..5118aed110069e5eb19c105b1b5f9dce9a3df897 --- /dev/null +++ b/kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch @@ -0,0 +1,178 @@ +From eb0fc0ae2750a0462698d6d21ebb56a4249539f9 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:11 +0000 +Subject: [PATCH 1/9] target/i386: kvm: initialize feature MSRs very early +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-2-pbonzini@redhat.com> +Patchwork-id: 93899 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/6] target/i386: kvm: initialize feature MSRs very early +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +Some read-only MSRs affect the behavior of ioctls such as +KVM_SET_NESTED_STATE. We can initialize them once and for all +right after the CPU is realized, since they will never be modified +by the guest. + +Reported-by: Qingua Cheng +Cc: qemu-stable@nongnu.org +Signed-off-by: Paolo Bonzini +Message-Id: <1579544504-3616-2-git-send-email-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 420ae1fc51c99abfd03b1c590f55617edd2a2bed) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/kvm.c | 81 ++++++++++++++++++++++++++++++-------------------- + target/i386/kvm_i386.h | 1 + + 2 files changed, 49 insertions(+), 33 deletions(-) + +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index 86d9a1f..f41605b 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -67,6 +67,8 @@ + * 255 kvm_msr_entry structs */ + #define MSR_BUF_SIZE 4096 + ++static void kvm_init_msrs(X86CPU *cpu); ++ + const KVMCapabilityInfo kvm_arch_required_capabilities[] = { + KVM_CAP_INFO(SET_TSS_ADDR), + KVM_CAP_INFO(EXT_CPUID), +@@ -1842,6 +1844,8 @@ int kvm_arch_init_vcpu(CPUState *cs) + has_msr_tsc_aux = false; + } + ++ kvm_init_msrs(cpu); ++ + r = hyperv_init_vcpu(cpu); + if (r) { + goto fail; +@@ -2660,11 +2664,53 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) + VMCS12_MAX_FIELD_INDEX << 1); + } + ++static int kvm_buf_set_msrs(X86CPU *cpu) ++{ ++ int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ if (ret < cpu->kvm_msr_buf->nmsrs) { ++ struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; ++ error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64, ++ (uint32_t)e->index, (uint64_t)e->data); ++ } ++ ++ assert(ret == cpu->kvm_msr_buf->nmsrs); ++ return 0; ++} ++ ++static void kvm_init_msrs(X86CPU *cpu) ++{ ++ CPUX86State *env = &cpu->env; ++ ++ kvm_msr_buf_reset(cpu); ++ if (has_msr_arch_capabs) { ++ kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, ++ env->features[FEAT_ARCH_CAPABILITIES]); ++ } ++ ++ if (has_msr_core_capabs) { ++ kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, ++ env->features[FEAT_CORE_CAPABILITY]); ++ } ++ ++ /* ++ * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but ++ * all kernels with MSR features should have them. ++ */ ++ if (kvm_feature_msrs && cpu_has_vmx(env)) { ++ kvm_msr_entry_add_vmx(cpu, env->features); ++ } ++ ++ assert(kvm_buf_set_msrs(cpu) == 0); ++} ++ + static int kvm_put_msrs(X86CPU *cpu, int level) + { + CPUX86State *env = &cpu->env; + int i; +- int ret; + + kvm_msr_buf_reset(cpu); + +@@ -2722,17 +2768,6 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + } + #endif + +- /* If host supports feature MSR, write down. */ +- if (has_msr_arch_capabs) { +- kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, +- env->features[FEAT_ARCH_CAPABILITIES]); +- } +- +- if (has_msr_core_capabs) { +- kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, +- env->features[FEAT_CORE_CAPABILITY]); +- } +- + /* + * The following MSRs have side effects on the guest or are too heavy + * for normal writeback. Limit them to reset or full state updates. +@@ -2910,14 +2945,6 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + + /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see + * kvm_put_msr_feature_control. */ +- +- /* +- * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but +- * all kernels with MSR features should have them. +- */ +- if (kvm_feature_msrs && cpu_has_vmx(env)) { +- kvm_msr_entry_add_vmx(cpu, env->features); +- } + } + + if (env->mcg_cap) { +@@ -2933,19 +2960,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + } + } + +- ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); +- if (ret < 0) { +- return ret; +- } +- +- if (ret < cpu->kvm_msr_buf->nmsrs) { +- struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; +- error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64, +- (uint32_t)e->index, (uint64_t)e->data); +- } +- +- assert(ret == cpu->kvm_msr_buf->nmsrs); +- return 0; ++ return kvm_buf_set_msrs(cpu); + } + + +diff --git a/target/i386/kvm_i386.h b/target/i386/kvm_i386.h +index 06fe06b..d98c6f6 100644 +--- a/target/i386/kvm_i386.h ++++ b/target/i386/kvm_i386.h +@@ -66,4 +66,5 @@ bool kvm_enable_x2apic(void); + bool kvm_has_x2apic_api(void); + + bool kvm_hv_vpindex_settable(void); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch b/kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch new file mode 100755 index 0000000000000000000000000000000000000000..99b18fc982ff4a0d17238163a63f8cfb0e0ac3b4 --- /dev/null +++ b/kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch @@ -0,0 +1,64 @@ +From 8f39b0c9523630efeb451e2298cf64b88cd2ac81 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 17 Feb 2020 16:23:13 +0000 +Subject: [PATCH 3/9] target/i386: kvm: initialize microcode revision from KVM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200217162316.2464-4-pbonzini@redhat.com> +Patchwork-id: 93897 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/6] target/i386: kvm: initialize microcode revision from KVM +Bugzilla: 1791648 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Dr. David Alan Gilbert + +KVM can return the host microcode revision as a feature MSR. +Use it as the default value for -cpu host. + +Signed-off-by: Paolo Bonzini +Message-Id: <1579544504-3616-4-git-send-email-pbonzini@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 32c87d70ff55b96741f08c35108935cac6f40fe4) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 4 ++++ + target/i386/kvm.c | 5 +++++ + 2 files changed, 9 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index e505d3e..5ac843d 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6323,6 +6323,10 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) + &cpu->mwait.ecx, &cpu->mwait.edx); + env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR; + } ++ if (kvm_enabled() && cpu->ucode_rev == 0) { ++ cpu->ucode_rev = kvm_arch_get_supported_msr_feature(kvm_state, ++ MSR_IA32_UCODE_REV); ++ } + } + + if (cpu->ucode_rev == 0) { +diff --git a/target/i386/kvm.c b/target/i386/kvm.c +index f41605b..6c61aef 100644 +--- a/target/i386/kvm.c ++++ b/target/i386/kvm.c +@@ -2696,6 +2696,11 @@ static void kvm_init_msrs(X86CPU *cpu) + env->features[FEAT_CORE_CAPABILITY]); + } + ++ if (kvm_arch_get_supported_msr_feature(kvm_state, ++ MSR_IA32_UCODE_REV)) { ++ kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); ++ } ++ + /* + * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but + * all kernels with MSR features should have them. +-- +1.8.3.1 + diff --git a/kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch b/kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch new file mode 100755 index 0000000000000000000000000000000000000000..49e54ba3d25a824da8fcb90cd2c766dbfec57c41 --- /dev/null +++ b/kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch @@ -0,0 +1,69 @@ +From 72a1827006be22791017ff2b671eac1c96be5d12 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 7 May 2020 22:09:23 +0100 +Subject: [PATCH 01/26] target/i386: set the CPUID level to 0x14 on old + machine-type + +RH-Author: plai@redhat.com +Message-id: <20200507220923.13723-1-plai@redhat.com> +Patchwork-id: 96347 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH RESEND] target/i386: set the CPUID level to 0x14 on old machine-type +Bugzilla: 1513681 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Igor Mammedov +RH-Acked-by: Danilo de Paula + +From: Luwei Kang + +BZ https://bugzilla.redhat.com/show_bug.cgi?id=1513681 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=28146304 +Branch: rhel-av-8.2.1 + +Tested on intel-icelake-y-01.ml3.eng.bos.redhat.com. + +The CPUID level need to be set to 0x14 manually on old +machine-type if Intel PT is enabled in guest. E.g. the +CPUID[0].EAX(level)=7 and CPUID[7].EBX[25](intel-pt)=1 when the +Qemu with "-machine pc-i440fx-3.1 -cpu qemu64,+intel-pt" parameter. + +Some Intel PT capabilities are exposed by leaf 0x14 and the +missing capabilities will cause some MSRs access failed. +This patch add a warning message to inform the user to extend +the CPUID level. + +Suggested-by: Eduardo Habkost +Signed-off-by: Luwei Kang +Message-Id: <1584031686-16444-1-git-send-email-luwei.kang@intel.com> +Signed-off-by: Eduardo Habkost +(cherry picked from commit ddc2fc9e4e42ebce48b088963dc7fbd1c08d5f33) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 1685a8c..0f0a2db 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6206,9 +6206,14 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error **errp) + x86_cpu_adjust_feat_level(cpu, FEAT_XSAVE); + + /* Intel Processor Trace requires CPUID[0x14] */ +- if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) && +- kvm_enabled() && cpu->intel_pt_auto_level) { +- x86_cpu_adjust_level(cpu, &cpu->env.cpuid_min_level, 0x14); ++ if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT)) { ++ if (cpu->intel_pt_auto_level) { ++ x86_cpu_adjust_level(cpu, &cpu->env.cpuid_min_level, 0x14); ++ } else if (cpu->env.cpuid_min_level < 0x14) { ++ mark_unavailable_features(cpu, FEAT_7_0_EBX, ++ CPUID_7_0_EBX_INTEL_PT, ++ "Intel PT need CPUID leaf 0x14, please set by \"-cpu ...,+intel-pt,level=0x14\""); ++ } + } + + /* CPU topology with multi-dies support requires CPUID[0x1F] */ +-- +1.8.3.1 + diff --git a/kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch b/kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch new file mode 100755 index 0000000000000000000000000000000000000000..60abc1b7b98fc4cbd79494241e9d6dee15760afa --- /dev/null +++ b/kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch @@ -0,0 +1,56 @@ +From 9adf5e57df32df464e7465b1df72c993d0ed4ed4 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 31 Jul 2020 18:08:35 -0400 +Subject: [PATCH 3/4] target/i386: sev: fail query-sev-capabilities if QEMU + cannot use SEV +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200731180835.86786-3-pbonzini@redhat.com> +Patchwork-id: 98124 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/2] target/i386: sev: fail query-sev-capabilities if QEMU cannot use SEV +Bugzilla: 1689341 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Dr. David Alan Gilbert + +In some cases, such as if the kvm-amd "sev" module parameter is set +to 0, SEV will be unavailable but query-sev-capabilities will still +return all the information. This tricks libvirt into erroneously +reporting that SEV is available. Check the actual usability of the +feature and return the appropriate error if QEMU cannot use KVM +or KVM cannot use SEV. + +Reviewed-by: Eric Blake +Signed-off-by: Paolo Bonzini +cherry picked from commit 1b38750c40281dd0d068f8536b2ea95d7b9bd585 +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/sev.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/target/i386/sev.c b/target/i386/sev.c +index 054f2d846a..a47f0d3880 100644 +--- a/target/i386/sev.c ++++ b/target/i386/sev.c +@@ -504,6 +504,15 @@ sev_get_capabilities(Error **errp) + uint32_t ebx; + int fd; + ++ if (!kvm_enabled()) { ++ error_setg(errp, "KVM not enabled"); ++ return NULL; ++ } ++ if (kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, NULL) < 0) { ++ error_setg(errp, "SEV is not enabled in KVM"); ++ return NULL; ++ } ++ + fd = open(DEFAULT_SEV_DEVICE, O_RDWR); + if (fd < 0) { + error_setg_errno(errp, errno, "Failed to open %s", +-- +2.27.0 + diff --git a/kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch b/kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch new file mode 100755 index 0000000000000000000000000000000000000000..e5f345984e7c1dee606ba944178e82a5f78de246 --- /dev/null +++ b/kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch @@ -0,0 +1,142 @@ +From 8789f2662c6ddacc5472a803d253b94d93c6e9f0 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 31 Jul 2020 18:08:34 -0400 +Subject: [PATCH 2/4] target/i386: sev: provide proper error reporting for + query-sev-capabilities +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Paolo Bonzini +Message-id: <20200731180835.86786-2-pbonzini@redhat.com> +Patchwork-id: 98123 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/2] target/i386: sev: provide proper error reporting for query-sev-capabilities +Bugzilla: 1689341 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Dr. David Alan Gilbert + +The query-sev-capabilities was reporting errors through error_report; +change it to use Error** so that the cause of the failure is clearer. + +Reviewed-by: Eric Blake +Signed-off-by: Paolo Bonzini +Cherry picked from commit e4f6278557148151e77260b872b41bcd7ceb4737 +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/monitor.c | 10 +--------- + target/i386/sev-stub.c | 3 ++- + target/i386/sev.c | 18 +++++++++--------- + target/i386/sev_i386.h | 2 +- + 4 files changed, 13 insertions(+), 20 deletions(-) + +diff --git a/target/i386/monitor.c b/target/i386/monitor.c +index 9fb4d641d5..cfd8075e4f 100644 +--- a/target/i386/monitor.c ++++ b/target/i386/monitor.c +@@ -727,13 +727,5 @@ SevLaunchMeasureInfo *qmp_query_sev_launch_measure(Error **errp) + + SevCapability *qmp_query_sev_capabilities(Error **errp) + { +- SevCapability *data; +- +- data = sev_get_capabilities(); +- if (!data) { +- error_setg(errp, "SEV feature is not available"); +- return NULL; +- } +- +- return data; ++ return sev_get_capabilities(errp); + } +diff --git a/target/i386/sev-stub.c b/target/i386/sev-stub.c +index e5ee13309c..88e3f39a1e 100644 +--- a/target/i386/sev-stub.c ++++ b/target/i386/sev-stub.c +@@ -44,7 +44,8 @@ char *sev_get_launch_measurement(void) + return NULL; + } + +-SevCapability *sev_get_capabilities(void) ++SevCapability *sev_get_capabilities(Error **errp) + { ++ error_setg(errp, "SEV is not available in this QEMU"); + return NULL; + } +diff --git a/target/i386/sev.c b/target/i386/sev.c +index 024bb24e51..054f2d846a 100644 +--- a/target/i386/sev.c ++++ b/target/i386/sev.c +@@ -453,7 +453,7 @@ sev_get_info(void) + + static int + sev_get_pdh_info(int fd, guchar **pdh, size_t *pdh_len, guchar **cert_chain, +- size_t *cert_chain_len) ++ size_t *cert_chain_len, Error **errp) + { + guchar *pdh_data = NULL; + guchar *cert_chain_data = NULL; +@@ -464,8 +464,8 @@ sev_get_pdh_info(int fd, guchar **pdh, size_t *pdh_len, guchar **cert_chain, + r = sev_platform_ioctl(fd, SEV_PDH_CERT_EXPORT, &export, &err); + if (r < 0) { + if (err != SEV_RET_INVALID_LEN) { +- error_report("failed to export PDH cert ret=%d fw_err=%d (%s)", +- r, err, fw_error_to_str(err)); ++ error_setg(errp, "failed to export PDH cert ret=%d fw_err=%d (%s)", ++ r, err, fw_error_to_str(err)); + return 1; + } + } +@@ -477,8 +477,8 @@ sev_get_pdh_info(int fd, guchar **pdh, size_t *pdh_len, guchar **cert_chain, + + r = sev_platform_ioctl(fd, SEV_PDH_CERT_EXPORT, &export, &err); + if (r < 0) { +- error_report("failed to export PDH cert ret=%d fw_err=%d (%s)", +- r, err, fw_error_to_str(err)); ++ error_setg(errp, "failed to export PDH cert ret=%d fw_err=%d (%s)", ++ r, err, fw_error_to_str(err)); + goto e_free; + } + +@@ -495,7 +495,7 @@ e_free: + } + + SevCapability * +-sev_get_capabilities(void) ++sev_get_capabilities(Error **errp) + { + SevCapability *cap = NULL; + guchar *pdh_data = NULL; +@@ -506,13 +506,13 @@ sev_get_capabilities(void) + + fd = open(DEFAULT_SEV_DEVICE, O_RDWR); + if (fd < 0) { +- error_report("%s: Failed to open %s '%s'", __func__, +- DEFAULT_SEV_DEVICE, strerror(errno)); ++ error_setg_errno(errp, errno, "Failed to open %s", ++ DEFAULT_SEV_DEVICE); + return NULL; + } + + if (sev_get_pdh_info(fd, &pdh_data, &pdh_len, +- &cert_chain_data, &cert_chain_len)) { ++ &cert_chain_data, &cert_chain_len, errp)) { + goto out; + } + +diff --git a/target/i386/sev_i386.h b/target/i386/sev_i386.h +index 8ada9d385d..1e073342ba 100644 +--- a/target/i386/sev_i386.h ++++ b/target/i386/sev_i386.h +@@ -38,7 +38,7 @@ extern SevInfo *sev_get_info(void); + extern uint32_t sev_get_cbit_position(void); + extern uint32_t sev_get_reduced_phys_bits(void); + extern char *sev_get_launch_measurement(void); +-extern SevCapability *sev_get_capabilities(void); ++extern SevCapability *sev_get_capabilities(Error **errp); + + typedef struct QSevGuestInfo QSevGuestInfo; + typedef struct QSevGuestInfoClass QSevGuestInfoClass; +-- +2.27.0 + diff --git a/kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch b/kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch new file mode 100755 index 0000000000000000000000000000000000000000..38e56374ae572908c1b0738deb7c5214367174ef --- /dev/null +++ b/kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch @@ -0,0 +1,60 @@ +From c4fe37ae6d75ed72e6a3bde01fea053eb508274c Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 5 Jun 2020 07:41:11 -0400 +Subject: [PATCH 41/42] target/s390x/kvm: Enable adapter interruption + suppression again +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +Message-id: <20200605074111.2185-4-thuth@redhat.com> +Patchwork-id: 97370 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 3/3] target/s390x/kvm: Enable adapter interruption suppression again +Bugzilla: 1756946 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +The AIS feature has been disabled late in the v2.10 development cycle since +there were some issues with migration (see commit 3f2d07b3b01ea61126b - +"s390x/ais: for 2.10 stable: disable ais facility"). We originally wanted +to enable it again for newer machine types, but apparently we forgot to do +this so far. Let's do it now for the machines that support proper CPU models. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1756946 +Signed-off-by: Thomas Huth +Message-Id: <20200122101437.5069-1-thuth@redhat.com> +Reviewed-by: David Hildenbrand +Tested-by: Matthew Rosato +Signed-off-by: Cornelia Huck +(cherry picked from commit a5c8617af6919515b84256978452edf07401c45e) +Signed-off-by: Danilo C. L. de Paula +--- + target/s390x/kvm.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c +index c589ef9034..0bbf8f81b0 100644 +--- a/target/s390x/kvm.c ++++ b/target/s390x/kvm.c +@@ -377,10 +377,13 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + /* + * The migration interface for ais was introduced with kernel 4.13 + * but the capability itself had been active since 4.12. As migration +- * support is considered necessary let's disable ais in the 2.10 +- * machine. ++ * support is considered necessary, we only try to enable this for ++ * newer machine types if KVM_CAP_S390_AIS_MIGRATION is available. + */ +- /* kvm_vm_enable_cap(s, KVM_CAP_S390_AIS, 0); */ ++ if (cpu_model_allowed() && kvm_kernel_irqchip_allowed() && ++ kvm_check_extension(s, KVM_CAP_S390_AIS_MIGRATION)) { ++ kvm_vm_enable_cap(s, KVM_CAP_S390_AIS, 0); ++ } + + kvm_set_max_memslot_size(KVM_SLOT_MAX_BYTES); + return 0; +-- +2.27.0 + diff --git a/kvm-tcp_emu-Fix-oob-access.patch b/kvm-tcp_emu-Fix-oob-access.patch new file mode 100755 index 0000000000000000000000000000000000000000..e53287736374c581df9c893870b40effd1fdff83 --- /dev/null +++ b/kvm-tcp_emu-Fix-oob-access.patch @@ -0,0 +1,59 @@ +From 5c2c5496083fa549e1dff903413bb6136fc19d8d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Fri, 17 Jan 2020 12:07:56 +0100 +Subject: [PATCH 1/4] tcp_emu: Fix oob access +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200117120758.1076549-2-marcandre.lureau@redhat.com> +Patchwork-id: 93399 +O-Subject: [RHEL-AV-8.1.0 qemu-kvm + RHEL-AV-8.2.0 qemu-kvm PATCH 1/3] tcp_emu: Fix oob access +Bugzilla: 1791568 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +From: Samuel Thibault + +The main loop only checks for one available byte, while we sometimes +need two bytes. + +[ MA - minor conflict, CHANGELOG.md absent ] +(cherry picked from libslirp commit 2655fffed7a9e765bcb4701dd876e9dab975f289) +Signed-off-by: Marc-André Lureau + +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tcp_subr.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index d6dd133..cbecd64 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -886,6 +886,9 @@ int tcp_emu(struct socket *so, struct mbuf *m) + break; + + case 5: ++ if (bptr == m->m_data + m->m_len - 1) ++ return 1; /* We need two bytes */ ++ + /* + * The difference between versions 1.0 and + * 2.0 is here. For future versions of +@@ -901,6 +904,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + /* This is the field containing the port + * number that RA-player is listening to. + */ ++ ++ if (bptr == m->m_data + m->m_len - 1) ++ return 1; /* We need two bytes */ ++ + lport = (((uint8_t *)bptr)[0] << 8) + ((uint8_t *)bptr)[1]; + if (lport < 6970) + lport += 256; /* don't know why */ +-- +1.8.3.1 + diff --git a/kvm-tcp_emu-fix-unsafe-snprintf-usages.patch b/kvm-tcp_emu-fix-unsafe-snprintf-usages.patch new file mode 100755 index 0000000000000000000000000000000000000000..846da73ce823cf1c75d300a4aa98d3f664ba5fa5 --- /dev/null +++ b/kvm-tcp_emu-fix-unsafe-snprintf-usages.patch @@ -0,0 +1,149 @@ +From 9a7810c257711ce02627916d886fc1029f7a8190 Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Thu, 13 Feb 2020 15:50:49 +0000 +Subject: [PATCH 3/7] tcp_emu: fix unsafe snprintf() usages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200213155049.3936-3-jmaloy@redhat.com> +Patchwork-id: 93826 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] tcp_emu: fix unsafe snprintf() usages +Bugzilla: 1798994 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Various calls to snprintf() assume that snprintf() returns "only" the +number of bytes written (excluding terminating NUL). + +https://pubs.opengroup.org/onlinepubs/9699919799/functions/snprintf.html#tag_16_159_04 + +"Upon successful completion, the snprintf() function shall return the +number of bytes that would be written to s had n been sufficiently +large excluding the terminating null byte." + +Before patch ce131029, if there isn't enough room in "m_data" for the +"DCC ..." message, we overflow "m_data". + +After the patch, if there isn't enough room for the same, we don't +overflow "m_data", but we set "m_len" out-of-bounds. The next time an +access is bounded by "m_len", we'll have a buffer overflow then. + +Use slirp_fmt*() to fix potential OOB memory access. + +Reported-by: Laszlo Ersek +Signed-off-by: Marc-André Lureau +Reviewed-by: Samuel Thibault +Message-Id: <20200127092414.169796-7-marcandre.lureau@redhat.com> +(cherry picked from libslirp commit 68ccb8021a838066f0951d4b2817eb6b6f10a843) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/tcp_subr.c | 44 +++++++++++++++++++++----------------------- + 1 file changed, 21 insertions(+), 23 deletions(-) + +diff --git a/slirp/src/tcp_subr.c b/slirp/src/tcp_subr.c +index 954d1a6..26d4ead 100644 +--- a/slirp/src/tcp_subr.c ++++ b/slirp/src/tcp_subr.c +@@ -655,8 +655,7 @@ int tcp_emu(struct socket *so, struct mbuf *m) + NTOHS(n1); + NTOHS(n2); + m_inc(m, snprintf(NULL, 0, "%d,%d\r\n", n1, n2) + 1); +- m->m_len = snprintf(m->m_data, M_ROOM(m), "%d,%d\r\n", n1, n2); +- assert(m->m_len < M_ROOM(m)); ++ m->m_len = slirp_fmt(m->m_data, M_ROOM(m), "%d,%d\r\n", n1, n2); + } else { + *eol = '\r'; + } +@@ -696,9 +695,9 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "ORT %d,%d,%d,%d,%d,%d\r\n%s", n1, n2, n3, n4, +- n5, n6, x == 7 ? buff : ""); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "ORT %d,%d,%d,%d,%d,%d\r\n%s", ++ n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); + return 1; + } else if ((bptr = (char *)strstr(m->m_data, "27 Entering")) != NULL) { + /* +@@ -731,10 +730,9 @@ int tcp_emu(struct socket *so, struct mbuf *m) + n4 = (laddr & 0xff); + + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "27 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n%s", +- n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); +- ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "27 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n%s", ++ n1, n2, n3, n4, n5, n6, x == 7 ? buff : ""); + return 1; + } + +@@ -757,8 +755,8 @@ int tcp_emu(struct socket *so, struct mbuf *m) + if (m->m_data[m->m_len - 1] == '\0' && lport != 0 && + (so = tcp_listen(slirp, INADDR_ANY, 0, so->so_laddr.s_addr, + htons(lport), SS_FACCEPTONCE)) != NULL) +- m->m_len = snprintf(m->m_data, M_ROOM(m), +- "%d", ntohs(so->so_fport)) + 1; ++ m->m_len = slirp_fmt0(m->m_data, M_ROOM(m), ++ "%d", ntohs(so->so_fport)); + return 1; + + case EMU_IRC: +@@ -777,10 +775,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "DCC CHAT chat %lu %u%c\n", +- (unsigned long)ntohl(so->so_faddr.s_addr), +- ntohs(so->so_fport), 1); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "DCC CHAT chat %lu %u%c\n", ++ (unsigned long)ntohl(so->so_faddr.s_addr), ++ ntohs(so->so_fport), 1); + } else if (sscanf(bptr, "DCC SEND %256s %u %u %u", buff, &laddr, &lport, + &n1) == 4) { + if ((so = tcp_listen(slirp, INADDR_ANY, 0, htonl(laddr), +@@ -788,10 +786,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "DCC SEND %s %lu %u %u%c\n", buff, +- (unsigned long)ntohl(so->so_faddr.s_addr), +- ntohs(so->so_fport), n1, 1); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "DCC SEND %s %lu %u %u%c\n", buff, ++ (unsigned long)ntohl(so->so_faddr.s_addr), ++ ntohs(so->so_fport), n1, 1); + } else if (sscanf(bptr, "DCC MOVE %256s %u %u %u", buff, &laddr, &lport, + &n1) == 4) { + if ((so = tcp_listen(slirp, INADDR_ANY, 0, htonl(laddr), +@@ -799,10 +797,10 @@ int tcp_emu(struct socket *so, struct mbuf *m) + return 1; + } + m->m_len = bptr - m->m_data; /* Adjust length */ +- m->m_len += snprintf(bptr, M_FREEROOM(m), +- "DCC MOVE %s %lu %u %u%c\n", buff, +- (unsigned long)ntohl(so->so_faddr.s_addr), +- ntohs(so->so_fport), n1, 1); ++ m->m_len += slirp_fmt(bptr, M_FREEROOM(m), ++ "DCC MOVE %s %lu %u %u%c\n", buff, ++ (unsigned long)ntohl(so->so_faddr.s_addr), ++ ntohs(so->so_fport), n1, 1); + } + return 1; + +-- +1.8.3.1 + diff --git a/kvm-tests-arm-cpu-features-Check-feature-default-values.patch b/kvm-tests-arm-cpu-features-Check-feature-default-values.patch new file mode 100755 index 0000000000000000000000000000000000000000..e8a48bf0164cbd7fda24008d2c1b6f7cc8509b87 --- /dev/null +++ b/kvm-tests-arm-cpu-features-Check-feature-default-values.patch @@ -0,0 +1,106 @@ +From 323889aa2182bf39df10f1caf43f22daea2d7d37 Mon Sep 17 00:00:00 2001 +From: Andrew Jones +Date: Fri, 31 Jan 2020 14:23:12 +0000 +Subject: [PATCH 10/15] tests/arm-cpu-features: Check feature default values +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Andrew Jones +Message-id: <20200131142314.13175-4-drjones@redhat.com> +Patchwork-id: 93626 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/5] tests/arm-cpu-features: Check feature default values +Bugzilla: 1647366 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Auger Eric +RH-Acked-by: Gavin Shan + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1647366 + +Author: Andrew Jones +Date: Thu, 30 Jan 2020 16:02:06 +0000 + + tests/arm-cpu-features: Check feature default values + + If we know what the default value should be then we can test for + that as well as the feature existence. + + Signed-off-by: Andrew Jones + Reviewed-by: Richard Henderson + Message-id: 20200120101023.16030-5-drjones@redhat.com + Signed-off-by: Peter Maydell + +(cherry picked from commit 789a35efb583464f9fcd5d871a7fd6164318bb91) +Signed-off-by: Danilo C. L. de Paula +--- + tests/arm-cpu-features.c | 37 ++++++++++++++++++++++++++++--------- + 1 file changed, 28 insertions(+), 9 deletions(-) + +diff --git a/tests/arm-cpu-features.c b/tests/arm-cpu-features.c +index 6e99aa9..89285ca 100644 +--- a/tests/arm-cpu-features.c ++++ b/tests/arm-cpu-features.c +@@ -159,6 +159,25 @@ static bool resp_get_feature(QDict *resp, const char *feature) + qobject_unref(_resp); \ + }) + ++#define assert_feature(qts, cpu_type, feature, expected_value) \ ++({ \ ++ QDict *_resp, *_props; \ ++ \ ++ _resp = do_query_no_props(qts, cpu_type); \ ++ g_assert(_resp); \ ++ g_assert(resp_has_props(_resp)); \ ++ _props = resp_get_props(_resp); \ ++ g_assert(qdict_get(_props, feature)); \ ++ g_assert(qdict_get_bool(_props, feature) == (expected_value)); \ ++ qobject_unref(_resp); \ ++}) ++ ++#define assert_has_feature_enabled(qts, cpu_type, feature) \ ++ assert_feature(qts, cpu_type, feature, true) ++ ++#define assert_has_feature_disabled(qts, cpu_type, feature) \ ++ assert_feature(qts, cpu_type, feature, false) ++ + static void assert_type_full(QTestState *qts) + { + const char *error; +@@ -405,16 +424,16 @@ static void test_query_cpu_model_expansion(const void *data) + assert_error(qts, "host", "The CPU type 'host' requires KVM", NULL); + + /* Test expected feature presence/absence for some cpu types */ +- assert_has_feature(qts, "max", "pmu"); +- assert_has_feature(qts, "cortex-a15", "pmu"); ++ assert_has_feature_enabled(qts, "max", "pmu"); ++ assert_has_feature_enabled(qts, "cortex-a15", "pmu"); + assert_has_not_feature(qts, "cortex-a15", "aarch64"); + + if (g_str_equal(qtest_get_arch(), "aarch64")) { +- assert_has_feature(qts, "max", "aarch64"); +- assert_has_feature(qts, "max", "sve"); +- assert_has_feature(qts, "max", "sve128"); +- assert_has_feature(qts, "cortex-a57", "pmu"); +- assert_has_feature(qts, "cortex-a57", "aarch64"); ++ assert_has_feature_enabled(qts, "max", "aarch64"); ++ assert_has_feature_enabled(qts, "max", "sve"); ++ assert_has_feature_enabled(qts, "max", "sve128"); ++ assert_has_feature_enabled(qts, "cortex-a57", "pmu"); ++ assert_has_feature_enabled(qts, "cortex-a57", "aarch64"); + + sve_tests_default(qts, "max"); + +@@ -451,8 +470,8 @@ static void test_query_cpu_model_expansion_kvm(const void *data) + QDict *resp; + char *error; + +- assert_has_feature(qts, "host", "aarch64"); +- assert_has_feature(qts, "host", "pmu"); ++ assert_has_feature_enabled(qts, "host", "aarch64"); ++ assert_has_feature_enabled(qts, "host", "pmu"); + + assert_error(qts, "cortex-a15", + "We cannot guarantee the CPU type 'cortex-a15' works " +-- +1.8.3.1 + diff --git a/kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch b/kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch new file mode 100755 index 0000000000000000000000000000000000000000..12df637631b29eb6432949948578951c3fdaf9fc --- /dev/null +++ b/kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch @@ -0,0 +1,127 @@ +From 6d549629becb69f315dd4213f730122d19c9c566 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:54 +0100 +Subject: [PATCH 11/12] tests/bios-tables-test: add test cases for ACPI HMAT + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-11-plai@redhat.com> +Patchwork-id: 96739 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 10/11] tests/bios-tables-test: add test cases for ACPI HMAT +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Tao Xu + +ACPI table HMAT has been introduced, QEMU now builds HMAT tables for +Heterogeneous Memory with boot option '-numa node'. + +Add test cases on PC and Q35 machines with 2 numa nodes. +Because HMAT is generated when system enable numa, the +following tables need to be added for this test: + tests/data/acpi/pc/APIC.acpihmat + tests/data/acpi/pc/SRAT.acpihmat + tests/data/acpi/pc/HMAT.acpihmat + tests/data/acpi/pc/DSDT.acpihmat + tests/data/acpi/q35/APIC.acpihmat + tests/data/acpi/q35/SRAT.acpihmat + tests/data/acpi/q35/HMAT.acpihmat + tests/data/acpi/q35/DSDT.acpihmat + +Acked-by: Markus Armbruster +Reviewed-by: Igor Mammedov +Reviewed-by: Daniel Black +Reviewed-by: Jingqi Liu +Suggested-by: Igor Mammedov +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-9-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 1c8f85d93d261dc555a0aad6f54f2b5e8009d859) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + tests/bios-tables-test-allowed-diff.h | 8 +++++++ + tests/bios-tables-test.c | 44 +++++++++++++++++++++++++++++++++++ + 2 files changed, 52 insertions(+) + +diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h +index dfb8523..3c9e0c9 100644 +--- a/tests/bios-tables-test-allowed-diff.h ++++ b/tests/bios-tables-test-allowed-diff.h +@@ -1 +1,9 @@ + /* List of comma-separated changed AML files to ignore */ ++"tests/data/acpi/pc/APIC.acpihmat", ++"tests/data/acpi/pc/SRAT.acpihmat", ++"tests/data/acpi/pc/HMAT.acpihmat", ++"tests/data/acpi/pc/DSDT.acpihmat", ++"tests/data/acpi/q35/APIC.acpihmat", ++"tests/data/acpi/q35/SRAT.acpihmat", ++"tests/data/acpi/q35/HMAT.acpihmat", ++"tests/data/acpi/q35/DSDT.acpihmat", +diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c +index 79f5da0..9823820 100644 +--- a/tests/bios-tables-test.c ++++ b/tests/bios-tables-test.c +@@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) + + } + ++static void test_acpi_tcg_acpi_hmat(const char *machine) ++{ ++ test_data data; ++ ++ memset(&data, 0, sizeof(data)); ++ data.machine = machine; ++ data.variant = ".acpihmat"; ++ test_acpi_one(" -machine hmat=on" ++ " -smp 2,sockets=2" ++ " -m 128M,slots=2,maxmem=1G" ++ " -object memory-backend-ram,size=64M,id=m0" ++ " -object memory-backend-ram,size=64M,id=m1" ++ " -numa node,nodeid=0,memdev=m0" ++ " -numa node,nodeid=1,memdev=m1,initiator=0" ++ " -numa cpu,node-id=0,socket-id=0" ++ " -numa cpu,node-id=0,socket-id=1" ++ " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," ++ "data-type=access-latency,latency=1" ++ " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," ++ "data-type=access-bandwidth,bandwidth=65534M" ++ " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," ++ "data-type=access-latency,latency=65534" ++ " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," ++ "data-type=access-bandwidth,bandwidth=32767M" ++ " -numa hmat-cache,node-id=0,size=10K,level=1," ++ "associativity=direct,policy=write-back,line=8" ++ " -numa hmat-cache,node-id=1,size=10K,level=1," ++ "associativity=direct,policy=write-back,line=8", ++ &data); ++ free_test_data(&data); ++} ++ ++static void test_acpi_q35_tcg_acpi_hmat(void) ++{ ++ test_acpi_tcg_acpi_hmat(MACHINE_Q35); ++} ++ ++static void test_acpi_piix4_tcg_acpi_hmat(void) ++{ ++ test_acpi_tcg_acpi_hmat(MACHINE_PC); ++} ++ + static void test_acpi_virt_tcg(void) + { + test_data data = { +@@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) + qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem); + qtest_add_func("acpi/piix4/dimmpxm", test_acpi_piix4_tcg_dimm_pxm); + qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm); ++ qtest_add_func("acpi/piix4/acpihmat", test_acpi_piix4_tcg_acpi_hmat); ++ qtest_add_func("acpi/q35/acpihmat", test_acpi_q35_tcg_acpi_hmat); + } else if (strcmp(arch, "aarch64") == 0) { + qtest_add_func("acpi/virt", test_acpi_virt_tcg); + qtest_add_func("acpi/virt/numamem", test_acpi_virt_tcg_numamem); +-- +1.8.3.1 + diff --git a/kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch b/kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch new file mode 100755 index 0000000000000000000000000000000000000000..240c4082ef9f323585803f313aab996ed5657304 --- /dev/null +++ b/kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch @@ -0,0 +1,60 @@ +From f73b18e03c6758500bf367b1575205772d1f878f Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:53:52 -0400 +Subject: [PATCH 10/42] tests/boot-sector: Fix the bad s390x assembler code + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-11-thuth@redhat.com> +Patchwork-id: 97031 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 10/38] tests/boot-sector: Fix the bad s390x assembler code +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +There are currently two bugs in s390x_code[]: First, the initial jump +uses the wrong offset, so it was jumping to 0x10014 instead of 0x10010. +Second, LHI only loads the lower 32-bit of the register. + +Everything worked fine as long as the s390-ccw bios code was jumping +here with r3 containing zeroes in the uppermost 48 bit - which just +happened to be the case so far by accident. But we can not rely on this +fact, and indeed one of the recent suggested patches to jump2ipl.c cause +the newer GCCs to put different values into r3. In that case the code +from s390x_code[] crashes very ungracefully. + +Thus let's make sure to jump to the right instruction, and use LGHI +instead of LHI to make sure that we always zero out the upper bits +of the register. + +Signed-off-by: Thomas Huth +Message-Id: <20191217150642.27946-1-thuth@redhat.com> +Reviewed-by: Christian Borntraeger +Signed-off-by: Cornelia Huck +(cherry picked from commit 5afec76fbe2c07d03fd8c9ac525140059499637a) +Signed-off-by: Danilo C. L. de Paula +--- + tests/boot-sector.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tests/boot-sector.c b/tests/boot-sector.c +index 7824286b9a..9e66c6d013 100644 +--- a/tests/boot-sector.c ++++ b/tests/boot-sector.c +@@ -75,11 +75,11 @@ static const uint8_t s390x_psw_and_magic[] = { + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* in the s390-ccw bios */ + }; + static const uint8_t s390x_code[] = { +- 0xa7, 0xf4, 0x00, 0x0a, /* j 0x10010 */ ++ 0xa7, 0xf4, 0x00, 0x08, /* j 0x10010 */ + 0x00, 0x00, 0x00, 0x00, + 'S', '3', '9', '0', + 'E', 'P', 0x00, 0x01, +- 0xa7, 0x38, HIGH(SIGNATURE_ADDR), LOW(SIGNATURE_ADDR), /* lhi r3,0x7c10 */ ++ 0xa7, 0x39, HIGH(SIGNATURE_ADDR), LOW(SIGNATURE_ADDR), /* lghi r3,0x7c10 */ + 0xa7, 0x48, LOW(SIGNATURE), HIGH(SIGNATURE), /* lhi r4,0xadde */ + 0x40, 0x40, 0x30, 0x00, /* sth r4,0(r3) */ + 0xa7, 0xf4, 0xff, 0xfa /* j 0x10010 */ +-- +2.27.0 + diff --git a/kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch b/kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch new file mode 100755 index 0000000000000000000000000000000000000000..41ee71ca96d4e4b8ea204419f1242412fab15089 --- /dev/null +++ b/kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch @@ -0,0 +1,266 @@ +From 0f11aae02dcabd3a5ee0b5946aec39da6dddea52 Mon Sep 17 00:00:00 2001 +From: "plai@redhat.com" +Date: Thu, 21 May 2020 23:56:53 +0100 +Subject: [PATCH 10/12] tests/numa: Add case for QMP build HMAT + +RH-Author: plai@redhat.com +Message-id: <20200521235655.27141-10-plai@redhat.com> +Patchwork-id: 96735 +O-Subject: [RHEL8.2.1 AV qemu-kvm PATCH 09/11] tests/numa: Add case for QMP build HMAT +Bugzilla: 1600217 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Igor Mammedov +RH-Acked-by: Eduardo Habkost + +From: Tao Xu + +Check configuring HMAT usecase + +Acked-by: Markus Armbruster +Suggested-by: Igor Mammedov +Signed-off-by: Tao Xu +Message-Id: <20191213011929.2520-8-tao3.xu@intel.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Igor Mammedov +(cherry picked from commit d00817c944ed15fbe4a61d44fe7f9fe166c7df88) +Signed-off-by: Paul Lai +Signed-off-by: Danilo C. L. de Paula +--- + tests/numa-test.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 213 insertions(+) + +diff --git a/tests/numa-test.c b/tests/numa-test.c +index 8de8581..17dd807 100644 +--- a/tests/numa-test.c ++++ b/tests/numa-test.c +@@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) + qtest_quit(qs); + } + ++static void pc_hmat_build_cfg(const void *data) ++{ ++ QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " ++ "-smp 2,sockets=2 " ++ "-m 128M,slots=2,maxmem=1G " ++ "-object memory-backend-ram,size=64M,id=m0 " ++ "-object memory-backend-ram,size=64M,id=m1 " ++ "-numa node,nodeid=0,memdev=m0 " ++ "-numa node,nodeid=1,memdev=m1,initiator=0 " ++ "-numa cpu,node-id=0,socket-id=0 " ++ "-numa cpu,node-id=0,socket-id=1", ++ data ? (char *)data : ""); ++ ++ /* Fail: Initiator should be less than the number of nodes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); ++ ++ /* Fail: Target should be less than the number of nodes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); ++ ++ /* Fail: Initiator should contain cpu */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); ++ ++ /* Fail: Data-type mismatch */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"write-latency\"," ++ " 'bandwidth': 524288000 } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\"," ++ " 'latency': 5 } }"))); ++ ++ /* Fail: Bandwidth should be 1MB (1048576) aligned */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 1048575 } }"))); ++ ++ /* Configuring HMAT bandwidth and latency details */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 1 } }"))); /* 1 ns */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 5 } }"))); /* Fail: Duplicate configuration */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 68717379584 } }"))); /* 65534 MB/s */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 65534 } }"))); /* 65534 ns */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 34358689792 } }"))); /* 32767 MB/s */ ++ ++ /* Fail: node_id should be less than the number of nodes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 2, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* Fail: level should be less than HMAT_LB_LEVELS (4) */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 4, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* Fail: associativity option should be 'none', if level is 0 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"direct\", 'policy': \"none\"," ++ " 'line': 0 } }"))); ++ /* Fail: policy option should be 'none', if level is 0 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"none\", 'policy': \"write-back\"," ++ " 'line': 0 } }"))); ++ /* Fail: line option should be 0, if level is 0 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"none\", 'policy': \"none\"," ++ " 'line': 8 } }"))); ++ ++ /* Configuring HMAT memory side cache attributes */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); /* Fail: Duplicate configuration */ ++ /* Fail: The size of level 2 size should be small than level 1 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 2, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ /* Fail: The size of level 0 size should be larger than level 1 */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 0, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 1, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* let machine initialization to complete and run */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, ++ "{ 'execute': 'x-exit-preconfig' }"))); ++ qtest_qmp_eventwait(qs, "RESUME"); ++ ++ qtest_quit(qs); ++} ++ ++static void pc_hmat_off_cfg(const void *data) ++{ ++ QTestState *qs = qtest_initf("%s -nodefaults --preconfig " ++ "-smp 2,sockets=2 " ++ "-m 128M,slots=2,maxmem=1G " ++ "-object memory-backend-ram,size=64M,id=m0 " ++ "-object memory-backend-ram,size=64M,id=m1 " ++ "-numa node,nodeid=0,memdev=m0", ++ data ? (char *)data : ""); ++ ++ /* ++ * Fail: Enable HMAT with -machine hmat=on ++ * before using any of hmat specific options ++ */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\"," ++ " 'initiator': 0 } }"))); ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'node', 'nodeid': 1, 'memdev': \"m1\" } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 1 } }"))); ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* let machine initialization to complete and run */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, ++ "{ 'execute': 'x-exit-preconfig' }"))); ++ qtest_qmp_eventwait(qs, "RESUME"); ++ ++ qtest_quit(qs); ++} ++ ++static void pc_hmat_erange_cfg(const void *data) ++{ ++ QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " ++ "-smp 2,sockets=2 " ++ "-m 128M,slots=2,maxmem=1G " ++ "-object memory-backend-ram,size=64M,id=m0 " ++ "-object memory-backend-ram,size=64M,id=m1 " ++ "-numa node,nodeid=0,memdev=m0 " ++ "-numa node,nodeid=1,memdev=m1,initiator=0 " ++ "-numa cpu,node-id=0,socket-id=0 " ++ "-numa cpu,node-id=0,socket-id=1", ++ data ? (char *)data : ""); ++ ++ /* Can't store the compressed latency */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 1 } }"))); /* 1 ns */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-latency\"," ++ " 'latency': 65535 } }"))); /* 65535 ns */ ++ ++ /* Test the 0 input (bandwidth not provided) */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 0 } }"))); /* 0 MB/s */ ++ /* Fail: bandwidth should be provided before memory side cache attributes */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-cache', 'node-id': 0, 'size': 10240," ++ " 'level': 1, 'associativity': \"direct\", 'policy': \"write-back\"," ++ " 'line': 8 } }"))); ++ ++ /* Can't store the compressed bandwidth */ ++ g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," ++ " 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 1," ++ " 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," ++ " 'bandwidth': 68718428160 } }"))); /* 65535 MB/s */ ++ ++ /* let machine initialization to complete and run */ ++ g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, ++ "{ 'execute': 'x-exit-preconfig' }"))); ++ qtest_qmp_eventwait(qs, "RESUME"); ++ ++ qtest_quit(qs); ++} ++ + int main(int argc, char **argv) + { + const char *args = NULL; +@@ -346,6 +556,9 @@ int main(int argc, char **argv) + if (!strcmp(arch, "i386") || !strcmp(arch, "x86_64")) { + qtest_add_data_func("/numa/pc/cpu/explicit", args, pc_numa_cpu); + qtest_add_data_func("/numa/pc/dynamic/cpu", args, pc_dynamic_cpu_cfg); ++ qtest_add_data_func("/numa/pc/hmat/build", args, pc_hmat_build_cfg); ++ qtest_add_data_func("/numa/pc/hmat/off", args, pc_hmat_off_cfg); ++ qtest_add_data_func("/numa/pc/hmat/erange", args, pc_hmat_erange_cfg); + } + + if (!strcmp(arch, "ppc64")) { +-- +1.8.3.1 + diff --git a/kvm-tftp-check-tftp_input-buffer-size.patch b/kvm-tftp-check-tftp_input-buffer-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..85ed81107514d7dabf2cd66c2ff8d585fe6f5a24 --- /dev/null +++ b/kvm-tftp-check-tftp_input-buffer-size.patch @@ -0,0 +1,53 @@ +From 6bd4d80f9274f76eb402ce85aa60729150b39980 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:34 -0400 +Subject: [PATCH 09/14] tftp: check tftp_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-6-marcandre.lureau@redhat.com> +Patchwork-id: 101823 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 5/8] tftp: check tftp_input buffer size +Bugzilla: 1970843 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3595 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/46 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970843 + +(cherry picked from commit 3f17948137155f025f7809fdc38576d5d2451c3d) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tftp.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/slirp/src/tftp.c b/slirp/src/tftp.c +index 093c2e06a3..07e8f3cb2f 100644 +--- a/slirp/src/tftp.c ++++ b/slirp/src/tftp.c +@@ -444,7 +444,11 @@ static void tftp_handle_error(Slirp *slirp, struct sockaddr_storage *srcsas, + + void tftp_input(struct sockaddr_storage *srcsas, struct mbuf *m) + { +- struct tftp_t *tp = (struct tftp_t *)m->m_data; ++ struct tftp_t *tp = mtod_check(m, offsetof(struct tftp_t, x.tp_buf)); ++ ++ if (tp == NULL) { ++ return; ++ } + + switch (ntohs(tp->tp_op)) { + case TFTP_RRQ: +-- +2.27.0 + diff --git a/kvm-tftp-introduce-a-header-structure.patch b/kvm-tftp-introduce-a-header-structure.patch new file mode 100755 index 0000000000000000000000000000000000000000..d8c8ddb568e64be17e70739343ad52db680b362a --- /dev/null +++ b/kvm-tftp-introduce-a-header-structure.patch @@ -0,0 +1,263 @@ +From af72e344459614fcf2746739f05494ef7e691a78 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:36 -0400 +Subject: [PATCH 10/14] tftp: introduce a header structure +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-7-marcandre.lureau@redhat.com> +Patchwork-id: 101825 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 6/8] tftp: introduce a header structure +Bugzilla: 1970819 1970835 1970843 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Instead of using a composed structure and potentially reading past the +incoming buffer, use a different structure for the header. + +Signed-off-by: Marc-André Lureau + +(cherry picked from commit 990163cf3ac86b7875559f49602c4d76f46f6f30) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/tftp.c | 58 +++++++++++++++++++++++++----------------------- + slirp/src/tftp.h | 6 ++++- + 2 files changed, 35 insertions(+), 29 deletions(-) + +diff --git a/slirp/src/tftp.c b/slirp/src/tftp.c +index 07e8f3cb2f..53e04d0aeb 100644 +--- a/slirp/src/tftp.c ++++ b/slirp/src/tftp.c +@@ -50,7 +50,7 @@ static void tftp_session_terminate(struct tftp_session *spt) + } + + static int tftp_session_allocate(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp) ++ struct tftphdr *hdr) + { + struct tftp_session *spt; + int k; +@@ -75,7 +75,7 @@ found: + memcpy(&spt->client_addr, srcsas, sockaddr_size(srcsas)); + spt->fd = -1; + spt->block_size = 512; +- spt->client_port = tp->udp.uh_sport; ++ spt->client_port = hdr->udp.uh_sport; + spt->slirp = slirp; + + tftp_session_update(spt); +@@ -84,7 +84,7 @@ found: + } + + static int tftp_session_find(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp) ++ struct tftphdr *hdr) + { + struct tftp_session *spt; + int k; +@@ -94,7 +94,7 @@ static int tftp_session_find(Slirp *slirp, struct sockaddr_storage *srcsas, + + if (tftp_session_in_use(spt)) { + if (sockaddr_equal(&spt->client_addr, srcsas)) { +- if (spt->client_port == tp->udp.uh_sport) { ++ if (spt->client_port == hdr->udp.uh_sport) { + return k; + } + } +@@ -146,13 +146,13 @@ static struct tftp_t *tftp_prep_mbuf_data(struct tftp_session *spt, + } + + static void tftp_udp_output(struct tftp_session *spt, struct mbuf *m, +- struct tftp_t *recv_tp) ++ struct tftphdr *hdr) + { + if (spt->client_addr.ss_family == AF_INET6) { + struct sockaddr_in6 sa6, da6; + + sa6.sin6_addr = spt->slirp->vhost_addr6; +- sa6.sin6_port = recv_tp->udp.uh_dport; ++ sa6.sin6_port = hdr->udp.uh_dport; + da6.sin6_addr = ((struct sockaddr_in6 *)&spt->client_addr)->sin6_addr; + da6.sin6_port = spt->client_port; + +@@ -161,7 +161,7 @@ static void tftp_udp_output(struct tftp_session *spt, struct mbuf *m, + struct sockaddr_in sa4, da4; + + sa4.sin_addr = spt->slirp->vhost_addr; +- sa4.sin_port = recv_tp->udp.uh_dport; ++ sa4.sin_port = hdr->udp.uh_dport; + da4.sin_addr = ((struct sockaddr_in *)&spt->client_addr)->sin_addr; + da4.sin_port = spt->client_port; + +@@ -183,7 +183,7 @@ static int tftp_send_oack(struct tftp_session *spt, const char *keys[], + + tp = tftp_prep_mbuf_data(spt, m); + +- tp->tp_op = htons(TFTP_OACK); ++ tp->hdr.tp_op = htons(TFTP_OACK); + for (i = 0; i < nb; i++) { + n += snprintf(tp->x.tp_buf + n, sizeof(tp->x.tp_buf) - n, "%s", + keys[i]) + +@@ -195,7 +195,7 @@ static int tftp_send_oack(struct tftp_session *spt, const char *keys[], + + m->m_len = sizeof(struct tftp_t) - (TFTP_BLOCKSIZE_MAX + 2) + n - + sizeof(struct udphdr); +- tftp_udp_output(spt, m, recv_tp); ++ tftp_udp_output(spt, m, &recv_tp->hdr); + + return 0; + } +@@ -216,21 +216,21 @@ static void tftp_send_error(struct tftp_session *spt, uint16_t errorcode, + + tp = tftp_prep_mbuf_data(spt, m); + +- tp->tp_op = htons(TFTP_ERROR); ++ tp->hdr.tp_op = htons(TFTP_ERROR); + tp->x.tp_error.tp_error_code = htons(errorcode); + slirp_pstrcpy((char *)tp->x.tp_error.tp_msg, sizeof(tp->x.tp_error.tp_msg), + msg); + + m->m_len = sizeof(struct tftp_t) - (TFTP_BLOCKSIZE_MAX + 2) + 3 + + strlen(msg) - sizeof(struct udphdr); +- tftp_udp_output(spt, m, recv_tp); ++ tftp_udp_output(spt, m, &recv_tp->hdr); + + out: + tftp_session_terminate(spt); + } + + static void tftp_send_next_block(struct tftp_session *spt, +- struct tftp_t *recv_tp) ++ struct tftphdr *hdr) + { + struct mbuf *m; + struct tftp_t *tp; +@@ -244,7 +244,7 @@ static void tftp_send_next_block(struct tftp_session *spt, + + tp = tftp_prep_mbuf_data(spt, m); + +- tp->tp_op = htons(TFTP_DATA); ++ tp->hdr.tp_op = htons(TFTP_DATA); + tp->x.tp_data.tp_block_nr = htons((spt->block_nr + 1) & 0xffff); + + nobytes = tftp_read_data(spt, spt->block_nr, tp->x.tp_data.tp_buf, +@@ -262,7 +262,7 @@ static void tftp_send_next_block(struct tftp_session *spt, + + m->m_len = sizeof(struct tftp_t) - (TFTP_BLOCKSIZE_MAX - nobytes) - + sizeof(struct udphdr); +- tftp_udp_output(spt, m, recv_tp); ++ tftp_udp_output(spt, m, hdr); + + if (nobytes == spt->block_size) { + tftp_session_update(spt); +@@ -285,12 +285,12 @@ static void tftp_handle_rrq(Slirp *slirp, struct sockaddr_storage *srcsas, + int nb_options = 0; + + /* check if a session already exists and if so terminate it */ +- s = tftp_session_find(slirp, srcsas, tp); ++ s = tftp_session_find(slirp, srcsas, &tp->hdr); + if (s >= 0) { + tftp_session_terminate(&slirp->tftp_sessions[s]); + } + +- s = tftp_session_allocate(slirp, srcsas, tp); ++ s = tftp_session_allocate(slirp, srcsas, &tp->hdr); + + if (s < 0) { + return; +@@ -411,29 +411,29 @@ static void tftp_handle_rrq(Slirp *slirp, struct sockaddr_storage *srcsas, + } + + spt->block_nr = 0; +- tftp_send_next_block(spt, tp); ++ tftp_send_next_block(spt, &tp->hdr); + } + + static void tftp_handle_ack(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp, int pktlen) ++ struct tftphdr *hdr) + { + int s; + +- s = tftp_session_find(slirp, srcsas, tp); ++ s = tftp_session_find(slirp, srcsas, hdr); + + if (s < 0) { + return; + } + +- tftp_send_next_block(&slirp->tftp_sessions[s], tp); ++ tftp_send_next_block(&slirp->tftp_sessions[s], hdr); + } + + static void tftp_handle_error(Slirp *slirp, struct sockaddr_storage *srcsas, +- struct tftp_t *tp, int pktlen) ++ struct tftphdr *hdr) + { + int s; + +- s = tftp_session_find(slirp, srcsas, tp); ++ s = tftp_session_find(slirp, srcsas, hdr); + + if (s < 0) { + return; +@@ -444,23 +444,25 @@ static void tftp_handle_error(Slirp *slirp, struct sockaddr_storage *srcsas, + + void tftp_input(struct sockaddr_storage *srcsas, struct mbuf *m) + { +- struct tftp_t *tp = mtod_check(m, offsetof(struct tftp_t, x.tp_buf)); ++ struct tftphdr *hdr = mtod_check(m, sizeof(struct tftphdr)); + +- if (tp == NULL) { ++ if (hdr == NULL) { + return; + } + +- switch (ntohs(tp->tp_op)) { ++ switch (ntohs(hdr->tp_op)) { + case TFTP_RRQ: +- tftp_handle_rrq(m->slirp, srcsas, tp, m->m_len); ++ tftp_handle_rrq(m->slirp, srcsas, ++ mtod(m, struct tftp_t *), ++ m->m_len); + break; + + case TFTP_ACK: +- tftp_handle_ack(m->slirp, srcsas, tp, m->m_len); ++ tftp_handle_ack(m->slirp, srcsas, hdr); + break; + + case TFTP_ERROR: +- tftp_handle_error(m->slirp, srcsas, tp, m->m_len); ++ tftp_handle_error(m->slirp, srcsas, hdr); + break; + } + } +diff --git a/slirp/src/tftp.h b/slirp/src/tftp.h +index c47bb43c7d..021f6cf109 100644 +--- a/slirp/src/tftp.h ++++ b/slirp/src/tftp.h +@@ -18,9 +18,13 @@ + #define TFTP_FILENAME_MAX 512 + #define TFTP_BLOCKSIZE_MAX 1428 + +-struct tftp_t { ++struct tftphdr { + struct udphdr udp; + uint16_t tp_op; ++} SLIRP_PACKED; ++ ++struct tftp_t { ++ struct tftphdr hdr; + union { + struct { + uint16_t tp_block_nr; +-- +2.27.0 + diff --git a/kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch b/kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch new file mode 100755 index 0000000000000000000000000000000000000000..3efef479757d24edba839648bb9278c909a347b0 --- /dev/null +++ b/kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch @@ -0,0 +1,55 @@ +From e483eea891139ee38138381ba6715b3a2be050cc Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:12 +0000 +Subject: [PATCH 16/18] tools/virtiofsd/fuse_lowlevel: Fix + fuse_out_header::error value +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-6-dgilbert@redhat.com> +Patchwork-id: 94128 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 5/7] tools/virtiofsd/fuse_lowlevel: Fix fuse_out_header::error value +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: Philippe Mathieu-Daudé + +Fix warning reported by Clang static code analyzer: + + CC tools/virtiofsd/fuse_lowlevel.o + tools/virtiofsd/fuse_lowlevel.c:195:9: warning: Value stored to 'error' is never read + error = -ERANGE; + ^ ~~~~~~~ + +Fixes: 3db2876 +Reported-by: Clang Static Analyzer +Reviewed-by: Ján Tomko +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 09c086b2a144324199f99a7d4de78c3276a486c1) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_lowlevel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 704c036..2dd36ec 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -192,7 +192,7 @@ int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + + if (error <= -1000 || error > 0) { + fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); +- error = -ERANGE; ++ out.error = -ERANGE; + } + + iov[0].iov_base = &out; +-- +1.8.3.1 + diff --git a/kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch b/kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch new file mode 100755 index 0000000000000000000000000000000000000000..6af549a693772f43e31bc2de611c1b15b5494c67 --- /dev/null +++ b/kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch @@ -0,0 +1,56 @@ +From 8ce8ccc2a22798a89bac06a37427c3a3cea91a62 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:54 +0100 +Subject: [PATCH 3/9] tools/virtiofsd/passthrough_ll: Fix double close() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-2-dgilbert@redhat.com> +Patchwork-id: 96269 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/7] tools/virtiofsd/passthrough_ll: Fix double close() +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michael S. Tsirkin + +From: Philippe Mathieu-Daudé + +On success, the fdopendir() call closes fd. Later on the error +path we try to close an already-closed fd. This can lead to +use-after-free. Fix by only closing the fd if the fdopendir() +call failed. + +Cc: qemu-stable@nongnu.org +Fixes: b39bce121b (add dirp_map to hide lo_dirp pointers) +Reported-by: Coverity (CID 1421933 USE_AFTER_FREE) +Suggested-by: Peter Maydell +Signed-off-by: Philippe Mathieu-Daudé +Message-Id: <20200321120654.7985-1-philmd@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e1cd92d95cd4f97b3464c4e08cd5b22bf5ca05cb) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9cba3f1..50ff672 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1524,8 +1524,7 @@ out_err: + if (d) { + if (d->dp) { + closedir(d->dp); +- } +- if (fd != -1) { ++ } else if (fd != -1) { + close(fd); + } + free(d); +-- +1.8.3.1 + diff --git a/kvm-tpm-ppi-page-align-PPI-RAM.patch b/kvm-tpm-ppi-page-align-PPI-RAM.patch new file mode 100755 index 0000000000000000000000000000000000000000..32c971dc06ca5ca0a6c2f4d2ecfbbb0374cd4424 --- /dev/null +++ b/kvm-tpm-ppi-page-align-PPI-RAM.patch @@ -0,0 +1,58 @@ +From 7cb1c5e1416de9a09180f0930d2a216c77e8cdbd Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 30 Jan 2020 16:01:10 +0000 +Subject: [PATCH 07/15] tpm-ppi: page-align PPI RAM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20200130160110.126086-1-marcandre.lureau@redhat.com> +Patchwork-id: 93600 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH] tpm-ppi: page-align PPI RAM +Bugzilla: 1787444 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Philippe Mathieu-Daudé + +post-copy migration fails on destination with error such as: +2019-12-26T10:22:44.714644Z qemu-kvm: ram_block_discard_range: +Unaligned start address: 0x559d2afae9a0 + +Use qemu_memalign() to constrain the PPI RAM memory alignment. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Marc-André Lureau +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Stefan Berger +Signed-off-by: Stefan Berger +Message-id: 20200103074000.1006389-3-marcandre.lureau@redhat.com + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1787444 +Brew: http://brewweb.devel.redhat.com/brew/taskinfo?taskID=26122940 + +(cherry picked from commit 71e415c8a75c130875f14d6b2136825789feb297) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + hw/tpm/tpm_ppi.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/tpm/tpm_ppi.c b/hw/tpm/tpm_ppi.c +index ff31459..6d9c1a3 100644 +--- a/hw/tpm/tpm_ppi.c ++++ b/hw/tpm/tpm_ppi.c +@@ -43,7 +43,8 @@ void tpm_ppi_reset(TPMPPI *tpmppi) + void tpm_ppi_init(TPMPPI *tpmppi, struct MemoryRegion *m, + hwaddr addr, Object *obj) + { +- tpmppi->buf = g_malloc0(HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE)); ++ tpmppi->buf = qemu_memalign(qemu_real_host_page_size, ++ HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE)); + memory_region_init_ram_device_ptr(&tpmppi->ram, obj, "tpm-ppi", + TPM_PPI_ADDR_SIZE, tpmppi->buf); + vmstate_register_ram(&tpmppi->ram, DEVICE(obj)); +-- +1.8.3.1 + diff --git a/kvm-trace-update-qemu-trace-stap-to-Python-3.patch b/kvm-trace-update-qemu-trace-stap-to-Python-3.patch new file mode 100755 index 0000000000000000000000000000000000000000..c49aecdc1af7bc9cc6f4605ee9370f260d88dc5e --- /dev/null +++ b/kvm-trace-update-qemu-trace-stap-to-Python-3.patch @@ -0,0 +1,82 @@ +From e7cdcd1e39c4c030a32c9e8ef79316eae8555bc8 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 16 Jan 2020 17:52:48 +0000 +Subject: [PATCH 04/15] trace: update qemu-trace-stap to Python 3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +Message-id: <20200116175248.286556-2-stefanha@redhat.com> +Patchwork-id: 93365 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] trace: update qemu-trace-stap to Python 3 +Bugzilla: 1787395 +RH-Acked-by: John Snow +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Dr. David Alan Gilbert + +qemu-trace-stap does not support Python 3 yet: + + $ scripts/qemu-trace-stap list path/to/qemu-system-x86_64 + Traceback (most recent call last): + File "scripts/qemu-trace-stap", line 175, in + main() + File "scripts/qemu-trace-stap", line 171, in main + args.func(args) + File "scripts/qemu-trace-stap", line 118, in cmd_list + print_probes(args.verbose, "*") + File "scripts/qemu-trace-stap", line 114, in print_probes + if line.startswith(prefix): + TypeError: startswith first arg must be bytes or a tuple of bytes, not str + +Now that QEMU requires Python 3.5 or later we can switch to pure Python +3. Use Popen()'s universal_newlines=True argument to treat stdout as +text instead of binary. + +Fixes: 62dd1048c0bd ("trace: add ability to do simple printf logging via systemtap") +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1787395 +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Message-id: 20200107112438.383958-1-stefanha@redhat.com +Message-Id: <20200107112438.383958-1-stefanha@redhat.com> +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 3f0097169bb60268cc5dda0c5ea47c31ab57b22f) +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Danilo C. L. de Paula +--- + scripts/qemu-trace-stap | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/scripts/qemu-trace-stap b/scripts/qemu-trace-stap +index 91d1051..90527eb 100755 +--- a/scripts/qemu-trace-stap ++++ b/scripts/qemu-trace-stap +@@ -1,4 +1,4 @@ +-#!/usr/bin/python ++#!/usr/bin/env python3 + # -*- python -*- + # + # Copyright (C) 2019 Red Hat, Inc +@@ -18,8 +18,6 @@ + # You should have received a copy of the GNU General Public License + # along with this program; if not, see . + +-from __future__ import print_function +- + import argparse + import copy + import os.path +@@ -104,7 +102,9 @@ def cmd_list(args): + if verbose: + print("Listing probes with name '%s'" % script) + proc = subprocess.Popen(["stap", "-l", script], +- stdout=subprocess.PIPE, env=tapset_env(tapsets)) ++ stdout=subprocess.PIPE, ++ universal_newlines=True, ++ env=tapset_env(tapsets)) + out, err = proc.communicate() + if proc.returncode != 0: + print("No probes found, are the tapsets installed in %s" % tapset_dir(args.binary)) +-- +1.8.3.1 + diff --git a/kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch b/kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch new file mode 100755 index 0000000000000000000000000000000000000000..059445bb7dbcd8100d11234a59eefcfd9fdc93e8 --- /dev/null +++ b/kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch @@ -0,0 +1,116 @@ +From ba3068eb1a349ec4ed8b7ccdae76450f0c315be9 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 19 Nov 2020 17:23:11 -0500 +Subject: [PATCH 18/18] trace: use STAP_SDT_V2 to work around symbol visibility +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +Message-id: <20201119172311.942629-2-stefanha@redhat.com> +Patchwork-id: 99779 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/1] trace: use STAP_SDT_V2 to work around symbol visibility +Bugzilla: 1898700 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Gerd Hoffmann +RH-Acked-by: Philippe Mathieu-Daudé + +QEMU binaries no longer launch successfully with recent SystemTap +releases. This is because modular QEMU builds link the sdt semaphores +into the main binary instead of into the shared objects where they are +used. The symbol visibility of semaphores is 'hidden' and the dynamic +linker prints an error during module loading: + + $ ./configure --enable-trace-backends=dtrace --enable-modules ... + ... + Failed to open module: /builddir/build/BUILD/qemu-4.2.0/s390x-softmmu/../block-curl.so: undefined symbol: qemu_curl_close_semaphore + +The long-term solution is to generate per-module dtrace .o files and +link them into the module instead of the main binary. + +In the short term we can define STAP_SDT_V2 so dtrace(1) produces a .o +file with 'default' symbol visibility instead of 'hidden'. This +workaround is small and easier to merge for QEMU 5.2 and downstream +backports. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1898700 +Cc: wcohen@redhat.com +Cc: fche@redhat.com +Cc: kraxel@redhat.com +Cc: rjones@redhat.com +Cc: ddepaula@redhat.com +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Miroslav Rezanina + +(cherry picked from commit 4b265c79a85bb35abe19aacea6954c1616521639) +Signed-off-by: Stefan Hajnoczi + +Conflicts: + trace/meson.build + Downstream uses makefiles, so move the dtrace invocation changes to + rules.mak and Makefile. +Signed-off-by: Danilo C. L. de Paula +--- + Makefile | 4 ++-- + configure | 7 +++++++ + rules.mak | 2 +- + 3 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/Makefile b/Makefile +index ff05c309497..29b01a13ee3 100644 +--- a/Makefile ++++ b/Makefile +@@ -198,7 +198,7 @@ tracetool-y += $(shell find $(SRC_PATH)/scripts/tracetool -name "*.py") + $< > $@,"GEN","$(@:%-timestamp=%)") + + %/trace-dtrace.h: %/trace-dtrace.dtrace $(tracetool-y) +- $(call quiet-command,dtrace -o $@ -h -s $<, "GEN","$@") ++ $(call quiet-command,dtrace -o $@ -DSTAP_SDT_V2 -h -s $<, "GEN","$@") + + %/trace-dtrace.o: %/trace-dtrace.dtrace $(tracetool-y) + +@@ -258,7 +258,7 @@ trace-dtrace-root.dtrace-timestamp: $(SRC_PATH)/trace-events $(BUILD_DIR)/config + $< > $@,"GEN","$(@:%-timestamp=%)") + + trace-dtrace-root.h: trace-dtrace-root.dtrace +- $(call quiet-command,dtrace -o $@ -h -s $<, "GEN","$@") ++ $(call quiet-command,dtrace -o $@ -DSTAP_SDT_V2 -h -s $<, "GEN","$@") + + trace-dtrace-root.o: trace-dtrace-root.dtrace + +diff --git a/configure b/configure +index 5120c1409a7..c62b61403f6 100755 +--- a/configure ++++ b/configure +@@ -5275,6 +5275,13 @@ if have_backend "dtrace"; then + trace_backend_stap="no" + if has 'stap' ; then + trace_backend_stap="yes" ++ ++ # Workaround to avoid dtrace(1) producing a file with 'hidden' symbol ++ # visibility. Define STAP_SDT_V2 to produce 'default' symbol visibility ++ # instead. QEMU --enable-modules depends on this because the SystemTap ++ # semaphores are linked into the main binary and not the module's shared ++ # object. ++ QEMU_CFLAGS="$QEMU_CFLAGS -DSTAP_SDT_V2" + fi + fi + +diff --git a/rules.mak b/rules.mak +index 967295dd2b6..bdfc223a5a1 100644 +--- a/rules.mak ++++ b/rules.mak +@@ -101,7 +101,7 @@ LINK = $(call quiet-command, $(LINKPROG) $(QEMU_LDFLAGS) $(QEMU_CFLAGS) $(CFLAGS + -c -o $@ $<,"OBJC","$(TARGET_DIR)$@") + + %.o: %.dtrace +- $(call quiet-command,dtrace -o $@ -G -s $<,"GEN","$(TARGET_DIR)$@") ++ $(call quiet-command,dtrace -o $@ -DSTAP_SDT_V2 -G -s $<,"GEN","$(TARGET_DIR)$@") + + DSO_OBJ_CFLAGS := -fPIC -DBUILD_DSO + module-common.o: CFLAGS += $(DSO_OBJ_CFLAGS) +-- +2.27.0 + diff --git a/kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch b/kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch new file mode 100755 index 0000000000000000000000000000000000000000..4da71cccd35f4332ae2c3acf81fb08ed42c12a59 --- /dev/null +++ b/kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch @@ -0,0 +1,53 @@ +From 87cacc268f37758553ad93fefa8b312ed0bd2520 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 29 Jun 2021 03:42:43 -0400 +Subject: [PATCH 5/9] tx_pkt: switch to use qemu_receive_packet_iov() for + loopback +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210629034247.3286477-6-jmaloy@redhat.com> +Patchwork-id: 101788 +O-Subject: [RHEL-8.4.0.z qemu-kvm PATCH v2 5/9] tx_pkt: switch to use qemu_receive_packet_iov() for loopback +Bugzilla: 1932917 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth + +From: Jason Wang + +This patch switches to use qemu_receive_receive_iov() which can detect +reentrancy and return early. + +This is intended to address CVE-2021-3416. + +Cc: Prasad J Pandit +Cc: qemu-stable@nongnu.org +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Jason Wang + +(cherry picked from commit 8c552542b81e56ff532dd27ec6e5328954bdda73) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/net_tx_pkt.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c +index 54d4c3bbd0..646cdfaf4d 100644 +--- a/hw/net/net_tx_pkt.c ++++ b/hw/net/net_tx_pkt.c +@@ -544,7 +544,7 @@ static inline void net_tx_pkt_sendv(struct NetTxPkt *pkt, + NetClientState *nc, const struct iovec *iov, int iov_cnt) + { + if (pkt->is_loopback) { +- nc->info->receive_iov(nc, iov, iov_cnt); ++ qemu_receive_packet_iov(nc, iov, iov_cnt); + } else { + qemu_sendv_packet(nc, iov, iov_cnt); + } +-- +2.27.0 + diff --git a/kvm-udp-check-upd_input-buffer-size.patch b/kvm-udp-check-upd_input-buffer-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..0f3c6f3e24bf1201a54a70942364780e7d8cb268 --- /dev/null +++ b/kvm-udp-check-upd_input-buffer-size.patch @@ -0,0 +1,52 @@ +From 1b8aa33b218a8ff3e8aa2f1b6875df40fd70f0ed Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:40 -0400 +Subject: [PATCH 11/14] udp: check upd_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-8-marcandre.lureau@redhat.com> +Patchwork-id: 101826 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 7/8] udp: check upd_input buffer size +Bugzilla: 1970853 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3594 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/47 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970853 + +(cherry picked from commit 74572be49247c8c5feae7c6e0b50c4f569ca9824) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/udp.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/slirp/src/udp.c b/slirp/src/udp.c +index ae23ba4b2a..86142bba14 100644 +--- a/slirp/src/udp.c ++++ b/slirp/src/udp.c +@@ -90,7 +90,10 @@ void udp_input(register struct mbuf *m, int iphlen) + /* + * Get IP and UDP header together in first mbuf. + */ +- ip = mtod(m, struct ip *); ++ ip = mtod_check(m, iphlen + sizeof(struct udphdr)); ++ if (ip == NULL) { ++ goto bad; ++ } + uh = (struct udphdr *)((char *)ip + iphlen); + + /* +-- +2.27.0 + diff --git a/kvm-upd6-check-udp6_input-buffer-size.patch b/kvm-upd6-check-udp6_input-buffer-size.patch new file mode 100755 index 0000000000000000000000000000000000000000..2aa3a2431aae199fdec6e3f36d113997272955e1 --- /dev/null +++ b/kvm-upd6-check-udp6_input-buffer-size.patch @@ -0,0 +1,52 @@ +From 6808086932ddc83fd748c46fea495e7004299b55 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 29 Jul 2021 04:56:31 -0400 +Subject: [PATCH 08/14] upd6: check udp6_input buffer size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20210708082537.1550263-5-marcandre.lureau@redhat.com> +Patchwork-id: 101822 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 4/8] upd6: check udp6_input buffer size +Bugzilla: 1970835 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Fixes: CVE-2021-3593 +Fixes: https://gitlab.freedesktop.org/slirp/libslirp/-/issues/45 + +Signed-off-by: Marc-André Lureau + +BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1970835 + +(cherry picked from commit de71c15de66ba9350bf62c45b05f8fbff166517b) +Signed-off-by: Marc-André Lureau +Signed-off-by: Miroslav Rezanina +--- + slirp/src/udp6.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/slirp/src/udp6.c b/slirp/src/udp6.c +index 6f9486bbca..8c490e4d10 100644 +--- a/slirp/src/udp6.c ++++ b/slirp/src/udp6.c +@@ -28,7 +28,10 @@ void udp6_input(struct mbuf *m) + ip = mtod(m, struct ip6 *); + m->m_len -= iphlen; + m->m_data += iphlen; +- uh = mtod(m, struct udphdr *); ++ uh = mtod_check(m, sizeof(struct udphdr)); ++ if (uh == NULL) { ++ goto bad; ++ } + m->m_len += iphlen; + m->m_data -= iphlen; + +-- +2.27.0 + diff --git a/kvm-usb-fix-setup_len-init-CVE-2020-14364.patch b/kvm-usb-fix-setup_len-init-CVE-2020-14364.patch new file mode 100755 index 0000000000000000000000000000000000000000..5e632997af9a50e73bb0d36408ebfa5c5330afd2 --- /dev/null +++ b/kvm-usb-fix-setup_len-init-CVE-2020-14364.patch @@ -0,0 +1,102 @@ +From feb16ff29a13a4286389bb8b9d4f541aab9b84f1 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Thu, 3 Sep 2020 15:27:13 -0400 +Subject: [PATCH] usb: fix setup_len init (CVE-2020-14364) + +RH-Author: Jon Maloy +Message-id: <20200903152713.1420531-2-jmaloy@redhat.com> +Patchwork-id: 98271 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] usb: fix setup_len init (CVE-2020-14364) +Bugzilla: 1869710 +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Thomas Huth +RH-Acked-by: Gerd Hoffmann + +From: Gerd Hoffmann + +Store calculated setup_len in a local variable, verify it, and only +write it to the struct (USBDevice->setup_len) in case it passed the +sanity checks. + +This prevents other code (do_token_{in,out} functions specifically) +from working with invalid USBDevice->setup_len values and overrunning +the USBDevice->setup_buf[] buffer. + +Fixes: CVE-2020-14364 +Signed-off-by: Gerd Hoffmann +Tested-by: Gonglei +Reviewed-by: Li Qiang +Message-id: 20200825053636.29648-1-kraxel@redhat.com +(cherry picked from commit b946434f2659a182afc17e155be6791ebfb302eb) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/core.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +diff --git a/hw/usb/core.c b/hw/usb/core.c +index 5abd128b6b..5234dcc73f 100644 +--- a/hw/usb/core.c ++++ b/hw/usb/core.c +@@ -129,6 +129,7 @@ void usb_wakeup(USBEndpoint *ep, unsigned int stream) + static void do_token_setup(USBDevice *s, USBPacket *p) + { + int request, value, index; ++ unsigned int setup_len; + + if (p->iov.size != 8) { + p->status = USB_RET_STALL; +@@ -138,14 +139,15 @@ static void do_token_setup(USBDevice *s, USBPacket *p) + usb_packet_copy(p, s->setup_buf, p->iov.size); + s->setup_index = 0; + p->actual_length = 0; +- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; +- if (s->setup_len > sizeof(s->data_buf)) { ++ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; ++ if (setup_len > sizeof(s->data_buf)) { + fprintf(stderr, + "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", +- s->setup_len, sizeof(s->data_buf)); ++ setup_len, sizeof(s->data_buf)); + p->status = USB_RET_STALL; + return; + } ++ s->setup_len = setup_len; + + request = (s->setup_buf[0] << 8) | s->setup_buf[1]; + value = (s->setup_buf[3] << 8) | s->setup_buf[2]; +@@ -259,26 +261,28 @@ static void do_token_out(USBDevice *s, USBPacket *p) + static void do_parameter(USBDevice *s, USBPacket *p) + { + int i, request, value, index; ++ unsigned int setup_len; + + for (i = 0; i < 8; i++) { + s->setup_buf[i] = p->parameter >> (i*8); + } + + s->setup_state = SETUP_STATE_PARAM; +- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; + s->setup_index = 0; + + request = (s->setup_buf[0] << 8) | s->setup_buf[1]; + value = (s->setup_buf[3] << 8) | s->setup_buf[2]; + index = (s->setup_buf[5] << 8) | s->setup_buf[4]; + +- if (s->setup_len > sizeof(s->data_buf)) { ++ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; ++ if (setup_len > sizeof(s->data_buf)) { + fprintf(stderr, + "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", +- s->setup_len, sizeof(s->data_buf)); ++ setup_len, sizeof(s->data_buf)); + p->status = USB_RET_STALL; + return; + } ++ s->setup_len = setup_len; + + if (p->pid == USB_TOKEN_OUT) { + usb_packet_copy(p, s->data_buf, s->setup_len); +-- +2.27.0 + diff --git a/kvm-usbredir-Prevent-recursion-in-usbredir_write.patch b/kvm-usbredir-Prevent-recursion-in-usbredir_write.patch new file mode 100755 index 0000000000000000000000000000000000000000..8f082565249f227520479a8f0c1e9f5f9d85f67e --- /dev/null +++ b/kvm-usbredir-Prevent-recursion-in-usbredir_write.patch @@ -0,0 +1,106 @@ +From 8f6311159977b8ee4b78172caa411d3cee4d2ae5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 14 Jan 2020 20:23:30 +0000 +Subject: [PATCH 4/5] usbredir: Prevent recursion in usbredir_write +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200114202331.51831-2-dgilbert@redhat.com> +Patchwork-id: 93344 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] usbredir: Prevent recursion in usbredir_write +Bugzilla: 1790844 +RH-Acked-by: Peter Xu +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Gerd Hoffmann + +From: "Dr. David Alan Gilbert" + +I've got a case where usbredir_write manages to call back into itself +via spice; this patch causes the recursion to fail (0 bytes) the write; +this seems to avoid the deadlock I was previously seeing. + +I can't say I fully understand the interaction of usbredir and spice; +but there are a few similar guards in spice and usbredir +to catch other cases especially onces also related to spice_server_char_device_wakeup + +This case seems to be triggered by repeated migration+repeated +reconnection of the viewer; but my debugging suggests the migration +finished before this hits. + +The backtrace of the hang looks like: + reds_handle_ticket + reds_handle_other_links + reds_channel_do_link + red_channel_connect + spicevmc_connect + usbredir_create_parser + usbredirparser_do_write + usbredir_write + qemu_chr_fe_write + qemu_chr_write + qemu_chr_write_buffer + spice_chr_write + spice_server_char_device_wakeup + red_char_device_wakeup + red_char_device_write_to_device + vmc_write + usbredirparser_do_write + usbredir_write + qemu_chr_fe_write + qemu_chr_write + qemu_chr_write_buffer + qemu_mutex_lock_impl + +and we fail as we land through qemu_chr_write_buffer's lock +twice. + +Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1752320 + +Signed-off-by: Dr. David Alan Gilbert +Message-Id: <20191218113012.13331-1-dgilbert@redhat.com> +Signed-off-by: Gerd Hoffmann +(cherry picked from commit 394642a8d3742c885e397d5bb5ee0ec40743cdc6) +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/redirect.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/hw/usb/redirect.c b/hw/usb/redirect.c +index e0f5ca6..97f2c3a 100644 +--- a/hw/usb/redirect.c ++++ b/hw/usb/redirect.c +@@ -113,6 +113,7 @@ struct USBRedirDevice { + /* Properties */ + CharBackend cs; + bool enable_streams; ++ bool in_write; + uint8_t debug; + int32_t bootindex; + char *filter_str; +@@ -290,6 +291,13 @@ static int usbredir_write(void *priv, uint8_t *data, int count) + return 0; + } + ++ /* Recursion check */ ++ if (dev->in_write) { ++ DPRINTF("usbredir_write recursion\n"); ++ return 0; ++ } ++ dev->in_write = true; ++ + r = qemu_chr_fe_write(&dev->cs, data, count); + if (r < count) { + if (!dev->watch) { +@@ -300,6 +308,7 @@ static int usbredir_write(void *priv, uint8_t *data, int count) + r = 0; + } + } ++ dev->in_write = false; + return r; + } + +-- +1.8.3.1 + diff --git a/kvm-util-Introduce-qemu_get_host_name.patch b/kvm-util-Introduce-qemu_get_host_name.patch new file mode 100755 index 0000000000000000000000000000000000000000..da21888112a0b1e583b7822d8e2ef28a6a448ce6 --- /dev/null +++ b/kvm-util-Introduce-qemu_get_host_name.patch @@ -0,0 +1,123 @@ +From 41510fba34cda98cb85a8d04e46dcfdd9a91aa61 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= +Date: Thu, 24 Dec 2020 12:53:03 -0500 +Subject: [PATCH 3/5] util: Introduce qemu_get_host_name() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +Message-id: <20201224125304.62697-3-marcandre.lureau@redhat.com> +Patchwork-id: 100499 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/3] util: Introduce qemu_get_host_name() +Bugzilla: 1910326 +RH-Acked-by: Daniel P. Berrange +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé + +From: Michal Privoznik + +This function offers operating system agnostic way to fetch host +name. It is implemented for both POSIX-like and Windows systems. + +Signed-off-by: Michal Privoznik +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Daniel P. Berrangé +Cc: qemu-stable@nongnu.org +Signed-off-by: Michael Roth + +(cherry picked from commit e47f4765afcab2b78dfa5b0115abf64d1d49a5d3) +Signed-off-by: Marc-André Lureau +Signed-off-by: Danilo C. L. de Paula +--- + include/qemu/osdep.h | 10 ++++++++++ + util/oslib-posix.c | 35 +++++++++++++++++++++++++++++++++++ + util/oslib-win32.c | 13 +++++++++++++ + 3 files changed, 58 insertions(+) + +diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h +index 0f97d68586a..d427e81a427 100644 +--- a/include/qemu/osdep.h ++++ b/include/qemu/osdep.h +@@ -620,4 +620,14 @@ static inline void qemu_reset_optind(void) + #endif + } + ++/** ++ * qemu_get_host_name: ++ * @errp: Error object ++ * ++ * Operating system agnostic way of querying host name. ++ * ++ * Returns allocated hostname (caller should free), NULL on failure. ++ */ ++char *qemu_get_host_name(Error **errp); ++ + #endif +diff --git a/util/oslib-posix.c b/util/oslib-posix.c +index 5a291cc9820..8f88e4dbe10 100644 +--- a/util/oslib-posix.c ++++ b/util/oslib-posix.c +@@ -726,3 +726,38 @@ void sigaction_invoke(struct sigaction *action, + } + action->sa_sigaction(info->ssi_signo, &si, NULL); + } ++ ++#ifndef HOST_NAME_MAX ++# ifdef _POSIX_HOST_NAME_MAX ++# define HOST_NAME_MAX _POSIX_HOST_NAME_MAX ++# else ++# define HOST_NAME_MAX 255 ++# endif ++#endif ++ ++char *qemu_get_host_name(Error **errp) ++{ ++ long len = -1; ++ g_autofree char *hostname = NULL; ++ ++#ifdef _SC_HOST_NAME_MAX ++ len = sysconf(_SC_HOST_NAME_MAX); ++#endif /* _SC_HOST_NAME_MAX */ ++ ++ if (len < 0) { ++ len = HOST_NAME_MAX; ++ } ++ ++ /* Unfortunately, gethostname() below does not guarantee a ++ * NULL terminated string. Therefore, allocate one byte more ++ * to be sure. */ ++ hostname = g_new0(char, len + 1); ++ ++ if (gethostname(hostname, len) < 0) { ++ error_setg_errno(errp, errno, ++ "cannot get hostname"); ++ return NULL; ++ } ++ ++ return g_steal_pointer(&hostname); ++} +diff --git a/util/oslib-win32.c b/util/oslib-win32.c +index e9b14ab1784..3b49d272972 100644 +--- a/util/oslib-win32.c ++++ b/util/oslib-win32.c +@@ -808,3 +808,16 @@ bool qemu_write_pidfile(const char *filename, Error **errp) + } + return true; + } ++ ++char *qemu_get_host_name(Error **errp) ++{ ++ wchar_t tmp[MAX_COMPUTERNAME_LENGTH + 1]; ++ DWORD size = G_N_ELEMENTS(tmp); ++ ++ if (GetComputerNameW(tmp, &size) == 0) { ++ error_setg_win32(errp, GetLastError(), "failed close handle"); ++ return NULL; ++ } ++ ++ return g_utf16_to_utf8(tmp, size, NULL, NULL, NULL); ++} +-- +2.27.0 + diff --git a/kvm-util-add-slirp_fmt-helpers.patch b/kvm-util-add-slirp_fmt-helpers.patch new file mode 100755 index 0000000000000000000000000000000000000000..31af5996628bd8bec5e6fe0395c571b07111f080 --- /dev/null +++ b/kvm-util-add-slirp_fmt-helpers.patch @@ -0,0 +1,140 @@ +From 5dc50c6bca059a9cda6677b1fd0187df1de78ed7 Mon Sep 17 00:00:00 2001 +From: jmaloy +Date: Thu, 13 Feb 2020 15:50:48 +0000 +Subject: [PATCH 2/7] util: add slirp_fmt() helpers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: jmaloy +Message-id: <20200213155049.3936-2-jmaloy@redhat.com> +Patchwork-id: 93824 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/2] util: add slirp_fmt() helpers +Bugzilla: 1798994 +RH-Acked-by: Eduardo Habkost +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi + +From: Marc-André Lureau + +Various calls to snprintf() in libslirp assume that snprintf() returns +"only" the number of bytes written (excluding terminating NUL). + +https://pubs.opengroup.org/onlinepubs/9699919799/functions/snprintf.html#tag_16_159_04 + +"Upon successful completion, the snprintf() function shall return the +number of bytes that would be written to s had n been sufficiently +large excluding the terminating null byte." + +Introduce slirp_fmt() that handles several pathological cases the +way libslirp usually expect: + +- treat error as fatal (instead of silently returning -1) + +- fmt0() will always \0 end + +- return the number of bytes actually written (instead of what would +have been written, which would usually result in OOB later), including +the ending \0 for fmt0() + +- warn if truncation happened (instead of ignoring) + +Other less common cases can still be handled with strcpy/snprintf() etc. + +Signed-off-by: Marc-André Lureau +Reviewed-by: Samuel Thibault +Message-Id: <20200127092414.169796-2-marcandre.lureau@redhat.com> +(cherry picked from libslirp commit 30648c03b27fb8d9611b723184216cd3174b6775) +Signed-off-by: Jon Maloy + +Signed-off-by: Danilo C. L. de Paula +--- + slirp/src/util.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + slirp/src/util.h | 3 +++ + 2 files changed, 65 insertions(+) + +diff --git a/slirp/src/util.c b/slirp/src/util.c +index e596087..e3b6257 100644 +--- a/slirp/src/util.c ++++ b/slirp/src/util.c +@@ -364,3 +364,65 @@ void slirp_pstrcpy(char *buf, int buf_size, const char *str) + } + *q = '\0'; + } ++ ++static int slirp_vsnprintf(char *str, size_t size, ++ const char *format, va_list args) ++{ ++ int rv = vsnprintf(str, size, format, args); ++ ++ if (rv < 0) { ++ g_error("vsnprintf() failed: %s", g_strerror(errno)); ++ } ++ ++ return rv; ++} ++ ++/* ++ * A snprintf()-like function that: ++ * - returns the number of bytes written (excluding optional \0-ending) ++ * - dies on error ++ * - warn on truncation ++ */ ++int slirp_fmt(char *str, size_t size, const char *format, ...) ++{ ++ va_list args; ++ int rv; ++ ++ va_start(args, format); ++ rv = slirp_vsnprintf(str, size, format, args); ++ va_end(args); ++ ++ if (rv > size) { ++ g_critical("vsnprintf() truncation"); ++ } ++ ++ return MIN(rv, size); ++} ++ ++/* ++ * A snprintf()-like function that: ++ * - always \0-end (unless size == 0) ++ * - returns the number of bytes actually written, including \0 ending ++ * - dies on error ++ * - warn on truncation ++ */ ++int slirp_fmt0(char *str, size_t size, const char *format, ...) ++{ ++ va_list args; ++ int rv; ++ ++ va_start(args, format); ++ rv = slirp_vsnprintf(str, size, format, args); ++ va_end(args); ++ ++ if (rv >= size) { ++ g_critical("vsnprintf() truncation"); ++ if (size > 0) ++ str[size - 1] = '\0'; ++ rv = size; ++ } else { ++ rv += 1; /* include \0 */ ++ } ++ ++ return rv; ++} +diff --git a/slirp/src/util.h b/slirp/src/util.h +index 3c6223c..0558dfc 100644 +--- a/slirp/src/util.h ++++ b/slirp/src/util.h +@@ -177,4 +177,7 @@ static inline int slirp_socket_set_fast_reuse(int fd) + + void slirp_pstrcpy(char *buf, int buf_size, const char *str); + ++int slirp_fmt(char *str, size_t size, const char *format, ...); ++int slirp_fmt0(char *str, size_t size, const char *format, ...); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch b/kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch new file mode 100755 index 0000000000000000000000000000000000000000..8e584734d850b626ecf7682c09180d34bca86900 --- /dev/null +++ b/kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch @@ -0,0 +1,79 @@ +From f53c2c68db7780353a915072f8c953a74149b1f7 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:42 -0500 +Subject: [PATCH 3/7] vfio: Create shared routine for scanning info + capabilities +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-4-cohuck@redhat.com> +Patchwork-id: 100678 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/7] vfio: Create shared routine for scanning info capabilities +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +Rather than duplicating the same loop in multiple locations, +create a static function to do the work. + +Signed-off-by: Matthew Rosato +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Cornelia Huck +Signed-off-by: Alex Williamson +(cherry picked from commit 3ab7a0b40d4be5ade3b61d4afd1518193b199423) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/common.c | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 5ca11488d67..77d62d2dcdf 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -826,17 +826,12 @@ static void vfio_listener_release(VFIOContainer *container) + } + } + +-struct vfio_info_cap_header * +-vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) ++static struct vfio_info_cap_header * ++vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) + { + struct vfio_info_cap_header *hdr; +- void *ptr = info; +- +- if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { +- return NULL; +- } + +- for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { ++ for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } +@@ -845,6 +840,16 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) + return NULL; + } + ++struct vfio_info_cap_header * ++vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) ++{ ++ if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { ++ return NULL; ++ } ++ ++ return vfio_get_cap((void *)info, info->cap_offset, id); ++} ++ + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) + { +-- +2.27.0 + diff --git a/kvm-vfio-Find-DMA-available-capability.patch b/kvm-vfio-Find-DMA-available-capability.patch new file mode 100755 index 0000000000000000000000000000000000000000..b81bcc4bd9e4a71fa35e1fa359d2eaf6cea7e629 --- /dev/null +++ b/kvm-vfio-Find-DMA-available-capability.patch @@ -0,0 +1,91 @@ +From e6147c5a23a75361b1374bfb4b96403d243b5c38 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 19 Jan 2021 12:50:43 -0500 +Subject: [PATCH 4/7] vfio: Find DMA available capability + +RH-Author: Cornelia Huck +Message-id: <20210119125046.472811-5-cohuck@redhat.com> +Patchwork-id: 100677 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 4/7] vfio: Find DMA available capability +Bugzilla: 1905391 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Auger Eric +RH-Acked-by: Thomas Huth + +From: Matthew Rosato + +The underlying host may be limiting the number of outstanding DMA +requests for type 1 IOMMU. Add helper functions to check for the +DMA available capability and retrieve the current number of DMA +mappings allowed. + +Signed-off-by: Matthew Rosato +Reviewed-by: Cornelia Huck +[aw: vfio_get_info_dma_avail moved inside CONFIG_LINUX] +Signed-off-by: Alex Williamson +(cherry picked from commit 7486a62845b1e12011dd99973e4739f69d57cd38) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/common.c | 31 +++++++++++++++++++++++++++++++ + include/hw/vfio/vfio-common.h | 2 ++ + 2 files changed, 33 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 77d62d2dcdf..23efdfadebd 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -850,6 +850,37 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) + return vfio_get_cap((void *)info, info->cap_offset, id); + } + ++static struct vfio_info_cap_header * ++vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) ++{ ++ if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { ++ return NULL; ++ } ++ ++ return vfio_get_cap((void *)info, info->cap_offset, id); ++} ++ ++bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, ++ unsigned int *avail) ++{ ++ struct vfio_info_cap_header *hdr; ++ struct vfio_iommu_type1_info_dma_avail *cap; ++ ++ /* If the capability cannot be found, assume no DMA limiting */ ++ hdr = vfio_get_iommu_type1_info_cap(info, ++ VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); ++ if (hdr == NULL) { ++ return false; ++ } ++ ++ if (avail != NULL) { ++ cap = (void *) hdr; ++ *avail = cap->avail; ++ } ++ ++ return true; ++} ++ + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) + { +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index fd564209ac7..aa6cbe4a998 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -191,6 +191,8 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type); + struct vfio_info_cap_header * + vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id); ++bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, ++ unsigned int *avail); + #endif + extern const MemoryListener vfio_prereg_listener; + +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch b/kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch new file mode 100755 index 0000000000000000000000000000000000000000..c5156769ee384b4ca6a82de56e46cc235a5fa38f --- /dev/null +++ b/kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch @@ -0,0 +1,175 @@ +From 58edd0fba4d9e98edfeb16139467d6035a1f4e61 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:42 -0400 +Subject: [PATCH 08/12] vfio-ccw: Add support for the CRW region and IRQ + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-9-cohuck@redhat.com> +Patchwork-id: 97698 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 8/9] vfio-ccw: Add support for the CRW region and IRQ +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Farhan Ali + +The crw region can be used to obtain information about +Channel Report Words (CRW) from vfio-ccw driver. + +Currently only channel-path related CRWs are passed to +QEMU from vfio-ccw driver. + +Signed-off-by: Farhan Ali +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-7-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit f030532f2ad6eeb200034915e9c6357cce81b538) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 73 insertions(+) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 94a0d9840d..b72a505893 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -44,7 +44,11 @@ struct VFIOCCWDevice { + uint64_t schib_region_size; + uint64_t schib_region_offset; + struct ccw_schib_region *schib_region; ++ uint64_t crw_region_size; ++ uint64_t crw_region_offset; ++ struct ccw_crw_region *crw_region; + EventNotifier io_notifier; ++ EventNotifier crw_notifier; + bool force_orb_pfch; + bool warned_orb_pfch; + }; +@@ -254,6 +258,44 @@ static void vfio_ccw_reset(DeviceState *dev) + ioctl(vcdev->vdev.fd, VFIO_DEVICE_RESET); + } + ++static void vfio_ccw_crw_read(VFIOCCWDevice *vcdev) ++{ ++ struct ccw_crw_region *region = vcdev->crw_region; ++ CRW crw; ++ int size; ++ ++ /* Keep reading CRWs as long as data is returned */ ++ do { ++ memset(region, 0, sizeof(*region)); ++ size = pread(vcdev->vdev.fd, region, vcdev->crw_region_size, ++ vcdev->crw_region_offset); ++ ++ if (size == -1) { ++ error_report("vfio-ccw: Read crw region failed with errno=%d", ++ errno); ++ break; ++ } ++ ++ if (region->crw == 0) { ++ /* No more CRWs to queue */ ++ break; ++ } ++ ++ memcpy(&crw, ®ion->crw, sizeof(CRW)); ++ ++ css_crw_add_to_queue(crw); ++ } while (1); ++} ++ ++static void vfio_ccw_crw_notifier_handler(void *opaque) ++{ ++ VFIOCCWDevice *vcdev = opaque; ++ ++ while (event_notifier_test_and_clear(&vcdev->crw_notifier)) { ++ vfio_ccw_crw_read(vcdev); ++ } ++} ++ + static void vfio_ccw_io_notifier_handler(void *opaque) + { + VFIOCCWDevice *vcdev = opaque; +@@ -340,6 +382,10 @@ static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, + notifier = &vcdev->io_notifier; + fd_read = vfio_ccw_io_notifier_handler; + break; ++ case VFIO_CCW_CRW_IRQ_INDEX: ++ notifier = &vcdev->crw_notifier; ++ fd_read = vfio_ccw_crw_notifier_handler; ++ break; + default: + error_setg(errp, "vfio: Unsupported device irq(%d)", irq); + return; +@@ -391,6 +437,9 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, + case VFIO_CCW_IO_IRQ_INDEX: + notifier = &vcdev->io_notifier; + break; ++ case VFIO_CCW_CRW_IRQ_INDEX: ++ notifier = &vcdev->crw_notifier; ++ break; + default: + error_report("vfio: Unsupported device irq(%d)", irq); + return; +@@ -468,10 +517,24 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->schib_region = g_malloc(info->size); + } + ++ ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, ++ VFIO_REGION_SUBTYPE_CCW_CRW, &info); ++ ++ if (!ret) { ++ vcdev->crw_region_size = info->size; ++ if (sizeof(*vcdev->crw_region) != vcdev->crw_region_size) { ++ error_setg(errp, "vfio: Unexpected size of the CRW region"); ++ goto out_err; ++ } ++ vcdev->crw_region_offset = info->offset; ++ vcdev->crw_region = g_malloc(info->size); ++ } ++ + g_free(info); + return; + + out_err: ++ g_free(vcdev->crw_region); + g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); +@@ -481,6 +544,7 @@ out_err: + + static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) + { ++ g_free(vcdev->crw_region); + g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); +@@ -596,6 +660,14 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + goto out_notifier_err; + } + ++ if (vcdev->crw_region) { ++ vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX, &err); ++ if (err) { ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); ++ goto out_notifier_err; ++ } ++ } ++ + return; + + out_notifier_err: +@@ -620,6 +692,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) + S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); + VFIOGroup *group = vcdev->vdev.group; + ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); + vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); + vfio_ccw_put_region(vcdev); + vfio_ccw_put_device(vcdev); +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Add-support-for-the-schib-region.patch b/kvm-vfio-ccw-Add-support-for-the-schib-region.patch new file mode 100755 index 0000000000000000000000000000000000000000..667e5cf79dd82b39cffb7ee298dd60d1928007bc --- /dev/null +++ b/kvm-vfio-ccw-Add-support-for-the-schib-region.patch @@ -0,0 +1,254 @@ +From b73e3e52f76db823d7bffe3f705f575ca413863b Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:39 -0400 +Subject: [PATCH 05/12] vfio-ccw: Add support for the schib region + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-6-cohuck@redhat.com> +Patchwork-id: 97697 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 5/9] vfio-ccw: Add support for the schib region +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Farhan Ali + +The schib region can be used to obtain the latest SCHIB from the host +passthrough subchannel. Since the guest SCHIB is virtualized, +we currently only update the path related information so that the +guest is aware of any path related changes when it issues the +'stsch' instruction. + +Signed-off-by: Farhan Ali +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-4-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 46ea3841edaff2a7657b8f6c7f474e5e3850cd62) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/css.c | 13 ++++++-- + hw/s390x/s390-ccw.c | 21 +++++++++++++ + hw/vfio/ccw.c | 63 +++++++++++++++++++++++++++++++++++++ + include/hw/s390x/css.h | 3 +- + include/hw/s390x/s390-ccw.h | 1 + + target/s390x/ioinst.c | 3 +- + 6 files changed, 99 insertions(+), 5 deletions(-) + +diff --git a/hw/s390x/css.c b/hw/s390x/css.c +index 844caab408..71fd3f9a00 100644 +--- a/hw/s390x/css.c ++++ b/hw/s390x/css.c +@@ -1335,11 +1335,20 @@ static void copy_schib_to_guest(SCHIB *dest, const SCHIB *src) + } + } + +-int css_do_stsch(SubchDev *sch, SCHIB *schib) ++IOInstEnding css_do_stsch(SubchDev *sch, SCHIB *schib) + { ++ int ret; ++ ++ /* ++ * For some subchannels, we may want to update parts of ++ * the schib (e.g., update path masks from the host device ++ * for passthrough subchannels). ++ */ ++ ret = s390_ccw_store(sch); ++ + /* Use current status. */ + copy_schib_to_guest(schib, &sch->curr_status); +- return 0; ++ return ret; + } + + static void copy_pmcw_from_guest(PMCW *dest, const PMCW *src) +diff --git a/hw/s390x/s390-ccw.c b/hw/s390x/s390-ccw.c +index 0c5a5b60bd..75b788c95e 100644 +--- a/hw/s390x/s390-ccw.c ++++ b/hw/s390x/s390-ccw.c +@@ -51,6 +51,27 @@ int s390_ccw_clear(SubchDev *sch) + return cdc->handle_clear(sch); + } + ++IOInstEnding s390_ccw_store(SubchDev *sch) ++{ ++ S390CCWDeviceClass *cdc = NULL; ++ int ret = IOINST_CC_EXPECTED; ++ ++ /* ++ * This code is called for both virtual and passthrough devices, ++ * but only applies to to the latter. This ugly check makes that ++ * distinction for us. ++ */ ++ if (object_dynamic_cast(OBJECT(sch->driver_data), TYPE_S390_CCW)) { ++ cdc = S390_CCW_DEVICE_GET_CLASS(sch->driver_data); ++ } ++ ++ if (cdc && cdc->handle_store) { ++ ret = cdc->handle_store(sch); ++ } ++ ++ return ret; ++} ++ + static void s390_ccw_get_dev_info(S390CCWDevice *cdev, + char *sysfsdev, + Error **errp) +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 17eb4c4048..859ad646f1 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -41,6 +41,9 @@ struct VFIOCCWDevice { + uint64_t async_cmd_region_size; + uint64_t async_cmd_region_offset; + struct ccw_cmd_region *async_cmd_region; ++ uint64_t schib_region_size; ++ uint64_t schib_region_offset; ++ struct ccw_schib_region *schib_region; + EventNotifier io_notifier; + bool force_orb_pfch; + bool warned_orb_pfch; +@@ -116,6 +119,51 @@ again: + } + } + ++static IOInstEnding vfio_ccw_handle_store(SubchDev *sch) ++{ ++ S390CCWDevice *cdev = sch->driver_data; ++ VFIOCCWDevice *vcdev = DO_UPCAST(VFIOCCWDevice, cdev, cdev); ++ SCHIB *schib = &sch->curr_status; ++ struct ccw_schib_region *region = vcdev->schib_region; ++ SCHIB *s; ++ int ret; ++ ++ /* schib region not available so nothing else to do */ ++ if (!region) { ++ return IOINST_CC_EXPECTED; ++ } ++ ++ memset(region, 0, sizeof(*region)); ++ ret = pread(vcdev->vdev.fd, region, vcdev->schib_region_size, ++ vcdev->schib_region_offset); ++ ++ if (ret == -1) { ++ /* ++ * Device is probably damaged, but store subchannel does not ++ * have a nonzero cc defined for this scenario. Log an error, ++ * and presume things are otherwise fine. ++ */ ++ error_report("vfio-ccw: store region read failed with errno=%d", errno); ++ return IOINST_CC_EXPECTED; ++ } ++ ++ /* ++ * Selectively copy path-related bits of the SCHIB, ++ * rather than copying the entire struct. ++ */ ++ s = (SCHIB *)region->schib_area; ++ schib->pmcw.pnom = s->pmcw.pnom; ++ schib->pmcw.lpum = s->pmcw.lpum; ++ schib->pmcw.pam = s->pmcw.pam; ++ schib->pmcw.pom = s->pmcw.pom; ++ ++ if (s->scsw.flags & SCSW_FLAGS_MASK_PNO) { ++ schib->scsw.flags |= SCSW_FLAGS_MASK_PNO; ++ } ++ ++ return IOINST_CC_EXPECTED; ++} ++ + static int vfio_ccw_handle_clear(SubchDev *sch) + { + S390CCWDevice *cdev = sch->driver_data; +@@ -382,10 +430,23 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->async_cmd_region = g_malloc0(info->size); + } + ++ ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, ++ VFIO_REGION_SUBTYPE_CCW_SCHIB, &info); ++ if (!ret) { ++ vcdev->schib_region_size = info->size; ++ if (sizeof(*vcdev->schib_region) != vcdev->schib_region_size) { ++ error_setg(errp, "vfio: Unexpected size of the schib region"); ++ goto out_err; ++ } ++ vcdev->schib_region_offset = info->offset; ++ vcdev->schib_region = g_malloc(info->size); ++ } ++ + g_free(info); + return; + + out_err: ++ g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); + g_free(info); +@@ -394,6 +455,7 @@ out_err: + + static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) + { ++ g_free(vcdev->schib_region); + g_free(vcdev->async_cmd_region); + g_free(vcdev->io_region); + } +@@ -569,6 +631,7 @@ static void vfio_ccw_class_init(ObjectClass *klass, void *data) + cdc->handle_request = vfio_ccw_handle_request; + cdc->handle_halt = vfio_ccw_handle_halt; + cdc->handle_clear = vfio_ccw_handle_clear; ++ cdc->handle_store = vfio_ccw_handle_store; + } + + static const TypeInfo vfio_ccw_info = { +diff --git a/include/hw/s390x/css.h b/include/hw/s390x/css.h +index f46bcafb16..7e3a5e7433 100644 +--- a/include/hw/s390x/css.h ++++ b/include/hw/s390x/css.h +@@ -218,6 +218,7 @@ IOInstEnding do_subchannel_work_passthrough(SubchDev *sub); + + int s390_ccw_halt(SubchDev *sch); + int s390_ccw_clear(SubchDev *sch); ++IOInstEnding s390_ccw_store(SubchDev *sch); + + typedef enum { + CSS_IO_ADAPTER_VIRTIO = 0, +@@ -242,7 +243,7 @@ SubchDev *css_find_subch(uint8_t m, uint8_t cssid, uint8_t ssid, + uint16_t schid); + bool css_subch_visible(SubchDev *sch); + void css_conditional_io_interrupt(SubchDev *sch); +-int css_do_stsch(SubchDev *sch, SCHIB *schib); ++IOInstEnding css_do_stsch(SubchDev *sch, SCHIB *schib); + bool css_schid_final(int m, uint8_t cssid, uint8_t ssid, uint16_t schid); + IOInstEnding css_do_msch(SubchDev *sch, const SCHIB *schib); + IOInstEnding css_do_xsch(SubchDev *sch); +diff --git a/include/hw/s390x/s390-ccw.h b/include/hw/s390x/s390-ccw.h +index fffb54562f..4a43803ef2 100644 +--- a/include/hw/s390x/s390-ccw.h ++++ b/include/hw/s390x/s390-ccw.h +@@ -37,6 +37,7 @@ typedef struct S390CCWDeviceClass { + IOInstEnding (*handle_request) (SubchDev *sch); + int (*handle_halt) (SubchDev *sch); + int (*handle_clear) (SubchDev *sch); ++ IOInstEnding (*handle_store) (SubchDev *sch); + } S390CCWDeviceClass; + + #endif +diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c +index f40c35c6ff..b6be300cc4 100644 +--- a/target/s390x/ioinst.c ++++ b/target/s390x/ioinst.c +@@ -292,8 +292,7 @@ void ioinst_handle_stsch(S390CPU *cpu, uint64_t reg1, uint32_t ipb, + sch = css_find_subch(m, cssid, ssid, schid); + if (sch) { + if (css_subch_visible(sch)) { +- css_do_stsch(sch, &schib); +- cc = 0; ++ cc = css_do_stsch(sch, &schib); + } else { + /* Indicate no more subchannels in this css/ss */ + cc = 3; +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Connect-the-device-request-notifier.patch b/kvm-vfio-ccw-Connect-the-device-request-notifier.patch new file mode 100755 index 0000000000000000000000000000000000000000..298fb29f5dd2264c437451f5ea3150a74c11e322 --- /dev/null +++ b/kvm-vfio-ccw-Connect-the-device-request-notifier.patch @@ -0,0 +1,128 @@ +From db6a782f8b9ba062f195ff504b4d2f93e471fecc Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 11 May 2021 11:24:05 -0400 +Subject: [PATCH 2/5] vfio-ccw: Connect the device request notifier + +RH-Author: Thomas Huth +Message-id: <20210511112405.297037-3-thuth@redhat.com> +Patchwork-id: 101536 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] vfio-ccw: Connect the device request notifier +Bugzilla: 1940450 +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +Now that the vfio-ccw code has a notifier interface to request that +a device be unplugged, let's wire that together. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20210104202057.48048-4-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit b2f96f9e4f5fbc8f2770a436191cb328da4d5350) +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1940450 +Signed-off-by: Thomas Huth +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 40 ++++++++++++++++++++++++++++++++++++---- + 1 file changed, 36 insertions(+), 4 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index b72a505893..3d450fe1c9 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -49,6 +49,7 @@ struct VFIOCCWDevice { + struct ccw_crw_region *crw_region; + EventNotifier io_notifier; + EventNotifier crw_notifier; ++ EventNotifier req_notifier; + bool force_orb_pfch; + bool warned_orb_pfch; + }; +@@ -287,6 +288,21 @@ static void vfio_ccw_crw_read(VFIOCCWDevice *vcdev) + } while (1); + } + ++static void vfio_ccw_req_notifier_handler(void *opaque) ++{ ++ VFIOCCWDevice *vcdev = opaque; ++ Error *err = NULL; ++ ++ if (!event_notifier_test_and_clear(&vcdev->req_notifier)) { ++ return; ++ } ++ ++ qdev_unplug(DEVICE(vcdev), &err); ++ if (err) { ++ warn_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name); ++ } ++} ++ + static void vfio_ccw_crw_notifier_handler(void *opaque) + { + VFIOCCWDevice *vcdev = opaque; +@@ -386,6 +402,10 @@ static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, + notifier = &vcdev->crw_notifier; + fd_read = vfio_ccw_crw_notifier_handler; + break; ++ case VFIO_CCW_REQ_IRQ_INDEX: ++ notifier = &vcdev->req_notifier; ++ fd_read = vfio_ccw_req_notifier_handler; ++ break; + default: + error_setg(errp, "vfio: Unsupported device irq(%d)", irq); + return; +@@ -440,6 +460,9 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, + case VFIO_CCW_CRW_IRQ_INDEX: + notifier = &vcdev->crw_notifier; + break; ++ case VFIO_CCW_REQ_IRQ_INDEX: ++ notifier = &vcdev->req_notifier; ++ break; + default: + error_report("vfio: Unsupported device irq(%d)", irq); + return; +@@ -657,20 +680,28 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + + vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX, &err); + if (err) { +- goto out_notifier_err; ++ goto out_io_notifier_err; + } + + if (vcdev->crw_region) { + vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX, &err); + if (err) { +- vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); +- goto out_notifier_err; ++ goto out_crw_notifier_err; + } + } + ++ vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_REQ_IRQ_INDEX, &err); ++ if (err) { ++ goto out_req_notifier_err; ++ } ++ + return; + +-out_notifier_err: ++out_req_notifier_err: ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); ++out_crw_notifier_err: ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); ++out_io_notifier_err: + vfio_ccw_put_region(vcdev); + out_region_err: + vfio_ccw_put_device(vcdev); +@@ -692,6 +723,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) + S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); + VFIOGroup *group = vcdev->vdev.group; + ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_REQ_IRQ_INDEX); + vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); + vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); + vfio_ccw_put_region(vcdev); +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Fix-error-message.patch b/kvm-vfio-ccw-Fix-error-message.patch new file mode 100755 index 0000000000000000000000000000000000000000..86d2fdfd00f21a6c9970e749ffec1fc84ccfc978 --- /dev/null +++ b/kvm-vfio-ccw-Fix-error-message.patch @@ -0,0 +1,48 @@ +From 7258b1fabcd152c2ad9b61485b869a41d1bc64e2 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:35 -0400 +Subject: [PATCH 01/12] vfio-ccw: Fix error message +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-2-cohuck@redhat.com> +Patchwork-id: 97693 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/9] vfio-ccw: Fix error message +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth +RH-Acked-by: Philippe Mathieu-Daudé + +From: Boris Fiuczynski + +Signed-off-by: Boris Fiuczynski +Reviewed-by: Eric Farman +Message-Id: <20191128143015.5231-1-fiuczy@linux.ibm.com> +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Cornelia Huck +(cherry picked from commit 91f751dc111b270b1e81d80ac92cf479e7620fa4) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 6863f6c69f..3b5520ae75 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -102,7 +102,7 @@ again: + if (errno == EAGAIN) { + goto again; + } +- error_report("vfio-ccw: wirte I/O region failed with errno=%d", errno); ++ error_report("vfio-ccw: write I/O region failed with errno=%d", errno); + ret = -errno; + } else { + ret = region->ret_code; +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Refactor-ccw-irq-handler.patch b/kvm-vfio-ccw-Refactor-ccw-irq-handler.patch new file mode 100755 index 0000000000000000000000000000000000000000..8a3514df58b346bf739dfe14991767b5b1bcc229 --- /dev/null +++ b/kvm-vfio-ccw-Refactor-ccw-irq-handler.patch @@ -0,0 +1,155 @@ +From ee9b03e774641fba8baaf85256706fcc5e8d8efa Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:40 -0400 +Subject: [PATCH 06/12] vfio-ccw: Refactor ccw irq handler + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-7-cohuck@redhat.com> +Patchwork-id: 97695 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 6/9] vfio-ccw: Refactor ccw irq handler +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Eric Farman + +Make it easier to add new ones in the future. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-5-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 690e29b91102ac69810b35fe72cd90bc9fa1fff7) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 58 +++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 42 insertions(+), 16 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 859ad646f1..94a0d9840d 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -324,22 +324,36 @@ read_err: + css_inject_io_interrupt(sch); + } + +-static void vfio_ccw_register_io_notifier(VFIOCCWDevice *vcdev, Error **errp) ++static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, ++ unsigned int irq, ++ Error **errp) + { + VFIODevice *vdev = &vcdev->vdev; + struct vfio_irq_info *irq_info; + size_t argsz; + int fd; ++ EventNotifier *notifier; ++ IOHandler *fd_read; ++ ++ switch (irq) { ++ case VFIO_CCW_IO_IRQ_INDEX: ++ notifier = &vcdev->io_notifier; ++ fd_read = vfio_ccw_io_notifier_handler; ++ break; ++ default: ++ error_setg(errp, "vfio: Unsupported device irq(%d)", irq); ++ return; ++ } + +- if (vdev->num_irqs < VFIO_CCW_IO_IRQ_INDEX + 1) { +- error_setg(errp, "vfio: unexpected number of io irqs %u", ++ if (vdev->num_irqs < irq + 1) { ++ error_setg(errp, "vfio: unexpected number of irqs %u", + vdev->num_irqs); + return; + } + + argsz = sizeof(*irq_info); + irq_info = g_malloc0(argsz); +- irq_info->index = VFIO_CCW_IO_IRQ_INDEX; ++ irq_info->index = irq; + irq_info->argsz = argsz; + if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, + irq_info) < 0 || irq_info->count < 1) { +@@ -347,37 +361,49 @@ static void vfio_ccw_register_io_notifier(VFIOCCWDevice *vcdev, Error **errp) + goto out_free_info; + } + +- if (event_notifier_init(&vcdev->io_notifier, 0)) { ++ if (event_notifier_init(notifier, 0)) { + error_setg_errno(errp, errno, +- "vfio: Unable to init event notifier for IO"); ++ "vfio: Unable to init event notifier for irq (%d)", ++ irq); + goto out_free_info; + } + +- fd = event_notifier_get_fd(&vcdev->io_notifier); +- qemu_set_fd_handler(fd, vfio_ccw_io_notifier_handler, NULL, vcdev); ++ fd = event_notifier_get_fd(notifier); ++ qemu_set_fd_handler(fd, fd_read, NULL, vcdev); + +- if (vfio_set_irq_signaling(vdev, VFIO_CCW_IO_IRQ_INDEX, 0, ++ if (vfio_set_irq_signaling(vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { + qemu_set_fd_handler(fd, NULL, NULL, vcdev); +- event_notifier_cleanup(&vcdev->io_notifier); ++ event_notifier_cleanup(notifier); + } + + out_free_info: + g_free(irq_info); + } + +-static void vfio_ccw_unregister_io_notifier(VFIOCCWDevice *vcdev) ++static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, ++ unsigned int irq) + { + Error *err = NULL; ++ EventNotifier *notifier; ++ ++ switch (irq) { ++ case VFIO_CCW_IO_IRQ_INDEX: ++ notifier = &vcdev->io_notifier; ++ break; ++ default: ++ error_report("vfio: Unsupported device irq(%d)", irq); ++ return; ++ } + +- if (vfio_set_irq_signaling(&vcdev->vdev, VFIO_CCW_IO_IRQ_INDEX, 0, ++ if (vfio_set_irq_signaling(&vcdev->vdev, irq, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name); + } + +- qemu_set_fd_handler(event_notifier_get_fd(&vcdev->io_notifier), ++ qemu_set_fd_handler(event_notifier_get_fd(notifier), + NULL, NULL, vcdev); +- event_notifier_cleanup(&vcdev->io_notifier); ++ event_notifier_cleanup(notifier); + } + + static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) +@@ -565,7 +591,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + goto out_region_err; + } + +- vfio_ccw_register_io_notifier(vcdev, &err); ++ vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX, &err); + if (err) { + goto out_notifier_err; + } +@@ -594,7 +620,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) + S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); + VFIOGroup *group = vcdev->vdev.group; + +- vfio_ccw_unregister_io_notifier(vcdev); ++ vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); + vfio_ccw_put_region(vcdev); + vfio_ccw_put_device(vcdev); + vfio_put_group(group); +-- +2.27.0 + diff --git a/kvm-vfio-ccw-Refactor-cleanup-of-regions.patch b/kvm-vfio-ccw-Refactor-cleanup-of-regions.patch new file mode 100755 index 0000000000000000000000000000000000000000..1741f4b9c5ae91e0b415b9d7ddb68a4efd6d907c --- /dev/null +++ b/kvm-vfio-ccw-Refactor-cleanup-of-regions.patch @@ -0,0 +1,73 @@ +From 30906c9c78af2710a2b86c096cc7b18bbc4b4e69 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:38 -0400 +Subject: [PATCH 04/12] vfio-ccw: Refactor cleanup of regions + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-5-cohuck@redhat.com> +Patchwork-id: 97694 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 4/9] vfio-ccw: Refactor cleanup of regions +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Eric Farman + +While we're at it, add a g_free() for the async_cmd_region that +is the last thing currently created. g_free() knows how to handle +NULL pointers, so this makes it easier to remember what cleanups +need to be performed when new regions are added. + +Signed-off-by: Eric Farman +Reviewed-by: Cornelia Huck +Message-Id: <20200505125757.98209-3-farman@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 2a3b9cbaa7b25a4db4cdcfe1c65279c5464f2923) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 6bc612b5b7..17eb4c4048 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -363,8 +363,7 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->io_region_size = info->size; + if (sizeof(*vcdev->io_region) != vcdev->io_region_size) { + error_setg(errp, "vfio: Unexpected size of the I/O region"); +- g_free(info); +- return; ++ goto out_err; + } + + vcdev->io_region_offset = info->offset; +@@ -377,15 +376,20 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) + vcdev->async_cmd_region_size = info->size; + if (sizeof(*vcdev->async_cmd_region) != vcdev->async_cmd_region_size) { + error_setg(errp, "vfio: Unexpected size of the async cmd region"); +- g_free(vcdev->io_region); +- g_free(info); +- return; ++ goto out_err; + } + vcdev->async_cmd_region_offset = info->offset; + vcdev->async_cmd_region = g_malloc0(info->size); + } + + g_free(info); ++ return; ++ ++out_err: ++ g_free(vcdev->async_cmd_region); ++ g_free(vcdev->io_region); ++ g_free(info); ++ return; + } + + static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) +-- +2.27.0 + diff --git a/kvm-vfio-ccw-allow-non-prefetch-ORBs.patch b/kvm-vfio-ccw-allow-non-prefetch-ORBs.patch new file mode 100755 index 0000000000000000000000000000000000000000..da2fc5c7b28416b3a1b13c0c5bbbf5476c3ac6ed --- /dev/null +++ b/kvm-vfio-ccw-allow-non-prefetch-ORBs.patch @@ -0,0 +1,61 @@ +From d5f5a307f3396064d29ef0d300c7377756dd165b Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Tue, 23 Jun 2020 09:25:36 -0400 +Subject: [PATCH 02/12] vfio-ccw: allow non-prefetch ORBs + +RH-Author: Cornelia Huck +Message-id: <20200623092543.358315-3-cohuck@redhat.com> +Patchwork-id: 97692 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 2/9] vfio-ccw: allow non-prefetch ORBs +Bugzilla: 1660916 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: David Hildenbrand +RH-Acked-by: Thomas Huth + +From: Jared Rossi + +Remove the explicit prefetch check when using vfio-ccw devices. +This check does not trigger in practice as all Linux channel programs +are intended to use prefetch. + +Newer Linux kernel versions do not require to force the PFCH flag with +vfio-ccw devices anymore. + +Signed-off-by: Jared Rossi +Reviewed-by: Eric Farman +Message-Id: <20200512181535.18630-2-jrossi@linux.ibm.com> +Signed-off-by: Cornelia Huck +(cherry picked from commit 24e58a7b1d411627e326144030a20dcf0093fed0) +Signed-off-by: Cornelia Huck +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/ccw.c | 13 +++---------- + 1 file changed, 3 insertions(+), 10 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 3b5520ae75..6bc612b5b7 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -74,16 +74,9 @@ static IOInstEnding vfio_ccw_handle_request(SubchDev *sch) + struct ccw_io_region *region = vcdev->io_region; + int ret; + +- if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH)) { +- if (!(vcdev->force_orb_pfch)) { +- warn_once_pfch(vcdev, sch, "requires PFCH flag set"); +- sch_gen_unit_exception(sch); +- css_inject_io_interrupt(sch); +- return IOINST_CC_EXPECTED; +- } else { +- sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH; +- warn_once_pfch(vcdev, sch, "PFCH flag forced"); +- } ++ if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH) && vcdev->force_orb_pfch) { ++ sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH; ++ warn_once_pfch(vcdev, sch, "PFCH flag forced"); + } + + QEMU_BUILD_BUG_ON(sizeof(region->orb_area) != sizeof(ORB)); +-- +2.27.0 + diff --git a/kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch b/kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch new file mode 100755 index 0000000000000000000000000000000000000000..81cf80ed0aa3385b19474a58607c629a8f5ced65 --- /dev/null +++ b/kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch @@ -0,0 +1,75 @@ +From f01098bb86c12f485895f38f7a24170ec84b60b6 Mon Sep 17 00:00:00 2001 +From: Greg Kurz +Date: Mon, 8 Jun 2020 16:25:21 -0400 +Subject: [PATCH 42/42] vfio/nvlink: Remove exec permission to avoid SELinux + AVCs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Greg Kurz +Message-id: <20200608162521.382858-2-gkurz@redhat.com> +Patchwork-id: 97459 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH 1/1] vfio/nvlink: Remove exec permission to avoid SELinux AVCs +Bugzilla: 1823275 +RH-Acked-by: David Gibson +RH-Acked-by: Laurent Vivier +RH-Acked-by: Philippe Mathieu-Daudé + +From: Leonardo Bras + +If SELinux is setup without 'execmem' permission for qemu, all mmap +with (PROT_WRITE | PROT_EXEC) will fail and print a warning in +SELinux log. + +If "nvlink2-mr" memory allocation fails (fist diff), it will cause +guest NUMA nodes to not be correctly configured (V100 memory will +not be visible for guest, nor its NUMA nodes). + +Not having 'execmem' permission is intesting for virtual machines to +avoid buffer-overflow based attacks, and it's adopted in distros +like RHEL. + +So, removing the PROT_EXEC flag seems the right thing to do. + +Browsing some other code that mmaps memory for usage with +memory_region_init_ram_device_ptr, I could notice it's usual to +not have PROT_EXEC (only PROT_READ | PROT_WRITE), so it should be +no problem around this. + +Signed-off-by: Leonardo Bras +Message-Id: <20200501055448.286518-1-leobras.c@gmail.com> +Acked-by: Alex Williamson +Signed-off-by: David Gibson +(cherry picked from commit 9c7c0407028355ca83349b8a60fddfad46f2ebd8) +Signed-off-by: Greg Kurz +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/pci-quirks.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c +index 4505ffe48a..1c5fe014cf 100644 +--- a/hw/vfio/pci-quirks.c ++++ b/hw/vfio/pci-quirks.c +@@ -2237,7 +2237,7 @@ int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp) + } + cap = (void *) hdr; + +- p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC, ++ p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE, + MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset); + if (p == MAP_FAILED) { + ret = -errno; +@@ -2297,7 +2297,7 @@ int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp) + + /* Some NVLink bridges may not have assigned ATSD */ + if (atsdreg->size) { +- p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC, ++ p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE, + MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset); + if (p == MAP_FAILED) { + ret = -errno; +-- +2.27.0 + diff --git a/kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch b/kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch new file mode 100755 index 0000000000000000000000000000000000000000..d416e0fdfbb0b7ffbc5c2735465593c7b74f1f0f --- /dev/null +++ b/kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch @@ -0,0 +1,58 @@ +From e4631c00d8e9ee3608ef3196cbe8bec4841ee988 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 8 Jan 2020 15:04:57 +0000 +Subject: [PATCH 2/5] vfio/pci: Don't remove irqchip notifier if not registered + +RH-Author: Peter Xu +Message-id: <20200108150457.12324-2-peterx@redhat.com> +Patchwork-id: 93291 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] vfio/pci: Don't remove irqchip notifier if not registered +Bugzilla: 1782678 +RH-Acked-by: Alex Williamson +RH-Acked-by: Cornelia Huck +RH-Acked-by: Auger Eric +RH-Acked-by: Jens Freimann + +The kvm irqchip notifier is only registered if the device supports +INTx, however it's unconditionally removed. If the assigned device +does not support INTx, this will cause QEMU to crash when unplugging +the device from the system. Change it to conditionally remove the +notifier only if the notify hook is setup. + +CC: Eduardo Habkost +CC: David Gibson +CC: Alex Williamson +Cc: qemu-stable@nongnu.org # v4.2 +Reported-by: yanghliu@redhat.com +Debugged-by: Eduardo Habkost +Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change notifier") +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1782678 +Signed-off-by: Peter Xu +Reviewed-by: David Gibson +Reviewed-by: Greg Kurz +Signed-off-by: Alex Williamson +(cherry picked from commit 0446f8121723b134ca1d1ed0b73e96d4a0a8689d) +Signed-off-by: Peter Xu +Signed-off-by: Danilo C. L. de Paula +--- + hw/vfio/pci.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 309535f..d717520 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3100,7 +3100,9 @@ static void vfio_exitfn(PCIDevice *pdev) + vfio_unregister_req_notifier(vdev); + vfio_unregister_err_notifier(vdev); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); +- kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); ++ if (vdev->irqchip_change_notifier.notify) { ++ kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); ++ } + vfio_disable_interrupts(vdev); + if (vdev->intx.mmap_timer) { + timer_free(vdev->intx.mmap_timer); +-- +1.8.3.1 + diff --git a/kvm-vhost-Add-names-to-section-rounded-warning.patch b/kvm-vhost-Add-names-to-section-rounded-warning.patch new file mode 100755 index 0000000000000000000000000000000000000000..c41a14c2a12afb63e6b2cd267bebe6605d092f85 --- /dev/null +++ b/kvm-vhost-Add-names-to-section-rounded-warning.patch @@ -0,0 +1,53 @@ +From 0d545c5850caf76ad3e8dd9bb0fbc9f86b08e220 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 24 Jan 2020 19:46:11 +0100 +Subject: [PATCH 002/116] vhost: Add names to section rounded warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200124194613.41119-2-dgilbert@redhat.com> +Patchwork-id: 93450 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 1/3] vhost: Add names to section rounded warning +Bugzilla: 1779041 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Dr. David Alan Gilbert" + +Add the memory region names to section rounding/alignment +warnings. + +Signed-off-by: Dr. David Alan Gilbert +Message-Id: <20200116202414.157959-2-dgilbert@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit ff4776147e960b128ee68f94c728659f662f4378) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 4da0d5a..774d87d 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -590,9 +590,10 @@ static void vhost_region_add_section(struct vhost_dev *dev, + * match up in the same RAMBlock if they do. + */ + if (mrs_gpa < prev_gpa_start) { +- error_report("%s:Section rounded to %"PRIx64 +- " prior to previous %"PRIx64, +- __func__, mrs_gpa, prev_gpa_start); ++ error_report("%s:Section '%s' rounded to %"PRIx64 ++ " prior to previous '%s' %"PRIx64, ++ __func__, section->mr->name, mrs_gpa, ++ prev_sec->mr->name, prev_gpa_start); + /* A way to cleanly fail here would be better */ + return; + } +-- +1.8.3.1 + diff --git a/kvm-vhost-Only-align-sections-for-vhost-user.patch b/kvm-vhost-Only-align-sections-for-vhost-user.patch new file mode 100755 index 0000000000000000000000000000000000000000..e082ce80fc62d69f6953b1c3b4e3586cb9f7aa52 --- /dev/null +++ b/kvm-vhost-Only-align-sections-for-vhost-user.patch @@ -0,0 +1,97 @@ +From c35466c168e5219bf585aa65ac31fc9bdc7cbf36 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 24 Jan 2020 19:46:12 +0100 +Subject: [PATCH 003/116] vhost: Only align sections for vhost-user +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200124194613.41119-3-dgilbert@redhat.com> +Patchwork-id: 93452 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 2/3] vhost: Only align sections for vhost-user +Bugzilla: 1779041 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Dr. David Alan Gilbert" + +I added hugepage alignment code in c1ece84e7c9 to deal with +vhost-user + postcopy which needs aligned pages when using userfault. +However, on x86 the lower 2MB of address space tends to be shotgun'd +with small fragments around the 512-640k range - e.g. video RAM, and +with HyperV synic pages tend to sit around there - again splitting +it up. The alignment code complains with a 'Section rounded to ...' +error and gives up. + +Since vhost-user already filters out devices without an fd +(see vhost-user.c vhost_user_mem_section_filter) it shouldn't be +affected by those overlaps. + +Turn the alignment off on vhost-kernel so that it doesn't try +and align, and thus won't hit the rounding issues. + +Signed-off-by: Dr. David Alan Gilbert +Message-Id: <20200116202414.157959-3-dgilbert@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Paolo Bonzini +(cherry picked from commit 76525114736e8f669766e69b715fa59ce8648aae) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost.c | 34 ++++++++++++++++++---------------- + 1 file changed, 18 insertions(+), 16 deletions(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 774d87d..25fd469 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -547,26 +547,28 @@ static void vhost_region_add_section(struct vhost_dev *dev, + uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + RAMBlock *mrs_rb = section->mr->ram_block; +- size_t mrs_page = qemu_ram_pagesize(mrs_rb); + + trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, + mrs_host); + +- /* Round the section to it's page size */ +- /* First align the start down to a page boundary */ +- uint64_t alignage = mrs_host & (mrs_page - 1); +- if (alignage) { +- mrs_host -= alignage; +- mrs_size += alignage; +- mrs_gpa -= alignage; +- } +- /* Now align the size up to a page boundary */ +- alignage = mrs_size & (mrs_page - 1); +- if (alignage) { +- mrs_size += mrs_page - alignage; +- } +- trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, +- mrs_host); ++ if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { ++ /* Round the section to it's page size */ ++ /* First align the start down to a page boundary */ ++ size_t mrs_page = qemu_ram_pagesize(mrs_rb); ++ uint64_t alignage = mrs_host & (mrs_page - 1); ++ if (alignage) { ++ mrs_host -= alignage; ++ mrs_size += alignage; ++ mrs_gpa -= alignage; ++ } ++ /* Now align the size up to a page boundary */ ++ alignage = mrs_size & (mrs_page - 1); ++ if (alignage) { ++ mrs_size += mrs_page - alignage; ++ } ++ trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, ++ mrs_host); ++ } + + if (dev->n_tmp_sections) { + /* Since we already have at least one section, lets see if +-- +1.8.3.1 + diff --git a/kvm-vhost-coding-style-fix.patch b/kvm-vhost-coding-style-fix.patch new file mode 100755 index 0000000000000000000000000000000000000000..45461304f778a626bee860272df3d09d86eb9475 --- /dev/null +++ b/kvm-vhost-coding-style-fix.patch @@ -0,0 +1,56 @@ +From 624d96c456536e1471968a59fbeea206309cc33b Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Fri, 24 Jan 2020 19:46:13 +0100 +Subject: [PATCH 004/116] vhost: coding style fix +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200124194613.41119-4-dgilbert@redhat.com> +Patchwork-id: 93453 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 3/3] vhost: coding style fix +Bugzilla: 1779041 +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Philippe Mathieu-Daudé + +From: "Michael S. Tsirkin" + +Drop a trailing whitespace. Make line shorter. + +Fixes: 76525114736e8 ("vhost: Only align sections for vhost-user") +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 8347505640238d3b80f9bb7510fdc1bb574bad19) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 25fd469..9edfadc 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -551,7 +551,7 @@ static void vhost_region_add_section(struct vhost_dev *dev, + trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, + mrs_host); + +- if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { ++ if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { + /* Round the section to it's page size */ + /* First align the start down to a page boundary */ + size_t mrs_page = qemu_ram_pagesize(mrs_rb); +@@ -566,8 +566,8 @@ static void vhost_region_add_section(struct vhost_dev *dev, + if (alignage) { + mrs_size += mrs_page - alignage; + } +- trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, +- mrs_host); ++ trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, ++ mrs_size, mrs_host); + } + + if (dev->n_tmp_sections) { +-- +1.8.3.1 + diff --git a/kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch b/kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch new file mode 100755 index 0000000000000000000000000000000000000000..7e1353c63db3ab523a602707fc6bf1a2dbdfc8b3 --- /dev/null +++ b/kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch @@ -0,0 +1,69 @@ +From e06655cfe0fa9473b1e8b311571f36d787472834 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Fri, 29 May 2020 05:54:02 -0400 +Subject: [PATCH 20/42] vhost: correctly turn on VIRTIO_F_IOMMU_PLATFORM + +RH-Author: Thomas Huth +Message-id: <20200529055420.16855-21-thuth@redhat.com> +Patchwork-id: 97041 +O-Subject: [RHEL-8.3.0 qemu-kvm PATCH v2 20/38] vhost: correctly turn on VIRTIO_F_IOMMU_PLATFORM +Bugzilla: 1828317 +RH-Acked-by: Claudio Imbrenda +RH-Acked-by: Cornelia Huck +RH-Acked-by: David Hildenbrand + +From: Jason Wang + +We turn on device IOTLB via VIRTIO_F_IOMMU_PLATFORM unconditionally on +platform without IOMMU support. This can lead unnecessary IOTLB +transactions which will damage the performance. + +Fixing this by check whether the device is backed by IOMMU and disable +device IOTLB. + +Reported-by: Halil Pasic +Tested-by: Halil Pasic +Reviewed-by: Halil Pasic +Signed-off-by: Jason Wang +Message-Id: <20200302042454.24814-1-jasowang@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit f7ef7e6e3ba6e994e070cc609eb154339d1c4a11) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/vhost.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 9edfadc81d..9182a00495 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -290,7 +290,14 @@ static int vhost_dev_has_iommu(struct vhost_dev *dev) + { + VirtIODevice *vdev = dev->vdev; + +- return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); ++ /* ++ * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support ++ * incremental memory mapping API via IOTLB API. For platform that ++ * does not have IOMMU, there's no need to enable this feature ++ * which may cause unnecessary IOTLB miss/update trnasactions. ++ */ ++ return vdev->dma_as != &address_space_memory && ++ virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + } + + static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, +@@ -765,6 +772,9 @@ static int vhost_dev_set_features(struct vhost_dev *dev, + if (enable_log) { + features |= 0x1ULL << VHOST_F_LOG_ALL; + } ++ if (!vhost_dev_has_iommu(dev)) { ++ features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); ++ } + r = dev->vhost_ops->vhost_set_features(dev, features); + if (r < 0) { + VHOST_OPS_DEBUG("vhost_set_features failed"); +-- +2.27.0 + diff --git a/kvm-vhost-user-Print-unexpected-slave-message-types.patch b/kvm-vhost-user-Print-unexpected-slave-message-types.patch new file mode 100755 index 0000000000000000000000000000000000000000..e5776e7580b58d05afd9168e7bbcafd133e06f62 --- /dev/null +++ b/kvm-vhost-user-Print-unexpected-slave-message-types.patch @@ -0,0 +1,48 @@ +From d6abbdaeb2c35efe6793a599c98116e250b1f179 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:43 +0100 +Subject: [PATCH 072/116] vhost-user: Print unexpected slave message types +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-69-dgilbert@redhat.com> +Patchwork-id: 93519 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 068/112] vhost-user: Print unexpected slave message types +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +When we receive an unexpected message type on the slave fd, print +the type. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0fdc465d7d5aafeae127eba488f247ac6f58df4c) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost-user.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c +index 02a9b25..e4f46ec 100644 +--- a/hw/virtio/vhost-user.c ++++ b/hw/virtio/vhost-user.c +@@ -1055,7 +1055,7 @@ static void slave_read(void *opaque) + fd[0]); + break; + default: +- error_report("Received unexpected msg type."); ++ error_report("Received unexpected msg type: %d.", hdr.request); + ret = -EINVAL; + } + +-- +1.8.3.1 + diff --git a/kvm-vhost-user-fs-remove-vhostfd-property.patch b/kvm-vhost-user-fs-remove-vhostfd-property.patch new file mode 100755 index 0000000000000000000000000000000000000000..5904e82b722bec38a111fd65da29a9616480a8b5 --- /dev/null +++ b/kvm-vhost-user-fs-remove-vhostfd-property.patch @@ -0,0 +1,59 @@ +From 912af6f7c270e2939a91c9b3f62b6ba1202edc43 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:37 +0100 +Subject: [PATCH 006/116] vhost-user-fs: remove "vhostfd" property +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-3-dgilbert@redhat.com> +Patchwork-id: 93458 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 002/112] vhost-user-fs: remove "vhostfd" property +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Marc-André Lureau + +The property doesn't make much sense for a vhost-user device. + +Signed-off-by: Marc-André Lureau +Message-Id: <20191116112016.14872-1-marcandre.lureau@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 703857348724319735d9be7b5b996e6445c6e6b9) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost-user-fs.c | 1 - + include/hw/virtio/vhost-user-fs.h | 1 - + 2 files changed, 2 deletions(-) + +diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c +index f0df7f4..ca0b7fc 100644 +--- a/hw/virtio/vhost-user-fs.c ++++ b/hw/virtio/vhost-user-fs.c +@@ -263,7 +263,6 @@ static Property vuf_properties[] = { + DEFINE_PROP_UINT16("num-request-queues", VHostUserFS, + conf.num_request_queues, 1), + DEFINE_PROP_UINT16("queue-size", VHostUserFS, conf.queue_size, 128), +- DEFINE_PROP_STRING("vhostfd", VHostUserFS, conf.vhostfd), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/include/hw/virtio/vhost-user-fs.h b/include/hw/virtio/vhost-user-fs.h +index 539885b..9ff1bdb 100644 +--- a/include/hw/virtio/vhost-user-fs.h ++++ b/include/hw/virtio/vhost-user-fs.h +@@ -28,7 +28,6 @@ typedef struct { + char *tag; + uint16_t num_request_queues; + uint16_t queue_size; +- char *vhostfd; + } VHostUserFSConf; + + typedef struct { +-- +1.8.3.1 + diff --git a/kvm-vhost-user-gpu-Drop-trailing-json-comma.patch b/kvm-vhost-user-gpu-Drop-trailing-json-comma.patch new file mode 100755 index 0000000000000000000000000000000000000000..3a50632485185dae8c473f42547a9496d138ccce --- /dev/null +++ b/kvm-vhost-user-gpu-Drop-trailing-json-comma.patch @@ -0,0 +1,52 @@ +From 044feb40e3041759ee77d08136f334cf3ad67c1e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=A1n=20Tomko?= +Date: Fri, 21 Feb 2020 09:49:23 +0000 +Subject: [PATCH] vhost-user-gpu: Drop trailing json comma +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Ján Tomko +Message-id: <07fed9a38495938a7180819e27f590d80cd6668d.1582278173.git.jtomko@redhat.com> +Patchwork-id: 94019 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] vhost-user-gpu: Drop trailing json comma +Bugzilla: 1805334 +RH-Acked-by: Marc-André Lureau +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Stefan Hajnoczi + +From: Cole Robinson + +Trailing comma is not valid json: + +$ cat contrib/vhost-user-gpu/50-qemu-gpu.json.in | jq +parse error: Expected another key-value pair at line 5, column 1 + +Signed-off-by: Cole Robinson +Reviewed-by: Marc-André Lureau +Reviewed-by: Li Qiang +Reviewed-by: Philippe Mathieu-Daudé +Message-id: 7f5dd2ac9f3504e2699f23e69bc3d8051b729832.1568925097.git.crobinso@redhat.com +Signed-off-by: Gerd Hoffmann +(cherry picked from commit ca26b032e5a0e8a190c763ce828a8740d24b9b65) +Signed-off-by: Ján Tomko +Signed-off-by: Danilo C. L. de Paula +--- + contrib/vhost-user-gpu/50-qemu-gpu.json.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/contrib/vhost-user-gpu/50-qemu-gpu.json.in b/contrib/vhost-user-gpu/50-qemu-gpu.json.in +index 658b545..f5edd09 100644 +--- a/contrib/vhost-user-gpu/50-qemu-gpu.json.in ++++ b/contrib/vhost-user-gpu/50-qemu-gpu.json.in +@@ -1,5 +1,5 @@ + { + "description": "QEMU vhost-user-gpu", + "type": "gpu", +- "binary": "@libexecdir@/vhost-user-gpu", ++ "binary": "@libexecdir@/vhost-user-gpu" + } +-- +1.8.3.1 + diff --git a/kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch b/kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch new file mode 100755 index 0000000000000000000000000000000000000000..ed107016f0d5edd978fa066ed40dce23f221b5c1 --- /dev/null +++ b/kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch @@ -0,0 +1,80 @@ +From b395ad369278d0923a590975fabbb99ec7716c6b Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:28 +0000 +Subject: [PATCH 4/7] virtio: add ability to delete vq through a pointer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-2-jusual@redhat.com> +Patchwork-id: 93980 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/4] virtio: add ability to delete vq through a pointer +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: "Michael S. Tsirkin" + +Devices tend to maintain vq pointers, allow deleting them trough a vq pointer. + +Signed-off-by: Michael S. Tsirkin +Reviewed-by: David Hildenbrand +Reviewed-by: David Hildenbrand +(cherry picked from commit 722f8c51d8af223751dfb1d02de40043e8ba067e) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/virtio.c | 15 ++++++++++----- + include/hw/virtio/virtio.h | 2 ++ + 2 files changed, 12 insertions(+), 5 deletions(-) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index 3211135..d63a369 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -2335,17 +2335,22 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + return &vdev->vq[i]; + } + ++void virtio_delete_queue(VirtQueue *vq) ++{ ++ vq->vring.num = 0; ++ vq->vring.num_default = 0; ++ vq->handle_output = NULL; ++ vq->handle_aio_output = NULL; ++ g_free(vq->used_elems); ++} ++ + void virtio_del_queue(VirtIODevice *vdev, int n) + { + if (n < 0 || n >= VIRTIO_QUEUE_MAX) { + abort(); + } + +- vdev->vq[n].vring.num = 0; +- vdev->vq[n].vring.num_default = 0; +- vdev->vq[n].handle_output = NULL; +- vdev->vq[n].handle_aio_output = NULL; +- g_free(vdev->vq[n].used_elems); ++ virtio_delete_queue(&vdev->vq[n]); + } + + static void virtio_set_isr(VirtIODevice *vdev, int value) +diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h +index 6a20442..91167f6 100644 +--- a/include/hw/virtio/virtio.h ++++ b/include/hw/virtio/virtio.h +@@ -183,6 +183,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + + void virtio_del_queue(VirtIODevice *vdev, int n); + ++void virtio_delete_queue(VirtQueue *vq); ++ + void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len); + void virtqueue_flush(VirtQueue *vq, unsigned int count); +-- +1.8.3.1 + diff --git a/kvm-virtio-add-vhost-user-fs-ccw-device.patch b/kvm-virtio-add-vhost-user-fs-ccw-device.patch new file mode 100755 index 0000000000000000000000000000000000000000..d7d41af9897ec14e42cc7b0e7aca4d45e8854d5e --- /dev/null +++ b/kvm-virtio-add-vhost-user-fs-ccw-device.patch @@ -0,0 +1,136 @@ +From fc5d5887462da813d91a3a0649214313d580d7af Mon Sep 17 00:00:00 2001 +From: Claudio Imbrenda +Date: Tue, 27 Oct 2020 12:02:16 -0400 +Subject: [PATCH 03/18] virtio: add vhost-user-fs-ccw device + +RH-Author: Claudio Imbrenda +Message-id: <20201027120217.2997314-3-cimbrend@redhat.com> +Patchwork-id: 98720 +O-Subject: [RHEL8.4 qemu-kvm PATCH 2/3] virtio: add vhost-user-fs-ccw device +Bugzilla: 1857733 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Thomas Huth +RH-Acked-by: Dr. David Alan Gilbert +RH-Acked-by: Cornelia Huck + +From: Halil Pasic + +upstream bd0bbb9aba2afbc2ea24b0475be04f795468b381 + +fixed for the backport: +* makefile logic instead of meson +* old style qdev initialization +* old style device class properties + +-- + +Wire up the CCW device for vhost-user-fs. + +Reviewed-by: Cornelia Huck +Signed-off-by: Halil Pasic +Message-id: 20200901150019.29229-2-mhartmay@linux.ibm.com +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Danilo C. L. de Paula +--- + hw/s390x/Makefile.objs | 1 + + hw/s390x/vhost-user-fs-ccw.c | 76 ++++++++++++++++++++++++++++++++++++ + 2 files changed, 77 insertions(+) + create mode 100644 hw/s390x/vhost-user-fs-ccw.c + +diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs +index a46a1c7894e..c4086ec3171 100644 +--- a/hw/s390x/Makefile.objs ++++ b/hw/s390x/Makefile.objs +@@ -20,6 +20,7 @@ obj-$(CONFIG_VIRTIO_NET) += virtio-ccw-net.o + obj-$(CONFIG_VIRTIO_BLK) += virtio-ccw-blk.o + obj-$(call land,$(CONFIG_VIRTIO_9P),$(CONFIG_VIRTFS)) += virtio-ccw-9p.o + obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-ccw.o ++obj-$(CONFIG_VHOST_USER_FS) += vhost-user-fs-ccw.o + endif + obj-y += css-bridge.o + obj-y += ccw-device.o +diff --git a/hw/s390x/vhost-user-fs-ccw.c b/hw/s390x/vhost-user-fs-ccw.c +new file mode 100644 +index 00000000000..e7b165d5f61 +--- /dev/null ++++ b/hw/s390x/vhost-user-fs-ccw.c +@@ -0,0 +1,76 @@ ++/* ++ * virtio ccw vhost-user-fs implementation ++ * ++ * Copyright 2020 IBM Corp. ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or (at ++ * your option) any later version. See the COPYING file in the top-level ++ * directory. ++ */ ++#include "qemu/osdep.h" ++#include "hw/qdev-properties.h" ++#include "qapi/error.h" ++#include "hw/virtio/vhost-user-fs.h" ++#include "virtio-ccw.h" ++ ++typedef struct VHostUserFSCcw { ++ VirtioCcwDevice parent_obj; ++ VHostUserFS vdev; ++} VHostUserFSCcw; ++ ++#define TYPE_VHOST_USER_FS_CCW "vhost-user-fs-ccw" ++#define VHOST_USER_FS_CCW(obj) \ ++ OBJECT_CHECK(VHostUserFSCcw, (obj), TYPE_VHOST_USER_FS_CCW) ++ ++ ++static Property vhost_user_fs_ccw_properties[] = { ++ DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags, ++ VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true), ++ DEFINE_PROP_UINT32("max_revision", VirtioCcwDevice, max_rev, ++ VIRTIO_CCW_MAX_REV), ++ DEFINE_PROP_END_OF_LIST(), ++}; ++ ++static void vhost_user_fs_ccw_realize(VirtioCcwDevice *ccw_dev, Error **errp) ++{ ++ VHostUserFSCcw *dev = VHOST_USER_FS_CCW(ccw_dev); ++ DeviceState *vdev = DEVICE(&dev->vdev); ++ ++ qdev_set_parent_bus(vdev, BUS(&ccw_dev->bus)); ++ object_property_set_bool(OBJECT(vdev), true, "realized", errp); ++} ++ ++static void vhost_user_fs_ccw_instance_init(Object *obj) ++{ ++ VHostUserFSCcw *dev = VHOST_USER_FS_CCW(obj); ++ VirtioCcwDevice *ccw_dev = VIRTIO_CCW_DEVICE(obj); ++ ++ ccw_dev->force_revision_1 = true; ++ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), ++ TYPE_VHOST_USER_FS); ++} ++ ++static void vhost_user_fs_ccw_class_init(ObjectClass *klass, void *data) ++{ ++ DeviceClass *dc = DEVICE_CLASS(klass); ++ VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass); ++ ++ k->realize = vhost_user_fs_ccw_realize; ++ dc->props = vhost_user_fs_ccw_properties; ++ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); ++} ++ ++static const TypeInfo vhost_user_fs_ccw = { ++ .name = TYPE_VHOST_USER_FS_CCW, ++ .parent = TYPE_VIRTIO_CCW_DEVICE, ++ .instance_size = sizeof(VHostUserFSCcw), ++ .instance_init = vhost_user_fs_ccw_instance_init, ++ .class_init = vhost_user_fs_ccw_class_init, ++}; ++ ++static void vhost_user_fs_ccw_register(void) ++{ ++ type_register_static(&vhost_user_fs_ccw); ++} ++ ++type_init(vhost_user_fs_ccw_register) +-- +2.27.0 + diff --git a/kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch b/kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch new file mode 100755 index 0000000000000000000000000000000000000000..9e46be1b2b212b693897a1844bb27ae7583a4b19 --- /dev/null +++ b/kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch @@ -0,0 +1,203 @@ +From fdd1f3bf672ad8bb0a6db896ec8cbc797c31da1f Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Wed, 24 Jun 2020 13:24:53 -0400 +Subject: [PATCH 11/12] virtio-blk: On restart, process queued requests in the + proper context + +RH-Author: Sergio Lopez Pascual +Message-id: <20200624132453.111276-3-slp@redhat.com> +Patchwork-id: 97798 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/2] virtio-blk: On restart, process queued requests in the proper context +Bugzilla: +RH-Acked-by: John Snow +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Kevin Wolf + +On restart, we were scheduling a BH to process queued requests, which +would run before starting up the data plane, leading to those requests +being assigned and started on coroutines on the main context. + +This could cause requests to be wrongly processed in parallel from +different threads (the main thread and the iothread managing the data +plane), potentially leading to multiple issues. + +For example, stopping and resuming a VM multiple times while the guest +is generating I/O on a virtio_blk device can trigger a crash with a +stack tracing looking like this one: + +<------> + Thread 2 (Thread 0x7ff736765700 (LWP 1062503)): + #0 0x00005567a13b99d6 in iov_memset + (iov=0x6563617073206f4e, iov_cnt=1717922848, offset=516096, fillc=0, bytes=7018105756081554803) + at util/iov.c:69 + #1 0x00005567a13bab73 in qemu_iovec_memset + (qiov=0x7ff73ec99748, offset=516096, fillc=0, bytes=7018105756081554803) at util/iov.c:530 + #2 0x00005567a12f411c in qemu_laio_process_completion (laiocb=0x7ff6512ee6c0) at block/linux-aio.c:86 + #3 0x00005567a12f42ff in qemu_laio_process_completions (s=0x7ff7182e8420) at block/linux-aio.c:217 + #4 0x00005567a12f480d in ioq_submit (s=0x7ff7182e8420) at block/linux-aio.c:323 + #5 0x00005567a12f43d9 in qemu_laio_process_completions_and_submit (s=0x7ff7182e8420) + at block/linux-aio.c:236 + #6 0x00005567a12f44c2 in qemu_laio_poll_cb (opaque=0x7ff7182e8430) at block/linux-aio.c:267 + #7 0x00005567a13aed83 in run_poll_handlers_once (ctx=0x5567a2b58c70, timeout=0x7ff7367645f8) + at util/aio-posix.c:520 + #8 0x00005567a13aee9f in run_poll_handlers (ctx=0x5567a2b58c70, max_ns=16000, timeout=0x7ff7367645f8) + at util/aio-posix.c:562 + #9 0x00005567a13aefde in try_poll_mode (ctx=0x5567a2b58c70, timeout=0x7ff7367645f8) + at util/aio-posix.c:597 + #10 0x00005567a13af115 in aio_poll (ctx=0x5567a2b58c70, blocking=true) at util/aio-posix.c:639 + #11 0x00005567a109acca in iothread_run (opaque=0x5567a2b29760) at iothread.c:75 + #12 0x00005567a13b2790 in qemu_thread_start (args=0x5567a2b694c0) at util/qemu-thread-posix.c:519 + #13 0x00007ff73eedf2de in start_thread () at /lib64/libpthread.so.0 + #14 0x00007ff73ec10e83 in clone () at /lib64/libc.so.6 + + Thread 1 (Thread 0x7ff743986f00 (LWP 1062500)): + #0 0x00005567a13b99d6 in iov_memset + (iov=0x6563617073206f4e, iov_cnt=1717922848, offset=516096, fillc=0, bytes=7018105756081554803) + at util/iov.c:69 + #1 0x00005567a13bab73 in qemu_iovec_memset + (qiov=0x7ff73ec99748, offset=516096, fillc=0, bytes=7018105756081554803) at util/iov.c:530 + #2 0x00005567a12f411c in qemu_laio_process_completion (laiocb=0x7ff6512ee6c0) at block/linux-aio.c:86 + #3 0x00005567a12f42ff in qemu_laio_process_completions (s=0x7ff7182e8420) at block/linux-aio.c:217 + #4 0x00005567a12f480d in ioq_submit (s=0x7ff7182e8420) at block/linux-aio.c:323 + #5 0x00005567a12f4a2f in laio_do_submit (fd=19, laiocb=0x7ff5f4ff9ae0, offset=472363008, type=2) + at block/linux-aio.c:375 + #6 0x00005567a12f4af2 in laio_co_submit + (bs=0x5567a2b8c460, s=0x7ff7182e8420, fd=19, offset=472363008, qiov=0x7ff5f4ff9ca0, type=2) + at block/linux-aio.c:394 + #7 0x00005567a12f1803 in raw_co_prw + (bs=0x5567a2b8c460, offset=472363008, bytes=20480, qiov=0x7ff5f4ff9ca0, type=2) + at block/file-posix.c:1892 + #8 0x00005567a12f1941 in raw_co_pwritev + (bs=0x5567a2b8c460, offset=472363008, bytes=20480, qiov=0x7ff5f4ff9ca0, flags=0) + at block/file-posix.c:1925 + #9 0x00005567a12fe3e1 in bdrv_driver_pwritev + (bs=0x5567a2b8c460, offset=472363008, bytes=20480, qiov=0x7ff5f4ff9ca0, qiov_offset=0, flags=0) + at block/io.c:1183 + #10 0x00005567a1300340 in bdrv_aligned_pwritev + (child=0x5567a2b5b070, req=0x7ff5f4ff9db0, offset=472363008, bytes=20480, align=512, qiov=0x7ff72c0425b8, qiov_offset=0, flags=0) at block/io.c:1980 + #11 0x00005567a1300b29 in bdrv_co_pwritev_part + (child=0x5567a2b5b070, offset=472363008, bytes=20480, qiov=0x7ff72c0425b8, qiov_offset=0, flags=0) + at block/io.c:2137 + #12 0x00005567a12baba1 in qcow2_co_pwritev_task + (bs=0x5567a2b92740, file_cluster_offset=472317952, offset=487305216, bytes=20480, qiov=0x7ff72c0425b8, qiov_offset=0, l2meta=0x0) at block/qcow2.c:2444 + #13 0x00005567a12bacdb in qcow2_co_pwritev_task_entry (task=0x5567a2b48540) at block/qcow2.c:2475 + #14 0x00005567a13167d8 in aio_task_co (opaque=0x5567a2b48540) at block/aio_task.c:45 + #15 0x00005567a13cf00c in coroutine_trampoline (i0=738245600, i1=32759) at util/coroutine-ucontext.c:115 + #16 0x00007ff73eb622e0 in __start_context () at /lib64/libc.so.6 + #17 0x00007ff6626f1350 in () + #18 0x0000000000000000 in () +<------> + +This is also known to cause crashes with this message (assertion +failed): + + aio_co_schedule: Co-routine was already scheduled in 'aio_co_schedule' + +RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1812765 +Signed-off-by: Sergio Lopez +Message-Id: <20200603093240.40489-3-slp@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 49b44549ace7890fffdf027fd3695218ee7f1121) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + hw/block/dataplane/virtio-blk.c | 8 ++++++++ + hw/block/virtio-blk.c | 18 ++++++++++++------ + include/hw/virtio/virtio-blk.h | 2 +- + 3 files changed, 21 insertions(+), 7 deletions(-) + +diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c +index 119906a5fe..ac495fd72a 100644 +--- a/hw/block/dataplane/virtio-blk.c ++++ b/hw/block/dataplane/virtio-blk.c +@@ -220,6 +220,9 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) + goto fail_guest_notifiers; + } + ++ /* Process queued requests before the ones in vring */ ++ virtio_blk_process_queued_requests(vblk, false); ++ + /* Kick right away to begin processing requests already in vring */ + for (i = 0; i < nvqs; i++) { + VirtQueue *vq = virtio_get_queue(s->vdev, i); +@@ -239,6 +242,11 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) + return 0; + + fail_guest_notifiers: ++ /* ++ * If we failed to set up the guest notifiers queued requests will be ++ * processed on the main context. ++ */ ++ virtio_blk_process_queued_requests(vblk, false); + vblk->dataplane_disabled = true; + s->starting = false; + vblk->dataplane_started = true; +diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c +index 6ff29a05d6..493a263fa6 100644 +--- a/hw/block/virtio-blk.c ++++ b/hw/block/virtio-blk.c +@@ -819,7 +819,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) + virtio_blk_handle_output_do(s, vq); + } + +-void virtio_blk_process_queued_requests(VirtIOBlock *s) ++void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh) + { + VirtIOBlockReq *req = s->rq; + MultiReqBuffer mrb = {}; +@@ -847,7 +847,9 @@ void virtio_blk_process_queued_requests(VirtIOBlock *s) + if (mrb.num_reqs) { + virtio_blk_submit_multireq(s->blk, &mrb); + } +- blk_dec_in_flight(s->conf.conf.blk); ++ if (is_bh) { ++ blk_dec_in_flight(s->conf.conf.blk); ++ } + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); + } + +@@ -858,21 +860,25 @@ static void virtio_blk_dma_restart_bh(void *opaque) + qemu_bh_delete(s->bh); + s->bh = NULL; + +- virtio_blk_process_queued_requests(s); ++ virtio_blk_process_queued_requests(s, true); + } + + static void virtio_blk_dma_restart_cb(void *opaque, int running, + RunState state) + { + VirtIOBlock *s = opaque; ++ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s))); ++ VirtioBusState *bus = VIRTIO_BUS(qbus); + + if (!running) { + return; + } + +- if (!s->bh) { +- /* FIXME The data plane is not started yet, so these requests are +- * processed in the main thread. */ ++ /* ++ * If ioeventfd is enabled, don't schedule the BH here as queued ++ * requests will be processed while starting the data plane. ++ */ ++ if (!s->bh && !virtio_bus_ioeventfd_enabled(bus)) { + s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk), + virtio_blk_dma_restart_bh, s); + blk_inc_in_flight(s->conf.conf.blk); +diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h +index cf8eea2f58..e77f0db3b0 100644 +--- a/include/hw/virtio/virtio-blk.h ++++ b/include/hw/virtio/virtio-blk.h +@@ -84,6 +84,6 @@ typedef struct MultiReqBuffer { + } MultiReqBuffer; + + bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); +-void virtio_blk_process_queued_requests(VirtIOBlock *s); ++void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh); + + #endif +-- +2.27.0 + diff --git a/kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch b/kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch new file mode 100755 index 0000000000000000000000000000000000000000..148045d822d377ae90b778e02788edef2066af61 --- /dev/null +++ b/kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch @@ -0,0 +1,83 @@ +From 73d83d8880e85eedc22c9651b67d1eacd5de5ff4 Mon Sep 17 00:00:00 2001 +From: Sergio Lopez Pascual +Date: Wed, 24 Jun 2020 13:24:52 -0400 +Subject: [PATCH 10/12] virtio-blk: Refactor the code that processes queued + requests + +RH-Author: Sergio Lopez Pascual +Message-id: <20200624132453.111276-2-slp@redhat.com> +Patchwork-id: 97797 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/2] virtio-blk: Refactor the code that processes queued requests +Bugzilla: +RH-Acked-by: John Snow +RH-Acked-by: Michael S. Tsirkin +RH-Acked-by: Kevin Wolf + +Move the code that processes queued requests from +virtio_blk_dma_restart_bh() to its own, non-static, function. This +will allow us to call it from the virtio_blk_data_plane_start() in a +future patch. + +Signed-off-by: Sergio Lopez +Message-Id: <20200603093240.40489-2-slp@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 7aa1c247b466870b0704d3ccdc3755e5e7394dca) +Signed-off-by: Sergio Lopez +Signed-off-by: Danilo C. L. de Paula +--- + hw/block/virtio-blk.c | 16 +++++++++++----- + include/hw/virtio/virtio-blk.h | 1 + + 2 files changed, 12 insertions(+), 5 deletions(-) + +diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c +index c4e55fb3de..6ff29a05d6 100644 +--- a/hw/block/virtio-blk.c ++++ b/hw/block/virtio-blk.c +@@ -819,15 +819,11 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) + virtio_blk_handle_output_do(s, vq); + } + +-static void virtio_blk_dma_restart_bh(void *opaque) ++void virtio_blk_process_queued_requests(VirtIOBlock *s) + { +- VirtIOBlock *s = opaque; + VirtIOBlockReq *req = s->rq; + MultiReqBuffer mrb = {}; + +- qemu_bh_delete(s->bh); +- s->bh = NULL; +- + s->rq = NULL; + + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); +@@ -855,6 +851,16 @@ static void virtio_blk_dma_restart_bh(void *opaque) + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); + } + ++static void virtio_blk_dma_restart_bh(void *opaque) ++{ ++ VirtIOBlock *s = opaque; ++ ++ qemu_bh_delete(s->bh); ++ s->bh = NULL; ++ ++ virtio_blk_process_queued_requests(s); ++} ++ + static void virtio_blk_dma_restart_cb(void *opaque, int running, + RunState state) + { +diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h +index cddcfbebe9..cf8eea2f58 100644 +--- a/include/hw/virtio/virtio-blk.h ++++ b/include/hw/virtio/virtio-blk.h +@@ -84,5 +84,6 @@ typedef struct MultiReqBuffer { + } MultiReqBuffer; + + bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); ++void virtio_blk_process_queued_requests(VirtIOBlock *s); + + #endif +-- +2.27.0 + diff --git a/kvm-virtio-don-t-enable-notifications-during-polling.patch b/kvm-virtio-don-t-enable-notifications-during-polling.patch new file mode 100755 index 0000000000000000000000000000000000000000..2dffc0118d214e63334722e37eec2af701c36c59 --- /dev/null +++ b/kvm-virtio-don-t-enable-notifications-during-polling.patch @@ -0,0 +1,158 @@ +From 351dd07d7b5e69cdf47260c9ea848c0c93cd2c8a Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 9 Jan 2020 11:13:25 +0000 +Subject: [PATCH 3/5] virtio: don't enable notifications during polling + +RH-Author: Stefan Hajnoczi +Message-id: <20200109111325.559557-2-stefanha@redhat.com> +Patchwork-id: 93298 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/1] virtio: don't enable notifications during polling +Bugzilla: 1789301 +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Michael S. Tsirkin + +Virtqueue notifications are not necessary during polling, so we disable +them. This allows the guest driver to avoid MMIO vmexits. +Unfortunately the virtio-blk and virtio-scsi handler functions re-enable +notifications, defeating this optimization. + +Fix virtio-blk and virtio-scsi emulation so they leave notifications +disabled. The key thing to remember for correctness is that polling +always checks one last time after ending its loop, therefore it's safe +to lose the race when re-enabling notifications at the end of polling. + +There is a measurable performance improvement of 5-10% with the null-co +block driver. Real-life storage configurations will see a smaller +improvement because the MMIO vmexit overhead contributes less to +latency. + +Signed-off-by: Stefan Hajnoczi +Message-Id: <20191209210957.65087-1-stefanha@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit d0435bc513e23a4961b6af20164d1c6c219eb4ea) +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Danilo C. L. de Paula +--- + hw/block/virtio-blk.c | 9 +++++++-- + hw/scsi/virtio-scsi.c | 9 +++++++-- + hw/virtio/virtio.c | 12 ++++++------ + include/hw/virtio/virtio.h | 1 + + 4 files changed, 21 insertions(+), 10 deletions(-) + +diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c +index 4c357d2..c4e55fb 100644 +--- a/hw/block/virtio-blk.c ++++ b/hw/block/virtio-blk.c +@@ -764,13 +764,16 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) + { + VirtIOBlockReq *req; + MultiReqBuffer mrb = {}; ++ bool suppress_notifications = virtio_queue_get_notification(vq); + bool progress = false; + + aio_context_acquire(blk_get_aio_context(s->blk)); + blk_io_plug(s->blk); + + do { +- virtio_queue_set_notification(vq, 0); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 0); ++ } + + while ((req = virtio_blk_get_request(s, vq))) { + progress = true; +@@ -781,7 +784,9 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) + } + } + +- virtio_queue_set_notification(vq, 1); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 1); ++ } + } while (!virtio_queue_empty(vq)); + + if (mrb.num_reqs) { +diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c +index 54108c0..e2cd1df 100644 +--- a/hw/scsi/virtio-scsi.c ++++ b/hw/scsi/virtio-scsi.c +@@ -597,12 +597,15 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) + { + VirtIOSCSIReq *req, *next; + int ret = 0; ++ bool suppress_notifications = virtio_queue_get_notification(vq); + bool progress = false; + + QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs); + + do { +- virtio_queue_set_notification(vq, 0); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 0); ++ } + + while ((req = virtio_scsi_pop_req(s, vq))) { + progress = true; +@@ -622,7 +625,9 @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) + } + } + +- virtio_queue_set_notification(vq, 1); ++ if (suppress_notifications) { ++ virtio_queue_set_notification(vq, 1); ++ } + } while (ret != -EINVAL && !virtio_queue_empty(vq)); + + QTAILQ_FOREACH_SAFE(req, &reqs, next, next) { +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index 04716b5..3211135 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -432,6 +432,11 @@ static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable) + } + } + ++bool virtio_queue_get_notification(VirtQueue *vq) ++{ ++ return vq->notification; ++} ++ + void virtio_queue_set_notification(VirtQueue *vq, int enable) + { + vq->notification = enable; +@@ -3384,17 +3389,12 @@ static bool virtio_queue_host_notifier_aio_poll(void *opaque) + { + EventNotifier *n = opaque; + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); +- bool progress; + + if (!vq->vring.desc || virtio_queue_empty(vq)) { + return false; + } + +- progress = virtio_queue_notify_aio_vq(vq); +- +- /* In case the handler function re-enabled notifications */ +- virtio_queue_set_notification(vq, 0); +- return progress; ++ return virtio_queue_notify_aio_vq(vq); + } + + static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) +diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h +index c32a815..6a20442 100644 +--- a/include/hw/virtio/virtio.h ++++ b/include/hw/virtio/virtio.h +@@ -224,6 +224,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id); + + void virtio_notify_config(VirtIODevice *vdev); + ++bool virtio_queue_get_notification(VirtQueue *vq); + void virtio_queue_set_notification(VirtQueue *vq, int enable); + + int virtio_queue_ready(VirtQueue *vq); +-- +1.8.3.1 + diff --git a/kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch b/kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch new file mode 100755 index 0000000000000000000000000000000000000000..9a69ed135a283d303d0721a4709da3899cf81bcd --- /dev/null +++ b/kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch @@ -0,0 +1,60 @@ +From c0cf6d8a1d3b9bf3928f37fcfd5aa8ae6f1338ca Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:36 +0100 +Subject: [PATCH 005/116] virtio-fs: fix MSI-X nvectors calculation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-2-dgilbert@redhat.com> +Patchwork-id: 93455 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 001/112] virtio-fs: fix MSI-X nvectors calculation +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +The following MSI-X vectors are required: + * VIRTIO Configuration Change + * hiprio virtqueue + * requests virtqueues + +Fix the calculation to reserve enough MSI-X vectors. Otherwise guest +drivers fall back to a sub-optional configuration where all virtqueues +share a single vector. + +This change does not break live migration compatibility since +vhost-user-fs-pci devices are not migratable yet. + +Reported-by: Vivek Goyal +Signed-off-by: Stefan Hajnoczi +Message-Id: <20191209110759.35227-1-stefanha@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 366844f3d1329c6423dd752891a28ccb3ee8fddd) +Signed-off-by: Miroslav Rezanina +--- + hw/virtio/vhost-user-fs-pci.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c +index 933a3f2..e3a649d 100644 +--- a/hw/virtio/vhost-user-fs-pci.c ++++ b/hw/virtio/vhost-user-fs-pci.c +@@ -40,7 +40,8 @@ static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) + DeviceState *vdev = DEVICE(&dev->vdev); + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { +- vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 1; ++ /* Also reserve config change and hiprio queue vectors */ ++ vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 2; + } + + qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); +-- +1.8.3.1 + diff --git a/kvm-virtio-make-virtio_delete_queue-idempotent.patch b/kvm-virtio-make-virtio_delete_queue-idempotent.patch new file mode 100755 index 0000000000000000000000000000000000000000..16eb1da415c901e816b6bb7a3d4d5bb2ac48a69a --- /dev/null +++ b/kvm-virtio-make-virtio_delete_queue-idempotent.patch @@ -0,0 +1,42 @@ +From 901e65fa6ccbadeacd6c585cf49a0a7cdafb4737 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:29 +0000 +Subject: [PATCH 5/7] virtio: make virtio_delete_queue idempotent + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-3-jusual@redhat.com> +Patchwork-id: 93981 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/4] virtio: make virtio_delete_queue idempotent +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: "Michael S. Tsirkin" + +Let's make sure calling this twice is harmless - +no known instances, but seems safer. + +Suggested-by: Pan Nengyuan +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 8cd353ea0fbf0e334e015d833f612799be642296) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/virtio.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index d63a369..e6a9ba4 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -2342,6 +2342,7 @@ void virtio_delete_queue(VirtQueue *vq) + vq->handle_output = NULL; + vq->handle_aio_output = NULL; + g_free(vq->used_elems); ++ vq->used_elems = NULL; + } + + void virtio_del_queue(VirtIODevice *vdev, int n) +-- +1.8.3.1 + diff --git a/kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch b/kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch new file mode 100755 index 0000000000000000000000000000000000000000..c21c699acecfc771e755ac201c2f0d294e21715e --- /dev/null +++ b/kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch @@ -0,0 +1,49 @@ +From 2f494c41715193522c52eafc6af2a5e33f88ceb9 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:31 +0000 +Subject: [PATCH 7/7] virtio-net: delete also control queue when TX/RX deleted + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-5-jusual@redhat.com> +Patchwork-id: 93983 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/4] virtio-net: delete also control queue when TX/RX deleted +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: Yuri Benditovich + +https://bugzilla.redhat.com/show_bug.cgi?id=1708480 +If the control queue is not deleted together with TX/RX, it +later will be ignored in freeing cache resources and hot +unplug will not be completed. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Yuri Benditovich +Message-Id: <20191226043649.14481-3-yuri.benditovich@daynix.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit d945d9f1731244ef341f74ede93120fc9de35913) +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/virtio-net.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index db3d7c3..f325440 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -3101,7 +3101,8 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) + for (i = 0; i < max_queues; i++) { + virtio_net_del_queue(n, i); + } +- ++ /* delete also control vq */ ++ virtio_del_queue(vdev, max_queues * 2); + qemu_announce_timer_del(&n->announce_timer, false); + g_free(n->vqs); + qemu_del_nic(n->nic); +-- +1.8.3.1 + diff --git a/kvm-virtio-net-fix-removal-of-failover-device.patch b/kvm-virtio-net-fix-removal-of-failover-device.patch new file mode 100755 index 0000000000000000000000000000000000000000..6044f3db2c2fb608dd26239d9f0356011a66e1a3 --- /dev/null +++ b/kvm-virtio-net-fix-removal-of-failover-device.patch @@ -0,0 +1,52 @@ +From 92fb4f6cdde32652352a0a831a2ba815701a4014 Mon Sep 17 00:00:00 2001 +From: Juan Quintela +Date: Fri, 3 Jul 2020 12:37:05 -0400 +Subject: [PATCH 4/4] virtio-net: fix removal of failover device + +RH-Author: Juan Quintela +Message-id: <20200703123705.7175-2-quintela@redhat.com> +Patchwork-id: 97901 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 1/1] virtio-net: fix removal of failover device +Bugzilla: +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Laurent Vivier +RH-Acked-by: Dr. David Alan Gilbert + +If you have a networking device and its virtio failover device, and +you remove them in this order: +- virtio device +- the real device + +You get qemu crash. +See bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1820120 + +Bug exist on qemu 4.2 and 5.0. +But in 5.0 don't shows because commit +77b06bba62034a87cc61a9c8de1309ae3e527d97 + +somehow papers over it. + +CC: Jason Wang +CC: Michael S. Tsirkin + +Signed-off-by: Juan Quintela +Signed-off-by: Danilo C. L. de Paula +--- + hw/net/virtio-net.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index f325440d01..dabeb9e720 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -3091,6 +3091,7 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) + g_free(n->vlans); + + if (n->failover) { ++ device_listener_unregister(&n->primary_listener); + g_free(n->primary_device_id); + g_free(n->standby_id); + qobject_unref(n->primary_device_dict); +-- +2.27.0 + diff --git a/kvm-virtio-reset-region-cache-when-on-queue-deletion.patch b/kvm-virtio-reset-region-cache-when-on-queue-deletion.patch new file mode 100755 index 0000000000000000000000000000000000000000..c9f1086611188ab9b830296c919e4f296ef7065e --- /dev/null +++ b/kvm-virtio-reset-region-cache-when-on-queue-deletion.patch @@ -0,0 +1,46 @@ +From 8bf4f561262d9282cebdb3418cdb9a69c92216a0 Mon Sep 17 00:00:00 2001 +From: Julia Suvorova +Date: Wed, 19 Feb 2020 21:34:30 +0000 +Subject: [PATCH 6/7] virtio: reset region cache when on queue deletion + +RH-Author: Julia Suvorova +Message-id: <20200219213431.11913-4-jusual@redhat.com> +Patchwork-id: 93982 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/4] virtio: reset region cache when on queue deletion +Bugzilla: 1791590 +RH-Acked-by: Danilo de Paula +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Michael S. Tsirkin + +From: Yuri Benditovich + +https://bugzilla.redhat.com/show_bug.cgi?id=1708480 +Fix leak of region reference that prevents complete +device deletion on hot unplug. + +Cc: qemu-stable@nongnu.org +Signed-off-by: Yuri Benditovich +Message-Id: <20191226043649.14481-2-yuri.benditovich@daynix.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 421afd2fe8dd4603216cbf36081877c391f5a2a4) +Signed-off-by: Danilo C. L. de Paula +--- + hw/virtio/virtio.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index e6a9ba4..f644d9a 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -2343,6 +2343,7 @@ void virtio_delete_queue(VirtQueue *vq) + vq->handle_aio_output = NULL; + g_free(vq->used_elems); + vq->used_elems = NULL; ++ virtio_virtqueue_reset_region_cache(vq); + } + + void virtio_del_queue(VirtIODevice *vdev, int n) +-- +1.8.3.1 + diff --git a/kvm-virtiofs-Add-maintainers-entry.patch b/kvm-virtiofs-Add-maintainers-entry.patch new file mode 100755 index 0000000000000000000000000000000000000000..fec9371caea84b6fdddf851a577ee2b34226d31e --- /dev/null +++ b/kvm-virtiofs-Add-maintainers-entry.patch @@ -0,0 +1,52 @@ +From f4144443eacceb04823ee72cb2d4f9f841f05495 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:11 +0100 +Subject: [PATCH 040/116] virtiofs: Add maintainers entry +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-37-dgilbert@redhat.com> +Patchwork-id: 93491 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 036/112] virtiofs: Add maintainers entry +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit bad7d2c3ad1af9344df035aedaf8e0967a543070) +Signed-off-by: Miroslav Rezanina +--- + MAINTAINERS | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/MAINTAINERS b/MAINTAINERS +index 5e5e3e5..d1b3e26 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -1575,6 +1575,14 @@ T: git https://github.com/cohuck/qemu.git s390-next + T: git https://github.com/borntraeger/qemu.git s390-next + L: qemu-s390x@nongnu.org + ++virtiofs ++M: Dr. David Alan Gilbert ++M: Stefan Hajnoczi ++S: Supported ++F: tools/virtiofsd/* ++F: hw/virtio/vhost-user-fs* ++F: include/hw/virtio/vhost-user-fs.h ++ + virtio-input + M: Gerd Hoffmann + S: Maintained +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch b/kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch new file mode 100755 index 0000000000000000000000000000000000000000..a2b91beb1092465ff8db191ac59b18b065225374 --- /dev/null +++ b/kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch @@ -0,0 +1,86 @@ +From 4d9106acfd7ed9e4d197ddf9f22b79ba6c8afdd8 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:38 +0100 +Subject: [PATCH 067/116] virtiofsd: Add ID to the log with FUSE_LOG_DEBUG + level +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-64-dgilbert@redhat.com> +Patchwork-id: 93514 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 063/112] virtiofsd: Add ID to the log with FUSE_LOG_DEBUG level +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +virtiofsd has some threads, so we see a lot of logs with debug option. +It would be useful for debugging if we can identify the specific thread +from the log. + +Add ID, which is got by gettid(), to the log with FUSE_LOG_DEBUG level +so that we can grep the specific thread. + +The log is like as: + + ]# ./virtiofsd -d -o vhost_user_socket=/tmp/vhostqemu0 -o source=/tmp/share0 -o cache=auto + ... + [ID: 00000097] unique: 12696, success, outsize: 120 + [ID: 00000097] virtio_send_msg: elem 18: with 2 in desc of length 120 + [ID: 00000003] fv_queue_thread: Got queue event on Queue 1 + [ID: 00000003] fv_queue_thread: Queue 1 gave evalue: 1 available: in: 65552 out: 80 + [ID: 00000003] fv_queue_thread: Waiting for Queue 1 event + [ID: 00000071] fv_queue_worker: elem 33: with 2 out desc of length 80 bad_in_num=0 bad_out_num=0 + [ID: 00000071] unique: 12694, opcode: READ (15), nodeid: 2, insize: 80, pid: 2014 + [ID: 00000071] lo_read(ino=2, size=65536, off=131072) + +Signed-off-by: Masayoshi Mizuma + +Signed-off-by: Dr. David Alan Gilbert + added rework as suggested by Daniel P. Berrangé during review +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 36f3846902bd41413f6c0bf797dee509028c29f4) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ff6910f..f08324f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -2268,10 +2269,17 @@ static void setup_nofile_rlimit(void) + + static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + { ++ g_autofree char *localfmt = NULL; ++ + if (current_log_level < level) { + return; + } + ++ if (current_log_level == FUSE_LOG_DEBUG) { ++ localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), fmt); ++ fmt = localfmt; ++ } ++ + if (use_syslog) { + int priority = LOG_ERR; + switch (level) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch b/kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch new file mode 100755 index 0000000000000000000000000000000000000000..b017bf4c4dba9d3ef1dae8355518485b85516580 --- /dev/null +++ b/kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch @@ -0,0 +1,106 @@ +From 709408de33112d32b7c6675f8c9320b8bebccd58 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:05 +0100 +Subject: [PATCH 034/116] virtiofsd: Add Makefile wiring for virtiofsd contrib +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-31-dgilbert@redhat.com> +Patchwork-id: 93482 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 030/112] virtiofsd: Add Makefile wiring for virtiofsd contrib +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Wire up the building of the virtiofsd in tools. + +virtiofsd relies on Linux-specific system calls and seccomp. Anyone +wishing to port it to other host operating systems should do so +carefully and without reducing security. + +Only allow building on Linux hosts. + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Liam Merwick +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 81bfc42dcf473bc8d3790622633410da72d8e622) +Signed-off-by: Miroslav Rezanina +--- + Makefile | 10 ++++++++++ + Makefile.objs | 1 + + tools/virtiofsd/Makefile.objs | 9 +++++++++ + 3 files changed, 20 insertions(+) + create mode 100644 tools/virtiofsd/Makefile.objs + +diff --git a/Makefile b/Makefile +index 4254950..1526775 100644 +--- a/Makefile ++++ b/Makefile +@@ -330,6 +330,10 @@ endif + endif + endif + ++ifdef CONFIG_LINUX ++HELPERS-y += virtiofsd$(EXESUF) ++endif ++ + # Sphinx does not allow building manuals into the same directory as + # the source files, so if we're doing an in-tree QEMU build we must + # build the manuals into a subdirectory (and then install them from +@@ -430,6 +434,7 @@ dummy := $(call unnest-vars,, \ + elf2dmp-obj-y \ + ivshmem-client-obj-y \ + ivshmem-server-obj-y \ ++ virtiofsd-obj-y \ + rdmacm-mux-obj-y \ + libvhost-user-obj-y \ + vhost-user-scsi-obj-y \ +@@ -675,6 +680,11 @@ rdmacm-mux$(EXESUF): LIBS += "-libumad" + rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) + $(call LINK, $^) + ++ifdef CONFIG_LINUX # relies on Linux-specific syscalls ++virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS) ++ $(call LINK, $^) ++endif ++ + vhost-user-gpu$(EXESUF): $(vhost-user-gpu-obj-y) $(libvhost-user-obj-y) libqemuutil.a libqemustub.a + $(call LINK, $^) + +diff --git a/Makefile.objs b/Makefile.objs +index fcf63e1..1a8f288 100644 +--- a/Makefile.objs ++++ b/Makefile.objs +@@ -125,6 +125,7 @@ vhost-user-blk-obj-y = contrib/vhost-user-blk/ + rdmacm-mux-obj-y = contrib/rdmacm-mux/ + vhost-user-input-obj-y = contrib/vhost-user-input/ + vhost-user-gpu-obj-y = contrib/vhost-user-gpu/ ++virtiofsd-obj-y = tools/virtiofsd/ + + ###################################################################### + trace-events-subdirs = +diff --git a/tools/virtiofsd/Makefile.objs b/tools/virtiofsd/Makefile.objs +new file mode 100644 +index 0000000..45a8075 +--- /dev/null ++++ b/tools/virtiofsd/Makefile.objs +@@ -0,0 +1,9 @@ ++virtiofsd-obj-y = buffer.o \ ++ fuse_opt.o \ ++ fuse_log.o \ ++ fuse_lowlevel.o \ ++ fuse_signals.o \ ++ fuse_virtio.o \ ++ helper.o \ ++ passthrough_ll.o ++ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-auxiliary-.c-s.patch b/kvm-virtiofsd-Add-auxiliary-.c-s.patch new file mode 100755 index 0000000000000000000000000000000000000000..90150d9f82c0435488f9558754815836e3303074 --- /dev/null +++ b/kvm-virtiofsd-Add-auxiliary-.c-s.patch @@ -0,0 +1,1387 @@ +From 55b4059d6399c212109c758190e15b574accdd07 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:41 +0100 +Subject: [PATCH 010/116] virtiofsd: Add auxiliary .c's +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-7-dgilbert@redhat.com> +Patchwork-id: 93461 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 006/112] virtiofsd: Add auxiliary .c's +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Add most of the non-main .c files we need from upstream fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ffcf8d9f8649c6e56b1193bbbc9c9f7388920043) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 321 ++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_log.c | 40 ++++ + tools/virtiofsd/fuse_opt.c | 423 +++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_signals.c | 91 +++++++++ + tools/virtiofsd/helper.c | 440 +++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 1315 insertions(+) + create mode 100644 tools/virtiofsd/buffer.c + create mode 100644 tools/virtiofsd/fuse_log.c + create mode 100644 tools/virtiofsd/fuse_opt.c + create mode 100644 tools/virtiofsd/fuse_signals.c + create mode 100644 tools/virtiofsd/helper.c + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +new file mode 100644 +index 0000000..5ab9b87 +--- /dev/null ++++ b/tools/virtiofsd/buffer.c +@@ -0,0 +1,321 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2010 Miklos Szeredi ++ ++ Functions for dealing with `struct fuse_buf` and `struct ++ fuse_bufvec`. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#define _GNU_SOURCE ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_lowlevel.h" ++#include ++#include ++#include ++#include ++ ++size_t fuse_buf_size(const struct fuse_bufvec *bufv) ++{ ++ size_t i; ++ size_t size = 0; ++ ++ for (i = 0; i < bufv->count; i++) { ++ if (bufv->buf[i].size == SIZE_MAX) ++ size = SIZE_MAX; ++ else ++ size += bufv->buf[i].size; ++ } ++ ++ return size; ++} ++ ++static size_t min_size(size_t s1, size_t s2) ++{ ++ return s1 < s2 ? s1 : s2; ++} ++ ++static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) ++{ ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (dst->flags & FUSE_BUF_FD_SEEK) { ++ res = pwrite(dst->fd, (char *)src->mem + src_off, len, ++ dst->pos + dst_off); ++ } else { ++ res = write(dst->fd, (char *)src->mem + src_off, len); ++ } ++ if (res == -1) { ++ if (!copied) ++ return -errno; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ if (!(dst->flags & FUSE_BUF_FD_RETRY)) ++ break; ++ ++ src_off += res; ++ dst_off += res; ++ len -= res; ++ } ++ ++ return copied; ++} ++ ++static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) ++{ ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (src->flags & FUSE_BUF_FD_SEEK) { ++ res = pread(src->fd, (char *)dst->mem + dst_off, len, ++ src->pos + src_off); ++ } else { ++ res = read(src->fd, (char *)dst->mem + dst_off, len); ++ } ++ if (res == -1) { ++ if (!copied) ++ return -errno; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ if (!(src->flags & FUSE_BUF_FD_RETRY)) ++ break; ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; ++} ++ ++static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) ++{ ++ char buf[4096]; ++ struct fuse_buf tmp = { ++ .size = sizeof(buf), ++ .flags = 0, ++ }; ++ ssize_t res; ++ size_t copied = 0; ++ ++ tmp.mem = buf; ++ ++ while (len) { ++ size_t this_len = min_size(tmp.size, len); ++ size_t read_len; ++ ++ res = fuse_buf_read(&tmp, 0, src, src_off, this_len); ++ if (res < 0) { ++ if (!copied) ++ return res; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ read_len = res; ++ res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); ++ if (res < 0) { ++ if (!copied) ++ return res; ++ break; ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ ++ if (res < this_len) ++ break; ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; ++} ++ ++#ifdef HAVE_SPLICE ++static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) ++{ ++ int splice_flags = 0; ++ off_t *srcpos = NULL; ++ off_t *dstpos = NULL; ++ off_t srcpos_val; ++ off_t dstpos_val; ++ ssize_t res; ++ size_t copied = 0; ++ ++ if (flags & FUSE_BUF_SPLICE_MOVE) ++ splice_flags |= SPLICE_F_MOVE; ++ if (flags & FUSE_BUF_SPLICE_NONBLOCK) ++ splice_flags |= SPLICE_F_NONBLOCK; ++ ++ if (src->flags & FUSE_BUF_FD_SEEK) { ++ srcpos_val = src->pos + src_off; ++ srcpos = &srcpos_val; ++ } ++ if (dst->flags & FUSE_BUF_FD_SEEK) { ++ dstpos_val = dst->pos + dst_off; ++ dstpos = &dstpos_val; ++ } ++ ++ while (len) { ++ res = splice(src->fd, srcpos, dst->fd, dstpos, len, ++ splice_flags); ++ if (res == -1) { ++ if (copied) ++ break; ++ ++ if (errno != EINVAL || (flags & FUSE_BUF_FORCE_SPLICE)) ++ return -errno; ++ ++ /* Maybe splice is not supported for this combination */ ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, ++ len); ++ } ++ if (res == 0) ++ break; ++ ++ copied += res; ++ if (!(src->flags & FUSE_BUF_FD_RETRY) && ++ !(dst->flags & FUSE_BUF_FD_RETRY)) { ++ break; ++ } ++ ++ len -= res; ++ } ++ ++ return copied; ++} ++#else ++static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) ++{ ++ (void) flags; ++ ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); ++} ++#endif ++ ++ ++static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) ++{ ++ int src_is_fd = src->flags & FUSE_BUF_IS_FD; ++ int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; ++ ++ if (!src_is_fd && !dst_is_fd) { ++ char *dstmem = (char *)dst->mem + dst_off; ++ char *srcmem = (char *)src->mem + src_off; ++ ++ if (dstmem != srcmem) { ++ if (dstmem + len <= srcmem || srcmem + len <= dstmem) ++ memcpy(dstmem, srcmem, len); ++ else ++ memmove(dstmem, srcmem, len); ++ } ++ ++ return len; ++ } else if (!src_is_fd) { ++ return fuse_buf_write(dst, dst_off, src, src_off, len); ++ } else if (!dst_is_fd) { ++ return fuse_buf_read(dst, dst_off, src, src_off, len); ++ } else if (flags & FUSE_BUF_NO_SPLICE) { ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); ++ } else { ++ return fuse_buf_splice(dst, dst_off, src, src_off, len, flags); ++ } ++} ++ ++static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv) ++{ ++ if (bufv->idx < bufv->count) ++ return &bufv->buf[bufv->idx]; ++ else ++ return NULL; ++} ++ ++static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) ++{ ++ const struct fuse_buf *buf = fuse_bufvec_current(bufv); ++ ++ bufv->off += len; ++ assert(bufv->off <= buf->size); ++ if (bufv->off == buf->size) { ++ assert(bufv->idx < bufv->count); ++ bufv->idx++; ++ if (bufv->idx == bufv->count) ++ return 0; ++ bufv->off = 0; ++ } ++ return 1; ++} ++ ++ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, ++ enum fuse_buf_copy_flags flags) ++{ ++ size_t copied = 0; ++ ++ if (dstv == srcv) ++ return fuse_buf_size(dstv); ++ ++ for (;;) { ++ const struct fuse_buf *src = fuse_bufvec_current(srcv); ++ const struct fuse_buf *dst = fuse_bufvec_current(dstv); ++ size_t src_len; ++ size_t dst_len; ++ size_t len; ++ ssize_t res; ++ ++ if (src == NULL || dst == NULL) ++ break; ++ ++ src_len = src->size - srcv->off; ++ dst_len = dst->size - dstv->off; ++ len = min_size(src_len, dst_len); ++ ++ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); ++ if (res < 0) { ++ if (!copied) ++ return res; ++ break; ++ } ++ copied += res; ++ ++ if (!fuse_bufvec_advance(srcv, res) || ++ !fuse_bufvec_advance(dstv, res)) ++ break; ++ ++ if (res < len) ++ break; ++ } ++ ++ return copied; ++} +diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c +new file mode 100644 +index 0000000..0d268ab +--- /dev/null ++++ b/tools/virtiofsd/fuse_log.c +@@ -0,0 +1,40 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2019 Red Hat, Inc. ++ ++ Logging API. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "fuse_log.h" ++ ++#include ++#include ++ ++static void default_log_func( ++ __attribute__(( unused )) enum fuse_log_level level, ++ const char *fmt, va_list ap) ++{ ++ vfprintf(stderr, fmt, ap); ++} ++ ++static fuse_log_func_t log_func = default_log_func; ++ ++void fuse_set_log_func(fuse_log_func_t func) ++{ ++ if (!func) ++ func = default_log_func; ++ ++ log_func = func; ++} ++ ++void fuse_log(enum fuse_log_level level, const char *fmt, ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ log_func(level, fmt, ap); ++ va_end(ap); ++} +diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c +new file mode 100644 +index 0000000..93066b9 +--- /dev/null ++++ b/tools/virtiofsd/fuse_opt.c +@@ -0,0 +1,423 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Implementation of option parsing routines (dealing with `struct ++ fuse_args`). ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_opt.h" ++#include "fuse_misc.h" ++ ++#include ++#include ++#include ++#include ++ ++struct fuse_opt_context { ++ void *data; ++ const struct fuse_opt *opt; ++ fuse_opt_proc_t proc; ++ int argctr; ++ int argc; ++ char **argv; ++ struct fuse_args outargs; ++ char *opts; ++ int nonopt; ++}; ++ ++void fuse_opt_free_args(struct fuse_args *args) ++{ ++ if (args) { ++ if (args->argv && args->allocated) { ++ int i; ++ for (i = 0; i < args->argc; i++) ++ free(args->argv[i]); ++ free(args->argv); ++ } ++ args->argc = 0; ++ args->argv = NULL; ++ args->allocated = 0; ++ } ++} ++ ++static int alloc_failed(void) ++{ ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++} ++ ++int fuse_opt_add_arg(struct fuse_args *args, const char *arg) ++{ ++ char **newargv; ++ char *newarg; ++ ++ assert(!args->argv || args->allocated); ++ ++ newarg = strdup(arg); ++ if (!newarg) ++ return alloc_failed(); ++ ++ newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); ++ if (!newargv) { ++ free(newarg); ++ return alloc_failed(); ++ } ++ ++ args->argv = newargv; ++ args->allocated = 1; ++ args->argv[args->argc++] = newarg; ++ args->argv[args->argc] = NULL; ++ return 0; ++} ++ ++static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos, ++ const char *arg) ++{ ++ assert(pos <= args->argc); ++ if (fuse_opt_add_arg(args, arg) == -1) ++ return -1; ++ ++ if (pos != args->argc - 1) { ++ char *newarg = args->argv[args->argc - 1]; ++ memmove(&args->argv[pos + 1], &args->argv[pos], ++ sizeof(char *) * (args->argc - pos - 1)); ++ args->argv[pos] = newarg; ++ } ++ return 0; ++} ++ ++int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg) ++{ ++ return fuse_opt_insert_arg_common(args, pos, arg); ++} ++ ++static int next_arg(struct fuse_opt_context *ctx, const char *opt) ++{ ++ if (ctx->argctr + 1 >= ctx->argc) { ++ fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); ++ return -1; ++ } ++ ctx->argctr++; ++ return 0; ++} ++ ++static int add_arg(struct fuse_opt_context *ctx, const char *arg) ++{ ++ return fuse_opt_add_arg(&ctx->outargs, arg); ++} ++ ++static int add_opt_common(char **opts, const char *opt, int esc) ++{ ++ unsigned oldlen = *opts ? strlen(*opts) : 0; ++ char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); ++ ++ if (!d) ++ return alloc_failed(); ++ ++ *opts = d; ++ if (oldlen) { ++ d += oldlen; ++ *d++ = ','; ++ } ++ ++ for (; *opt; opt++) { ++ if (esc && (*opt == ',' || *opt == '\\')) ++ *d++ = '\\'; ++ *d++ = *opt; ++ } ++ *d = '\0'; ++ ++ return 0; ++} ++ ++int fuse_opt_add_opt(char **opts, const char *opt) ++{ ++ return add_opt_common(opts, opt, 0); ++} ++ ++int fuse_opt_add_opt_escaped(char **opts, const char *opt) ++{ ++ return add_opt_common(opts, opt, 1); ++} ++ ++static int add_opt(struct fuse_opt_context *ctx, const char *opt) ++{ ++ return add_opt_common(&ctx->opts, opt, 1); ++} ++ ++static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key, ++ int iso) ++{ ++ if (key == FUSE_OPT_KEY_DISCARD) ++ return 0; ++ ++ if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { ++ int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); ++ if (res == -1 || !res) ++ return res; ++ } ++ if (iso) ++ return add_opt(ctx, arg); ++ else ++ return add_arg(ctx, arg); ++} ++ ++static int match_template(const char *t, const char *arg, unsigned *sepp) ++{ ++ int arglen = strlen(arg); ++ const char *sep = strchr(t, '='); ++ sep = sep ? sep : strchr(t, ' '); ++ if (sep && (!sep[1] || sep[1] == '%')) { ++ int tlen = sep - t; ++ if (sep[0] == '=') ++ tlen ++; ++ if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { ++ *sepp = sep - t; ++ return 1; ++ } ++ } ++ if (strcmp(t, arg) == 0) { ++ *sepp = 0; ++ return 1; ++ } ++ return 0; ++} ++ ++static const struct fuse_opt *find_opt(const struct fuse_opt *opt, ++ const char *arg, unsigned *sepp) ++{ ++ for (; opt && opt->templ; opt++) ++ if (match_template(opt->templ, arg, sepp)) ++ return opt; ++ return NULL; ++} ++ ++int fuse_opt_match(const struct fuse_opt *opts, const char *opt) ++{ ++ unsigned dummy; ++ return find_opt(opts, opt, &dummy) ? 1 : 0; ++} ++ ++static int process_opt_param(void *var, const char *format, const char *param, ++ const char *arg) ++{ ++ assert(format[0] == '%'); ++ if (format[1] == 's') { ++ char **s = var; ++ char *copy = strdup(param); ++ if (!copy) ++ return alloc_failed(); ++ ++ free(*s); ++ *s = copy; ++ } else { ++ if (sscanf(param, format, var) != 1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", arg); ++ return -1; ++ } ++ } ++ return 0; ++} ++ ++static int process_opt(struct fuse_opt_context *ctx, ++ const struct fuse_opt *opt, unsigned sep, ++ const char *arg, int iso) ++{ ++ if (opt->offset == -1U) { ++ if (call_proc(ctx, arg, opt->value, iso) == -1) ++ return -1; ++ } else { ++ void *var = (char *)ctx->data + opt->offset; ++ if (sep && opt->templ[sep + 1]) { ++ const char *param = arg + sep; ++ if (opt->templ[sep] == '=') ++ param ++; ++ if (process_opt_param(var, opt->templ + sep + 1, ++ param, arg) == -1) ++ return -1; ++ } else ++ *(int *)var = opt->value; ++ } ++ return 0; ++} ++ ++static int process_opt_sep_arg(struct fuse_opt_context *ctx, ++ const struct fuse_opt *opt, unsigned sep, ++ const char *arg, int iso) ++{ ++ int res; ++ char *newarg; ++ char *param; ++ ++ if (next_arg(ctx, arg) == -1) ++ return -1; ++ ++ param = ctx->argv[ctx->argctr]; ++ newarg = malloc(sep + strlen(param) + 1); ++ if (!newarg) ++ return alloc_failed(); ++ ++ memcpy(newarg, arg, sep); ++ strcpy(newarg + sep, param); ++ res = process_opt(ctx, opt, sep, newarg, iso); ++ free(newarg); ++ ++ return res; ++} ++ ++static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso) ++{ ++ unsigned sep; ++ const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); ++ if (opt) { ++ for (; opt; opt = find_opt(opt + 1, arg, &sep)) { ++ int res; ++ if (sep && opt->templ[sep] == ' ' && !arg[sep]) ++ res = process_opt_sep_arg(ctx, opt, sep, arg, ++ iso); ++ else ++ res = process_opt(ctx, opt, sep, arg, iso); ++ if (res == -1) ++ return -1; ++ } ++ return 0; ++ } else ++ return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); ++} ++ ++static int process_real_option_group(struct fuse_opt_context *ctx, char *opts) ++{ ++ char *s = opts; ++ char *d = s; ++ int end = 0; ++ ++ while (!end) { ++ if (*s == '\0') ++ end = 1; ++ if (*s == ',' || end) { ++ int res; ++ ++ *d = '\0'; ++ res = process_gopt(ctx, opts, 1); ++ if (res == -1) ++ return -1; ++ d = opts; ++ } else { ++ if (s[0] == '\\' && s[1] != '\0') { ++ s++; ++ if (s[0] >= '0' && s[0] <= '3' && ++ s[1] >= '0' && s[1] <= '7' && ++ s[2] >= '0' && s[2] <= '7') { ++ *d++ = (s[0] - '0') * 0100 + ++ (s[1] - '0') * 0010 + ++ (s[2] - '0'); ++ s += 2; ++ } else { ++ *d++ = *s; ++ } ++ } else { ++ *d++ = *s; ++ } ++ } ++ s++; ++ } ++ ++ return 0; ++} ++ ++static int process_option_group(struct fuse_opt_context *ctx, const char *opts) ++{ ++ int res; ++ char *copy = strdup(opts); ++ ++ if (!copy) { ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++ } ++ res = process_real_option_group(ctx, copy); ++ free(copy); ++ return res; ++} ++ ++static int process_one(struct fuse_opt_context *ctx, const char *arg) ++{ ++ if (ctx->nonopt || arg[0] != '-') ++ return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); ++ else if (arg[1] == 'o') { ++ if (arg[2]) ++ return process_option_group(ctx, arg + 2); ++ else { ++ if (next_arg(ctx, arg) == -1) ++ return -1; ++ ++ return process_option_group(ctx, ++ ctx->argv[ctx->argctr]); ++ } ++ } else if (arg[1] == '-' && !arg[2]) { ++ if (add_arg(ctx, arg) == -1) ++ return -1; ++ ctx->nonopt = ctx->outargs.argc; ++ return 0; ++ } else ++ return process_gopt(ctx, arg, 0); ++} ++ ++static int opt_parse(struct fuse_opt_context *ctx) ++{ ++ if (ctx->argc) { ++ if (add_arg(ctx, ctx->argv[0]) == -1) ++ return -1; ++ } ++ ++ for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) ++ if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) ++ return -1; ++ ++ if (ctx->opts) { ++ if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || ++ fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) ++ return -1; ++ } ++ ++ /* If option separator ("--") is the last argument, remove it */ ++ if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && ++ strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { ++ free(ctx->outargs.argv[ctx->outargs.argc - 1]); ++ ctx->outargs.argv[--ctx->outargs.argc] = NULL; ++ } ++ ++ return 0; ++} ++ ++int fuse_opt_parse(struct fuse_args *args, void *data, ++ const struct fuse_opt opts[], fuse_opt_proc_t proc) ++{ ++ int res; ++ struct fuse_opt_context ctx = { ++ .data = data, ++ .opt = opts, ++ .proc = proc, ++ }; ++ ++ if (!args || !args->argv || !args->argc) ++ return 0; ++ ++ ctx.argc = args->argc; ++ ctx.argv = args->argv; ++ ++ res = opt_parse(&ctx); ++ if (res != -1) { ++ struct fuse_args tmp = *args; ++ *args = ctx.outargs; ++ ctx.outargs = tmp; ++ } ++ free(ctx.opts); ++ fuse_opt_free_args(&ctx.outargs); ++ return res; ++} +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +new file mode 100644 +index 0000000..4271947 +--- /dev/null ++++ b/tools/virtiofsd/fuse_signals.c +@@ -0,0 +1,91 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Utility functions for setting signal handlers. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "config.h" ++#include "fuse_lowlevel.h" ++#include "fuse_i.h" ++ ++#include ++#include ++#include ++#include ++ ++static struct fuse_session *fuse_instance; ++ ++static void exit_handler(int sig) ++{ ++ if (fuse_instance) { ++ fuse_session_exit(fuse_instance); ++ if(sig <= 0) { ++ fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); ++ abort(); ++ } ++ fuse_instance->error = sig; ++ } ++} ++ ++static void do_nothing(int sig) ++{ ++ (void) sig; ++} ++ ++static int set_one_signal_handler(int sig, void (*handler)(int), int remove) ++{ ++ struct sigaction sa; ++ struct sigaction old_sa; ++ ++ memset(&sa, 0, sizeof(struct sigaction)); ++ sa.sa_handler = remove ? SIG_DFL : handler; ++ sigemptyset(&(sa.sa_mask)); ++ sa.sa_flags = 0; ++ ++ if (sigaction(sig, NULL, &old_sa) == -1) { ++ perror("fuse: cannot get old signal handler"); ++ return -1; ++ } ++ ++ if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && ++ sigaction(sig, &sa, NULL) == -1) { ++ perror("fuse: cannot set signal handler"); ++ return -1; ++ } ++ return 0; ++} ++ ++int fuse_set_signal_handlers(struct fuse_session *se) ++{ ++ /* If we used SIG_IGN instead of the do_nothing function, ++ then we would be unable to tell if we set SIG_IGN (and ++ thus should reset to SIG_DFL in fuse_remove_signal_handlers) ++ or if it was already set to SIG_IGN (and should be left ++ untouched. */ ++ if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) ++ return -1; ++ ++ fuse_instance = se; ++ return 0; ++} ++ ++void fuse_remove_signal_handlers(struct fuse_session *se) ++{ ++ if (fuse_instance != se) ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: fuse_remove_signal_handlers: unknown session\n"); ++ else ++ fuse_instance = NULL; ++ ++ set_one_signal_handler(SIGHUP, exit_handler, 1); ++ set_one_signal_handler(SIGINT, exit_handler, 1); ++ set_one_signal_handler(SIGTERM, exit_handler, 1); ++ set_one_signal_handler(SIGPIPE, do_nothing, 1); ++} +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +new file mode 100644 +index 0000000..64ff7ad +--- /dev/null ++++ b/tools/virtiofsd/helper.c +@@ -0,0 +1,440 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Helper functions to create (simple) standalone programs. With the ++ aid of these functions it should be possible to create full FUSE ++ file system by implementing nothing but the request handlers. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_misc.h" ++#include "fuse_opt.h" ++#include "fuse_lowlevel.h" ++#include "mount_util.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define FUSE_HELPER_OPT(t, p) \ ++ { t, offsetof(struct fuse_cmdline_opts, p), 1 } ++ ++static const struct fuse_opt fuse_helper_opts[] = { ++ FUSE_HELPER_OPT("-h", show_help), ++ FUSE_HELPER_OPT("--help", show_help), ++ FUSE_HELPER_OPT("-V", show_version), ++ FUSE_HELPER_OPT("--version", show_version), ++ FUSE_HELPER_OPT("-d", debug), ++ FUSE_HELPER_OPT("debug", debug), ++ FUSE_HELPER_OPT("-d", foreground), ++ FUSE_HELPER_OPT("debug", foreground), ++ FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), ++ FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("-f", foreground), ++ FUSE_HELPER_OPT("-s", singlethread), ++ FUSE_HELPER_OPT("fsname=", nodefault_subtype), ++ FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), ++#ifndef __FreeBSD__ ++ FUSE_HELPER_OPT("subtype=", nodefault_subtype), ++ FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), ++#endif ++ FUSE_HELPER_OPT("clone_fd", clone_fd), ++ FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_OPT_END ++}; ++ ++struct fuse_conn_info_opts { ++ int atomic_o_trunc; ++ int no_remote_posix_lock; ++ int no_remote_flock; ++ int splice_write; ++ int splice_move; ++ int splice_read; ++ int no_splice_write; ++ int no_splice_move; ++ int no_splice_read; ++ int auto_inval_data; ++ int no_auto_inval_data; ++ int no_readdirplus; ++ int no_readdirplus_auto; ++ int async_dio; ++ int no_async_dio; ++ int writeback_cache; ++ int no_writeback_cache; ++ int async_read; ++ int sync_read; ++ unsigned max_write; ++ unsigned max_readahead; ++ unsigned max_background; ++ unsigned congestion_threshold; ++ unsigned time_gran; ++ int set_max_write; ++ int set_max_readahead; ++ int set_max_background; ++ int set_congestion_threshold; ++ int set_time_gran; ++}; ++ ++#define CONN_OPTION(t, p, v) \ ++ { t, offsetof(struct fuse_conn_info_opts, p), v } ++static const struct fuse_opt conn_info_opt_spec[] = { ++ CONN_OPTION("max_write=%u", max_write, 0), ++ CONN_OPTION("max_write=", set_max_write, 1), ++ CONN_OPTION("max_readahead=%u", max_readahead, 0), ++ CONN_OPTION("max_readahead=", set_max_readahead, 1), ++ CONN_OPTION("max_background=%u", max_background, 0), ++ CONN_OPTION("max_background=", set_max_background, 1), ++ CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), ++ CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), ++ CONN_OPTION("sync_read", sync_read, 1), ++ CONN_OPTION("async_read", async_read, 1), ++ CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), ++ CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("no_remote_lock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_flock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("splice_write", splice_write, 1), ++ CONN_OPTION("no_splice_write", no_splice_write, 1), ++ CONN_OPTION("splice_move", splice_move, 1), ++ CONN_OPTION("no_splice_move", no_splice_move, 1), ++ CONN_OPTION("splice_read", splice_read, 1), ++ CONN_OPTION("no_splice_read", no_splice_read, 1), ++ CONN_OPTION("auto_inval_data", auto_inval_data, 1), ++ CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), ++ CONN_OPTION("readdirplus=no", no_readdirplus, 1), ++ CONN_OPTION("readdirplus=yes", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), ++ CONN_OPTION("readdirplus=auto", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), ++ CONN_OPTION("async_dio", async_dio, 1), ++ CONN_OPTION("no_async_dio", no_async_dio, 1), ++ CONN_OPTION("writeback_cache", writeback_cache, 1), ++ CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), ++ CONN_OPTION("time_gran=%u", time_gran, 0), ++ CONN_OPTION("time_gran=", set_time_gran, 1), ++ FUSE_OPT_END ++}; ++ ++ ++void fuse_cmdline_help(void) ++{ ++ printf(" -h --help print help\n" ++ " -V --version print version\n" ++ " -d -o debug enable debug output (implies -f)\n" ++ " -f foreground operation\n" ++ " -s disable multi-threaded operation\n" ++ " -o clone_fd use separate fuse device fd for each thread\n" ++ " (may improve performance)\n" ++ " -o max_idle_threads the maximum number of idle worker threads\n" ++ " allowed (default: 10)\n"); ++} ++ ++static int fuse_helper_opt_proc(void *data, const char *arg, int key, ++ struct fuse_args *outargs) ++{ ++ (void) outargs; ++ struct fuse_cmdline_opts *opts = data; ++ ++ switch (key) { ++ case FUSE_OPT_KEY_NONOPT: ++ if (!opts->mountpoint) { ++ if (fuse_mnt_parse_fuse_fd(arg) != -1) { ++ return fuse_opt_add_opt(&opts->mountpoint, arg); ++ } ++ ++ char mountpoint[PATH_MAX] = ""; ++ if (realpath(arg, mountpoint) == NULL) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: bad mount point `%s': %s\n", ++ arg, strerror(errno)); ++ return -1; ++ } ++ return fuse_opt_add_opt(&opts->mountpoint, mountpoint); ++ } else { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); ++ return -1; ++ } ++ ++ default: ++ /* Pass through unknown options */ ++ return 1; ++ } ++} ++ ++/* Under FreeBSD, there is no subtype option so this ++ function actually sets the fsname */ ++static int add_default_subtype(const char *progname, struct fuse_args *args) ++{ ++ int res; ++ char *subtype_opt; ++ ++ const char *basename = strrchr(progname, '/'); ++ if (basename == NULL) ++ basename = progname; ++ else if (basename[1] != '\0') ++ basename++; ++ ++ subtype_opt = (char *) malloc(strlen(basename) + 64); ++ if (subtype_opt == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++ } ++#ifdef __FreeBSD__ ++ sprintf(subtype_opt, "-ofsname=%s", basename); ++#else ++ sprintf(subtype_opt, "-osubtype=%s", basename); ++#endif ++ res = fuse_opt_add_arg(args, subtype_opt); ++ free(subtype_opt); ++ return res; ++} ++ ++int fuse_parse_cmdline(struct fuse_args *args, ++ struct fuse_cmdline_opts *opts) ++{ ++ memset(opts, 0, sizeof(struct fuse_cmdline_opts)); ++ ++ opts->max_idle_threads = 10; ++ ++ if (fuse_opt_parse(args, opts, fuse_helper_opts, ++ fuse_helper_opt_proc) == -1) ++ return -1; ++ ++ /* *Linux*: if neither -o subtype nor -o fsname are specified, ++ set subtype to program's basename. ++ *FreeBSD*: if fsname is not specified, set to program's ++ basename. */ ++ if (!opts->nodefault_subtype) ++ if (add_default_subtype(args->argv[0], args) == -1) ++ return -1; ++ ++ return 0; ++} ++ ++ ++int fuse_daemonize(int foreground) ++{ ++ if (!foreground) { ++ int nullfd; ++ int waiter[2]; ++ char completed; ++ ++ if (pipe(waiter)) { ++ perror("fuse_daemonize: pipe"); ++ return -1; ++ } ++ ++ /* ++ * demonize current process by forking it and killing the ++ * parent. This makes current process as a child of 'init'. ++ */ ++ switch(fork()) { ++ case -1: ++ perror("fuse_daemonize: fork"); ++ return -1; ++ case 0: ++ break; ++ default: ++ (void) read(waiter[0], &completed, sizeof(completed)); ++ _exit(0); ++ } ++ ++ if (setsid() == -1) { ++ perror("fuse_daemonize: setsid"); ++ return -1; ++ } ++ ++ (void) chdir("/"); ++ ++ nullfd = open("/dev/null", O_RDWR, 0); ++ if (nullfd != -1) { ++ (void) dup2(nullfd, 0); ++ (void) dup2(nullfd, 1); ++ (void) dup2(nullfd, 2); ++ if (nullfd > 2) ++ close(nullfd); ++ } ++ ++ /* Propagate completion of daemon initialization */ ++ completed = 1; ++ (void) write(waiter[1], &completed, sizeof(completed)); ++ close(waiter[0]); ++ close(waiter[1]); ++ } else { ++ (void) chdir("/"); ++ } ++ return 0; ++} ++ ++int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, ++ size_t op_size, void *user_data) ++{ ++ struct fuse_args args = FUSE_ARGS_INIT(argc, argv); ++ struct fuse *fuse; ++ struct fuse_cmdline_opts opts; ++ int res; ++ ++ if (fuse_parse_cmdline(&args, &opts) != 0) ++ return 1; ++ ++ if (opts.show_version) { ++ printf("FUSE library version %s\n", PACKAGE_VERSION); ++ fuse_lowlevel_version(); ++ res = 0; ++ goto out1; ++ } ++ ++ if (opts.show_help) { ++ if(args.argv[0][0] != '\0') ++ printf("usage: %s [options] \n\n", ++ args.argv[0]); ++ printf("FUSE options:\n"); ++ fuse_cmdline_help(); ++ fuse_lib_help(&args); ++ res = 0; ++ goto out1; ++ } ++ ++ if (!opts.show_help && ++ !opts.mountpoint) { ++ fuse_log(FUSE_LOG_ERR, "error: no mountpoint specified\n"); ++ res = 2; ++ goto out1; ++ } ++ ++ ++ fuse = fuse_new_31(&args, op, op_size, user_data); ++ if (fuse == NULL) { ++ res = 3; ++ goto out1; ++ } ++ ++ if (fuse_mount(fuse,opts.mountpoint) != 0) { ++ res = 4; ++ goto out2; ++ } ++ ++ if (fuse_daemonize(opts.foreground) != 0) { ++ res = 5; ++ goto out3; ++ } ++ ++ struct fuse_session *se = fuse_get_session(fuse); ++ if (fuse_set_signal_handlers(se) != 0) { ++ res = 6; ++ goto out3; ++ } ++ ++ if (opts.singlethread) ++ res = fuse_loop(fuse); ++ else { ++ struct fuse_loop_config loop_config; ++ loop_config.clone_fd = opts.clone_fd; ++ loop_config.max_idle_threads = opts.max_idle_threads; ++ res = fuse_loop_mt_32(fuse, &loop_config); ++ } ++ if (res) ++ res = 7; ++ ++ fuse_remove_signal_handlers(se); ++out3: ++ fuse_unmount(fuse); ++out2: ++ fuse_destroy(fuse); ++out1: ++ free(opts.mountpoint); ++ fuse_opt_free_args(&args); ++ return res; ++} ++ ++ ++void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, ++ struct fuse_conn_info *conn) ++{ ++ if(opts->set_max_write) ++ conn->max_write = opts->max_write; ++ if(opts->set_max_background) ++ conn->max_background = opts->max_background; ++ if(opts->set_congestion_threshold) ++ conn->congestion_threshold = opts->congestion_threshold; ++ if(opts->set_time_gran) ++ conn->time_gran = opts->time_gran; ++ if(opts->set_max_readahead) ++ conn->max_readahead = opts->max_readahead; ++ ++#define LL_ENABLE(cond,cap) \ ++ if (cond) conn->want |= (cap) ++#define LL_DISABLE(cond,cap) \ ++ if (cond) conn->want &= ~(cap) ++ ++ LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); ++ LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); ++ ++ LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); ++ LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); ++ ++ LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); ++ LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); ++ ++ LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ ++ LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); ++ ++ LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); ++ LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); ++ ++ LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ ++ LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); ++ LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); ++ ++ LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); ++ LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); ++} ++ ++struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args) ++{ ++ struct fuse_conn_info_opts *opts; ++ ++ opts = calloc(1, sizeof(struct fuse_conn_info_opts)); ++ if(opts == NULL) { ++ fuse_log(FUSE_LOG_ERR, "calloc failed\n"); ++ return NULL; ++ } ++ if(fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { ++ free(opts); ++ return NULL; ++ } ++ return opts; ++} ++ ++int fuse_open_channel(const char *mountpoint, const char* options) ++{ ++ struct mount_opts *opts = NULL; ++ int fd = -1; ++ const char *argv[] = { "", "-o", options }; ++ int argc = sizeof(argv) / sizeof(argv[0]); ++ struct fuse_args args = FUSE_ARGS_INIT(argc, (char**) argv); ++ ++ opts = parse_mount_opts(&args); ++ if (opts == NULL) ++ return -1; ++ ++ fd = fuse_kern_mount(mountpoint, opts); ++ destroy_mount_opts(opts); ++ ++ return fd; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-fuse_lowlevel.c.patch b/kvm-virtiofsd-Add-fuse_lowlevel.c.patch new file mode 100755 index 0000000000000000000000000000000000000000..1318fef39327e3581ae37216373e5754b18d4b67 --- /dev/null +++ b/kvm-virtiofsd-Add-fuse_lowlevel.c.patch @@ -0,0 +1,3172 @@ +From f6c6830f772e8060255323d2a458cd0e774d9654 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:42 +0100 +Subject: [PATCH 011/116] virtiofsd: Add fuse_lowlevel.c +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-8-dgilbert@redhat.com> +Patchwork-id: 93456 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 007/112] virtiofsd: Add fuse_lowlevel.c +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +fuse_lowlevel is one of the largest files from the library +and does most of the work. Add it separately to keep the diff +sizes small. +Again this is from upstream fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 2de121f01e37e2fe98a4362f4abf7c0848697f76) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 3129 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 3129 insertions(+) + create mode 100644 tools/virtiofsd/fuse_lowlevel.c + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +new file mode 100644 +index 0000000..f2d7038 +--- /dev/null ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -0,0 +1,3129 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ Implementation of (most of) the low-level FUSE API. The session loop ++ functions are implemented in separate files. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#define _GNU_SOURCE ++ ++#include "config.h" ++#include "fuse_i.h" ++#include "fuse_kernel.h" ++#include "fuse_opt.h" ++#include "fuse_misc.h" ++#include "mount_util.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef F_LINUX_SPECIFIC_BASE ++#define F_LINUX_SPECIFIC_BASE 1024 ++#endif ++#ifndef F_SETPIPE_SZ ++#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) ++#endif ++ ++ ++#define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) ++#define OFFSET_MAX 0x7fffffffffffffffLL ++ ++#define container_of(ptr, type, member) ({ \ ++ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ ++ (type *)( (char *)__mptr - offsetof(type,member) );}) ++ ++struct fuse_pollhandle { ++ uint64_t kh; ++ struct fuse_session *se; ++}; ++ ++static size_t pagesize; ++ ++static __attribute__((constructor)) void fuse_ll_init_pagesize(void) ++{ ++ pagesize = getpagesize(); ++} ++ ++static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) ++{ ++ attr->ino = stbuf->st_ino; ++ attr->mode = stbuf->st_mode; ++ attr->nlink = stbuf->st_nlink; ++ attr->uid = stbuf->st_uid; ++ attr->gid = stbuf->st_gid; ++ attr->rdev = stbuf->st_rdev; ++ attr->size = stbuf->st_size; ++ attr->blksize = stbuf->st_blksize; ++ attr->blocks = stbuf->st_blocks; ++ attr->atime = stbuf->st_atime; ++ attr->mtime = stbuf->st_mtime; ++ attr->ctime = stbuf->st_ctime; ++ attr->atimensec = ST_ATIM_NSEC(stbuf); ++ attr->mtimensec = ST_MTIM_NSEC(stbuf); ++ attr->ctimensec = ST_CTIM_NSEC(stbuf); ++} ++ ++static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) ++{ ++ stbuf->st_mode = attr->mode; ++ stbuf->st_uid = attr->uid; ++ stbuf->st_gid = attr->gid; ++ stbuf->st_size = attr->size; ++ stbuf->st_atime = attr->atime; ++ stbuf->st_mtime = attr->mtime; ++ stbuf->st_ctime = attr->ctime; ++ ST_ATIM_NSEC_SET(stbuf, attr->atimensec); ++ ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); ++ ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); ++} ++ ++static size_t iov_length(const struct iovec *iov, size_t count) ++{ ++ size_t seg; ++ size_t ret = 0; ++ ++ for (seg = 0; seg < count; seg++) ++ ret += iov[seg].iov_len; ++ return ret; ++} ++ ++static void list_init_req(struct fuse_req *req) ++{ ++ req->next = req; ++ req->prev = req; ++} ++ ++static void list_del_req(struct fuse_req *req) ++{ ++ struct fuse_req *prev = req->prev; ++ struct fuse_req *next = req->next; ++ prev->next = next; ++ next->prev = prev; ++} ++ ++static void list_add_req(struct fuse_req *req, struct fuse_req *next) ++{ ++ struct fuse_req *prev = next->prev; ++ req->next = next; ++ req->prev = prev; ++ prev->next = req; ++ next->prev = req; ++} ++ ++static void destroy_req(fuse_req_t req) ++{ ++ pthread_mutex_destroy(&req->lock); ++ free(req); ++} ++ ++void fuse_free_req(fuse_req_t req) ++{ ++ int ctr; ++ struct fuse_session *se = req->se; ++ ++ pthread_mutex_lock(&se->lock); ++ req->u.ni.func = NULL; ++ req->u.ni.data = NULL; ++ list_del_req(req); ++ ctr = --req->ctr; ++ fuse_chan_put(req->ch); ++ req->ch = NULL; ++ pthread_mutex_unlock(&se->lock); ++ if (!ctr) ++ destroy_req(req); ++} ++ ++static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se) ++{ ++ struct fuse_req *req; ++ ++ req = (struct fuse_req *) calloc(1, sizeof(struct fuse_req)); ++ if (req == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); ++ } else { ++ req->se = se; ++ req->ctr = 1; ++ list_init_req(req); ++ fuse_mutex_init(&req->lock); ++ } ++ ++ return req; ++} ++ ++/* Send data. If *ch* is NULL, send via session master fd */ ++static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count) ++{ ++ struct fuse_out_header *out = iov[0].iov_base; ++ ++ out->len = iov_length(iov, count); ++ if (se->debug) { ++ if (out->unique == 0) { ++ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", ++ out->error, out->len); ++ } else if (out->error) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, error: %i (%s), outsize: %i\n", ++ (unsigned long long) out->unique, out->error, ++ strerror(-out->error), out->len); ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, success, outsize: %i\n", ++ (unsigned long long) out->unique, out->len); ++ } ++ } ++ ++ ssize_t res = writev(ch ? ch->fd : se->fd, ++ iov, count); ++ int err = errno; ++ ++ if (res == -1) { ++ assert(se != NULL); ++ ++ /* ENOENT means the operation was interrupted */ ++ if (!fuse_session_exited(se) && err != ENOENT) ++ perror("fuse: writing device"); ++ return -err; ++ } ++ ++ return 0; ++} ++ ++ ++int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, ++ int count) ++{ ++ struct fuse_out_header out; ++ ++ if (error <= -1000 || error > 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); ++ error = -ERANGE; ++ } ++ ++ out.unique = req->unique; ++ out.error = error; ++ ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); ++ ++ return fuse_send_msg(req->se, req->ch, iov, count); ++} ++ ++static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov, ++ int count) ++{ ++ int res; ++ ++ res = fuse_send_reply_iov_nofree(req, error, iov, count); ++ fuse_free_req(req); ++ return res; ++} ++ ++static int send_reply(fuse_req_t req, int error, const void *arg, ++ size_t argsize) ++{ ++ struct iovec iov[2]; ++ int count = 1; ++ if (argsize) { ++ iov[1].iov_base = (void *) arg; ++ iov[1].iov_len = argsize; ++ count++; ++ } ++ return send_reply_iov(req, error, iov, count); ++} ++ ++int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count) ++{ ++ int res; ++ struct iovec *padded_iov; ++ ++ padded_iov = malloc((count + 1) * sizeof(struct iovec)); ++ if (padded_iov == NULL) ++ return fuse_reply_err(req, ENOMEM); ++ ++ memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); ++ count++; ++ ++ res = send_reply_iov(req, 0, padded_iov, count); ++ free(padded_iov); ++ ++ return res; ++} ++ ++ ++/* `buf` is allowed to be empty so that the proper size may be ++ allocated by the caller */ ++size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, const struct stat *stbuf, off_t off) ++{ ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ struct fuse_dirent *dirent; ++ ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ ++ if ((buf == NULL) || (entlen_padded > bufsize)) ++ return entlen_padded; ++ ++ dirent = (struct fuse_dirent*) buf; ++ dirent->ino = stbuf->st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (stbuf->st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ ++ return entlen_padded; ++} ++ ++static void convert_statfs(const struct statvfs *stbuf, ++ struct fuse_kstatfs *kstatfs) ++{ ++ kstatfs->bsize = stbuf->f_bsize; ++ kstatfs->frsize = stbuf->f_frsize; ++ kstatfs->blocks = stbuf->f_blocks; ++ kstatfs->bfree = stbuf->f_bfree; ++ kstatfs->bavail = stbuf->f_bavail; ++ kstatfs->files = stbuf->f_files; ++ kstatfs->ffree = stbuf->f_ffree; ++ kstatfs->namelen = stbuf->f_namemax; ++} ++ ++static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) ++{ ++ return send_reply(req, 0, arg, argsize); ++} ++ ++int fuse_reply_err(fuse_req_t req, int err) ++{ ++ return send_reply(req, -err, NULL, 0); ++} ++ ++void fuse_reply_none(fuse_req_t req) ++{ ++ fuse_free_req(req); ++} ++ ++static unsigned long calc_timeout_sec(double t) ++{ ++ if (t > (double) ULONG_MAX) ++ return ULONG_MAX; ++ else if (t < 0.0) ++ return 0; ++ else ++ return (unsigned long) t; ++} ++ ++static unsigned int calc_timeout_nsec(double t) ++{ ++ double f = t - (double) calc_timeout_sec(t); ++ if (f < 0.0) ++ return 0; ++ else if (f >= 0.999999999) ++ return 999999999; ++ else ++ return (unsigned int) (f * 1.0e9); ++} ++ ++static void fill_entry(struct fuse_entry_out *arg, ++ const struct fuse_entry_param *e) ++{ ++ arg->nodeid = e->ino; ++ arg->generation = e->generation; ++ arg->entry_valid = calc_timeout_sec(e->entry_timeout); ++ arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); ++ arg->attr_valid = calc_timeout_sec(e->attr_timeout); ++ arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); ++ convert_stat(&e->attr, &arg->attr); ++} ++ ++/* `buf` is allowed to be empty so that the proper size may be ++ allocated by the caller */ ++size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, ++ const struct fuse_entry_param *e, off_t off) ++{ ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ if ((buf == NULL) || (entlen_padded > bufsize)) ++ return entlen_padded; ++ ++ struct fuse_direntplus *dp = (struct fuse_direntplus *) buf; ++ memset(&dp->entry_out, 0, sizeof(dp->entry_out)); ++ fill_entry(&dp->entry_out, e); ++ ++ struct fuse_dirent *dirent = &dp->dirent; ++ dirent->ino = e->attr.st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (e->attr.st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ ++ return entlen_padded; ++} ++ ++static void fill_open(struct fuse_open_out *arg, ++ const struct fuse_file_info *f) ++{ ++ arg->fh = f->fh; ++ if (f->direct_io) ++ arg->open_flags |= FOPEN_DIRECT_IO; ++ if (f->keep_cache) ++ arg->open_flags |= FOPEN_KEEP_CACHE; ++ if (f->cache_readdir) ++ arg->open_flags |= FOPEN_CACHE_DIR; ++ if (f->nonseekable) ++ arg->open_flags |= FOPEN_NONSEEKABLE; ++} ++ ++int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) ++{ ++ struct fuse_entry_out arg; ++ size_t size = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(arg); ++ ++ /* before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant ++ negative entry */ ++ if (!e->ino && req->se->conn.proto_minor < 4) ++ return fuse_reply_err(req, ENOENT); ++ ++ memset(&arg, 0, sizeof(arg)); ++ fill_entry(&arg, e); ++ return send_reply_ok(req, &arg, size); ++} ++ ++int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, ++ const struct fuse_file_info *f) ++{ ++ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; ++ size_t entrysize = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(struct fuse_entry_out); ++ struct fuse_entry_out *earg = (struct fuse_entry_out *) buf; ++ struct fuse_open_out *oarg = (struct fuse_open_out *) (buf + entrysize); ++ ++ memset(buf, 0, sizeof(buf)); ++ fill_entry(earg, e); ++ fill_open(oarg, f); ++ return send_reply_ok(req, buf, ++ entrysize + sizeof(struct fuse_open_out)); ++} ++ ++int fuse_reply_attr(fuse_req_t req, const struct stat *attr, ++ double attr_timeout) ++{ ++ struct fuse_attr_out arg; ++ size_t size = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.attr_valid = calc_timeout_sec(attr_timeout); ++ arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); ++ convert_stat(attr, &arg.attr); ++ ++ return send_reply_ok(req, &arg, size); ++} ++ ++int fuse_reply_readlink(fuse_req_t req, const char *linkname) ++{ ++ return send_reply_ok(req, linkname, strlen(linkname)); ++} ++ ++int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f) ++{ ++ struct fuse_open_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ fill_open(&arg, f); ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_write(fuse_req_t req, size_t count) ++{ ++ struct fuse_write_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size) ++{ ++ return send_reply_ok(req, buf, size); ++} ++ ++static int fuse_send_data_iov_fallback(struct fuse_session *se, ++ struct fuse_chan *ch, ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, ++ size_t len) ++{ ++ struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); ++ void *mbuf; ++ int res; ++ ++ /* Optimize common case */ ++ if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && ++ !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { ++ /* FIXME: also avoid memory copy if there are multiple buffers ++ but none of them contain an fd */ ++ ++ iov[iov_count].iov_base = buf->buf[0].mem; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ return fuse_send_msg(se, ch, iov, iov_count); ++ } ++ ++ res = posix_memalign(&mbuf, pagesize, len); ++ if (res != 0) ++ return res; ++ ++ mem_buf.buf[0].mem = mbuf; ++ res = fuse_buf_copy(&mem_buf, buf, 0); ++ if (res < 0) { ++ free(mbuf); ++ return -res; ++ } ++ len = res; ++ ++ iov[iov_count].iov_base = mbuf; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ res = fuse_send_msg(se, ch, iov, iov_count); ++ free(mbuf); ++ ++ return res; ++} ++ ++struct fuse_ll_pipe { ++ size_t size; ++ int can_grow; ++ int pipe[2]; ++}; ++ ++static void fuse_ll_pipe_free(struct fuse_ll_pipe *llp) ++{ ++ close(llp->pipe[0]); ++ close(llp->pipe[1]); ++ free(llp); ++} ++ ++#ifdef HAVE_SPLICE ++#if !defined(HAVE_PIPE2) || !defined(O_CLOEXEC) ++static int fuse_pipe(int fds[2]) ++{ ++ int rv = pipe(fds); ++ ++ if (rv == -1) ++ return rv; ++ ++ if (fcntl(fds[0], F_SETFL, O_NONBLOCK) == -1 || ++ fcntl(fds[1], F_SETFL, O_NONBLOCK) == -1 || ++ fcntl(fds[0], F_SETFD, FD_CLOEXEC) == -1 || ++ fcntl(fds[1], F_SETFD, FD_CLOEXEC) == -1) { ++ close(fds[0]); ++ close(fds[1]); ++ rv = -1; ++ } ++ return rv; ++} ++#else ++static int fuse_pipe(int fds[2]) ++{ ++ return pipe2(fds, O_CLOEXEC | O_NONBLOCK); ++} ++#endif ++ ++static struct fuse_ll_pipe *fuse_ll_get_pipe(struct fuse_session *se) ++{ ++ struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); ++ if (llp == NULL) { ++ int res; ++ ++ llp = malloc(sizeof(struct fuse_ll_pipe)); ++ if (llp == NULL) ++ return NULL; ++ ++ res = fuse_pipe(llp->pipe); ++ if (res == -1) { ++ free(llp); ++ return NULL; ++ } ++ ++ /* ++ *the default size is 16 pages on linux ++ */ ++ llp->size = pagesize * 16; ++ llp->can_grow = 1; ++ ++ pthread_setspecific(se->pipe_key, llp); ++ } ++ ++ return llp; ++} ++#endif ++ ++static void fuse_ll_clear_pipe(struct fuse_session *se) ++{ ++ struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); ++ if (llp) { ++ pthread_setspecific(se->pipe_key, NULL); ++ fuse_ll_pipe_free(llp); ++ } ++} ++ ++#if defined(HAVE_SPLICE) && defined(HAVE_VMSPLICE) ++static int read_back(int fd, char *buf, size_t len) ++{ ++ int res; ++ ++ res = read(fd, buf, len); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: internal error: failed to read back from pipe: %s\n", strerror(errno)); ++ return -EIO; ++ } ++ if (res != len) { ++ fuse_log(FUSE_LOG_ERR, "fuse: internal error: short read back from pipe: %i from %zi\n", res, len); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int grow_pipe_to_max(int pipefd) ++{ ++ int max; ++ int res; ++ int maxfd; ++ char buf[32]; ++ ++ maxfd = open("/proc/sys/fs/pipe-max-size", O_RDONLY); ++ if (maxfd < 0) ++ return -errno; ++ ++ res = read(maxfd, buf, sizeof(buf) - 1); ++ if (res < 0) { ++ int saved_errno; ++ ++ saved_errno = errno; ++ close(maxfd); ++ return -saved_errno; ++ } ++ close(maxfd); ++ buf[res] = '\0'; ++ ++ max = atoi(buf); ++ res = fcntl(pipefd, F_SETPIPE_SZ, max); ++ if (res < 0) ++ return -errno; ++ return max; ++} ++ ++static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, unsigned int flags) ++{ ++ int res; ++ size_t len = fuse_buf_size(buf); ++ struct fuse_out_header *out = iov[0].iov_base; ++ struct fuse_ll_pipe *llp; ++ int splice_flags; ++ size_t pipesize; ++ size_t total_fd_size; ++ size_t idx; ++ size_t headerlen; ++ struct fuse_bufvec pipe_buf = FUSE_BUFVEC_INIT(len); ++ ++ if (se->broken_splice_nonblock) ++ goto fallback; ++ ++ if (flags & FUSE_BUF_NO_SPLICE) ++ goto fallback; ++ ++ total_fd_size = 0; ++ for (idx = buf->idx; idx < buf->count; idx++) { ++ if (buf->buf[idx].flags & FUSE_BUF_IS_FD) { ++ total_fd_size = buf->buf[idx].size; ++ if (idx == buf->idx) ++ total_fd_size -= buf->off; ++ } ++ } ++ if (total_fd_size < 2 * pagesize) ++ goto fallback; ++ ++ if (se->conn.proto_minor < 14 || ++ !(se->conn.want & FUSE_CAP_SPLICE_WRITE)) ++ goto fallback; ++ ++ llp = fuse_ll_get_pipe(se); ++ if (llp == NULL) ++ goto fallback; ++ ++ ++ headerlen = iov_length(iov, iov_count); ++ ++ out->len = headerlen + len; ++ ++ /* ++ * Heuristic for the required pipe size, does not work if the ++ * source contains less than page size fragments ++ */ ++ pipesize = pagesize * (iov_count + buf->count + 1) + out->len; ++ ++ if (llp->size < pipesize) { ++ if (llp->can_grow) { ++ res = fcntl(llp->pipe[0], F_SETPIPE_SZ, pipesize); ++ if (res == -1) { ++ res = grow_pipe_to_max(llp->pipe[0]); ++ if (res > 0) ++ llp->size = res; ++ llp->can_grow = 0; ++ goto fallback; ++ } ++ llp->size = res; ++ } ++ if (llp->size < pipesize) ++ goto fallback; ++ } ++ ++ ++ res = vmsplice(llp->pipe[1], iov, iov_count, SPLICE_F_NONBLOCK); ++ if (res == -1) ++ goto fallback; ++ ++ if (res != headerlen) { ++ res = -EIO; ++ fuse_log(FUSE_LOG_ERR, "fuse: short vmsplice to pipe: %u/%zu\n", res, ++ headerlen); ++ goto clear_pipe; ++ } ++ ++ pipe_buf.buf[0].flags = FUSE_BUF_IS_FD; ++ pipe_buf.buf[0].fd = llp->pipe[1]; ++ ++ res = fuse_buf_copy(&pipe_buf, buf, ++ FUSE_BUF_FORCE_SPLICE | FUSE_BUF_SPLICE_NONBLOCK); ++ if (res < 0) { ++ if (res == -EAGAIN || res == -EINVAL) { ++ /* ++ * Should only get EAGAIN on kernels with ++ * broken SPLICE_F_NONBLOCK support (<= ++ * 2.6.35) where this error or a short read is ++ * returned even if the pipe itself is not ++ * full ++ * ++ * EINVAL might mean that splice can't handle ++ * this combination of input and output. ++ */ ++ if (res == -EAGAIN) ++ se->broken_splice_nonblock = 1; ++ ++ pthread_setspecific(se->pipe_key, NULL); ++ fuse_ll_pipe_free(llp); ++ goto fallback; ++ } ++ res = -res; ++ goto clear_pipe; ++ } ++ ++ if (res != 0 && res < len) { ++ struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); ++ void *mbuf; ++ size_t now_len = res; ++ /* ++ * For regular files a short count is either ++ * 1) due to EOF, or ++ * 2) because of broken SPLICE_F_NONBLOCK (see above) ++ * ++ * For other inputs it's possible that we overflowed ++ * the pipe because of small buffer fragments. ++ */ ++ ++ res = posix_memalign(&mbuf, pagesize, len); ++ if (res != 0) ++ goto clear_pipe; ++ ++ mem_buf.buf[0].mem = mbuf; ++ mem_buf.off = now_len; ++ res = fuse_buf_copy(&mem_buf, buf, 0); ++ if (res > 0) { ++ char *tmpbuf; ++ size_t extra_len = res; ++ /* ++ * Trickiest case: got more data. Need to get ++ * back the data from the pipe and then fall ++ * back to regular write. ++ */ ++ tmpbuf = malloc(headerlen); ++ if (tmpbuf == NULL) { ++ free(mbuf); ++ res = ENOMEM; ++ goto clear_pipe; ++ } ++ res = read_back(llp->pipe[0], tmpbuf, headerlen); ++ free(tmpbuf); ++ if (res != 0) { ++ free(mbuf); ++ goto clear_pipe; ++ } ++ res = read_back(llp->pipe[0], mbuf, now_len); ++ if (res != 0) { ++ free(mbuf); ++ goto clear_pipe; ++ } ++ len = now_len + extra_len; ++ iov[iov_count].iov_base = mbuf; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ res = fuse_send_msg(se, ch, iov, iov_count); ++ free(mbuf); ++ return res; ++ } ++ free(mbuf); ++ res = now_len; ++ } ++ len = res; ++ out->len = headerlen + len; ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, success, outsize: %i (splice)\n", ++ (unsigned long long) out->unique, out->len); ++ } ++ ++ splice_flags = 0; ++ if ((flags & FUSE_BUF_SPLICE_MOVE) && ++ (se->conn.want & FUSE_CAP_SPLICE_MOVE)) ++ splice_flags |= SPLICE_F_MOVE; ++ ++ res = splice(llp->pipe[0], NULL, ch ? ch->fd : se->fd, ++ NULL, out->len, splice_flags); ++ if (res == -1) { ++ res = -errno; ++ perror("fuse: splice from pipe"); ++ goto clear_pipe; ++ } ++ if (res != out->len) { ++ res = -EIO; ++ fuse_log(FUSE_LOG_ERR, "fuse: short splice from pipe: %u/%u\n", ++ res, out->len); ++ goto clear_pipe; ++ } ++ return 0; ++ ++clear_pipe: ++ fuse_ll_clear_pipe(se); ++ return res; ++ ++fallback: ++ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); ++} ++#else ++static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, unsigned int flags) ++{ ++ size_t len = fuse_buf_size(buf); ++ (void) flags; ++ ++ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); ++} ++#endif ++ ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags) ++{ ++ struct iovec iov[2]; ++ struct fuse_out_header out; ++ int res; ++ ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); ++ ++ out.unique = req->unique; ++ out.error = 0; ++ ++ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); ++ if (res <= 0) { ++ fuse_free_req(req); ++ return res; ++ } else { ++ return fuse_reply_err(req, res); ++ } ++} ++ ++int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) ++{ ++ struct fuse_statfs_out arg; ++ size_t size = req->se->conn.proto_minor < 4 ? ++ FUSE_COMPAT_STATFS_SIZE : sizeof(arg); ++ ++ memset(&arg, 0, sizeof(arg)); ++ convert_statfs(stbuf, &arg.st); ++ ++ return send_reply_ok(req, &arg, size); ++} ++ ++int fuse_reply_xattr(fuse_req_t req, size_t count) ++{ ++ struct fuse_getxattr_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_lock(fuse_req_t req, const struct flock *lock) ++{ ++ struct fuse_lk_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.lk.type = lock->l_type; ++ if (lock->l_type != F_UNLCK) { ++ arg.lk.start = lock->l_start; ++ if (lock->l_len == 0) ++ arg.lk.end = OFFSET_MAX; ++ else ++ arg.lk.end = lock->l_start + lock->l_len - 1; ++ } ++ arg.lk.pid = lock->l_pid; ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_bmap(fuse_req_t req, uint64_t idx) ++{ ++ struct fuse_bmap_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.block = idx; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov, ++ size_t count) ++{ ++ struct fuse_ioctl_iovec *fiov; ++ size_t i; ++ ++ fiov = malloc(sizeof(fiov[0]) * count); ++ if (!fiov) ++ return NULL; ++ ++ for (i = 0; i < count; i++) { ++ fiov[i].base = (uintptr_t) iov[i].iov_base; ++ fiov[i].len = iov[i].iov_len; ++ } ++ ++ return fiov; ++} ++ ++int fuse_reply_ioctl_retry(fuse_req_t req, ++ const struct iovec *in_iov, size_t in_count, ++ const struct iovec *out_iov, size_t out_count) ++{ ++ struct fuse_ioctl_out arg; ++ struct fuse_ioctl_iovec *in_fiov = NULL; ++ struct fuse_ioctl_iovec *out_fiov = NULL; ++ struct iovec iov[4]; ++ size_t count = 1; ++ int res; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.flags |= FUSE_IOCTL_RETRY; ++ arg.in_iovs = in_count; ++ arg.out_iovs = out_count; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; ++ ++ if (req->se->conn.proto_minor < 16) { ++ if (in_count) { ++ iov[count].iov_base = (void *)in_iov; ++ iov[count].iov_len = sizeof(in_iov[0]) * in_count; ++ count++; ++ } ++ ++ if (out_count) { ++ iov[count].iov_base = (void *)out_iov; ++ iov[count].iov_len = sizeof(out_iov[0]) * out_count; ++ count++; ++ } ++ } else { ++ /* Can't handle non-compat 64bit ioctls on 32bit */ ++ if (sizeof(void *) == 4 && req->ioctl_64bit) { ++ res = fuse_reply_err(req, EINVAL); ++ goto out; ++ } ++ ++ if (in_count) { ++ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); ++ if (!in_fiov) ++ goto enomem; ++ ++ iov[count].iov_base = (void *)in_fiov; ++ iov[count].iov_len = sizeof(in_fiov[0]) * in_count; ++ count++; ++ } ++ if (out_count) { ++ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); ++ if (!out_fiov) ++ goto enomem; ++ ++ iov[count].iov_base = (void *)out_fiov; ++ iov[count].iov_len = sizeof(out_fiov[0]) * out_count; ++ count++; ++ } ++ } ++ ++ res = send_reply_iov(req, 0, iov, count); ++out: ++ free(in_fiov); ++ free(out_fiov); ++ ++ return res; ++ ++enomem: ++ res = fuse_reply_err(req, ENOMEM); ++ goto out; ++} ++ ++int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size) ++{ ++ struct fuse_ioctl_out arg; ++ struct iovec iov[3]; ++ size_t count = 1; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; ++ ++ if (size) { ++ iov[count].iov_base = (char *) buf; ++ iov[count].iov_len = size; ++ count++; ++ } ++ ++ return send_reply_iov(req, 0, iov, count); ++} ++ ++int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, ++ int count) ++{ ++ struct iovec *padded_iov; ++ struct fuse_ioctl_out arg; ++ int res; ++ ++ padded_iov = malloc((count + 2) * sizeof(struct iovec)); ++ if (padded_iov == NULL) ++ return fuse_reply_err(req, ENOMEM); ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ padded_iov[1].iov_base = &arg; ++ padded_iov[1].iov_len = sizeof(arg); ++ ++ memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); ++ ++ res = send_reply_iov(req, 0, padded_iov, count + 2); ++ free(padded_iov); ++ ++ return res; ++} ++ ++int fuse_reply_poll(fuse_req_t req, unsigned revents) ++{ ++ struct fuse_poll_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.revents = revents; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++int fuse_reply_lseek(fuse_req_t req, off_t off) ++{ ++ struct fuse_lseek_out arg; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.offset = off; ++ ++ return send_reply_ok(req, &arg, sizeof(arg)); ++} ++ ++static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.lookup) ++ req->se->op.lookup(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_forget_in *arg = (struct fuse_forget_in *) inarg; ++ ++ if (req->se->op.forget) ++ req->se->op.forget(req, nodeid, arg->nlookup); ++ else ++ fuse_reply_none(req); ++} ++ ++static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, ++ const void *inarg) ++{ ++ struct fuse_batch_forget_in *arg = (void *) inarg; ++ struct fuse_forget_one *param = (void *) PARAM(arg); ++ unsigned int i; ++ ++ (void) nodeid; ++ ++ if (req->se->op.forget_multi) { ++ req->se->op.forget_multi(req, arg->count, ++ (struct fuse_forget_data *) param); ++ } else if (req->se->op.forget) { ++ for (i = 0; i < arg->count; i++) { ++ struct fuse_forget_one *forget = ¶m[i]; ++ struct fuse_req *dummy_req; ++ ++ dummy_req = fuse_ll_alloc_req(req->se); ++ if (dummy_req == NULL) ++ break; ++ ++ dummy_req->unique = req->unique; ++ dummy_req->ctx = req->ctx; ++ dummy_req->ch = NULL; ++ ++ req->se->op.forget(dummy_req, forget->nodeid, ++ forget->nlookup); ++ } ++ fuse_reply_none(req); ++ } else { ++ fuse_reply_none(req); ++ } ++} ++ ++static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_file_info *fip = NULL; ++ struct fuse_file_info fi; ++ ++ if (req->se->conn.proto_minor >= 9) { ++ struct fuse_getattr_in *arg = (struct fuse_getattr_in *) inarg; ++ ++ if (arg->getattr_flags & FUSE_GETATTR_FH) { ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fip = &fi; ++ } ++ } ++ ++ if (req->se->op.getattr) ++ req->se->op.getattr(req, nodeid, fip); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_setattr_in *arg = (struct fuse_setattr_in *) inarg; ++ ++ if (req->se->op.setattr) { ++ struct fuse_file_info *fi = NULL; ++ struct fuse_file_info fi_store; ++ struct stat stbuf; ++ memset(&stbuf, 0, sizeof(stbuf)); ++ convert_attr(arg, &stbuf); ++ if (arg->valid & FATTR_FH) { ++ arg->valid &= ~FATTR_FH; ++ memset(&fi_store, 0, sizeof(fi_store)); ++ fi = &fi_store; ++ fi->fh = arg->fh; ++ } ++ arg->valid &= ++ FUSE_SET_ATTR_MODE | ++ FUSE_SET_ATTR_UID | ++ FUSE_SET_ATTR_GID | ++ FUSE_SET_ATTR_SIZE | ++ FUSE_SET_ATTR_ATIME | ++ FUSE_SET_ATTR_MTIME | ++ FUSE_SET_ATTR_ATIME_NOW | ++ FUSE_SET_ATTR_MTIME_NOW | ++ FUSE_SET_ATTR_CTIME; ++ ++ req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); ++ } else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_access_in *arg = (struct fuse_access_in *) inarg; ++ ++ if (req->se->op.access) ++ req->se->op.access(req, nodeid, arg->mask); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ (void) inarg; ++ ++ if (req->se->op.readlink) ++ req->se->op.readlink(req, nodeid); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_mknod_in *arg = (struct fuse_mknod_in *) inarg; ++ char *name = PARAM(arg); ++ ++ if (req->se->conn.proto_minor >= 12) ++ req->ctx.umask = arg->umask; ++ else ++ name = (char *) inarg + FUSE_COMPAT_MKNOD_IN_SIZE; ++ ++ if (req->se->op.mknod) ++ req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *) inarg; ++ ++ if (req->se->conn.proto_minor >= 12) ++ req->ctx.umask = arg->umask; ++ ++ if (req->se->op.mkdir) ++ req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.unlink) ++ req->se->op.unlink(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.rmdir) ++ req->se->op.rmdir(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ char *linkname = ((char *) inarg) + strlen((char *) inarg) + 1; ++ ++ if (req->se->op.symlink) ++ req->se->op.symlink(req, linkname, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_rename_in *arg = (struct fuse_rename_in *) inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; ++ ++ if (req->se->op.rename) ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, ++ 0); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_rename2_in *arg = (struct fuse_rename2_in *) inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; ++ ++ if (req->se->op.rename) ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, ++ arg->flags); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_link(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_link_in *arg = (struct fuse_link_in *) inarg; ++ ++ if (req->se->op.link) ++ req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_create_in *arg = (struct fuse_create_in *) inarg; ++ ++ if (req->se->op.create) { ++ struct fuse_file_info fi; ++ char *name = PARAM(arg); ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ ++ if (req->se->conn.proto_minor >= 12) ++ req->ctx.umask = arg->umask; ++ else ++ name = (char *) inarg + sizeof(struct fuse_open_in); ++ ++ req->se->op.create(req, nodeid, name, arg->mode, &fi); ++ } else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_open_in *arg = (struct fuse_open_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ ++ if (req->se->op.open) ++ req->se->op.open(req, nodeid, &fi); ++ else ++ fuse_reply_open(req, &fi); ++} ++ ++static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ ++ if (req->se->op.read) { ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 9) { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ } ++ req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); ++ } else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_write_in *arg = (struct fuse_write_in *) inarg; ++ struct fuse_file_info fi; ++ char *param; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; ++ ++ if (req->se->conn.proto_minor < 9) { ++ param = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ param = PARAM(arg); ++ } ++ ++ if (req->se->op.write) ++ req->se->op.write(req, nodeid, param, arg->size, ++ arg->offset, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ struct fuse_write_in *arg = (struct fuse_write_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; ++ ++ if (se->conn.proto_minor < 9) { ++ bufv.buf[0].mem = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ bufv.buf[0].size -= sizeof(struct fuse_in_header) + ++ FUSE_COMPAT_WRITE_IN_SIZE; ++ assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) ++ bufv.buf[0].mem = PARAM(arg); ++ ++ bufv.buf[0].size -= sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_write_in); ++ } ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); ++ fuse_reply_err(req, EIO); ++ goto out; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); ++ ++out: ++ /* Need to reset the pipe if ->write_buf() didn't consume all data */ ++ if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) ++ fuse_ll_clear_pipe(se); ++} ++ ++static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_flush_in *arg = (struct fuse_flush_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.flush = 1; ++ if (req->se->conn.proto_minor >= 7) ++ fi.lock_owner = arg->lock_owner; ++ ++ if (req->se->op.flush) ++ req->se->op.flush(req, nodeid, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_release_in *arg = (struct fuse_release_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 8) { ++ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; ++ fi.lock_owner = arg->lock_owner; ++ } ++ if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { ++ fi.flock_release = 1; ++ fi.lock_owner = arg->lock_owner; ++ } ++ ++ if (req->se->op.release) ++ req->se->op.release(req, nodeid, &fi); ++ else ++ fuse_reply_err(req, 0); ++} ++ ++static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.fsync) ++ req->se->op.fsync(req, nodeid, datasync, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_open_in *arg = (struct fuse_open_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ ++ if (req->se->op.opendir) ++ req->se->op.opendir(req, nodeid, &fi); ++ else ++ fuse_reply_open(req, &fi); ++} ++ ++static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.readdir) ++ req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.readdirplus) ++ req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_release_in *arg = (struct fuse_release_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; ++ ++ if (req->se->op.releasedir) ++ req->se->op.releasedir(req, nodeid, &fi); ++ else ++ fuse_reply_err(req, 0); ++} ++ ++static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.fsyncdir) ++ req->se->op.fsyncdir(req, nodeid, datasync, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ (void) nodeid; ++ (void) inarg; ++ ++ if (req->se->op.statfs) ++ req->se->op.statfs(req, nodeid); ++ else { ++ struct statvfs buf = { ++ .f_namemax = 255, ++ .f_bsize = 512, ++ }; ++ fuse_reply_statfs(req, &buf); ++ } ++} ++ ++static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *) inarg; ++ char *name = PARAM(arg); ++ char *value = name + strlen(name) + 1; ++ ++ if (req->se->op.setxattr) ++ req->se->op.setxattr(req, nodeid, name, value, arg->size, ++ arg->flags); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ ++ if (req->se->op.getxattr) ++ req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ ++ if (req->se->op.listxattr) ++ req->se->op.listxattr(req, nodeid, arg->size); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ char *name = (char *) inarg; ++ ++ if (req->se->op.removexattr) ++ req->se->op.removexattr(req, nodeid, name); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void convert_fuse_file_lock(struct fuse_file_lock *fl, ++ struct flock *flock) ++{ ++ memset(flock, 0, sizeof(struct flock)); ++ flock->l_type = fl->type; ++ flock->l_whence = SEEK_SET; ++ flock->l_start = fl->start; ++ if (fl->end == OFFSET_MAX) ++ flock->l_len = 0; ++ else ++ flock->l_len = fl->end - fl->start + 1; ++ flock->l_pid = fl->pid; ++} ++ ++static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; ++ struct fuse_file_info fi; ++ struct flock flock; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; ++ ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.getlk) ++ req->se->op.getlk(req, nodeid, &fi, &flock); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, ++ const void *inarg, int sleep) ++{ ++ struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; ++ struct fuse_file_info fi; ++ struct flock flock; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; ++ ++ if (arg->lk_flags & FUSE_LK_FLOCK) { ++ int op = 0; ++ ++ switch (arg->lk.type) { ++ case F_RDLCK: ++ op = LOCK_SH; ++ break; ++ case F_WRLCK: ++ op = LOCK_EX; ++ break; ++ case F_UNLCK: ++ op = LOCK_UN; ++ break; ++ } ++ if (!sleep) ++ op |= LOCK_NB; ++ ++ if (req->se->op.flock) ++ req->se->op.flock(req, nodeid, &fi, op); ++ else ++ fuse_reply_err(req, ENOSYS); ++ } else { ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.setlk) ++ req->se->op.setlk(req, nodeid, &fi, &flock, sleep); ++ else ++ fuse_reply_err(req, ENOSYS); ++ } ++} ++ ++static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ do_setlk_common(req, nodeid, inarg, 0); ++} ++ ++static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ do_setlk_common(req, nodeid, inarg, 1); ++} ++ ++static int find_interrupted(struct fuse_session *se, struct fuse_req *req) ++{ ++ struct fuse_req *curr; ++ ++ for (curr = se->list.next; curr != &se->list; curr = curr->next) { ++ if (curr->unique == req->u.i.unique) { ++ fuse_interrupt_func_t func; ++ void *data; ++ ++ curr->ctr++; ++ pthread_mutex_unlock(&se->lock); ++ ++ /* Ugh, ugly locking */ ++ pthread_mutex_lock(&curr->lock); ++ pthread_mutex_lock(&se->lock); ++ curr->interrupted = 1; ++ func = curr->u.ni.func; ++ data = curr->u.ni.data; ++ pthread_mutex_unlock(&se->lock); ++ if (func) ++ func(curr, data); ++ pthread_mutex_unlock(&curr->lock); ++ ++ pthread_mutex_lock(&se->lock); ++ curr->ctr--; ++ if (!curr->ctr) ++ destroy_req(curr); ++ ++ return 1; ++ } ++ } ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->u.i.unique) ++ return 1; ++ } ++ return 0; ++} ++ ++static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *) inarg; ++ struct fuse_session *se = req->se; ++ ++ (void) nodeid; ++ if (se->debug) ++ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", ++ (unsigned long long) arg->unique); ++ ++ req->u.i.unique = arg->unique; ++ ++ pthread_mutex_lock(&se->lock); ++ if (find_interrupted(se, req)) ++ destroy_req(req); ++ else ++ list_add_req(req, &se->interrupts); ++ pthread_mutex_unlock(&se->lock); ++} ++ ++static struct fuse_req *check_interrupt(struct fuse_session *se, ++ struct fuse_req *req) ++{ ++ struct fuse_req *curr; ++ ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->unique) { ++ req->interrupted = 1; ++ list_del_req(curr); ++ free(curr); ++ return NULL; ++ } ++ } ++ curr = se->interrupts.next; ++ if (curr != &se->interrupts) { ++ list_del_req(curr); ++ list_init_req(curr); ++ return curr; ++ } else ++ return NULL; ++} ++ ++static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_bmap_in *arg = (struct fuse_bmap_in *) inarg; ++ ++ if (req->se->op.bmap) ++ req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *) inarg; ++ unsigned int flags = arg->flags; ++ void *in_buf = arg->in_size ? PARAM(arg) : NULL; ++ struct fuse_file_info fi; ++ ++ if (flags & FUSE_IOCTL_DIR && ++ !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { ++ fuse_reply_err(req, ENOTTY); ++ return; ++ } ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && ++ !(flags & FUSE_IOCTL_32BIT)) { ++ req->ioctl_64bit = 1; ++ } ++ ++ if (req->se->op.ioctl) ++ req->se->op.ioctl(req, nodeid, arg->cmd, ++ (void *)(uintptr_t)arg->arg, &fi, flags, ++ in_buf, arg->in_size, arg->out_size); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) ++{ ++ free(ph); ++} ++ ++static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_poll_in *arg = (struct fuse_poll_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.poll_events = arg->events; ++ ++ if (req->se->op.poll) { ++ struct fuse_pollhandle *ph = NULL; ++ ++ if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { ++ ph = malloc(sizeof(struct fuse_pollhandle)); ++ if (ph == NULL) { ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ph->kh = arg->kh; ++ ph->se = req->se; ++ } ++ ++ req->se->op.poll(req, nodeid, &fi, ph); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } ++} ++ ++static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.fallocate) ++ req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, const void *inarg) ++{ ++ struct fuse_copy_file_range_in *arg = (struct fuse_copy_file_range_in *) inarg; ++ struct fuse_file_info fi_in, fi_out; ++ ++ memset(&fi_in, 0, sizeof(fi_in)); ++ fi_in.fh = arg->fh_in; ++ ++ memset(&fi_out, 0, sizeof(fi_out)); ++ fi_out.fh = arg->fh_out; ++ ++ ++ if (req->se->op.copy_file_range) ++ req->se->op.copy_file_range(req, nodeid_in, arg->off_in, ++ &fi_in, arg->nodeid_out, ++ arg->off_out, &fi_out, arg->len, ++ arg->flags); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_lseek_in *arg = (struct fuse_lseek_in *) inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ ++ if (req->se->op.lseek) ++ req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); ++ else ++ fuse_reply_err(req, ENOSYS); ++} ++ ++static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_init_in *arg = (struct fuse_init_in *) inarg; ++ struct fuse_init_out outarg; ++ struct fuse_session *se = req->se; ++ size_t bufsize = se->bufsize; ++ size_t outargsize = sizeof(outarg); ++ ++ (void) nodeid; ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); ++ if (arg->major == 7 && arg->minor >= 6) { ++ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); ++ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", ++ arg->max_readahead); ++ } ++ } ++ se->conn.proto_major = arg->major; ++ se->conn.proto_minor = arg->minor; ++ se->conn.capable = 0; ++ se->conn.want = 0; ++ ++ memset(&outarg, 0, sizeof(outarg)); ++ outarg.major = FUSE_KERNEL_VERSION; ++ outarg.minor = FUSE_KERNEL_MINOR_VERSION; ++ ++ if (arg->major < 7) { ++ fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", ++ arg->major, arg->minor); ++ fuse_reply_err(req, EPROTO); ++ return; ++ } ++ ++ if (arg->major > 7) { ++ /* Wait for a second INIT request with a 7.X version */ ++ send_reply_ok(req, &outarg, sizeof(outarg)); ++ return; ++ } ++ ++ if (arg->minor >= 6) { ++ if (arg->max_readahead < se->conn.max_readahead) ++ se->conn.max_readahead = arg->max_readahead; ++ if (arg->flags & FUSE_ASYNC_READ) ++ se->conn.capable |= FUSE_CAP_ASYNC_READ; ++ if (arg->flags & FUSE_POSIX_LOCKS) ++ se->conn.capable |= FUSE_CAP_POSIX_LOCKS; ++ if (arg->flags & FUSE_ATOMIC_O_TRUNC) ++ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; ++ if (arg->flags & FUSE_EXPORT_SUPPORT) ++ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; ++ if (arg->flags & FUSE_DONT_MASK) ++ se->conn.capable |= FUSE_CAP_DONT_MASK; ++ if (arg->flags & FUSE_FLOCK_LOCKS) ++ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; ++ if (arg->flags & FUSE_AUTO_INVAL_DATA) ++ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; ++ if (arg->flags & FUSE_DO_READDIRPLUS) ++ se->conn.capable |= FUSE_CAP_READDIRPLUS; ++ if (arg->flags & FUSE_READDIRPLUS_AUTO) ++ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; ++ if (arg->flags & FUSE_ASYNC_DIO) ++ se->conn.capable |= FUSE_CAP_ASYNC_DIO; ++ if (arg->flags & FUSE_WRITEBACK_CACHE) ++ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; ++ if (arg->flags & FUSE_NO_OPEN_SUPPORT) ++ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; ++ if (arg->flags & FUSE_PARALLEL_DIROPS) ++ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; ++ if (arg->flags & FUSE_POSIX_ACL) ++ se->conn.capable |= FUSE_CAP_POSIX_ACL; ++ if (arg->flags & FUSE_HANDLE_KILLPRIV) ++ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; ++ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) ++ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; ++ if (!(arg->flags & FUSE_MAX_PAGES)) { ++ size_t max_bufsize = ++ FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() ++ + FUSE_BUFFER_HEADER_SIZE; ++ if (bufsize > max_bufsize) { ++ bufsize = max_bufsize; ++ } ++ } ++ } else { ++ se->conn.max_readahead = 0; ++ } ++ ++ if (se->conn.proto_minor >= 14) { ++#ifdef HAVE_SPLICE ++#ifdef HAVE_VMSPLICE ++ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; ++#endif ++ se->conn.capable |= FUSE_CAP_SPLICE_READ; ++#endif ++ } ++ if (se->conn.proto_minor >= 18) ++ se->conn.capable |= FUSE_CAP_IOCTL_DIR; ++ ++ /* Default settings for modern filesystems. ++ * ++ * Most of these capabilities were disabled by default in ++ * libfuse2 for backwards compatibility reasons. In libfuse3, ++ * we can finally enable them by default (as long as they're ++ * supported by the kernel). ++ */ ++#define LL_SET_DEFAULT(cond, cap) \ ++ if ((cond) && (se->conn.capable & (cap))) \ ++ se->conn.want |= (cap) ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); ++ LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); ++ LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); ++ LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); ++ LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); ++ LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); ++ LL_SET_DEFAULT(se->op.getlk && se->op.setlk, ++ FUSE_CAP_POSIX_LOCKS); ++ LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); ++ LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, ++ FUSE_CAP_READDIRPLUS_AUTO); ++ se->conn.time_gran = 1; ++ ++ if (bufsize < FUSE_MIN_READ_BUFFER) { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", ++ bufsize); ++ bufsize = FUSE_MIN_READ_BUFFER; ++ } ++ se->bufsize = bufsize; ++ ++ if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) ++ se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; ++ ++ se->got_init = 1; ++ if (se->op.init) ++ se->op.init(se->userdata, &se->conn); ++ ++ if (se->conn.want & (~se->conn.capable)) { ++ fuse_log(FUSE_LOG_ERR, "fuse: error: filesystem requested capabilities " ++ "0x%x that are not supported by kernel, aborting.\n", ++ se->conn.want & (~se->conn.capable)); ++ fuse_reply_err(req, EPROTO); ++ se->error = -EPROTO; ++ fuse_session_exit(se); ++ return; ++ } ++ ++ unsigned max_read_mo = get_max_read(se->mo); ++ if (se->conn.max_read != max_read_mo) { ++ fuse_log(FUSE_LOG_ERR, "fuse: error: init() and fuse_session_new() " ++ "requested different maximum read size (%u vs %u)\n", ++ se->conn.max_read, max_read_mo); ++ fuse_reply_err(req, EPROTO); ++ se->error = -EPROTO; ++ fuse_session_exit(se); ++ return; ++ } ++ ++ if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { ++ se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; ++ } ++ if (arg->flags & FUSE_MAX_PAGES) { ++ outarg.flags |= FUSE_MAX_PAGES; ++ outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; ++ } ++ ++ /* Always enable big writes, this is superseded ++ by the max_write option */ ++ outarg.flags |= FUSE_BIG_WRITES; ++ ++ if (se->conn.want & FUSE_CAP_ASYNC_READ) ++ outarg.flags |= FUSE_ASYNC_READ; ++ if (se->conn.want & FUSE_CAP_POSIX_LOCKS) ++ outarg.flags |= FUSE_POSIX_LOCKS; ++ if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) ++ outarg.flags |= FUSE_ATOMIC_O_TRUNC; ++ if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) ++ outarg.flags |= FUSE_EXPORT_SUPPORT; ++ if (se->conn.want & FUSE_CAP_DONT_MASK) ++ outarg.flags |= FUSE_DONT_MASK; ++ if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) ++ outarg.flags |= FUSE_FLOCK_LOCKS; ++ if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) ++ outarg.flags |= FUSE_AUTO_INVAL_DATA; ++ if (se->conn.want & FUSE_CAP_READDIRPLUS) ++ outarg.flags |= FUSE_DO_READDIRPLUS; ++ if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) ++ outarg.flags |= FUSE_READDIRPLUS_AUTO; ++ if (se->conn.want & FUSE_CAP_ASYNC_DIO) ++ outarg.flags |= FUSE_ASYNC_DIO; ++ if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) ++ outarg.flags |= FUSE_WRITEBACK_CACHE; ++ if (se->conn.want & FUSE_CAP_POSIX_ACL) ++ outarg.flags |= FUSE_POSIX_ACL; ++ outarg.max_readahead = se->conn.max_readahead; ++ outarg.max_write = se->conn.max_write; ++ if (se->conn.proto_minor >= 13) { ++ if (se->conn.max_background >= (1 << 16)) ++ se->conn.max_background = (1 << 16) - 1; ++ if (se->conn.congestion_threshold > se->conn.max_background) ++ se->conn.congestion_threshold = se->conn.max_background; ++ if (!se->conn.congestion_threshold) { ++ se->conn.congestion_threshold = ++ se->conn.max_background * 3 / 4; ++ } ++ ++ outarg.max_background = se->conn.max_background; ++ outarg.congestion_threshold = se->conn.congestion_threshold; ++ } ++ if (se->conn.proto_minor >= 23) ++ outarg.time_gran = se->conn.time_gran; ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); ++ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); ++ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", ++ outarg.max_readahead); ++ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); ++ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", ++ outarg.max_background); ++ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", ++ outarg.congestion_threshold); ++ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", ++ outarg.time_gran); ++ } ++ if (arg->minor < 5) ++ outargsize = FUSE_COMPAT_INIT_OUT_SIZE; ++ else if (arg->minor < 23) ++ outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; ++ ++ send_reply_ok(req, &outarg, outargsize); ++} ++ ++static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++{ ++ struct fuse_session *se = req->se; ++ ++ (void) nodeid; ++ (void) inarg; ++ ++ se->got_destroy = 1; ++ if (se->op.destroy) ++ se->op.destroy(se->userdata); ++ ++ send_reply_ok(req, NULL, 0); ++} ++ ++static void list_del_nreq(struct fuse_notify_req *nreq) ++{ ++ struct fuse_notify_req *prev = nreq->prev; ++ struct fuse_notify_req *next = nreq->next; ++ prev->next = next; ++ next->prev = prev; ++} ++ ++static void list_add_nreq(struct fuse_notify_req *nreq, ++ struct fuse_notify_req *next) ++{ ++ struct fuse_notify_req *prev = next->prev; ++ nreq->next = next; ++ nreq->prev = prev; ++ prev->next = nreq; ++ next->prev = nreq; ++} ++ ++static void list_init_nreq(struct fuse_notify_req *nreq) ++{ ++ nreq->next = nreq; ++ nreq->prev = nreq; ++} ++ ++static void do_notify_reply(fuse_req_t req, fuse_ino_t nodeid, ++ const void *inarg, const struct fuse_buf *buf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_notify_req *nreq; ++ struct fuse_notify_req *head; ++ ++ pthread_mutex_lock(&se->lock); ++ head = &se->notify_list; ++ for (nreq = head->next; nreq != head; nreq = nreq->next) { ++ if (nreq->unique == req->unique) { ++ list_del_nreq(nreq); ++ break; ++ } ++ } ++ pthread_mutex_unlock(&se->lock); ++ ++ if (nreq != head) ++ nreq->reply(nreq, req, nodeid, inarg, buf); ++} ++ ++static int send_notify_iov(struct fuse_session *se, int notify_code, ++ struct iovec *iov, int count) ++{ ++ struct fuse_out_header out; ++ ++ if (!se->got_init) ++ return -ENOTCONN; ++ ++ out.unique = 0; ++ out.error = notify_code; ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); ++ ++ return fuse_send_msg(se, NULL, iov, count); ++} ++ ++int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) ++{ ++ if (ph != NULL) { ++ struct fuse_notify_poll_wakeup_out outarg; ++ struct iovec iov[2]; ++ ++ outarg.kh = ph->kh; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2); ++ } else { ++ return 0; ++ } ++} ++ ++int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, ++ off_t off, off_t len) ++{ ++ struct fuse_notify_inval_inode_out outarg; ++ struct iovec iov[2]; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) ++ return -ENOSYS; ++ ++ outarg.ino = ino; ++ outarg.off = off; ++ outarg.len = len; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2); ++} ++ ++int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, ++ const char *name, size_t namelen) ++{ ++ struct fuse_notify_inval_entry_out outarg; ++ struct iovec iov[3]; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) ++ return -ENOSYS; ++ ++ outarg.parent = parent; ++ outarg.namelen = namelen; ++ outarg.padding = 0; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; ++ ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3); ++} ++ ++int fuse_lowlevel_notify_delete(struct fuse_session *se, ++ fuse_ino_t parent, fuse_ino_t child, ++ const char *name, size_t namelen) ++{ ++ struct fuse_notify_delete_out outarg; ++ struct iovec iov[3]; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) ++ return -ENOSYS; ++ ++ outarg.parent = parent; ++ outarg.child = child; ++ outarg.namelen = namelen; ++ outarg.padding = 0; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; ++ ++ return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3); ++} ++ ++int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags) ++{ ++ struct fuse_out_header out; ++ struct fuse_notify_store_out outarg; ++ struct iovec iov[3]; ++ size_t size = fuse_buf_size(bufv); ++ int res; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) ++ return -ENOSYS; ++ ++ out.unique = 0; ++ out.error = FUSE_NOTIFY_STORE; ++ ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; ++ ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(out); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); ++ if (res > 0) ++ res = -res; ++ ++ return res; ++} ++ ++struct fuse_retrieve_req { ++ struct fuse_notify_req nreq; ++ void *cookie; ++}; ++ ++static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, ++ fuse_req_t req, fuse_ino_t ino, ++ const void *inarg, ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_retrieve_req *rreq = ++ container_of(nreq, struct fuse_retrieve_req, nreq); ++ const struct fuse_notify_retrieve_in *arg = inarg; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) ++ bufv.buf[0].mem = PARAM(arg); ++ ++ bufv.buf[0].size -= sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_notify_retrieve_in); ++ ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); ++ fuse_reply_none(req); ++ goto out; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ if (se->op.retrieve_reply) { ++ se->op.retrieve_reply(req, rreq->cookie, ino, ++ arg->offset, &bufv); ++ } else { ++ fuse_reply_none(req); ++ } ++out: ++ free(rreq); ++ if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) ++ fuse_ll_clear_pipe(se); ++} ++ ++int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, ++ size_t size, off_t offset, void *cookie) ++{ ++ struct fuse_notify_retrieve_out outarg; ++ struct iovec iov[2]; ++ struct fuse_retrieve_req *rreq; ++ int err; ++ ++ if (!se) ++ return -EINVAL; ++ ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) ++ return -ENOSYS; ++ ++ rreq = malloc(sizeof(*rreq)); ++ if (rreq == NULL) ++ return -ENOMEM; ++ ++ pthread_mutex_lock(&se->lock); ++ rreq->cookie = cookie; ++ rreq->nreq.unique = se->notify_ctr++; ++ rreq->nreq.reply = fuse_ll_retrieve_reply; ++ list_add_nreq(&rreq->nreq, &se->notify_list); ++ pthread_mutex_unlock(&se->lock); ++ ++ outarg.notify_unique = rreq->nreq.unique; ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; ++ ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ ++ err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); ++ if (err) { ++ pthread_mutex_lock(&se->lock); ++ list_del_nreq(&rreq->nreq); ++ pthread_mutex_unlock(&se->lock); ++ free(rreq); ++ } ++ ++ return err; ++} ++ ++void *fuse_req_userdata(fuse_req_t req) ++{ ++ return req->se->userdata; ++} ++ ++const struct fuse_ctx *fuse_req_ctx(fuse_req_t req) ++{ ++ return &req->ctx; ++} ++ ++void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, ++ void *data) ++{ ++ pthread_mutex_lock(&req->lock); ++ pthread_mutex_lock(&req->se->lock); ++ req->u.ni.func = func; ++ req->u.ni.data = data; ++ pthread_mutex_unlock(&req->se->lock); ++ if (req->interrupted && func) ++ func(req, data); ++ pthread_mutex_unlock(&req->lock); ++} ++ ++int fuse_req_interrupted(fuse_req_t req) ++{ ++ int interrupted; ++ ++ pthread_mutex_lock(&req->se->lock); ++ interrupted = req->interrupted; ++ pthread_mutex_unlock(&req->se->lock); ++ ++ return interrupted; ++} ++ ++static struct { ++ void (*func)(fuse_req_t, fuse_ino_t, const void *); ++ const char *name; ++} fuse_ll_ops[] = { ++ [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, ++ [FUSE_FORGET] = { do_forget, "FORGET" }, ++ [FUSE_GETATTR] = { do_getattr, "GETATTR" }, ++ [FUSE_SETATTR] = { do_setattr, "SETATTR" }, ++ [FUSE_READLINK] = { do_readlink, "READLINK" }, ++ [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, ++ [FUSE_MKNOD] = { do_mknod, "MKNOD" }, ++ [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, ++ [FUSE_UNLINK] = { do_unlink, "UNLINK" }, ++ [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, ++ [FUSE_RENAME] = { do_rename, "RENAME" }, ++ [FUSE_LINK] = { do_link, "LINK" }, ++ [FUSE_OPEN] = { do_open, "OPEN" }, ++ [FUSE_READ] = { do_read, "READ" }, ++ [FUSE_WRITE] = { do_write, "WRITE" }, ++ [FUSE_STATFS] = { do_statfs, "STATFS" }, ++ [FUSE_RELEASE] = { do_release, "RELEASE" }, ++ [FUSE_FSYNC] = { do_fsync, "FSYNC" }, ++ [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, ++ [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, ++ [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, ++ [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, ++ [FUSE_FLUSH] = { do_flush, "FLUSH" }, ++ [FUSE_INIT] = { do_init, "INIT" }, ++ [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, ++ [FUSE_READDIR] = { do_readdir, "READDIR" }, ++ [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, ++ [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, ++ [FUSE_GETLK] = { do_getlk, "GETLK" }, ++ [FUSE_SETLK] = { do_setlk, "SETLK" }, ++ [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, ++ [FUSE_ACCESS] = { do_access, "ACCESS" }, ++ [FUSE_CREATE] = { do_create, "CREATE" }, ++ [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, ++ [FUSE_BMAP] = { do_bmap, "BMAP" }, ++ [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, ++ [FUSE_POLL] = { do_poll, "POLL" }, ++ [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, ++ [FUSE_DESTROY] = { do_destroy, "DESTROY" }, ++ [FUSE_NOTIFY_REPLY] = { (void *) 1, "NOTIFY_REPLY" }, ++ [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, ++ [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS"}, ++ [FUSE_RENAME2] = { do_rename2, "RENAME2" }, ++ [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, ++ [FUSE_LSEEK] = { do_lseek, "LSEEK" }, ++ [CUSE_INIT] = { cuse_lowlevel_init, "CUSE_INIT" }, ++}; ++ ++#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) ++ ++static const char *opname(enum fuse_opcode opcode) ++{ ++ if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) ++ return "???"; ++ else ++ return fuse_ll_ops[opcode].name; ++} ++ ++static int fuse_ll_copy_from_pipe(struct fuse_bufvec *dst, ++ struct fuse_bufvec *src) ++{ ++ ssize_t res = fuse_buf_copy(dst, src, 0); ++ if (res < 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", strerror(-res)); ++ return res; ++ } ++ if ((size_t)res < fuse_buf_size(dst)) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); ++ return -1; ++ } ++ return 0; ++} ++ ++void fuse_session_process_buf(struct fuse_session *se, ++ const struct fuse_buf *buf) ++{ ++ fuse_session_process_buf_int(se, buf, NULL); ++} ++ ++void fuse_session_process_buf_int(struct fuse_session *se, ++ const struct fuse_buf *buf, struct fuse_chan *ch) ++{ ++ const size_t write_header_size = sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_write_in); ++ struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; ++ struct fuse_bufvec tmpbuf = FUSE_BUFVEC_INIT(write_header_size); ++ struct fuse_in_header *in; ++ const void *inarg; ++ struct fuse_req *req; ++ void *mbuf = NULL; ++ int err; ++ int res; ++ ++ if (buf->flags & FUSE_BUF_IS_FD) { ++ if (buf->size < tmpbuf.buf[0].size) ++ tmpbuf.buf[0].size = buf->size; ++ ++ mbuf = malloc(tmpbuf.buf[0].size); ++ if (mbuf == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate header\n"); ++ goto clear_pipe; ++ } ++ tmpbuf.buf[0].mem = mbuf; ++ ++ res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); ++ if (res < 0) ++ goto clear_pipe; ++ ++ in = mbuf; ++ } else { ++ in = buf->mem; ++ } ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", ++ (unsigned long long) in->unique, ++ opname((enum fuse_opcode) in->opcode), in->opcode, ++ (unsigned long long) in->nodeid, buf->size, in->pid); ++ } ++ ++ req = fuse_ll_alloc_req(se); ++ if (req == NULL) { ++ struct fuse_out_header out = { ++ .unique = in->unique, ++ .error = -ENOMEM, ++ }; ++ struct iovec iov = { ++ .iov_base = &out, ++ .iov_len = sizeof(struct fuse_out_header), ++ }; ++ ++ fuse_send_msg(se, ch, &iov, 1); ++ goto clear_pipe; ++ } ++ ++ req->unique = in->unique; ++ req->ctx.uid = in->uid; ++ req->ctx.gid = in->gid; ++ req->ctx.pid = in->pid; ++ req->ch = ch ? fuse_chan_get(ch) : NULL; ++ ++ err = EIO; ++ if (!se->got_init) { ++ enum fuse_opcode expected; ++ ++ expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; ++ if (in->opcode != expected) ++ goto reply_err; ++ } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) ++ goto reply_err; ++ ++ err = EACCES; ++ /* Implement -o allow_root */ ++ if (se->deny_others && in->uid != se->owner && in->uid != 0 && ++ in->opcode != FUSE_INIT && in->opcode != FUSE_READ && ++ in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && ++ in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && ++ in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && ++ in->opcode != FUSE_NOTIFY_REPLY && ++ in->opcode != FUSE_READDIRPLUS) ++ goto reply_err; ++ ++ err = ENOSYS; ++ if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) ++ goto reply_err; ++ if (in->opcode != FUSE_INTERRUPT) { ++ struct fuse_req *intr; ++ pthread_mutex_lock(&se->lock); ++ intr = check_interrupt(se, req); ++ list_add_req(req, &se->list); ++ pthread_mutex_unlock(&se->lock); ++ if (intr) ++ fuse_reply_err(intr, EAGAIN); ++ } ++ ++ if ((buf->flags & FUSE_BUF_IS_FD) && write_header_size < buf->size && ++ (in->opcode != FUSE_WRITE || !se->op.write_buf) && ++ in->opcode != FUSE_NOTIFY_REPLY) { ++ void *newmbuf; ++ ++ err = ENOMEM; ++ newmbuf = realloc(mbuf, buf->size); ++ if (newmbuf == NULL) ++ goto reply_err; ++ mbuf = newmbuf; ++ ++ tmpbuf = FUSE_BUFVEC_INIT(buf->size - write_header_size); ++ tmpbuf.buf[0].mem = (char *)mbuf + write_header_size; ++ ++ res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); ++ err = -res; ++ if (res < 0) ++ goto reply_err; ++ ++ in = mbuf; ++ } ++ ++ inarg = (void *) &in[1]; ++ if (in->opcode == FUSE_WRITE && se->op.write_buf) ++ do_write_buf(req, in->nodeid, inarg, buf); ++ else if (in->opcode == FUSE_NOTIFY_REPLY) ++ do_notify_reply(req, in->nodeid, inarg, buf); ++ else ++ fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); ++ ++out_free: ++ free(mbuf); ++ return; ++ ++reply_err: ++ fuse_reply_err(req, err); ++clear_pipe: ++ if (buf->flags & FUSE_BUF_IS_FD) ++ fuse_ll_clear_pipe(se); ++ goto out_free; ++} ++ ++#define LL_OPTION(n,o,v) \ ++ { n, offsetof(struct fuse_session, o), v } ++ ++static const struct fuse_opt fuse_ll_opts[] = { ++ LL_OPTION("debug", debug, 1), ++ LL_OPTION("-d", debug, 1), ++ LL_OPTION("--debug", debug, 1), ++ LL_OPTION("allow_root", deny_others, 1), ++ FUSE_OPT_END ++}; ++ ++void fuse_lowlevel_version(void) ++{ ++ printf("using FUSE kernel interface version %i.%i\n", ++ FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); ++ fuse_mount_version(); ++} ++ ++void fuse_lowlevel_help(void) ++{ ++ /* These are not all options, but the ones that are ++ potentially of interest to an end-user */ ++ printf( ++" -o allow_other allow access by all users\n" ++" -o allow_root allow access by root\n" ++" -o auto_unmount auto unmount on process termination\n"); ++} ++ ++void fuse_session_destroy(struct fuse_session *se) ++{ ++ struct fuse_ll_pipe *llp; ++ ++ if (se->got_init && !se->got_destroy) { ++ if (se->op.destroy) ++ se->op.destroy(se->userdata); ++ } ++ llp = pthread_getspecific(se->pipe_key); ++ if (llp != NULL) ++ fuse_ll_pipe_free(llp); ++ pthread_key_delete(se->pipe_key); ++ pthread_mutex_destroy(&se->lock); ++ free(se->cuse_data); ++ if (se->fd != -1) ++ close(se->fd); ++ destroy_mount_opts(se->mo); ++ free(se); ++} ++ ++ ++static void fuse_ll_pipe_destructor(void *data) ++{ ++ struct fuse_ll_pipe *llp = data; ++ fuse_ll_pipe_free(llp); ++} ++ ++int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf) ++{ ++ return fuse_session_receive_buf_int(se, buf, NULL); ++} ++ ++int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, ++ struct fuse_chan *ch) ++{ ++ int err; ++ ssize_t res; ++#ifdef HAVE_SPLICE ++ size_t bufsize = se->bufsize; ++ struct fuse_ll_pipe *llp; ++ struct fuse_buf tmpbuf; ++ ++ if (se->conn.proto_minor < 14 || !(se->conn.want & FUSE_CAP_SPLICE_READ)) ++ goto fallback; ++ ++ llp = fuse_ll_get_pipe(se); ++ if (llp == NULL) ++ goto fallback; ++ ++ if (llp->size < bufsize) { ++ if (llp->can_grow) { ++ res = fcntl(llp->pipe[0], F_SETPIPE_SZ, bufsize); ++ if (res == -1) { ++ llp->can_grow = 0; ++ res = grow_pipe_to_max(llp->pipe[0]); ++ if (res > 0) ++ llp->size = res; ++ goto fallback; ++ } ++ llp->size = res; ++ } ++ if (llp->size < bufsize) ++ goto fallback; ++ } ++ ++ res = splice(ch ? ch->fd : se->fd, ++ NULL, llp->pipe[1], NULL, bufsize, 0); ++ err = errno; ++ ++ if (fuse_session_exited(se)) ++ return 0; ++ ++ if (res == -1) { ++ if (err == ENODEV) { ++ /* Filesystem was unmounted, or connection was aborted ++ via /sys/fs/fuse/connections */ ++ fuse_session_exit(se); ++ return 0; ++ } ++ if (err != EINTR && err != EAGAIN) ++ perror("fuse: splice from device"); ++ return -err; ++ } ++ ++ if (res < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "short splice from fuse device\n"); ++ return -EIO; ++ } ++ ++ tmpbuf = (struct fuse_buf) { ++ .size = res, ++ .flags = FUSE_BUF_IS_FD, ++ .fd = llp->pipe[0], ++ }; ++ ++ /* ++ * Don't bother with zero copy for small requests. ++ * fuse_loop_mt() needs to check for FORGET so this more than ++ * just an optimization. ++ */ ++ if (res < sizeof(struct fuse_in_header) + ++ sizeof(struct fuse_write_in) + pagesize) { ++ struct fuse_bufvec src = { .buf[0] = tmpbuf, .count = 1 }; ++ struct fuse_bufvec dst = { .count = 1 }; ++ ++ if (!buf->mem) { ++ buf->mem = malloc(se->bufsize); ++ if (!buf->mem) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: failed to allocate read buffer\n"); ++ return -ENOMEM; ++ } ++ } ++ buf->size = se->bufsize; ++ buf->flags = 0; ++ dst.buf[0] = *buf; ++ ++ res = fuse_buf_copy(&dst, &src, 0); ++ if (res < 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", ++ strerror(-res)); ++ fuse_ll_clear_pipe(se); ++ return res; ++ } ++ if (res < tmpbuf.size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); ++ fuse_ll_clear_pipe(se); ++ return -EIO; ++ } ++ assert(res == tmpbuf.size); ++ ++ } else { ++ /* Don't overwrite buf->mem, as that would cause a leak */ ++ buf->fd = tmpbuf.fd; ++ buf->flags = tmpbuf.flags; ++ } ++ buf->size = tmpbuf.size; ++ ++ return res; ++ ++fallback: ++#endif ++ if (!buf->mem) { ++ buf->mem = malloc(se->bufsize); ++ if (!buf->mem) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: failed to allocate read buffer\n"); ++ return -ENOMEM; ++ } ++ } ++ ++restart: ++ res = read(ch ? ch->fd : se->fd, buf->mem, se->bufsize); ++ err = errno; ++ ++ if (fuse_session_exited(se)) ++ return 0; ++ if (res == -1) { ++ /* ENOENT means the operation was interrupted, it's safe ++ to restart */ ++ if (err == ENOENT) ++ goto restart; ++ ++ if (err == ENODEV) { ++ /* Filesystem was unmounted, or connection was aborted ++ via /sys/fs/fuse/connections */ ++ fuse_session_exit(se); ++ return 0; ++ } ++ /* Errors occurring during normal operation: EINTR (read ++ interrupted), EAGAIN (nonblocking I/O), ENODEV (filesystem ++ umounted) */ ++ if (err != EINTR && err != EAGAIN) ++ perror("fuse: reading device"); ++ return -err; ++ } ++ if ((size_t) res < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "short read on fuse device\n"); ++ return -EIO; ++ } ++ ++ buf->size = res; ++ ++ return res; ++} ++ ++struct fuse_session *fuse_session_new(struct fuse_args *args, ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata) ++{ ++ int err; ++ struct fuse_session *se; ++ struct mount_opts *mo; ++ ++ if (sizeof(struct fuse_lowlevel_ops) < op_size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: library too old, some operations may not work\n"); ++ op_size = sizeof(struct fuse_lowlevel_ops); ++ } ++ ++ if (args->argc == 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: empty argv passed to fuse_session_new().\n"); ++ return NULL; ++ } ++ ++ se = (struct fuse_session *) calloc(1, sizeof(struct fuse_session)); ++ if (se == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); ++ goto out1; ++ } ++ se->fd = -1; ++ se->conn.max_write = UINT_MAX; ++ se->conn.max_readahead = UINT_MAX; ++ ++ /* Parse options */ ++ if(fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) ++ goto out2; ++ if(se->deny_others) { ++ /* Allowing access only by root is done by instructing ++ * kernel to allow access by everyone, and then restricting ++ * access to root and mountpoint owner in libfuse. ++ */ ++ // We may be adding the option a second time, but ++ // that doesn't hurt. ++ if(fuse_opt_add_arg(args, "-oallow_other") == -1) ++ goto out2; ++ } ++ mo = parse_mount_opts(args); ++ if (mo == NULL) ++ goto out3; ++ ++ if(args->argc == 1 && ++ args->argv[0][0] == '-') { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: argv[0] looks like an option, but " ++ "will be ignored\n"); ++ } else if (args->argc != 1) { ++ int i; ++ fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); ++ for(i = 1; i < args->argc-1; i++) ++ fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); ++ fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); ++ goto out4; ++ } ++ ++ if (se->debug) ++ fuse_log(FUSE_LOG_DEBUG, "FUSE library version: %s\n", PACKAGE_VERSION); ++ ++ se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + ++ FUSE_BUFFER_HEADER_SIZE; ++ ++ list_init_req(&se->list); ++ list_init_req(&se->interrupts); ++ list_init_nreq(&se->notify_list); ++ se->notify_ctr = 1; ++ fuse_mutex_init(&se->lock); ++ ++ err = pthread_key_create(&se->pipe_key, fuse_ll_pipe_destructor); ++ if (err) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to create thread specific key: %s\n", ++ strerror(err)); ++ goto out5; ++ } ++ ++ memcpy(&se->op, op, op_size); ++ se->owner = getuid(); ++ se->userdata = userdata; ++ ++ se->mo = mo; ++ return se; ++ ++out5: ++ pthread_mutex_destroy(&se->lock); ++out4: ++ fuse_opt_free_args(args); ++out3: ++ free(mo); ++out2: ++ free(se); ++out1: ++ return NULL; ++} ++ ++int fuse_session_mount(struct fuse_session *se, const char *mountpoint) ++{ ++ int fd; ++ ++ /* ++ * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos ++ * would ensue. ++ */ ++ do { ++ fd = open("/dev/null", O_RDWR); ++ if (fd > 2) ++ close(fd); ++ } while (fd >= 0 && fd <= 2); ++ ++ /* ++ * To allow FUSE daemons to run without privileges, the caller may open ++ * /dev/fuse before launching the file system and pass on the file ++ * descriptor by specifying /dev/fd/N as the mount point. Note that the ++ * parent process takes care of performing the mount in this case. ++ */ ++ fd = fuse_mnt_parse_fuse_fd(mountpoint); ++ if (fd != -1) { ++ if (fcntl(fd, F_GETFD) == -1) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: Invalid file descriptor /dev/fd/%u\n", ++ fd); ++ return -1; ++ } ++ se->fd = fd; ++ return 0; ++ } ++ ++ /* Open channel */ ++ fd = fuse_kern_mount(mountpoint, se->mo); ++ if (fd == -1) ++ return -1; ++ se->fd = fd; ++ ++ /* Save mountpoint */ ++ se->mountpoint = strdup(mountpoint); ++ if (se->mountpoint == NULL) ++ goto error_out; ++ ++ return 0; ++ ++error_out: ++ fuse_kern_unmount(mountpoint, fd); ++ return -1; ++} ++ ++int fuse_session_fd(struct fuse_session *se) ++{ ++ return se->fd; ++} ++ ++void fuse_session_unmount(struct fuse_session *se) ++{ ++ if (se->mountpoint != NULL) { ++ fuse_kern_unmount(se->mountpoint, se->fd); ++ free(se->mountpoint); ++ se->mountpoint = NULL; ++ } ++} ++ ++#ifdef linux ++int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) ++{ ++ char *buf; ++ size_t bufsize = 1024; ++ char path[128]; ++ int ret; ++ int fd; ++ unsigned long pid = req->ctx.pid; ++ char *s; ++ ++ sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); ++ ++retry: ++ buf = malloc(bufsize); ++ if (buf == NULL) ++ return -ENOMEM; ++ ++ ret = -EIO; ++ fd = open(path, O_RDONLY); ++ if (fd == -1) ++ goto out_free; ++ ++ ret = read(fd, buf, bufsize); ++ close(fd); ++ if (ret < 0) { ++ ret = -EIO; ++ goto out_free; ++ } ++ ++ if ((size_t)ret == bufsize) { ++ free(buf); ++ bufsize *= 4; ++ goto retry; ++ } ++ ++ ret = -EIO; ++ s = strstr(buf, "\nGroups:"); ++ if (s == NULL) ++ goto out_free; ++ ++ s += 8; ++ ret = 0; ++ while (1) { ++ char *end; ++ unsigned long val = strtoul(s, &end, 0); ++ if (end == s) ++ break; ++ ++ s = end; ++ if (ret < size) ++ list[ret] = val; ++ ret++; ++ } ++ ++out_free: ++ free(buf); ++ return ret; ++} ++#else /* linux */ ++/* ++ * This is currently not implemented on other than Linux... ++ */ ++int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) ++{ ++ (void) req; (void) size; (void) list; ++ return -ENOSYS; ++} ++#endif ++ ++void fuse_session_exit(struct fuse_session *se) ++{ ++ se->exited = 1; ++} ++ ++void fuse_session_reset(struct fuse_session *se) ++{ ++ se->exited = 0; ++ se->error = 0; ++} ++ ++int fuse_session_exited(struct fuse_session *se) ++{ ++ return se->exited; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-main-virtio-loop.patch b/kvm-virtiofsd-Add-main-virtio-loop.patch new file mode 100755 index 0000000000000000000000000000000000000000..c0ba96a7bf4e52d1ccc97202981b8d32ed13fe3c --- /dev/null +++ b/kvm-virtiofsd-Add-main-virtio-loop.patch @@ -0,0 +1,105 @@ +From 6f413d8b76ff38e5bc01f36515ca71d7fd6e6144 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:58 +0100 +Subject: [PATCH 027/116] virtiofsd: Add main virtio loop +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-24-dgilbert@redhat.com> +Patchwork-id: 93475 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 023/112] virtiofsd: Add main virtio loop +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Processes incoming requests on the vhost-user fd. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 204d8ae57b3c57098642c79b3c03d42495149c09) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 42 +++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 39 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 2ae3c76..1928a20 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -11,12 +11,14 @@ + * See the file COPYING.LIB + */ + ++#include "fuse_virtio.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" +-#include "fuse_virtio.h" + ++#include ++#include + #include + #include + #include +@@ -80,15 +82,49 @@ static const VuDevIface fv_iface = { + .queue_is_processed_in_order = fv_queue_order, + }; + ++/* ++ * Main loop; this mostly deals with events on the vhost-user ++ * socket itself, and not actual fuse data. ++ */ + int virtio_loop(struct fuse_session *se) + { + fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__); + +- while (1) { +- /* TODO: Add stuffing */ ++ while (!fuse_session_exited(se)) { ++ struct pollfd pf[1]; ++ pf[0].fd = se->vu_socketfd; ++ pf[0].events = POLLIN; ++ pf[0].revents = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__); ++ int poll_res = ppoll(pf, 1, NULL, NULL); ++ ++ if (poll_res == -1) { ++ if (errno == EINTR) { ++ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", ++ __func__); ++ continue; ++ } ++ fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n"); ++ break; ++ } ++ assert(poll_res == 1); ++ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { ++ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__, ++ pf[0].revents); ++ break; ++ } ++ assert(pf[0].revents & POLLIN); ++ fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); ++ if (!vu_dispatch(&se->virtio_dev->dev)) { ++ fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); ++ break; ++ } + } + + fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); ++ ++ return 0; + } + + int virtio_session_mount(struct fuse_session *se) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-options-for-virtio.patch b/kvm-virtiofsd-Add-options-for-virtio.patch new file mode 100755 index 0000000000000000000000000000000000000000..8ac7fa7a4ed4da684d9290390ccbd7a9703997ed --- /dev/null +++ b/kvm-virtiofsd-Add-options-for-virtio.patch @@ -0,0 +1,103 @@ +From 9c1bbe327cf8f88ffc78eed0fce8cdd6f3f006ef Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:54 +0100 +Subject: [PATCH 023/116] virtiofsd: Add options for virtio +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-20-dgilbert@redhat.com> +Patchwork-id: 93473 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 019/112] virtiofsd: Add options for virtio +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Add options to specify parameters for virtio-fs paths, i.e. + + ./virtiofsd -o vhost_user_socket=/tmp/vhostqemu + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 205de006aab8dcbe546a7e3a51d295c2d05e654b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 11 ++++++++--- + tools/virtiofsd/helper.c | 14 +++++++------- + 3 files changed, 16 insertions(+), 10 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index bae0699..26b1a7d 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -63,6 +63,7 @@ struct fuse_session { + struct fuse_notify_req notify_list; + size_t bufsize; + int error; ++ char *vu_socket_path; + }; + + struct fuse_chan { +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 8552cfb..17e8718 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2115,8 +2115,11 @@ reply_err: + } + + static const struct fuse_opt fuse_ll_opts[] = { +- LL_OPTION("debug", debug, 1), LL_OPTION("-d", debug, 1), +- LL_OPTION("--debug", debug, 1), LL_OPTION("allow_root", deny_others, 1), ++ LL_OPTION("debug", debug, 1), ++ LL_OPTION("-d", debug, 1), ++ LL_OPTION("--debug", debug, 1), ++ LL_OPTION("allow_root", deny_others, 1), ++ LL_OPTION("--socket-path=%s", vu_socket_path, 0), + FUSE_OPT_END + }; + +@@ -2132,7 +2135,9 @@ void fuse_lowlevel_help(void) + * These are not all options, but the ones that are + * potentially of interest to an end-user + */ +- printf(" -o allow_root allow access by root\n"); ++ printf( ++ " -o allow_root allow access by root\n" ++ " --socket-path=PATH path for the vhost-user socket\n"); + } + + void fuse_session_destroy(struct fuse_session *se) +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 9333691..676032e 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -127,13 +127,13 @@ static const struct fuse_opt conn_info_opt_spec[] = { + + void fuse_cmdline_help(void) + { +- printf( +- " -h --help print help\n" +- " -V --version print version\n" +- " -d -o debug enable debug output (implies -f)\n" +- " -f foreground operation\n" +- " -o max_idle_threads the maximum number of idle worker threads\n" +- " allowed (default: 10)\n"); ++ printf(" -h --help print help\n" ++ " -V --version print version\n" ++ " -d -o debug enable debug output (implies -f)\n" ++ " -f foreground operation\n" ++ " -o max_idle_threads the maximum number of idle worker " ++ "threads\n" ++ " allowed (default: 10)\n"); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-passthrough_ll.patch b/kvm-virtiofsd-Add-passthrough_ll.patch new file mode 100755 index 0000000000000000000000000000000000000000..2510551f17d5b6a05dddab0e370433f05094a2fd --- /dev/null +++ b/kvm-virtiofsd-Add-passthrough_ll.patch @@ -0,0 +1,1387 @@ +From 18ef831cac81a6bd2336c73dda357d9d69f8fd25 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:43 +0100 +Subject: [PATCH 012/116] virtiofsd: Add passthrough_ll +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-9-dgilbert@redhat.com> +Patchwork-id: 93462 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 008/112] virtiofsd: Add passthrough_ll +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +passthrough_ll is one of the examples in the upstream fuse project +and is the main part of our daemon here. It passes through requests +from fuse to the underlying filesystem, using syscalls as directly +as possible. + +>From libfuse fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert + Fixed up 'GPL' to 'GPLv2' as per Dan's comments and consistent + with the 'LICENSE' file in libfuse; patch sent to libfuse to fix + it upstream. +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 7c6b66027241f41720240fc6ee1021cdbd975b2e) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 1338 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 1338 insertions(+) + create mode 100644 tools/virtiofsd/passthrough_ll.c + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +new file mode 100644 +index 0000000..e1a6056 +--- /dev/null ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -0,0 +1,1338 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU GPLv2. ++ See the file COPYING. ++*/ ++ ++/** @file ++ * ++ * This file system mirrors the existing file system hierarchy of the ++ * system, starting at the root file system. This is implemented by ++ * just "passing through" all requests to the corresponding user-space ++ * libc functions. In contrast to passthrough.c and passthrough_fh.c, ++ * this implementation uses the low-level API. Its performance should ++ * be the least bad among the three, but many operations are not ++ * implemented. In particular, it is not possible to remove files (or ++ * directories) because the code necessary to defer actual removal ++ * until the file is not opened anymore would make the example much ++ * more complicated. ++ * ++ * When writeback caching is enabled (-o writeback mount option), it ++ * is only possible to write to files for which the mounting user has ++ * read permissions. This is because the writeback cache requires the ++ * kernel to be able to issue read requests for all files (which the ++ * passthrough filesystem cannot satisfy if it can't read the file in ++ * the underlying filesystem). ++ * ++ * Compile with: ++ * ++ * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o passthrough_ll ++ * ++ * ## Source code ## ++ * \include passthrough_ll.c ++ */ ++ ++#define _GNU_SOURCE ++#define FUSE_USE_VERSION 31 ++ ++#include "config.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "passthrough_helpers.h" ++ ++/* We are re-using pointers to our `struct lo_inode` and `struct ++ lo_dirp` elements as inodes. This means that we must be able to ++ store uintptr_t values in a fuse_ino_t variable. The following ++ incantation checks this condition at compile time. */ ++#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++_Static_assert(sizeof(fuse_ino_t) >= sizeof(uintptr_t), ++ "fuse_ino_t too small to hold uintptr_t values!"); ++#else ++struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct \ ++ { unsigned _uintptr_to_must_hold_fuse_ino_t: ++ ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); }; ++#endif ++ ++struct lo_inode { ++ struct lo_inode *next; /* protected by lo->mutex */ ++ struct lo_inode *prev; /* protected by lo->mutex */ ++ int fd; ++ bool is_symlink; ++ ino_t ino; ++ dev_t dev; ++ uint64_t refcount; /* protected by lo->mutex */ ++}; ++ ++enum { ++ CACHE_NEVER, ++ CACHE_NORMAL, ++ CACHE_ALWAYS, ++}; ++ ++struct lo_data { ++ pthread_mutex_t mutex; ++ int debug; ++ int writeback; ++ int flock; ++ int xattr; ++ const char *source; ++ double timeout; ++ int cache; ++ int timeout_set; ++ struct lo_inode root; /* protected by lo->mutex */ ++}; ++ ++static const struct fuse_opt lo_opts[] = { ++ { "writeback", ++ offsetof(struct lo_data, writeback), 1 }, ++ { "no_writeback", ++ offsetof(struct lo_data, writeback), 0 }, ++ { "source=%s", ++ offsetof(struct lo_data, source), 0 }, ++ { "flock", ++ offsetof(struct lo_data, flock), 1 }, ++ { "no_flock", ++ offsetof(struct lo_data, flock), 0 }, ++ { "xattr", ++ offsetof(struct lo_data, xattr), 1 }, ++ { "no_xattr", ++ offsetof(struct lo_data, xattr), 0 }, ++ { "timeout=%lf", ++ offsetof(struct lo_data, timeout), 0 }, ++ { "timeout=", ++ offsetof(struct lo_data, timeout_set), 1 }, ++ { "cache=never", ++ offsetof(struct lo_data, cache), CACHE_NEVER }, ++ { "cache=auto", ++ offsetof(struct lo_data, cache), CACHE_NORMAL }, ++ { "cache=always", ++ offsetof(struct lo_data, cache), CACHE_ALWAYS }, ++ ++ FUSE_OPT_END ++}; ++ ++static struct lo_data *lo_data(fuse_req_t req) ++{ ++ return (struct lo_data *) fuse_req_userdata(req); ++} ++ ++static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) ++{ ++ if (ino == FUSE_ROOT_ID) ++ return &lo_data(req)->root; ++ else ++ return (struct lo_inode *) (uintptr_t) ino; ++} ++ ++static int lo_fd(fuse_req_t req, fuse_ino_t ino) ++{ ++ return lo_inode(req, ino)->fd; ++} ++ ++static bool lo_debug(fuse_req_t req) ++{ ++ return lo_data(req)->debug != 0; ++} ++ ++static void lo_init(void *userdata, ++ struct fuse_conn_info *conn) ++{ ++ struct lo_data *lo = (struct lo_data*) userdata; ++ ++ if(conn->capable & FUSE_CAP_EXPORT_SUPPORT) ++ conn->want |= FUSE_CAP_EXPORT_SUPPORT; ++ ++ if (lo->writeback && ++ conn->capable & FUSE_CAP_WRITEBACK_CACHE) { ++ if (lo->debug) ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); ++ conn->want |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { ++ if (lo->debug) ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); ++ conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ } ++} ++ ++static void lo_getattr(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) ++{ ++ int res; ++ struct stat buf; ++ struct lo_data *lo = lo_data(req); ++ ++ (void) fi; ++ ++ res = fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ fuse_reply_attr(req, &buf, lo->timeout); ++} ++ ++static int utimensat_empty_nofollow(struct lo_inode *inode, ++ const struct timespec *tv) ++{ ++ int res; ++ char procname[64]; ++ ++ if (inode->is_symlink) { ++ res = utimensat(inode->fd, "", tv, ++ AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1 && errno == EINVAL) { ++ /* Sorry, no race free way to set times on symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ return utimensat(AT_FDCWD, procname, tv, 0); ++} ++ ++static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, ++ int valid, struct fuse_file_info *fi) ++{ ++ int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ int ifd = inode->fd; ++ int res; ++ ++ if (valid & FUSE_SET_ATTR_MODE) { ++ if (fi) { ++ res = fchmod(fi->fh, attr->st_mode); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = chmod(procname, attr->st_mode); ++ } ++ if (res == -1) ++ goto out_err; ++ } ++ if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { ++ uid_t uid = (valid & FUSE_SET_ATTR_UID) ? ++ attr->st_uid : (uid_t) -1; ++ gid_t gid = (valid & FUSE_SET_ATTR_GID) ? ++ attr->st_gid : (gid_t) -1; ++ ++ res = fchownat(ifd, "", uid, gid, ++ AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ goto out_err; ++ } ++ if (valid & FUSE_SET_ATTR_SIZE) { ++ if (fi) { ++ res = ftruncate(fi->fh, attr->st_size); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = truncate(procname, attr->st_size); ++ } ++ if (res == -1) ++ goto out_err; ++ } ++ if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { ++ struct timespec tv[2]; ++ ++ tv[0].tv_sec = 0; ++ tv[1].tv_sec = 0; ++ tv[0].tv_nsec = UTIME_OMIT; ++ tv[1].tv_nsec = UTIME_OMIT; ++ ++ if (valid & FUSE_SET_ATTR_ATIME_NOW) ++ tv[0].tv_nsec = UTIME_NOW; ++ else if (valid & FUSE_SET_ATTR_ATIME) ++ tv[0] = attr->st_atim; ++ ++ if (valid & FUSE_SET_ATTR_MTIME_NOW) ++ tv[1].tv_nsec = UTIME_NOW; ++ else if (valid & FUSE_SET_ATTR_MTIME) ++ tv[1] = attr->st_mtim; ++ ++ if (fi) ++ res = futimens(fi->fh, tv); ++ else ++ res = utimensat_empty_nofollow(inode, tv); ++ if (res == -1) ++ goto out_err; ++ } ++ ++ return lo_getattr(req, ino, fi); ++ ++out_err: ++ saverr = errno; ++ fuse_reply_err(req, saverr); ++} ++ ++static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) ++{ ++ struct lo_inode *p; ++ struct lo_inode *ret = NULL; ++ ++ pthread_mutex_lock(&lo->mutex); ++ for (p = lo->root.next; p != &lo->root; p = p->next) { ++ if (p->ino == st->st_ino && p->dev == st->st_dev) { ++ assert(p->refcount > 0); ++ ret = p; ++ ret->refcount++; ++ break; ++ } ++ } ++ pthread_mutex_unlock(&lo->mutex); ++ return ret; ++} ++ ++static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, ++ struct fuse_entry_param *e) ++{ ++ int newfd; ++ int res; ++ int saverr; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ ++ memset(e, 0, sizeof(*e)); ++ e->attr_timeout = lo->timeout; ++ e->entry_timeout = lo->timeout; ++ ++ newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); ++ if (newfd == -1) ++ goto out_err; ++ ++ res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ goto out_err; ++ ++ inode = lo_find(lo_data(req), &e->attr); ++ if (inode) { ++ close(newfd); ++ newfd = -1; ++ } else { ++ struct lo_inode *prev, *next; ++ ++ saverr = ENOMEM; ++ inode = calloc(1, sizeof(struct lo_inode)); ++ if (!inode) ++ goto out_err; ++ ++ inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ inode->refcount = 1; ++ inode->fd = newfd; ++ inode->ino = e->attr.st_ino; ++ inode->dev = e->attr.st_dev; ++ ++ pthread_mutex_lock(&lo->mutex); ++ prev = &lo->root; ++ next = prev->next; ++ next->prev = inode; ++ inode->next = next; ++ inode->prev = prev; ++ prev->next = inode; ++ pthread_mutex_unlock(&lo->mutex); ++ } ++ e->ino = (uintptr_t) inode; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long) parent, name, (unsigned long long) e->ino); ++ ++ return 0; ++ ++out_err: ++ saverr = errno; ++ if (newfd != -1) ++ close(newfd); ++ return saverr; ++} ++ ++static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) ++{ ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) ++ fuse_reply_err(req, err); ++ else ++ fuse_reply_entry(req, &e); ++} ++ ++static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, ++ const char *name, mode_t mode, dev_t rdev, ++ const char *link) ++{ ++ int res; ++ int saverr; ++ struct lo_inode *dir = lo_inode(req, parent); ++ struct fuse_entry_param e; ++ ++ saverr = ENOMEM; ++ ++ res = mknod_wrapper(dir->fd, name, link, mode, rdev); ++ ++ saverr = errno; ++ if (res == -1) ++ goto out; ++ ++ saverr = lo_do_lookup(req, parent, name, &e); ++ if (saverr) ++ goto out; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long) parent, name, (unsigned long long) e.ino); ++ ++ fuse_reply_entry(req, &e); ++ return; ++ ++out: ++ fuse_reply_err(req, saverr); ++} ++ ++static void lo_mknod(fuse_req_t req, fuse_ino_t parent, ++ const char *name, mode_t mode, dev_t rdev) ++{ ++ lo_mknod_symlink(req, parent, name, mode, rdev, NULL); ++} ++ ++static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode) ++{ ++ lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); ++} ++ ++static void lo_symlink(fuse_req_t req, const char *link, ++ fuse_ino_t parent, const char *name) ++{ ++ lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); ++} ++ ++static int linkat_empty_nofollow(struct lo_inode *inode, int dfd, ++ const char *name) ++{ ++ int res; ++ char procname[64]; ++ ++ if (inode->is_symlink) { ++ res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); ++ if (res == -1 && (errno == ENOENT || errno == EINVAL)) { ++ /* Sorry, no race free way to hard-link a symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); ++} ++ ++static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, ++ const char *name) ++{ ++ int res; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ struct fuse_entry_param e; ++ int saverr; ++ ++ memset(&e, 0, sizeof(struct fuse_entry_param)); ++ e.attr_timeout = lo->timeout; ++ e.entry_timeout = lo->timeout; ++ ++ res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); ++ if (res == -1) ++ goto out_err; ++ ++ res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) ++ goto out_err; ++ ++ pthread_mutex_lock(&lo->mutex); ++ inode->refcount++; ++ pthread_mutex_unlock(&lo->mutex); ++ e.ino = (uintptr_t) inode; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long) parent, name, ++ (unsigned long long) e.ino); ++ ++ fuse_reply_entry(req, &e); ++ return; ++ ++out_err: ++ saverr = errno; ++ fuse_reply_err(req, saverr); ++} ++ ++static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) ++{ ++ int res; ++ ++ res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags) ++{ ++ int res; ++ ++ if (flags) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ res = renameat(lo_fd(req, parent), name, ++ lo_fd(req, newparent), newname); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) ++{ ++ int res; ++ ++ res = unlinkat(lo_fd(req, parent), name, 0); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) ++{ ++ if (!inode) ++ return; ++ ++ pthread_mutex_lock(&lo->mutex); ++ assert(inode->refcount >= n); ++ inode->refcount -= n; ++ if (!inode->refcount) { ++ struct lo_inode *prev, *next; ++ ++ prev = inode->prev; ++ next = inode->next; ++ next->prev = prev; ++ prev->next = next; ++ ++ pthread_mutex_unlock(&lo->mutex); ++ close(inode->fd); ++ free(inode); ++ ++ } else { ++ pthread_mutex_unlock(&lo->mutex); ++ } ++} ++ ++static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", ++ (unsigned long long) ino, ++ (unsigned long long) inode->refcount, ++ (unsigned long long) nlookup); ++ } ++ ++ unref_inode(lo, inode, nlookup); ++} ++ ++static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) ++{ ++ lo_forget_one(req, ino, nlookup); ++ fuse_reply_none(req); ++} ++ ++static void lo_forget_multi(fuse_req_t req, size_t count, ++ struct fuse_forget_data *forgets) ++{ ++ int i; ++ ++ for (i = 0; i < count; i++) ++ lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); ++ fuse_reply_none(req); ++} ++ ++static void lo_readlink(fuse_req_t req, fuse_ino_t ino) ++{ ++ char buf[PATH_MAX + 1]; ++ int res; ++ ++ res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); ++ if (res == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ if (res == sizeof(buf)) ++ return (void) fuse_reply_err(req, ENAMETOOLONG); ++ ++ buf[res] = '\0'; ++ ++ fuse_reply_readlink(req, buf); ++} ++ ++struct lo_dirp { ++ DIR *dp; ++ struct dirent *entry; ++ off_t offset; ++}; ++ ++static struct lo_dirp *lo_dirp(struct fuse_file_info *fi) ++{ ++ return (struct lo_dirp *) (uintptr_t) fi->fh; ++} ++ ++static void lo_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ int error = ENOMEM; ++ struct lo_data *lo = lo_data(req); ++ struct lo_dirp *d; ++ int fd; ++ ++ d = calloc(1, sizeof(struct lo_dirp)); ++ if (d == NULL) ++ goto out_err; ++ ++ fd = openat(lo_fd(req, ino), ".", O_RDONLY); ++ if (fd == -1) ++ goto out_errno; ++ ++ d->dp = fdopendir(fd); ++ if (d->dp == NULL) ++ goto out_errno; ++ ++ d->offset = 0; ++ d->entry = NULL; ++ ++ fi->fh = (uintptr_t) d; ++ if (lo->cache == CACHE_ALWAYS) ++ fi->keep_cache = 1; ++ fuse_reply_open(req, fi); ++ return; ++ ++out_errno: ++ error = errno; ++out_err: ++ if (d) { ++ if (fd != -1) ++ close(fd); ++ free(d); ++ } ++ fuse_reply_err(req, error); ++} ++ ++static int is_dot_or_dotdot(const char *name) ++{ ++ return name[0] == '.' && (name[1] == '\0' || ++ (name[1] == '.' && name[2] == '\0')); ++} ++ ++static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi, int plus) ++{ ++ struct lo_dirp *d = lo_dirp(fi); ++ char *buf; ++ char *p; ++ size_t rem = size; ++ int err; ++ ++ (void) ino; ++ ++ buf = calloc(1, size); ++ if (!buf) { ++ err = ENOMEM; ++ goto error; ++ } ++ p = buf; ++ ++ if (offset != d->offset) { ++ seekdir(d->dp, offset); ++ d->entry = NULL; ++ d->offset = offset; ++ } ++ while (1) { ++ size_t entsize; ++ off_t nextoff; ++ const char *name; ++ ++ if (!d->entry) { ++ errno = 0; ++ d->entry = readdir(d->dp); ++ if (!d->entry) { ++ if (errno) { // Error ++ err = errno; ++ goto error; ++ } else { // End of stream ++ break; ++ } ++ } ++ } ++ nextoff = d->entry->d_off; ++ name = d->entry->d_name; ++ fuse_ino_t entry_ino = 0; ++ if (plus) { ++ struct fuse_entry_param e; ++ if (is_dot_or_dotdot(name)) { ++ e = (struct fuse_entry_param) { ++ .attr.st_ino = d->entry->d_ino, ++ .attr.st_mode = d->entry->d_type << 12, ++ }; ++ } else { ++ err = lo_do_lookup(req, ino, name, &e); ++ if (err) ++ goto error; ++ entry_ino = e.ino; ++ } ++ ++ entsize = fuse_add_direntry_plus(req, p, rem, name, ++ &e, nextoff); ++ } else { ++ struct stat st = { ++ .st_ino = d->entry->d_ino, ++ .st_mode = d->entry->d_type << 12, ++ }; ++ entsize = fuse_add_direntry(req, p, rem, name, ++ &st, nextoff); ++ } ++ if (entsize > rem) { ++ if (entry_ino != 0) ++ lo_forget_one(req, entry_ino, 1); ++ break; ++ } ++ ++ p += entsize; ++ rem -= entsize; ++ ++ d->entry = NULL; ++ d->offset = nextoff; ++ } ++ ++ err = 0; ++error: ++ // If there's an error, we can only signal it if we haven't stored ++ // any entries yet - otherwise we'd end up with wrong lookup ++ // counts for the entries that are already in the buffer. So we ++ // return what we've collected until that point. ++ if (err && rem == size) ++ fuse_reply_err(req, err); ++ else ++ fuse_reply_buf(req, buf, size - rem); ++ free(buf); ++} ++ ++static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi) ++{ ++ lo_do_readdir(req, ino, size, offset, fi, 0); ++} ++ ++static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi) ++{ ++ lo_do_readdir(req, ino, size, offset, fi, 1); ++} ++ ++static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ struct lo_dirp *d = lo_dirp(fi); ++ (void) ino; ++ closedir(d->dp); ++ free(d); ++ fuse_reply_err(req, 0); ++} ++ ++static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, struct fuse_file_info *fi) ++{ ++ int fd; ++ struct lo_data *lo = lo_data(req); ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ ++ fd = openat(lo_fd(req, parent), name, ++ (fi->flags | O_CREAT) & ~O_NOFOLLOW, mode); ++ if (fd == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) ++ fi->direct_io = 1; ++ else if (lo->cache == CACHE_ALWAYS) ++ fi->keep_cache = 1; ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) ++ fuse_reply_err(req, err); ++ else ++ fuse_reply_create(req, &e, fi); ++} ++ ++static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi) ++{ ++ int res; ++ int fd = dirfd(lo_dirp(fi)->dp); ++ (void) ino; ++ if (datasync) ++ res = fdatasync(fd); ++ else ++ res = fsync(fd); ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ int fd; ++ char buf[64]; ++ struct lo_data *lo = lo_data(req); ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ++ ino, fi->flags); ++ ++ /* With writeback cache, kernel may send read requests even ++ when userspace opened write-only */ ++ if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { ++ fi->flags &= ~O_ACCMODE; ++ fi->flags |= O_RDWR; ++ } ++ ++ /* With writeback cache, O_APPEND is handled by the kernel. ++ This breaks atomicity (since the file may change in the ++ underlying filesystem, so that the kernel's idea of the ++ end of the file isn't accurate anymore). In this example, ++ we just accept that. A more rigorous filesystem may want ++ to return an error here */ ++ if (lo->writeback && (fi->flags & O_APPEND)) ++ fi->flags &= ~O_APPEND; ++ ++ sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ fd = open(buf, fi->flags & ~O_NOFOLLOW); ++ if (fd == -1) ++ return (void) fuse_reply_err(req, errno); ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) ++ fi->direct_io = 1; ++ else if (lo->cache == CACHE_ALWAYS) ++ fi->keep_cache = 1; ++ fuse_reply_open(req, fi); ++} ++ ++static void lo_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ (void) ino; ++ ++ close(fi->fh); ++ fuse_reply_err(req, 0); ++} ++ ++static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++{ ++ int res; ++ (void) ino; ++ res = close(dup(fi->fh)); ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi) ++{ ++ int res; ++ (void) ino; ++ if (datasync) ++ res = fdatasync(fi->fh); ++ else ++ res = fsync(fi->fh); ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, ++ off_t offset, struct fuse_file_info *fi) ++{ ++ struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_read(ino=%" PRIu64 ", size=%zd, " ++ "off=%lu)\n", ino, size, (unsigned long) offset); ++ ++ buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ buf.buf[0].fd = fi->fh; ++ buf.buf[0].pos = offset; ++ ++ fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); ++} ++ ++static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_bufvec *in_buf, off_t off, ++ struct fuse_file_info *fi) ++{ ++ (void) ino; ++ ssize_t res; ++ struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); ++ ++ out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ out_buf.buf[0].fd = fi->fh; ++ out_buf.buf[0].pos = off; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ++ ino, out_buf.buf[0].size, (unsigned long) off); ++ ++ res = fuse_buf_copy(&out_buf, in_buf, 0); ++ if(res < 0) ++ fuse_reply_err(req, -res); ++ else ++ fuse_reply_write(req, (size_t) res); ++} ++ ++static void lo_statfs(fuse_req_t req, fuse_ino_t ino) ++{ ++ int res; ++ struct statvfs stbuf; ++ ++ res = fstatvfs(lo_fd(req, ino), &stbuf); ++ if (res == -1) ++ fuse_reply_err(req, errno); ++ else ++ fuse_reply_statfs(req, &stbuf); ++} ++ ++static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, ++ off_t offset, off_t length, struct fuse_file_info *fi) ++{ ++ int err = EOPNOTSUPP; ++ (void) ino; ++ ++#ifdef HAVE_FALLOCATE ++ err = fallocate(fi->fh, mode, offset, length); ++ if (err < 0) ++ err = errno; ++ ++#elif defined(HAVE_POSIX_FALLOCATE) ++ if (mode) { ++ fuse_reply_err(req, EOPNOTSUPP); ++ return; ++ } ++ ++ err = posix_fallocate(fi->fh, offset, length); ++#endif ++ ++ fuse_reply_err(req, err); ++} ++ ++static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ int op) ++{ ++ int res; ++ (void) ino; ++ ++ res = flock(fi->fh, op); ++ ++ fuse_reply_err(req, res == -1 ? errno : 0); ++} ++ ++static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, ++ size_t size) ++{ ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ++ ino, name, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to getxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) ++ goto out_err; ++ ++ ret = getxattr(procname, name, value, size); ++ if (ret == -1) ++ goto out_err; ++ saverr = 0; ++ if (ret == 0) ++ goto out; ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = getxattr(procname, name, NULL, 0); ++ if (ret == -1) ++ goto out_err; ++ ++ fuse_reply_xattr(req, ret); ++ } ++out_free: ++ free(value); ++ return; ++ ++out_err: ++ saverr = errno; ++out: ++ fuse_reply_err(req, saverr); ++ goto out_free; ++} ++ ++static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) ++{ ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ++ ino, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to listxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) ++ goto out_err; ++ ++ ret = listxattr(procname, value, size); ++ if (ret == -1) ++ goto out_err; ++ saverr = 0; ++ if (ret == 0) ++ goto out; ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = listxattr(procname, NULL, 0); ++ if (ret == -1) ++ goto out_err; ++ ++ fuse_reply_xattr(req, ret); ++ } ++out_free: ++ free(value); ++ return; ++ ++out_err: ++ saverr = errno; ++out: ++ fuse_reply_err(req, saverr); ++ goto out_free; ++} ++ ++static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, ++ const char *value, size_t size, int flags) ++{ ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", ++ ino, name, value, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ ret = setxattr(procname, name, value, size, flags); ++ saverr = ret == -1 ? errno : 0; ++ ++out: ++ fuse_reply_err(req, saverr); ++} ++ ++static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) ++{ ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) ++ goto out; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ++ ino, name); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ ret = removexattr(procname, name); ++ saverr = ret == -1 ? errno : 0; ++ ++out: ++ fuse_reply_err(req, saverr); ++} ++ ++#ifdef HAVE_COPY_FILE_RANGE ++static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, ++ struct fuse_file_info *fi_in, ++ fuse_ino_t ino_out, off_t off_out, ++ struct fuse_file_info *fi_out, size_t len, ++ int flags) ++{ ++ ssize_t res; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, size=%zd, flags=0x%x)\n", ++ ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, ++ len, flags); ++ ++ res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, ++ flags); ++ if (res < 0) ++ fuse_reply_err(req, -errno); ++ else ++ fuse_reply_write(req, res); ++} ++#endif ++ ++static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, ++ struct fuse_file_info *fi) ++{ ++ off_t res; ++ ++ (void)ino; ++ res = lseek(fi->fh, off, whence); ++ if (res != -1) ++ fuse_reply_lseek(req, res); ++ else ++ fuse_reply_err(req, errno); ++} ++ ++static struct fuse_lowlevel_ops lo_oper = { ++ .init = lo_init, ++ .lookup = lo_lookup, ++ .mkdir = lo_mkdir, ++ .mknod = lo_mknod, ++ .symlink = lo_symlink, ++ .link = lo_link, ++ .unlink = lo_unlink, ++ .rmdir = lo_rmdir, ++ .rename = lo_rename, ++ .forget = lo_forget, ++ .forget_multi = lo_forget_multi, ++ .getattr = lo_getattr, ++ .setattr = lo_setattr, ++ .readlink = lo_readlink, ++ .opendir = lo_opendir, ++ .readdir = lo_readdir, ++ .readdirplus = lo_readdirplus, ++ .releasedir = lo_releasedir, ++ .fsyncdir = lo_fsyncdir, ++ .create = lo_create, ++ .open = lo_open, ++ .release = lo_release, ++ .flush = lo_flush, ++ .fsync = lo_fsync, ++ .read = lo_read, ++ .write_buf = lo_write_buf, ++ .statfs = lo_statfs, ++ .fallocate = lo_fallocate, ++ .flock = lo_flock, ++ .getxattr = lo_getxattr, ++ .listxattr = lo_listxattr, ++ .setxattr = lo_setxattr, ++ .removexattr = lo_removexattr, ++#ifdef HAVE_COPY_FILE_RANGE ++ .copy_file_range = lo_copy_file_range, ++#endif ++ .lseek = lo_lseek, ++}; ++ ++int main(int argc, char *argv[]) ++{ ++ struct fuse_args args = FUSE_ARGS_INIT(argc, argv); ++ struct fuse_session *se; ++ struct fuse_cmdline_opts opts; ++ struct lo_data lo = { .debug = 0, ++ .writeback = 0 }; ++ int ret = -1; ++ ++ /* Don't mask creation mode, kernel already did that */ ++ umask(0); ++ ++ pthread_mutex_init(&lo.mutex, NULL); ++ lo.root.next = lo.root.prev = &lo.root; ++ lo.root.fd = -1; ++ lo.cache = CACHE_NORMAL; ++ ++ if (fuse_parse_cmdline(&args, &opts) != 0) ++ return 1; ++ if (opts.show_help) { ++ printf("usage: %s [options] \n\n", argv[0]); ++ fuse_cmdline_help(); ++ fuse_lowlevel_help(); ++ ret = 0; ++ goto err_out1; ++ } else if (opts.show_version) { ++ printf("FUSE library version %s\n", fuse_pkgversion()); ++ fuse_lowlevel_version(); ++ ret = 0; ++ goto err_out1; ++ } ++ ++ if(opts.mountpoint == NULL) { ++ printf("usage: %s [options] \n", argv[0]); ++ printf(" %s --help\n", argv[0]); ++ ret = 1; ++ goto err_out1; ++ } ++ ++ if (fuse_opt_parse(&args, &lo, lo_opts, NULL)== -1) ++ return 1; ++ ++ lo.debug = opts.debug; ++ lo.root.refcount = 2; ++ if (lo.source) { ++ struct stat stat; ++ int res; ++ ++ res = lstat(lo.source, &stat); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", ++ lo.source); ++ exit(1); ++ } ++ if (!S_ISDIR(stat.st_mode)) { ++ fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); ++ exit(1); ++ } ++ ++ } else { ++ lo.source = "/"; ++ } ++ lo.root.is_symlink = false; ++ if (!lo.timeout_set) { ++ switch (lo.cache) { ++ case CACHE_NEVER: ++ lo.timeout = 0.0; ++ break; ++ ++ case CACHE_NORMAL: ++ lo.timeout = 1.0; ++ break; ++ ++ case CACHE_ALWAYS: ++ lo.timeout = 86400.0; ++ break; ++ } ++ } else if (lo.timeout < 0) { ++ fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", ++ lo.timeout); ++ exit(1); ++ } ++ ++ lo.root.fd = open(lo.source, O_PATH); ++ if (lo.root.fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", ++ lo.source); ++ exit(1); ++ } ++ ++ se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); ++ if (se == NULL) ++ goto err_out1; ++ ++ if (fuse_set_signal_handlers(se) != 0) ++ goto err_out2; ++ ++ if (fuse_session_mount(se, opts.mountpoint) != 0) ++ goto err_out3; ++ ++ fuse_daemonize(opts.foreground); ++ ++ /* Block until ctrl+c or fusermount -u */ ++ if (opts.singlethread) ++ ret = fuse_session_loop(se); ++ else ++ ret = fuse_session_loop_mt(se, opts.clone_fd); ++ ++ fuse_session_unmount(se); ++err_out3: ++ fuse_remove_signal_handlers(se); ++err_out2: ++ fuse_session_destroy(se); ++err_out1: ++ free(opts.mountpoint); ++ fuse_opt_free_args(&args); ++ ++ if (lo.root.fd >= 0) ++ close(lo.root.fd); ++ ++ return ret ? 1 : 0; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch b/kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch new file mode 100755 index 0000000000000000000000000000000000000000..cef537a22a982041dc20814f48391abfc89f7282 --- /dev/null +++ b/kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch @@ -0,0 +1,73 @@ +From 52e93f2dc499ead339bf808dac3480b369dfadd1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:39 +0100 +Subject: [PATCH 068/116] virtiofsd: Add timestamp to the log with + FUSE_LOG_DEBUG level +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-65-dgilbert@redhat.com> +Patchwork-id: 93517 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 064/112] virtiofsd: Add timestamp to the log with FUSE_LOG_DEBUG level +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +virtiofsd has some threads, so we see a lot of logs with debug option. +It would be useful for debugging if we can see the timestamp. + +Add nano second timestamp, which got by get_clock(), to the log with +FUSE_LOG_DEBUG level if the syslog option isn't set. + +The log is like as: + + # ./virtiofsd -d -o vhost_user_socket=/tmp/vhostqemu0 -o source=/tmp/share0 -o cache=auto + ... + [5365943125463727] [ID: 00000002] fv_queue_thread: Start for queue 0 kick_fd 9 + [5365943125568644] [ID: 00000002] fv_queue_thread: Waiting for Queue 0 event + [5365943125573561] [ID: 00000002] fv_queue_thread: Got queue event on Queue 0 + +Signed-off-by: Masayoshi Mizuma +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 50fb955aa0e6ede929422146936cf68bf1ca876f) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index f08324f..98114a3 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -36,6 +36,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/timer.h" + #include "fuse_virtio.h" + #include "fuse_log.h" + #include "fuse_lowlevel.h" +@@ -2276,7 +2277,13 @@ static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + } + + if (current_log_level == FUSE_LOG_DEBUG) { +- localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), fmt); ++ if (!use_syslog) { ++ localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s", ++ get_clock(), syscall(__NR_gettid), fmt); ++ } else { ++ localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), ++ fmt); ++ } + fmt = localfmt; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Clean-up-inodes-on-destroy.patch b/kvm-virtiofsd-Clean-up-inodes-on-destroy.patch new file mode 100755 index 0000000000000000000000000000000000000000..4713a0d92f49ee3c69e33edbdeeef0412ed18a72 --- /dev/null +++ b/kvm-virtiofsd-Clean-up-inodes-on-destroy.patch @@ -0,0 +1,85 @@ +From 2b921f7162b53204051955228bf99bbed55d2457 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:53 +0100 +Subject: [PATCH 082/116] virtiofsd: Clean up inodes on destroy +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-79-dgilbert@redhat.com> +Patchwork-id: 93532 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 078/112] virtiofsd: Clean up inodes on destroy +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Clear out our inodes and fd's on a 'destroy' - so we get rid +of them if we reboot the guest. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 771b01eb76ff480fee984bd1d21727147cc3e702) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index b176a31..9ed77a1 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1169,6 +1169,25 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + } + ++static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) ++{ ++ struct lo_inode *inode = value; ++ struct lo_data *lo = user_data; ++ ++ inode->refcount = 0; ++ lo_map_remove(&lo->ino_map, inode->fuse_ino); ++ close(inode->fd); ++ ++ return TRUE; ++} ++ ++static void unref_all_inodes(struct lo_data *lo) ++{ ++ pthread_mutex_lock(&lo->mutex); ++ g_hash_table_foreach_remove(lo->inodes, unref_all_inodes_cb, lo); ++ pthread_mutex_unlock(&lo->mutex); ++} ++ + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +@@ -2035,6 +2054,12 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + } + } + ++static void lo_destroy(void *userdata) ++{ ++ struct lo_data *lo = (struct lo_data *)userdata; ++ unref_all_inodes(lo); ++} ++ + static struct fuse_lowlevel_ops lo_oper = { + .init = lo_init, + .lookup = lo_lookup, +@@ -2073,6 +2098,7 @@ static struct fuse_lowlevel_ops lo_oper = { + .copy_file_range = lo_copy_file_range, + #endif + .lseek = lo_lseek, ++ .destroy = lo_destroy, + }; + + /* Print vhost-user.json backend program capabilities */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch b/kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch new file mode 100755 index 0000000000000000000000000000000000000000..c4213657d2313a45237cfa6b4d67a02d02aa6147 --- /dev/null +++ b/kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch @@ -0,0 +1,112 @@ +From 24f91062f571ad2dd2ac22db3b7d456a2c8bd2cb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:23 +0100 +Subject: [PATCH 112/116] virtiofsd: Convert lo_destroy to take the lo->mutex + lock itself +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-109-dgilbert@redhat.com> +Patchwork-id: 93563 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 108/112] virtiofsd: Convert lo_destroy to take the lo->mutex lock itself +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +lo_destroy was relying on some implicit knowledge of the locking; +we can avoid this if we create an unref_inode that doesn't take +the lock and then grab it for the whole of the lo_destroy. + +Suggested-by: Vivek Goyal +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit fe4c15798a48143dd6b1f58d2d3cad12206ce211) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 31 +++++++++++++++++-------------- + 1 file changed, 17 insertions(+), 14 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index eb001b9..fc15d61 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1344,14 +1344,13 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + lo_inode_put(lo, &inode); + } + +-static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, +- uint64_t n) ++/* To be called with lo->mutex held */ ++static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + { + if (!inode) { + return; + } + +- pthread_mutex_lock(&lo->mutex); + assert(inode->nlookup >= n); + inode->nlookup -= n; + if (!inode->nlookup) { +@@ -1362,15 +1361,24 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + g_hash_table_destroy(inode->posix_locks); + pthread_mutex_destroy(&inode->plock_mutex); +- pthread_mutex_unlock(&lo->mutex); + + /* Drop our refcount from lo_do_lookup() */ + lo_inode_put(lo, &inode); +- } else { +- pthread_mutex_unlock(&lo->mutex); + } + } + ++static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, ++ uint64_t n) ++{ ++ if (!inode) { ++ return; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ unref_inode(lo, inode, n); ++ pthread_mutex_unlock(&lo->mutex); ++} ++ + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +@@ -2458,13 +2466,7 @@ static void lo_destroy(void *userdata) + { + struct lo_data *lo = (struct lo_data *)userdata; + +- /* +- * Normally lo->mutex must be taken when traversing lo->inodes but +- * lo_destroy() is a serialized request so no races are possible here. +- * +- * In addition, we cannot acquire lo->mutex since unref_inode() takes it +- * too and this would result in a recursive lock. +- */ ++ pthread_mutex_lock(&lo->mutex); + while (true) { + GHashTableIter iter; + gpointer key, value; +@@ -2475,8 +2477,9 @@ static void lo_destroy(void *userdata) + } + + struct lo_inode *inode = value; +- unref_inode_lolocked(lo, inode, inode->nlookup); ++ unref_inode(lo, inode, inode->nlookup); + } ++ pthread_mutex_unlock(&lo->mutex); + } + + static struct fuse_lowlevel_ops lo_oper = { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch b/kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch new file mode 100755 index 0000000000000000000000000000000000000000..90b6b35ac0bcd6c70e37f0559ebab12baca1095a --- /dev/null +++ b/kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch @@ -0,0 +1,72 @@ +From 3ec945ba7c2649cca13cf6070c6365b1262ad1ec Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Fri, 6 Aug 2021 11:58:26 -0400 +Subject: [PATCH 1/2] virtiofsd: Disable remote posix locks by default +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20210806115827.740945-2-mreitz@redhat.com> +Patchwork-id: 101970 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/2] virtiofsd: Disable remote posix locks by default +Bugzilla: 1967496 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Vivek Goyal + +From: Vivek Goyal + +Right now we enable remote posix locks by default. That means when guest +does a posix lock it sends request to server (virtiofsd). But currently +we only support non-blocking posix lock and return -EOPNOTSUPP for +blocking version. + +This means that existing applications which are doing blocking posix +locks get -EOPNOTSUPP and fail. To avoid this, people have been +running virtiosd with option "-o no_posix_lock". For new users it +is still a surprise and trial and error takes them to this option. + +Given posix lock implementation is not complete in virtiofsd, disable +it by default. This means that posix locks will work with-in applications +in a guest but not across guests. Anyway we don't support sharing +filesystem among different guests yet in virtiofs so this should +not lead to any kind of surprise or regression and will make life +little easier for virtiofs users. + +Reported-by: Aa Aa +Suggested-by: Miklos Szeredi +Signed-off-by: Vivek Goyal +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 88fc107956a5812649e5918e0c092d3f78bb28ad) + +Conflicts: + docs/tools/virtiofsd.rst + We do not have virtiofsd.rst downstream (added upstream in + commit 6a7e2bbee5fa), so I dropped that hunk (which effectively + updated the default value in the man page). + +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index cb0992f2db..b47029da89 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -3001,7 +3001,7 @@ int main(int argc, char *argv[]) + struct lo_data lo = { + .debug = 0, + .writeback = 0, +- .posix_lock = 1, ++ .posix_lock = 0, + .proc_self_fd = -1, + }; + struct lo_map_elem *root_elem; +-- +2.27.0 + diff --git a/kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch b/kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch new file mode 100755 index 0000000000000000000000000000000000000000..9f198c2d801770c98c433a7338ed60def9cfbc30 --- /dev/null +++ b/kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch @@ -0,0 +1,176 @@ +From e217ab392e0d4c770ec18dbfbe986771773cb557 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:33 +0100 +Subject: [PATCH 062/116] virtiofsd: Drop CAP_FSETID if client asked for it +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-59-dgilbert@redhat.com> +Patchwork-id: 93513 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 058/112] virtiofsd: Drop CAP_FSETID if client asked for it +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +If client requested killing setuid/setgid bits on file being written, drop +CAP_FSETID capability so that setuid/setgid bits are cleared upon write +automatically. + +pjdfstest chown/12.t needs this. + +Signed-off-by: Vivek Goyal + dgilbert: reworked for libcap-ng +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ee88465224b3aed2596049caa28f86cbe0d5a3d0) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 105 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 105 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 97e7c75..d53cb1e 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -201,6 +201,91 @@ static int load_capng(void) + return 0; + } + ++/* ++ * Helpers for dropping and regaining effective capabilities. Returns 0 ++ * on success, error otherwise ++ */ ++static int drop_effective_cap(const char *cap_name, bool *cap_dropped) ++{ ++ int cap, ret; ++ ++ cap = capng_name_to_capability(cap_name); ++ if (cap < 0) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", ++ cap_name, strerror(errno)); ++ goto out; ++ } ++ ++ if (load_capng()) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); ++ goto out; ++ } ++ ++ /* We dont have this capability in effective set already. */ ++ if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) { ++ ret = 0; ++ goto out; ++ } ++ ++ if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n"); ++ goto out; ++ } ++ ++ if (capng_apply(CAPNG_SELECT_CAPS)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n"); ++ goto out; ++ } ++ ++ ret = 0; ++ if (cap_dropped) { ++ *cap_dropped = true; ++ } ++ ++out: ++ return ret; ++} ++ ++static int gain_effective_cap(const char *cap_name) ++{ ++ int cap; ++ int ret = 0; ++ ++ cap = capng_name_to_capability(cap_name); ++ if (cap < 0) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", ++ cap_name, strerror(errno)); ++ goto out; ++ } ++ ++ if (load_capng()) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); ++ goto out; ++ } ++ ++ if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n"); ++ goto out; ++ } ++ ++ if (capng_apply(CAPNG_SELECT_CAPS)) { ++ ret = errno; ++ fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n"); ++ goto out; ++ } ++ ret = 0; ++ ++out: ++ return ret; ++} ++ + static void lo_map_init(struct lo_map *map) + { + map->elems = NULL; +@@ -1577,6 +1662,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + (void)ino; + ssize_t res; + struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); ++ bool cap_fsetid_dropped = false; + + out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + out_buf.buf[0].fd = lo_fi_fd(req, fi); +@@ -1588,12 +1674,31 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + out_buf.buf[0].size, (unsigned long)off); + } + ++ /* ++ * If kill_priv is set, drop CAP_FSETID which should lead to kernel ++ * clearing setuid/setgid on file. ++ */ ++ if (fi->kill_priv) { ++ res = drop_effective_cap("FSETID", &cap_fsetid_dropped); ++ if (res != 0) { ++ fuse_reply_err(req, res); ++ return; ++ } ++ } ++ + res = fuse_buf_copy(&out_buf, in_buf); + if (res < 0) { + fuse_reply_err(req, -res); + } else { + fuse_reply_write(req, (size_t)res); + } ++ ++ if (cap_fsetid_dropped) { ++ res = gain_effective_cap("FSETID"); ++ if (res) { ++ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); ++ } ++ } + } + + static void lo_statfs(fuse_req_t req, fuse_ino_t ino) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch b/kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch new file mode 100755 index 0000000000000000000000000000000000000000..152032b2207a82ed9cd1190a6bd3633fc4609664 --- /dev/null +++ b/kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch @@ -0,0 +1,111 @@ +From 746e07f2d54908296dde64e97e12ea33a35063e0 Mon Sep 17 00:00:00 2001 +From: Vivek Goyal +Date: Tue, 25 Jan 2022 13:51:14 -0500 +Subject: [PATCH] virtiofsd: Drop membership of all supplementary groups + (CVE-2022-0358) + +RH-Author: Dr. David Alan Gilbert +RH-MergeRequest: 106: 8.5.0z non-av; virtiofsd security fix - drop secondary groups +RH-Commit: [1/1] e39df0b31f3c236675262395b94d5c10e8e3073f +RH-Bugzilla: 2048627 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Laszlo Ersek +RH-Acked-by: Vivek Goyal + +At the start, drop membership of all supplementary groups. This is +not required. + +If we have membership of "root" supplementary group and when we switch +uid/gid using setresuid/setsgid, we still retain membership of existing +supplemntary groups. And that can allow some operations which are not +normally allowed. + +For example, if root in guest creates a dir as follows. + +$ mkdir -m 03777 test_dir + +This sets SGID on dir as well as allows unprivileged users to write into +this dir. + +And now as unprivileged user open file as follows. + +$ su test +$ fd = open("test_dir/priviledge_id", O_RDWR|O_CREAT|O_EXCL, 02755); + +This will create SGID set executable in test_dir/. + +And that's a problem because now an unpriviliged user can execute it, +get egid=0 and get access to resources owned by "root" group. This is +privilege escalation. + +Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2044863 +Fixes: CVE-2022-0358 +Reported-by: JIETAO XIAO +Suggested-by: Miklos Szeredi +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Vivek Goyal +Message-Id: +Signed-off-by: Dr. David Alan Gilbert + dgilbert: Fixed missing {}'s style nit +(cherry picked from commit 449e8171f96a6a944d1f3b7d3627ae059eae21ca) + dgilbert: Minor fixup around #includes on backport +--- + tools/virtiofsd/passthrough_ll.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index b47029da89..578131179c 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + + #include "passthrough_helpers.h" + #include "seccomp.h" +@@ -1058,6 +1059,30 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + #define OURSYS_setresuid SYS_setresuid + #endif + ++static void drop_supplementary_groups(void) ++{ ++ int ret; ++ ++ ret = getgroups(0, NULL); ++ if (ret == -1) { ++ fuse_log(FUSE_LOG_ERR, "getgroups() failed with error=%d:%s\n", ++ errno, strerror(errno)); ++ exit(1); ++ } ++ ++ if (!ret) { ++ return; ++ } ++ ++ /* Drop all supplementary groups. We should not need it */ ++ ret = setgroups(0, NULL); ++ if (ret == -1) { ++ fuse_log(FUSE_LOG_ERR, "setgroups() failed with error=%d:%s\n", ++ errno, strerror(errno)); ++ exit(1); ++ } ++} ++ + /* + * Change to uid/gid of caller so that file is created with + * ownership of caller. +@@ -3010,6 +3035,8 @@ int main(int argc, char *argv[]) + /* Don't mask creation mode, kernel already did that */ + umask(0); + ++ drop_supplementary_groups(); ++ + pthread_mutex_init(&lo.mutex, NULL); + lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); + lo.root.fd = -1; +-- +2.27.0 + diff --git a/kvm-virtiofsd-Fast-path-for-virtio-read.patch b/kvm-virtiofsd-Fast-path-for-virtio-read.patch new file mode 100755 index 0000000000000000000000000000000000000000..03874ce356317cabb2d93fb72d381ddbdc4931d0 --- /dev/null +++ b/kvm-virtiofsd-Fast-path-for-virtio-read.patch @@ -0,0 +1,240 @@ +From 7d2efc3e4af15eff57b0c38cff7c81b371a98303 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:06 +0100 +Subject: [PATCH 035/116] virtiofsd: Fast path for virtio read +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-32-dgilbert@redhat.com> +Patchwork-id: 93480 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 031/112] virtiofsd: Fast path for virtio read +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Readv the data straight into the guests buffer. + +Signed-off-by: Dr. David Alan Gilbert +With fix by: +Signed-off-by: Eryu Guan +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit eb49d187ef5134483a34c970bbfece28aaa686a7) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 5 ++ + tools/virtiofsd/fuse_virtio.c | 162 ++++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_virtio.h | 4 + + 3 files changed, 171 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 380d93b..4f4684d 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -475,6 +475,11 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + return fuse_send_msg(se, ch, iov, iov_count); + } + ++ if (fuse_lowlevel_is_virtio(se) && buf->count == 1 && ++ buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) { ++ return virtio_send_data_iov(se, ch, iov, iov_count, buf, len); ++ } ++ + abort(); /* Will have taken vhost path */ + return 0; + } +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index f1adeb6..7e2711b 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -230,6 +230,168 @@ err: + return ret; + } + ++/* ++ * Callback from fuse_send_data_iov_* when it's virtio and the buffer ++ * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK ++ * We need send the iov and then the buffer. ++ * Return 0 on success ++ */ ++int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count, struct fuse_bufvec *buf, ++ size_t len) ++{ ++ int ret = 0; ++ VuVirtqElement *elem; ++ VuVirtq *q; ++ ++ assert(count >= 1); ++ assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); ++ ++ struct fuse_out_header *out = iov[0].iov_base; ++ /* TODO: Endianness! */ ++ ++ size_t iov_len = iov_size(iov, count); ++ size_t tosend_len = iov_len + len; ++ ++ out->len = tosend_len; ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__, ++ count, len, iov_len); ++ ++ /* unique == 0 is notification which we don't support */ ++ assert(out->unique); ++ ++ /* For virtio we always have ch */ ++ assert(ch); ++ assert(!ch->qi->reply_sent); ++ elem = ch->qi->qe; ++ q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ ++ /* The 'in' part of the elem is to qemu */ ++ unsigned int in_num = elem->in_num; ++ struct iovec *in_sg = elem->in_sg; ++ size_t in_len = iov_size(in_sg, in_num); ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", ++ __func__, elem->index, in_num, in_len); ++ ++ /* ++ * The elem should have room for a 'fuse_out_header' (out from fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (in_len < sizeof(struct fuse_out_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", ++ __func__, elem->index); ++ ret = E2BIG; ++ goto err; ++ } ++ if (in_len < tosend_len) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", ++ __func__, elem->index, tosend_len); ++ ret = E2BIG; ++ goto err; ++ } ++ ++ /* TODO: Limit to 'len' */ ++ ++ /* First copy the header data from iov->in_sg */ ++ copy_iov(iov, count, in_sg, in_num, iov_len); ++ ++ /* ++ * Build a copy of the the in_sg iov so we can skip bits in it, ++ * including changing the offsets ++ */ ++ struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num); ++ assert(in_sg_cpy); ++ memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num); ++ /* These get updated as we skip */ ++ struct iovec *in_sg_ptr = in_sg_cpy; ++ int in_sg_cpy_count = in_num; ++ ++ /* skip over parts of in_sg that contained the header iov */ ++ size_t skip_size = iov_len; ++ ++ size_t in_sg_left = 0; ++ do { ++ while (skip_size != 0 && in_sg_cpy_count) { ++ if (skip_size >= in_sg_ptr[0].iov_len) { ++ skip_size -= in_sg_ptr[0].iov_len; ++ in_sg_ptr++; ++ in_sg_cpy_count--; ++ } else { ++ in_sg_ptr[0].iov_len -= skip_size; ++ in_sg_ptr[0].iov_base += skip_size; ++ break; ++ } ++ } ++ ++ int i; ++ for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) { ++ in_sg_left += in_sg_ptr[i].iov_len; ++ } ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: after skip skip_size=%zd in_sg_cpy_count=%d " ++ "in_sg_left=%zd\n", ++ __func__, skip_size, in_sg_cpy_count, in_sg_left); ++ ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count, ++ buf->buf[0].pos); ++ ++ if (ret == -1) { ++ ret = errno; ++ fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n", ++ __func__, len); ++ free(in_sg_cpy); ++ goto err; ++ } ++ fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__, ++ ret, len); ++ if (ret < len && ret) { ++ fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__); ++ /* Skip over this much next time around */ ++ skip_size = ret; ++ buf->buf[0].pos += ret; ++ len -= ret; ++ ++ /* Lets do another read */ ++ continue; ++ } ++ if (!ret) { ++ /* EOF case? */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__, ++ in_sg_left); ++ break; ++ } ++ if (ret != len) { ++ fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__); ++ ret = EIO; ++ free(in_sg_cpy); ++ goto err; ++ } ++ in_sg_left -= ret; ++ len -= ret; ++ } while (in_sg_left); ++ free(in_sg_cpy); ++ ++ /* Need to fix out->len on EOF */ ++ if (len) { ++ struct fuse_out_header *out_sg = in_sg[0].iov_base; ++ ++ tosend_len -= len; ++ out_sg->len = tosend_len; ++ } ++ ++ ret = 0; ++ ++ vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); ++ vu_queue_notify(&se->virtio_dev->dev, q); ++ ++err: ++ if (ret == 0) { ++ ch->qi->reply_sent = true; ++ } ++ ++ return ret; ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index 135a148..cc676b9 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -26,4 +26,8 @@ int virtio_loop(struct fuse_session *se); + int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count); + ++int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count, ++ struct fuse_bufvec *buf, size_t len); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch b/kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch new file mode 100755 index 0000000000000000000000000000000000000000..12bb9a268015d08d6b2b7941f5f79e58bcf50caf --- /dev/null +++ b/kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch @@ -0,0 +1,164 @@ +From 6d41fc549198e140f38fddcb02975098df040ae1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:50 +0100 +Subject: [PATCH 019/116] virtiofsd: Fix common header and define for QEMU + builds +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-16-dgilbert@redhat.com> +Patchwork-id: 93470 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 015/112] virtiofsd: Fix common header and define for QEMU builds +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +All of the fuse files include config.h and define GNU_SOURCE +where we don't have either under our build - remove them. +Fixup path to the kernel's fuse.h in the QEMUs world. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 09863ebc7e32a107235b3c815ad54d26cc64f07a) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 4 +--- + tools/virtiofsd/fuse_i.h | 3 +++ + tools/virtiofsd/fuse_log.c | 1 + + tools/virtiofsd/fuse_lowlevel.c | 6 ++---- + tools/virtiofsd/fuse_opt.c | 2 +- + tools/virtiofsd/fuse_signals.c | 2 +- + tools/virtiofsd/helper.c | 1 + + tools/virtiofsd/passthrough_ll.c | 8 ++------ + 8 files changed, 12 insertions(+), 15 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 4d507f3..772efa9 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -9,9 +9,7 @@ + * See the file COPYING.LIB + */ + +-#define _GNU_SOURCE +- +-#include "config.h" ++#include "qemu/osdep.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + #include +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index e63cb58..bae0699 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -6,6 +6,9 @@ + * See the file COPYING.LIB + */ + ++#define FUSE_USE_VERSION 31 ++ ++ + #include "fuse.h" + #include "fuse_lowlevel.h" + +diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c +index 11345f9..c301ff6 100644 +--- a/tools/virtiofsd/fuse_log.c ++++ b/tools/virtiofsd/fuse_log.c +@@ -8,6 +8,7 @@ + * See the file COPYING.LIB + */ + ++#include "qemu/osdep.h" + #include "fuse_log.h" + + #include +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 3da80de..07fb8a6 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -9,11 +9,9 @@ + * See the file COPYING.LIB + */ + +-#define _GNU_SOURCE +- +-#include "config.h" ++#include "qemu/osdep.h" + #include "fuse_i.h" +-#include "fuse_kernel.h" ++#include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" + +diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c +index edd36f4..2892236 100644 +--- a/tools/virtiofsd/fuse_opt.c ++++ b/tools/virtiofsd/fuse_opt.c +@@ -9,8 +9,8 @@ + * See the file COPYING.LIB + */ + ++#include "qemu/osdep.h" + #include "fuse_opt.h" +-#include "config.h" + #include "fuse_i.h" + #include "fuse_misc.h" + +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +index 19d6791..dc7c8ac 100644 +--- a/tools/virtiofsd/fuse_signals.c ++++ b/tools/virtiofsd/fuse_signals.c +@@ -8,7 +8,7 @@ + * See the file COPYING.LIB + */ + +-#include "config.h" ++#include "qemu/osdep.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index d9227d7..9333691 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -10,6 +10,7 @@ + * See the file COPYING.LIB. + */ + ++#include "qemu/osdep.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + #include "fuse_misc.h" +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 126a56c..322a889 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -35,15 +35,11 @@ + * \include passthrough_ll.c + */ + +-#define _GNU_SOURCE +-#define FUSE_USE_VERSION 31 +- +-#include "config.h" +- ++#include "qemu/osdep.h" ++#include "fuse_lowlevel.h" + #include + #include + #include +-#include + #include + #include + #include +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch b/kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch new file mode 100755 index 0000000000000000000000000000000000000000..f929bab7708fe6bd93b617a13d1f725d1e15da06 --- /dev/null +++ b/kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch @@ -0,0 +1,136 @@ +From 9b5fbc95a287b2ce9448142194b161d8360d5e4e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:15 +0100 +Subject: [PATCH 104/116] virtiofsd: Fix data corruption with O_APPEND write in + writeback mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-101-dgilbert@redhat.com> +Patchwork-id: 93556 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 100/112] virtiofsd: Fix data corruption with O_APPEND write in writeback mode +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Misono Tomohiro + +When writeback mode is enabled (-o writeback), O_APPEND handling is +done in kernel. Therefore virtiofsd clears O_APPEND flag when open. +Otherwise O_APPEND flag takes precedence over pwrite() and write +data may corrupt. + +Currently clearing O_APPEND flag is done in lo_open(), but we also +need the same operation in lo_create(). So, factor out the flag +update operation in lo_open() to update_open_flags() and call it +in both lo_open() and lo_create(). + +This fixes the failure of xfstest generic/069 in writeback mode +(which tests O_APPEND write data integrity). + +Signed-off-by: Misono Tomohiro +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8e4e41e39eac5ee5f378d66f069a2f70a1734317) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 66 ++++++++++++++++++++-------------------- + 1 file changed, 33 insertions(+), 33 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 948cb19..4c61ac5 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1692,6 +1692,37 @@ static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + fuse_reply_err(req, 0); + } + ++static void update_open_flags(int writeback, struct fuse_file_info *fi) ++{ ++ /* ++ * With writeback cache, kernel may send read requests even ++ * when userspace opened write-only ++ */ ++ if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { ++ fi->flags &= ~O_ACCMODE; ++ fi->flags |= O_RDWR; ++ } ++ ++ /* ++ * With writeback cache, O_APPEND is handled by the kernel. ++ * This breaks atomicity (since the file may change in the ++ * underlying filesystem, so that the kernel's idea of the ++ * end of the file isn't accurate anymore). In this example, ++ * we just accept that. A more rigorous filesystem may want ++ * to return an error here ++ */ ++ if (writeback && (fi->flags & O_APPEND)) { ++ fi->flags &= ~O_APPEND; ++ } ++ ++ /* ++ * O_DIRECT in guest should not necessarily mean bypassing page ++ * cache on host as well. If somebody needs that behavior, it ++ * probably should be a configuration knob in daemon. ++ */ ++ fi->flags &= ~O_DIRECT; ++} ++ + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) + { +@@ -1721,12 +1752,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out; + } + +- /* +- * O_DIRECT in guest should not necessarily mean bypassing page +- * cache on host as well. If somebody needs that behavior, it +- * probably should be a configuration knob in daemon. +- */ +- fi->flags &= ~O_DIRECT; ++ update_open_flags(lo->writeback, fi); + + fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); +@@ -1936,33 +1962,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, + fi->flags); + +- /* +- * With writeback cache, kernel may send read requests even +- * when userspace opened write-only +- */ +- if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { +- fi->flags &= ~O_ACCMODE; +- fi->flags |= O_RDWR; +- } +- +- /* +- * With writeback cache, O_APPEND is handled by the kernel. +- * This breaks atomicity (since the file may change in the +- * underlying filesystem, so that the kernel's idea of the +- * end of the file isn't accurate anymore). In this example, +- * we just accept that. A more rigorous filesystem may want +- * to return an error here +- */ +- if (lo->writeback && (fi->flags & O_APPEND)) { +- fi->flags &= ~O_APPEND; +- } +- +- /* +- * O_DIRECT in guest should not necessarily mean bypassing page +- * cache on host as well. If somebody needs that behavior, it +- * probably should be a configuration knob in daemon. +- */ +- fi->flags &= ~O_DIRECT; ++ update_open_flags(lo->writeback, fi); + + sprintf(buf, "%i", lo_fd(req, ino)); + fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch b/kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch new file mode 100755 index 0000000000000000000000000000000000000000..306c183f97eabaa7b7a2fc2972fdca29998e48b3 --- /dev/null +++ b/kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch @@ -0,0 +1,120 @@ +From 9f726593bc3acbc247876dcc4d79fbf046958003 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:49 +0100 +Subject: [PATCH 018/116] virtiofsd: Fix fuse_daemonize ignored return values +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-15-dgilbert@redhat.com> +Patchwork-id: 93469 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 014/112] virtiofsd: Fix fuse_daemonize ignored return values +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +QEMU's compiler enables warnings/errors for ignored values +and the (void) trick used in the fuse code isn't enough. +Turn all the return values into a return value on the function. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 30d8e49760712d65697ea517c53671bd1d214fc7) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 33 ++++++++++++++++++++++----------- + 1 file changed, 22 insertions(+), 11 deletions(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5e6f205..d9227d7 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -10,12 +10,10 @@ + * See the file COPYING.LIB. + */ + +-#include "config.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" + #include "fuse_misc.h" + #include "fuse_opt.h" +-#include "mount_util.h" + + #include + #include +@@ -171,6 +169,7 @@ int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + + int fuse_daemonize(int foreground) + { ++ int ret = 0, rett; + if (!foreground) { + int nullfd; + int waiter[2]; +@@ -192,8 +191,8 @@ int fuse_daemonize(int foreground) + case 0: + break; + default: +- (void)read(waiter[0], &completed, sizeof(completed)); +- _exit(0); ++ _exit(read(waiter[0], &completed, ++ sizeof(completed) != sizeof(completed))); + } + + if (setsid() == -1) { +@@ -201,13 +200,22 @@ int fuse_daemonize(int foreground) + return -1; + } + +- (void)chdir("/"); ++ ret = chdir("/"); + + nullfd = open("/dev/null", O_RDWR, 0); + if (nullfd != -1) { +- (void)dup2(nullfd, 0); +- (void)dup2(nullfd, 1); +- (void)dup2(nullfd, 2); ++ rett = dup2(nullfd, 0); ++ if (!ret) { ++ ret = rett; ++ } ++ rett = dup2(nullfd, 1); ++ if (!ret) { ++ ret = rett; ++ } ++ rett = dup2(nullfd, 2); ++ if (!ret) { ++ ret = rett; ++ } + if (nullfd > 2) { + close(nullfd); + } +@@ -215,13 +223,16 @@ int fuse_daemonize(int foreground) + + /* Propagate completion of daemon initialization */ + completed = 1; +- (void)write(waiter[1], &completed, sizeof(completed)); ++ rett = write(waiter[1], &completed, sizeof(completed)); ++ if (!ret) { ++ ret = rett; ++ } + close(waiter[0]); + close(waiter[1]); + } else { +- (void)chdir("/"); ++ ret = chdir("/"); + } +- return 0; ++ return ret; + } + + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch b/kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch new file mode 100755 index 0000000000000000000000000000000000000000..98907a5a841bc8c546ab938e97ec0eb0a98d592e --- /dev/null +++ b/kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch @@ -0,0 +1,51 @@ +From 6abfb7b3c37015ff901d11f178bc6900deec2acf Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Fri, 6 Aug 2021 11:58:27 -0400 +Subject: [PATCH 2/2] virtiofsd: Fix the help message of posix lock +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20210806115827.740945-3-mreitz@redhat.com> +Patchwork-id: 101969 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 2/2] virtiofsd: Fix the help message of posix lock +Bugzilla: 1967496 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Vivek Goyal + +From: Jiachen Zhang + +The commit 88fc107956a5812649e5918e0c092d3f78bb28ad disabled remote +posix locks by default. But the --help message still says it is enabled +by default. So fix it to output no_posix_lock. + +Signed-off-by: Jiachen Zhang +Message-Id: <20201027081558.29904-1-zhangjiachen.jaycee@bytedance.com> +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0429eaf518be1d4742356056e6c886b7f9bc9712) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/helper.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5b222ea49b..813d9490e5 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -163,7 +163,7 @@ void fuse_cmdline_help(void) + " default: false\n" + " -o posix_lock|no_posix_lock\n" + " enable/disable remote posix lock\n" +- " default: posix_lock\n" ++ " default: no_posix_lock\n" + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " +-- +2.27.0 + diff --git a/kvm-virtiofsd-Fix-xattr-operations.patch b/kvm-virtiofsd-Fix-xattr-operations.patch new file mode 100755 index 0000000000000000000000000000000000000000..532948fc9f7ab3161dad4ce514da86541f93b8bd --- /dev/null +++ b/kvm-virtiofsd-Fix-xattr-operations.patch @@ -0,0 +1,327 @@ +From 8721796f22a8a61d82974088e542377ee6db209e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:14 +0000 +Subject: [PATCH 18/18] virtiofsd: Fix xattr operations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-8-dgilbert@redhat.com> +Patchwork-id: 94123 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 7/7] virtiofsd: Fix xattr operations +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: Misono Tomohiro + +Current virtiofsd has problems about xattr operations and +they does not work properly for directory/symlink/special file. + +The fundamental cause is that virtiofsd uses openat() + f...xattr() +systemcalls for xattr operation but we should not open symlink/special +file in the daemon. Therefore the function is restricted. + +Fix this problem by: + 1. during setup of each thread, call unshare(CLONE_FS) + 2. in xattr operations (i.e. lo_getxattr), if inode is not a regular + file or directory, use fchdir(proc_loot_fd) + ...xattr() + + fchdir(root.fd) instead of openat() + f...xattr() + + (Note: for a regular file/directory openat() + f...xattr() + is still used for performance reason) + +With this patch, xfstests generic/062 passes on virtiofs. + +This fix is suggested by Miklos Szeredi and Stefan Hajnoczi. +The original discussion can be found here: + https://www.redhat.com/archives/virtio-fs/2019-October/msg00046.html + +Signed-off-by: Misono Tomohiro +Message-Id: <20200227055927.24566-3-misono.tomohiro@jp.fujitsu.com> +Acked-by: Vivek Goyal +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit bdfd66788349acc43cd3f1298718ad491663cfcc) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_virtio.c | 13 +++++ + tools/virtiofsd/passthrough_ll.c | 105 +++++++++++++++++++++------------------ + tools/virtiofsd/seccomp.c | 6 +++ + 3 files changed, 77 insertions(+), 47 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index dd1c605..3b6d16a 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -426,6 +426,8 @@ err: + return ret; + } + ++static __thread bool clone_fs_called; ++ + /* Process one FVRequest in a thread pool */ + static void fv_queue_worker(gpointer data, gpointer user_data) + { +@@ -441,6 +443,17 @@ static void fv_queue_worker(gpointer data, gpointer user_data) + + assert(se->bufsize > sizeof(struct fuse_in_header)); + ++ if (!clone_fs_called) { ++ int ret; ++ ++ /* unshare FS for xattr operation */ ++ ret = unshare(CLONE_FS); ++ /* should not fail */ ++ assert(ret == 0); ++ ++ clone_fs_called = true; ++ } ++ + /* + * An element contains one request and the space to send our response + * They're spread over multiple descriptors in a scatter/gather set +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 50c7273..9cba3f1 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -123,7 +123,7 @@ struct lo_inode { + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ + +- bool is_symlink; ++ mode_t filetype; + }; + + struct lo_cred { +@@ -695,7 +695,7 @@ static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode, + struct lo_inode *parent; + char path[PATH_MAX]; + +- if (inode->is_symlink) { ++ if (S_ISLNK(inode->filetype)) { + res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH); + if (res == -1 && errno == EINVAL) { + /* Sorry, no race free way to set times on symlink. */ +@@ -929,7 +929,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out_err; + } + +- inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ /* cache only filetype */ ++ inode->filetype = (e->attr.st_mode & S_IFMT); + + /* + * One for the caller and one for nlookup (released in +@@ -1139,7 +1140,7 @@ static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode, + struct lo_inode *parent; + char path[PATH_MAX]; + +- if (inode->is_symlink) { ++ if (S_ISLNK(inode->filetype)) { + res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); + if (res == -1 && (errno == ENOENT || errno == EINVAL)) { + /* Sorry, no race free way to hard-link a symlink. */ +@@ -2193,12 +2194,6 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", + ino, name, size); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to getxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + if (size) { + value = malloc(size); + if (!value) { +@@ -2207,12 +2202,25 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + } + + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDONLY); +- if (fd < 0) { +- goto out_err; ++ /* ++ * It is not safe to open() non-regular/non-dir files in file server ++ * unless O_PATH is used, so use that method for regular files/dir ++ * only (as it seems giving less performance overhead). ++ * Otherwise, call fchdir() to avoid open(). ++ */ ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } ++ ret = fgetxattr(fd, name, value, size); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = getxattr(procname, name, value, size); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = fgetxattr(fd, name, value, size); + if (ret == -1) { + goto out_err; + } +@@ -2266,12 +2274,6 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, + size); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to listxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + if (size) { + value = malloc(size); + if (!value) { +@@ -2280,12 +2282,19 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + } + + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDONLY); +- if (fd < 0) { +- goto out_err; ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } ++ ret = flistxattr(fd, value, size); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = listxattr(procname, value, size); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = flistxattr(fd, value, size); + if (ret == -1) { + goto out_err; + } +@@ -2339,20 +2348,21 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 + ", name=%s value=%s size=%zd)\n", ino, name, value, size); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDWR); +- if (fd < 0) { +- saverr = errno; +- goto out; ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } ++ ret = fsetxattr(fd, name, value, size, flags); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = setxattr(procname, name, value, size, flags); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = fsetxattr(fd, name, value, size, flags); + saverr = ret == -1 ? errno : 0; + + out: +@@ -2387,20 +2397,21 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, + name); + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- + sprintf(procname, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, procname, O_RDWR); +- if (fd < 0) { +- saverr = errno; +- goto out; ++ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } ++ ret = fremovexattr(fd, name); ++ } else { ++ /* fchdir should not fail here */ ++ assert(fchdir(lo->proc_self_fd) == 0); ++ ret = removexattr(procname, name); ++ assert(fchdir(lo->root.fd) == 0); + } + +- ret = fremovexattr(fd, name); + saverr = ret == -1 ? errno : 0; + + out: +@@ -2800,7 +2811,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + exit(1); + } + +- root->is_symlink = false; ++ root->filetype = S_IFDIR; + root->fd = fd; + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +index 2d9d4a7..bd9e7b0 100644 +--- a/tools/virtiofsd/seccomp.c ++++ b/tools/virtiofsd/seccomp.c +@@ -41,6 +41,7 @@ static const int syscall_whitelist[] = { + SCMP_SYS(exit), + SCMP_SYS(exit_group), + SCMP_SYS(fallocate), ++ SCMP_SYS(fchdir), + SCMP_SYS(fchmodat), + SCMP_SYS(fchownat), + SCMP_SYS(fcntl), +@@ -62,7 +63,9 @@ static const int syscall_whitelist[] = { + SCMP_SYS(getpid), + SCMP_SYS(gettid), + SCMP_SYS(gettimeofday), ++ SCMP_SYS(getxattr), + SCMP_SYS(linkat), ++ SCMP_SYS(listxattr), + SCMP_SYS(lseek), + SCMP_SYS(madvise), + SCMP_SYS(mkdirat), +@@ -85,6 +88,7 @@ static const int syscall_whitelist[] = { + SCMP_SYS(recvmsg), + SCMP_SYS(renameat), + SCMP_SYS(renameat2), ++ SCMP_SYS(removexattr), + SCMP_SYS(rt_sigaction), + SCMP_SYS(rt_sigprocmask), + SCMP_SYS(rt_sigreturn), +@@ -98,10 +102,12 @@ static const int syscall_whitelist[] = { + SCMP_SYS(setresuid32), + #endif + SCMP_SYS(set_robust_list), ++ SCMP_SYS(setxattr), + SCMP_SYS(symlinkat), + SCMP_SYS(time), /* Rarely needed, except on static builds */ + SCMP_SYS(tgkill), + SCMP_SYS(unlinkat), ++ SCMP_SYS(unshare), + SCMP_SYS(utimensat), + SCMP_SYS(write), + SCMP_SYS(writev), +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Format-imported-files-to-qemu-style.patch b/kvm-virtiofsd-Format-imported-files-to-qemu-style.patch new file mode 100755 index 0000000000000000000000000000000000000000..5593a338a38489ab7f03d3a11271aebdb1996e3b --- /dev/null +++ b/kvm-virtiofsd-Format-imported-files-to-qemu-style.patch @@ -0,0 +1,14743 @@ +From e313ab94af558bbc133e7a93b0a6dbff706dd1d8 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:45 +0100 +Subject: [PATCH 014/116] virtiofsd: Format imported files to qemu style +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-11-dgilbert@redhat.com> +Patchwork-id: 93464 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 010/112] virtiofsd: Format imported files to qemu style +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Mostly using a set like: + +indent -nut -i 4 -nlp -br -cs -ce --no-space-after-function-call-names file +clang-format -style=file -i -- file +clang-tidy -fix-errors -checks=readability-braces-around-statements file +clang-format -style=file -i -- file + +With manual cleanups. + +The .clang-format used is below. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed by: Aleksandar Markovic + +Language: Cpp +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false # although we like it, it creates churn +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: false # churn +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None # AlwaysBreakAfterDefinitionReturnType is taken into account +AlwaysBreakBeforeMultilineStrings: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterStruct: false + AfterUnion: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: false +BreakStringLiterals: true +ColumnLimit: 80 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ForEachMacros: [ + 'CPU_FOREACH', + 'CPU_FOREACH_REVERSE', + 'CPU_FOREACH_SAFE', + 'IOMMU_NOTIFIER_FOREACH', + 'QLIST_FOREACH', + 'QLIST_FOREACH_ENTRY', + 'QLIST_FOREACH_RCU', + 'QLIST_FOREACH_SAFE', + 'QLIST_FOREACH_SAFE_RCU', + 'QSIMPLEQ_FOREACH', + 'QSIMPLEQ_FOREACH_SAFE', + 'QSLIST_FOREACH', + 'QSLIST_FOREACH_SAFE', + 'QTAILQ_FOREACH', + 'QTAILQ_FOREACH_REVERSE', + 'QTAILQ_FOREACH_SAFE', + 'QTAILQ_RAW_FOREACH', + 'RAMBLOCK_FOREACH' +] +IncludeCategories: + - Regex: '^"qemu/osdep.h' + Priority: -3 + - Regex: '^"(block|chardev|crypto|disas|exec|fpu|hw|io|libdecnumber|migration|monitor|net|qapi|qemu|qom|standard-headers|sysemu|ui)/' + Priority: -2 + - Regex: '^"(elf.h|qemu-common.h|glib-compat.h|qemu-io.h|trace-tcg.h)' + Priority: -1 + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '$' +IndentCaseLabels: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '.*_BEGIN$' # only PREC_BEGIN ? +MacroBlockEnd: '.*_END$' +MaxEmptyLinesToKeep: 2 +PointerAlignment: Right +ReflowComments: true +SortIncludes: true +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInContainerLiterals: true +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Auto +UseTab: Never +... + +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 7387863d033e8028aa09a815736617a7c4490827) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 434 ++-- + tools/virtiofsd/fuse.h | 1572 +++++++------- + tools/virtiofsd/fuse_common.h | 730 +++---- + tools/virtiofsd/fuse_i.h | 121 +- + tools/virtiofsd/fuse_log.c | 38 +- + tools/virtiofsd/fuse_log.h | 32 +- + tools/virtiofsd/fuse_lowlevel.c | 3638 +++++++++++++++++---------------- + tools/virtiofsd/fuse_lowlevel.h | 2392 +++++++++++----------- + tools/virtiofsd/fuse_misc.h | 30 +- + tools/virtiofsd/fuse_opt.c | 659 +++--- + tools/virtiofsd/fuse_opt.h | 79 +- + tools/virtiofsd/fuse_signals.c | 118 +- + tools/virtiofsd/helper.c | 506 ++--- + tools/virtiofsd/passthrough_helpers.h | 33 +- + tools/virtiofsd/passthrough_ll.c | 2061 ++++++++++--------- + 15 files changed, 6382 insertions(+), 6061 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index aefb7db..5df946c 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -1,252 +1,272 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2010 Miklos Szeredi +- +- Functions for dealing with `struct fuse_buf` and `struct +- fuse_bufvec`. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2010 Miklos Szeredi ++ * ++ * Functions for dealing with `struct fuse_buf` and `struct ++ * fuse_bufvec`. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #define _GNU_SOURCE + + #include "config.h" + #include "fuse_i.h" + #include "fuse_lowlevel.h" ++#include ++#include + #include + #include +-#include +-#include + + size_t fuse_buf_size(const struct fuse_bufvec *bufv) + { +- size_t i; +- size_t size = 0; +- +- for (i = 0; i < bufv->count; i++) { +- if (bufv->buf[i].size == SIZE_MAX) +- size = SIZE_MAX; +- else +- size += bufv->buf[i].size; +- } +- +- return size; ++ size_t i; ++ size_t size = 0; ++ ++ for (i = 0; i < bufv->count; i++) { ++ if (bufv->buf[i].size == SIZE_MAX) { ++ size = SIZE_MAX; ++ } else { ++ size += bufv->buf[i].size; ++ } ++ } ++ ++ return size; + } + + static size_t min_size(size_t s1, size_t s2) + { +- return s1 < s2 ? s1 : s2; ++ return s1 < s2 ? s1 : s2; + } + + static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) + { +- ssize_t res = 0; +- size_t copied = 0; +- +- while (len) { +- if (dst->flags & FUSE_BUF_FD_SEEK) { +- res = pwrite(dst->fd, (char *)src->mem + src_off, len, +- dst->pos + dst_off); +- } else { +- res = write(dst->fd, (char *)src->mem + src_off, len); +- } +- if (res == -1) { +- if (!copied) +- return -errno; +- break; +- } +- if (res == 0) +- break; +- +- copied += res; +- if (!(dst->flags & FUSE_BUF_FD_RETRY)) +- break; +- +- src_off += res; +- dst_off += res; +- len -= res; +- } +- +- return copied; ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (dst->flags & FUSE_BUF_FD_SEEK) { ++ res = pwrite(dst->fd, (char *)src->mem + src_off, len, ++ dst->pos + dst_off); ++ } else { ++ res = write(dst->fd, (char *)src->mem + src_off, len); ++ } ++ if (res == -1) { ++ if (!copied) { ++ return -errno; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ copied += res; ++ if (!(dst->flags & FUSE_BUF_FD_RETRY)) { ++ break; ++ } ++ ++ src_off += res; ++ dst_off += res; ++ len -= res; ++ } ++ ++ return copied; + } + + static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) + { +- ssize_t res = 0; +- size_t copied = 0; +- +- while (len) { +- if (src->flags & FUSE_BUF_FD_SEEK) { +- res = pread(src->fd, (char *)dst->mem + dst_off, len, +- src->pos + src_off); +- } else { +- res = read(src->fd, (char *)dst->mem + dst_off, len); +- } +- if (res == -1) { +- if (!copied) +- return -errno; +- break; +- } +- if (res == 0) +- break; +- +- copied += res; +- if (!(src->flags & FUSE_BUF_FD_RETRY)) +- break; +- +- dst_off += res; +- src_off += res; +- len -= res; +- } +- +- return copied; ++ ssize_t res = 0; ++ size_t copied = 0; ++ ++ while (len) { ++ if (src->flags & FUSE_BUF_FD_SEEK) { ++ res = pread(src->fd, (char *)dst->mem + dst_off, len, ++ src->pos + src_off); ++ } else { ++ res = read(src->fd, (char *)dst->mem + dst_off, len); ++ } ++ if (res == -1) { ++ if (!copied) { ++ return -errno; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ copied += res; ++ if (!(src->flags & FUSE_BUF_FD_RETRY)) { ++ break; ++ } ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; + } + + static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len) + { +- char buf[4096]; +- struct fuse_buf tmp = { +- .size = sizeof(buf), +- .flags = 0, +- }; +- ssize_t res; +- size_t copied = 0; +- +- tmp.mem = buf; +- +- while (len) { +- size_t this_len = min_size(tmp.size, len); +- size_t read_len; +- +- res = fuse_buf_read(&tmp, 0, src, src_off, this_len); +- if (res < 0) { +- if (!copied) +- return res; +- break; +- } +- if (res == 0) +- break; +- +- read_len = res; +- res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); +- if (res < 0) { +- if (!copied) +- return res; +- break; +- } +- if (res == 0) +- break; +- +- copied += res; +- +- if (res < this_len) +- break; +- +- dst_off += res; +- src_off += res; +- len -= res; +- } +- +- return copied; ++ char buf[4096]; ++ struct fuse_buf tmp = { ++ .size = sizeof(buf), ++ .flags = 0, ++ }; ++ ssize_t res; ++ size_t copied = 0; ++ ++ tmp.mem = buf; ++ ++ while (len) { ++ size_t this_len = min_size(tmp.size, len); ++ size_t read_len; ++ ++ res = fuse_buf_read(&tmp, 0, src, src_off, this_len); ++ if (res < 0) { ++ if (!copied) { ++ return res; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ read_len = res; ++ res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); ++ if (res < 0) { ++ if (!copied) { ++ return res; ++ } ++ break; ++ } ++ if (res == 0) { ++ break; ++ } ++ ++ copied += res; ++ ++ if (res < this_len) { ++ break; ++ } ++ ++ dst_off += res; ++ src_off += res; ++ len -= res; ++ } ++ ++ return copied; + } + + static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) ++ const struct fuse_buf *src, size_t src_off, ++ size_t len, enum fuse_buf_copy_flags flags) + { +- int src_is_fd = src->flags & FUSE_BUF_IS_FD; +- int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; +- +- if (!src_is_fd && !dst_is_fd) { +- char *dstmem = (char *)dst->mem + dst_off; +- char *srcmem = (char *)src->mem + src_off; +- +- if (dstmem != srcmem) { +- if (dstmem + len <= srcmem || srcmem + len <= dstmem) +- memcpy(dstmem, srcmem, len); +- else +- memmove(dstmem, srcmem, len); +- } +- +- return len; +- } else if (!src_is_fd) { +- return fuse_buf_write(dst, dst_off, src, src_off, len); +- } else if (!dst_is_fd) { +- return fuse_buf_read(dst, dst_off, src, src_off, len); +- } else { +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); +- } ++ int src_is_fd = src->flags & FUSE_BUF_IS_FD; ++ int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; ++ ++ if (!src_is_fd && !dst_is_fd) { ++ char *dstmem = (char *)dst->mem + dst_off; ++ char *srcmem = (char *)src->mem + src_off; ++ ++ if (dstmem != srcmem) { ++ if (dstmem + len <= srcmem || srcmem + len <= dstmem) { ++ memcpy(dstmem, srcmem, len); ++ } else { ++ memmove(dstmem, srcmem, len); ++ } ++ } ++ ++ return len; ++ } else if (!src_is_fd) { ++ return fuse_buf_write(dst, dst_off, src, src_off, len); ++ } else if (!dst_is_fd) { ++ return fuse_buf_read(dst, dst_off, src, src_off, len); ++ } else { ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); ++ } + } + + static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv) + { +- if (bufv->idx < bufv->count) +- return &bufv->buf[bufv->idx]; +- else +- return NULL; ++ if (bufv->idx < bufv->count) { ++ return &bufv->buf[bufv->idx]; ++ } else { ++ return NULL; ++ } + } + + static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) + { +- const struct fuse_buf *buf = fuse_bufvec_current(bufv); +- +- bufv->off += len; +- assert(bufv->off <= buf->size); +- if (bufv->off == buf->size) { +- assert(bufv->idx < bufv->count); +- bufv->idx++; +- if (bufv->idx == bufv->count) +- return 0; +- bufv->off = 0; +- } +- return 1; ++ const struct fuse_buf *buf = fuse_bufvec_current(bufv); ++ ++ bufv->off += len; ++ assert(bufv->off <= buf->size); ++ if (bufv->off == buf->size) { ++ assert(bufv->idx < bufv->count); ++ bufv->idx++; ++ if (bufv->idx == bufv->count) { ++ return 0; ++ } ++ bufv->off = 0; ++ } ++ return 1; + } + + ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, +- enum fuse_buf_copy_flags flags) ++ enum fuse_buf_copy_flags flags) + { +- size_t copied = 0; +- +- if (dstv == srcv) +- return fuse_buf_size(dstv); +- +- for (;;) { +- const struct fuse_buf *src = fuse_bufvec_current(srcv); +- const struct fuse_buf *dst = fuse_bufvec_current(dstv); +- size_t src_len; +- size_t dst_len; +- size_t len; +- ssize_t res; +- +- if (src == NULL || dst == NULL) +- break; +- +- src_len = src->size - srcv->off; +- dst_len = dst->size - dstv->off; +- len = min_size(src_len, dst_len); +- +- res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); +- if (res < 0) { +- if (!copied) +- return res; +- break; +- } +- copied += res; +- +- if (!fuse_bufvec_advance(srcv, res) || +- !fuse_bufvec_advance(dstv, res)) +- break; +- +- if (res < len) +- break; +- } +- +- return copied; ++ size_t copied = 0; ++ ++ if (dstv == srcv) { ++ return fuse_buf_size(dstv); ++ } ++ ++ for (;;) { ++ const struct fuse_buf *src = fuse_bufvec_current(srcv); ++ const struct fuse_buf *dst = fuse_bufvec_current(dstv); ++ size_t src_len; ++ size_t dst_len; ++ size_t len; ++ ssize_t res; ++ ++ if (src == NULL || dst == NULL) { ++ break; ++ } ++ ++ src_len = src->size - srcv->off; ++ dst_len = dst->size - dstv->off; ++ len = min_size(src_len, dst_len); ++ ++ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); ++ if (res < 0) { ++ if (!copied) { ++ return res; ++ } ++ break; ++ } ++ copied += res; ++ ++ if (!fuse_bufvec_advance(srcv, res) || ++ !fuse_bufvec_advance(dstv, res)) { ++ break; ++ } ++ ++ if (res < len) { ++ break; ++ } ++ } ++ ++ return copied; + } +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +index 3202fba..7a4c713 100644 +--- a/tools/virtiofsd/fuse.h ++++ b/tools/virtiofsd/fuse.h +@@ -1,15 +1,15 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_H_ + #define FUSE_H_ + +-/** @file ++/* + * + * This file defines the library interface of FUSE + * +@@ -19,15 +19,15 @@ + #include "fuse_common.h" + + #include +-#include +-#include + #include + #include ++#include + #include ++#include + +-/* ----------------------------------------------------------- * +- * Basic FUSE API * +- * ----------------------------------------------------------- */ ++/* ++ * Basic FUSE API ++ */ + + /** Handle for a FUSE filesystem */ + struct fuse; +@@ -36,38 +36,39 @@ struct fuse; + * Readdir flags, passed to ->readdir() + */ + enum fuse_readdir_flags { +- /** +- * "Plus" mode. +- * +- * The kernel wants to prefill the inode cache during readdir. The +- * filesystem may honour this by filling in the attributes and setting +- * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also +- * just ignore this flag completely. +- */ +- FUSE_READDIR_PLUS = (1 << 0), ++ /** ++ * "Plus" mode. ++ * ++ * The kernel wants to prefill the inode cache during readdir. The ++ * filesystem may honour this by filling in the attributes and setting ++ * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also ++ * just ignore this flag completely. ++ */ ++ FUSE_READDIR_PLUS = (1 << 0), + }; + + enum fuse_fill_dir_flags { +- /** +- * "Plus" mode: all file attributes are valid +- * +- * The attributes are used by the kernel to prefill the inode cache +- * during a readdir. +- * +- * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set +- * and vice versa. +- */ +- FUSE_FILL_DIR_PLUS = (1 << 1), ++ /** ++ * "Plus" mode: all file attributes are valid ++ * ++ * The attributes are used by the kernel to prefill the inode cache ++ * during a readdir. ++ * ++ * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set ++ * and vice versa. ++ */ ++ FUSE_FILL_DIR_PLUS = (1 << 1), + }; + +-/** Function to add an entry in a readdir() operation ++/** ++ * Function to add an entry in a readdir() operation + * + * The *off* parameter can be any non-zero value that enables the + * filesystem to identify the current point in the directory + * stream. It does not need to be the actual physical position. A + * value of zero is reserved to indicate that seeking in directories + * is not supported. +- * ++ * + * @param buf the buffer passed to the readdir() operation + * @param name the file name of the directory entry + * @param stat file attributes, can be NULL +@@ -75,9 +76,9 @@ enum fuse_fill_dir_flags { + * @param flags fill flags + * @return 1 if buffer is full, zero otherwise + */ +-typedef int (*fuse_fill_dir_t) (void *buf, const char *name, +- const struct stat *stbuf, off_t off, +- enum fuse_fill_dir_flags flags); ++typedef int (*fuse_fill_dir_t)(void *buf, const char *name, ++ const struct stat *stbuf, off_t off, ++ enum fuse_fill_dir_flags flags); + /** + * Configuration of the high-level API + * +@@ -87,186 +88,186 @@ typedef int (*fuse_fill_dir_t) (void *buf, const char *name, + * file system implementation. + */ + struct fuse_config { +- /** +- * If `set_gid` is non-zero, the st_gid attribute of each file +- * is overwritten with the value of `gid`. +- */ +- int set_gid; +- unsigned int gid; +- +- /** +- * If `set_uid` is non-zero, the st_uid attribute of each file +- * is overwritten with the value of `uid`. +- */ +- int set_uid; +- unsigned int uid; +- +- /** +- * If `set_mode` is non-zero, the any permissions bits set in +- * `umask` are unset in the st_mode attribute of each file. +- */ +- int set_mode; +- unsigned int umask; +- +- /** +- * The timeout in seconds for which name lookups will be +- * cached. +- */ +- double entry_timeout; +- +- /** +- * The timeout in seconds for which a negative lookup will be +- * cached. This means, that if file did not exist (lookup +- * retuned ENOENT), the lookup will only be redone after the +- * timeout, and the file/directory will be assumed to not +- * exist until then. A value of zero means that negative +- * lookups are not cached. +- */ +- double negative_timeout; +- +- /** +- * The timeout in seconds for which file/directory attributes +- * (as returned by e.g. the `getattr` handler) are cached. +- */ +- double attr_timeout; +- +- /** +- * Allow requests to be interrupted +- */ +- int intr; +- +- /** +- * Specify which signal number to send to the filesystem when +- * a request is interrupted. The default is hardcoded to +- * USR1. +- */ +- int intr_signal; +- +- /** +- * Normally, FUSE assigns inodes to paths only for as long as +- * the kernel is aware of them. With this option inodes are +- * instead remembered for at least this many seconds. This +- * will require more memory, but may be necessary when using +- * applications that make use of inode numbers. +- * +- * A number of -1 means that inodes will be remembered for the +- * entire life-time of the file-system process. +- */ +- int remember; +- +- /** +- * The default behavior is that if an open file is deleted, +- * the file is renamed to a hidden file (.fuse_hiddenXXX), and +- * only removed when the file is finally released. This +- * relieves the filesystem implementation of having to deal +- * with this problem. This option disables the hiding +- * behavior, and files are removed immediately in an unlink +- * operation (or in a rename operation which overwrites an +- * existing file). +- * +- * It is recommended that you not use the hard_remove +- * option. When hard_remove is set, the following libc +- * functions fail on unlinked files (returning errno of +- * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2), +- * ftruncate(2), fstat(2), fchmod(2), fchown(2) +- */ +- int hard_remove; +- +- /** +- * Honor the st_ino field in the functions getattr() and +- * fill_dir(). This value is used to fill in the st_ino field +- * in the stat(2), lstat(2), fstat(2) functions and the d_ino +- * field in the readdir(2) function. The filesystem does not +- * have to guarantee uniqueness, however some applications +- * rely on this value being unique for the whole filesystem. +- * +- * Note that this does *not* affect the inode that libfuse +- * and the kernel use internally (also called the "nodeid"). +- */ +- int use_ino; +- +- /** +- * If use_ino option is not given, still try to fill in the +- * d_ino field in readdir(2). If the name was previously +- * looked up, and is still in the cache, the inode number +- * found there will be used. Otherwise it will be set to -1. +- * If use_ino option is given, this option is ignored. +- */ +- int readdir_ino; +- +- /** +- * This option disables the use of page cache (file content cache) +- * in the kernel for this filesystem. This has several affects: +- * +- * 1. Each read(2) or write(2) system call will initiate one +- * or more read or write operations, data will not be +- * cached in the kernel. +- * +- * 2. The return value of the read() and write() system calls +- * will correspond to the return values of the read and +- * write operations. This is useful for example if the +- * file size is not known in advance (before reading it). +- * +- * Internally, enabling this option causes fuse to set the +- * `direct_io` field of `struct fuse_file_info` - overwriting +- * any value that was put there by the file system. +- */ +- int direct_io; +- +- /** +- * This option disables flushing the cache of the file +- * contents on every open(2). This should only be enabled on +- * filesystems where the file data is never changed +- * externally (not through the mounted FUSE filesystem). Thus +- * it is not suitable for network filesystems and other +- * intermediate filesystems. +- * +- * NOTE: if this option is not specified (and neither +- * direct_io) data is still cached after the open(2), so a +- * read(2) system call will not always initiate a read +- * operation. +- * +- * Internally, enabling this option causes fuse to set the +- * `keep_cache` field of `struct fuse_file_info` - overwriting +- * any value that was put there by the file system. +- */ +- int kernel_cache; +- +- /** +- * This option is an alternative to `kernel_cache`. Instead of +- * unconditionally keeping cached data, the cached data is +- * invalidated on open(2) if if the modification time or the +- * size of the file has changed since it was last opened. +- */ +- int auto_cache; +- +- /** +- * The timeout in seconds for which file attributes are cached +- * for the purpose of checking if auto_cache should flush the +- * file data on open. +- */ +- int ac_attr_timeout_set; +- double ac_attr_timeout; +- +- /** +- * If this option is given the file-system handlers for the +- * following operations will not receive path information: +- * read, write, flush, release, fsync, readdir, releasedir, +- * fsyncdir, lock, ioctl and poll. +- * +- * For the truncate, getattr, chmod, chown and utimens +- * operations the path will be provided only if the struct +- * fuse_file_info argument is NULL. +- */ +- int nullpath_ok; +- +- /** +- * The remaining options are used by libfuse internally and +- * should not be touched. +- */ +- int show_help; +- char *modules; +- int debug; ++ /** ++ * If `set_gid` is non-zero, the st_gid attribute of each file ++ * is overwritten with the value of `gid`. ++ */ ++ int set_gid; ++ unsigned int gid; ++ ++ /** ++ * If `set_uid` is non-zero, the st_uid attribute of each file ++ * is overwritten with the value of `uid`. ++ */ ++ int set_uid; ++ unsigned int uid; ++ ++ /** ++ * If `set_mode` is non-zero, the any permissions bits set in ++ * `umask` are unset in the st_mode attribute of each file. ++ */ ++ int set_mode; ++ unsigned int umask; ++ ++ /** ++ * The timeout in seconds for which name lookups will be ++ * cached. ++ */ ++ double entry_timeout; ++ ++ /** ++ * The timeout in seconds for which a negative lookup will be ++ * cached. This means, that if file did not exist (lookup ++ * retuned ENOENT), the lookup will only be redone after the ++ * timeout, and the file/directory will be assumed to not ++ * exist until then. A value of zero means that negative ++ * lookups are not cached. ++ */ ++ double negative_timeout; ++ ++ /** ++ * The timeout in seconds for which file/directory attributes ++ * (as returned by e.g. the `getattr` handler) are cached. ++ */ ++ double attr_timeout; ++ ++ /** ++ * Allow requests to be interrupted ++ */ ++ int intr; ++ ++ /** ++ * Specify which signal number to send to the filesystem when ++ * a request is interrupted. The default is hardcoded to ++ * USR1. ++ */ ++ int intr_signal; ++ ++ /** ++ * Normally, FUSE assigns inodes to paths only for as long as ++ * the kernel is aware of them. With this option inodes are ++ * instead remembered for at least this many seconds. This ++ * will require more memory, but may be necessary when using ++ * applications that make use of inode numbers. ++ * ++ * A number of -1 means that inodes will be remembered for the ++ * entire life-time of the file-system process. ++ */ ++ int remember; ++ ++ /** ++ * The default behavior is that if an open file is deleted, ++ * the file is renamed to a hidden file (.fuse_hiddenXXX), and ++ * only removed when the file is finally released. This ++ * relieves the filesystem implementation of having to deal ++ * with this problem. This option disables the hiding ++ * behavior, and files are removed immediately in an unlink ++ * operation (or in a rename operation which overwrites an ++ * existing file). ++ * ++ * It is recommended that you not use the hard_remove ++ * option. When hard_remove is set, the following libc ++ * functions fail on unlinked files (returning errno of ++ * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2), ++ * ftruncate(2), fstat(2), fchmod(2), fchown(2) ++ */ ++ int hard_remove; ++ ++ /** ++ * Honor the st_ino field in the functions getattr() and ++ * fill_dir(). This value is used to fill in the st_ino field ++ * in the stat(2), lstat(2), fstat(2) functions and the d_ino ++ * field in the readdir(2) function. The filesystem does not ++ * have to guarantee uniqueness, however some applications ++ * rely on this value being unique for the whole filesystem. ++ * ++ * Note that this does *not* affect the inode that libfuse ++ * and the kernel use internally (also called the "nodeid"). ++ */ ++ int use_ino; ++ ++ /** ++ * If use_ino option is not given, still try to fill in the ++ * d_ino field in readdir(2). If the name was previously ++ * looked up, and is still in the cache, the inode number ++ * found there will be used. Otherwise it will be set to -1. ++ * If use_ino option is given, this option is ignored. ++ */ ++ int readdir_ino; ++ ++ /** ++ * This option disables the use of page cache (file content cache) ++ * in the kernel for this filesystem. This has several affects: ++ * ++ * 1. Each read(2) or write(2) system call will initiate one ++ * or more read or write operations, data will not be ++ * cached in the kernel. ++ * ++ * 2. The return value of the read() and write() system calls ++ * will correspond to the return values of the read and ++ * write operations. This is useful for example if the ++ * file size is not known in advance (before reading it). ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `direct_io` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int direct_io; ++ ++ /** ++ * This option disables flushing the cache of the file ++ * contents on every open(2). This should only be enabled on ++ * filesystems where the file data is never changed ++ * externally (not through the mounted FUSE filesystem). Thus ++ * it is not suitable for network filesystems and other ++ * intermediate filesystems. ++ * ++ * NOTE: if this option is not specified (and neither ++ * direct_io) data is still cached after the open(2), so a ++ * read(2) system call will not always initiate a read ++ * operation. ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `keep_cache` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int kernel_cache; ++ ++ /** ++ * This option is an alternative to `kernel_cache`. Instead of ++ * unconditionally keeping cached data, the cached data is ++ * invalidated on open(2) if if the modification time or the ++ * size of the file has changed since it was last opened. ++ */ ++ int auto_cache; ++ ++ /** ++ * The timeout in seconds for which file attributes are cached ++ * for the purpose of checking if auto_cache should flush the ++ * file data on open. ++ */ ++ int ac_attr_timeout_set; ++ double ac_attr_timeout; ++ ++ /** ++ * If this option is given the file-system handlers for the ++ * following operations will not receive path information: ++ * read, write, flush, release, fsync, readdir, releasedir, ++ * fsyncdir, lock, ioctl and poll. ++ * ++ * For the truncate, getattr, chmod, chown and utimens ++ * operations the path will be provided only if the struct ++ * fuse_file_info argument is NULL. ++ */ ++ int nullpath_ok; ++ ++ /** ++ * The remaining options are used by libfuse internally and ++ * should not be touched. ++ */ ++ int show_help; ++ char *modules; ++ int debug; + }; + + +@@ -293,515 +294,535 @@ struct fuse_config { + * Almost all operations take a path which can be of any length. + */ + struct fuse_operations { +- /** Get file attributes. +- * +- * Similar to stat(). The 'st_dev' and 'st_blksize' fields are +- * ignored. The 'st_ino' field is ignored except if the 'use_ino' +- * mount option is given. In that case it is passed to userspace, +- * but libfuse and the kernel will still assign a different +- * inode for internal use (called the "nodeid"). +- * +- * `fi` will always be NULL if the file is not currently open, but +- * may also be NULL if the file is open. +- */ +- int (*getattr) (const char *, struct stat *, struct fuse_file_info *fi); +- +- /** Read the target of a symbolic link +- * +- * The buffer should be filled with a null terminated string. The +- * buffer size argument includes the space for the terminating +- * null character. If the linkname is too long to fit in the +- * buffer, it should be truncated. The return value should be 0 +- * for success. +- */ +- int (*readlink) (const char *, char *, size_t); +- +- /** Create a file node +- * +- * This is called for creation of all non-directory, non-symlink +- * nodes. If the filesystem defines a create() method, then for +- * regular files that will be called instead. +- */ +- int (*mknod) (const char *, mode_t, dev_t); +- +- /** Create a directory +- * +- * Note that the mode argument may not have the type specification +- * bits set, i.e. S_ISDIR(mode) can be false. To obtain the +- * correct directory type bits use mode|S_IFDIR +- * */ +- int (*mkdir) (const char *, mode_t); +- +- /** Remove a file */ +- int (*unlink) (const char *); +- +- /** Remove a directory */ +- int (*rmdir) (const char *); +- +- /** Create a symbolic link */ +- int (*symlink) (const char *, const char *); +- +- /** Rename a file +- * +- * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If +- * RENAME_NOREPLACE is specified, the filesystem must not +- * overwrite *newname* if it exists and return an error +- * instead. If `RENAME_EXCHANGE` is specified, the filesystem +- * must atomically exchange the two files, i.e. both must +- * exist and neither may be deleted. +- */ +- int (*rename) (const char *, const char *, unsigned int flags); +- +- /** Create a hard link to a file */ +- int (*link) (const char *, const char *); +- +- /** Change the permission bits of a file +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- */ +- int (*chmod) (const char *, mode_t, struct fuse_file_info *fi); +- +- /** Change the owner and group of a file +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*chown) (const char *, uid_t, gid_t, struct fuse_file_info *fi); +- +- /** Change the size of a file +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*truncate) (const char *, off_t, struct fuse_file_info *fi); +- +- /** Open a file +- * +- * Open flags are available in fi->flags. The following rules +- * apply. +- * +- * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be +- * filtered out / handled by the kernel. +- * +- * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH) +- * should be used by the filesystem to check if the operation is +- * permitted. If the ``-o default_permissions`` mount option is +- * given, this check is already done by the kernel before calling +- * open() and may thus be omitted by the filesystem. +- * +- * - When writeback caching is enabled, the kernel may send +- * read requests even for files opened with O_WRONLY. The +- * filesystem should be prepared to handle this. +- * +- * - When writeback caching is disabled, the filesystem is +- * expected to properly handle the O_APPEND flag and ensure +- * that each write is appending to the end of the file. +- * +- * - When writeback caching is enabled, the kernel will +- * handle O_APPEND. However, unless all changes to the file +- * come through the kernel this will not work reliably. The +- * filesystem should thus either ignore the O_APPEND flag +- * (and let the kernel handle it), or return an error +- * (indicating that reliably O_APPEND is not available). +- * +- * Filesystem may store an arbitrary file handle (pointer, +- * index, etc) in fi->fh, and use this in other all other file +- * operations (read, write, flush, release, fsync). +- * +- * Filesystem may also implement stateless file I/O and not store +- * anything in fi->fh. +- * +- * There are also some flags (direct_io, keep_cache) which the +- * filesystem may set in fi, to change the way the file is opened. +- * See fuse_file_info structure in for more details. +- * +- * If this request is answered with an error code of ENOSYS +- * and FUSE_CAP_NO_OPEN_SUPPORT is set in +- * `fuse_conn_info.capable`, this is treated as success and +- * future calls to open will also succeed without being send +- * to the filesystem process. +- * +- */ +- int (*open) (const char *, struct fuse_file_info *); +- +- /** Read data from an open file +- * +- * Read should return exactly the number of bytes requested except +- * on EOF or error, otherwise the rest of the data will be +- * substituted with zeroes. An exception to this is when the +- * 'direct_io' mount option is specified, in which case the return +- * value of the read system call will reflect the return value of +- * this operation. +- */ +- int (*read) (const char *, char *, size_t, off_t, +- struct fuse_file_info *); +- +- /** Write data to an open file +- * +- * Write should return exactly the number of bytes requested +- * except on error. An exception to this is when the 'direct_io' +- * mount option is specified (see read operation). +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*write) (const char *, const char *, size_t, off_t, +- struct fuse_file_info *); +- +- /** Get file system statistics +- * +- * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored +- */ +- int (*statfs) (const char *, struct statvfs *); +- +- /** Possibly flush cached data +- * +- * BIG NOTE: This is not equivalent to fsync(). It's not a +- * request to sync dirty data. +- * +- * Flush is called on each close() of a file descriptor, as opposed to +- * release which is called on the close of the last file descriptor for +- * a file. Under Linux, errors returned by flush() will be passed to +- * userspace as errors from close(), so flush() is a good place to write +- * back any cached dirty data. However, many applications ignore errors +- * on close(), and on non-Linux systems, close() may succeed even if flush() +- * returns an error. For these reasons, filesystems should not assume +- * that errors returned by flush will ever be noticed or even +- * delivered. +- * +- * NOTE: The flush() method may be called more than once for each +- * open(). This happens if more than one file descriptor refers to an +- * open file handle, e.g. due to dup(), dup2() or fork() calls. It is +- * not possible to determine if a flush is final, so each flush should +- * be treated equally. Multiple write-flush sequences are relatively +- * rare, so this shouldn't be a problem. +- * +- * Filesystems shouldn't assume that flush will be called at any +- * particular point. It may be called more times than expected, or not +- * at all. +- * +- * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html +- */ +- int (*flush) (const char *, struct fuse_file_info *); +- +- /** Release an open file +- * +- * Release is called when there are no more references to an open +- * file: all file descriptors are closed and all memory mappings +- * are unmapped. +- * +- * For every open() call there will be exactly one release() call +- * with the same flags and file handle. It is possible to +- * have a file opened more than once, in which case only the last +- * release will mean, that no more reads/writes will happen on the +- * file. The return value of release is ignored. +- */ +- int (*release) (const char *, struct fuse_file_info *); +- +- /** Synchronize file contents +- * +- * If the datasync parameter is non-zero, then only the user data +- * should be flushed, not the meta data. +- */ +- int (*fsync) (const char *, int, struct fuse_file_info *); +- +- /** Set extended attributes */ +- int (*setxattr) (const char *, const char *, const char *, size_t, int); +- +- /** Get extended attributes */ +- int (*getxattr) (const char *, const char *, char *, size_t); +- +- /** List extended attributes */ +- int (*listxattr) (const char *, char *, size_t); +- +- /** Remove extended attributes */ +- int (*removexattr) (const char *, const char *); +- +- /** Open directory +- * +- * Unless the 'default_permissions' mount option is given, +- * this method should check if opendir is permitted for this +- * directory. Optionally opendir may also return an arbitrary +- * filehandle in the fuse_file_info structure, which will be +- * passed to readdir, releasedir and fsyncdir. +- */ +- int (*opendir) (const char *, struct fuse_file_info *); +- +- /** Read directory +- * +- * The filesystem may choose between two modes of operation: +- * +- * 1) The readdir implementation ignores the offset parameter, and +- * passes zero to the filler function's offset. The filler +- * function will not return '1' (unless an error happens), so the +- * whole directory is read in a single readdir operation. +- * +- * 2) The readdir implementation keeps track of the offsets of the +- * directory entries. It uses the offset parameter and always +- * passes non-zero offset to the filler function. When the buffer +- * is full (or an error happens) the filler function will return +- * '1'. +- */ +- int (*readdir) (const char *, void *, fuse_fill_dir_t, off_t, +- struct fuse_file_info *, enum fuse_readdir_flags); +- +- /** Release directory +- */ +- int (*releasedir) (const char *, struct fuse_file_info *); +- +- /** Synchronize directory contents +- * +- * If the datasync parameter is non-zero, then only the user data +- * should be flushed, not the meta data +- */ +- int (*fsyncdir) (const char *, int, struct fuse_file_info *); +- +- /** +- * Initialize filesystem +- * +- * The return value will passed in the `private_data` field of +- * `struct fuse_context` to all file operations, and as a +- * parameter to the destroy() method. It overrides the initial +- * value provided to fuse_main() / fuse_new(). +- */ +- void *(*init) (struct fuse_conn_info *conn, +- struct fuse_config *cfg); +- +- /** +- * Clean up filesystem +- * +- * Called on filesystem exit. +- */ +- void (*destroy) (void *private_data); +- +- /** +- * Check file access permissions +- * +- * This will be called for the access() system call. If the +- * 'default_permissions' mount option is given, this method is not +- * called. +- * +- * This method is not called under Linux kernel versions 2.4.x +- */ +- int (*access) (const char *, int); +- +- /** +- * Create and open a file +- * +- * If the file does not exist, first create it with the specified +- * mode, and then open it. +- * +- * If this method is not implemented or under Linux kernel +- * versions earlier than 2.6.15, the mknod() and open() methods +- * will be called instead. +- */ +- int (*create) (const char *, mode_t, struct fuse_file_info *); +- +- /** +- * Perform POSIX file locking operation +- * +- * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW. +- * +- * For the meaning of fields in 'struct flock' see the man page +- * for fcntl(2). The l_whence field will always be set to +- * SEEK_SET. +- * +- * For checking lock ownership, the 'fuse_file_info->owner' +- * argument must be used. +- * +- * For F_GETLK operation, the library will first check currently +- * held locks, and if a conflicting lock is found it will return +- * information without calling this method. This ensures, that +- * for local locks the l_pid field is correctly filled in. The +- * results may not be accurate in case of race conditions and in +- * the presence of hard links, but it's unlikely that an +- * application would rely on accurate GETLK results in these +- * cases. If a conflicting lock is not found, this method will be +- * called, and the filesystem may fill out l_pid by a meaningful +- * value, or it may leave this field zero. +- * +- * For F_SETLK and F_SETLKW the l_pid field will be set to the pid +- * of the process performing the locking operation. +- * +- * Note: if this method is not implemented, the kernel will still +- * allow file locking to work locally. Hence it is only +- * interesting for network filesystems and similar. +- */ +- int (*lock) (const char *, struct fuse_file_info *, int cmd, +- struct flock *); +- +- /** +- * Change the access and modification times of a file with +- * nanosecond resolution +- * +- * This supersedes the old utime() interface. New applications +- * should use this. +- * +- * `fi` will always be NULL if the file is not currenlty open, but +- * may also be NULL if the file is open. +- * +- * See the utimensat(2) man page for details. +- */ +- int (*utimens) (const char *, const struct timespec tv[2], +- struct fuse_file_info *fi); +- +- /** +- * Map block index within file to block index within device +- * +- * Note: This makes sense only for block device backed filesystems +- * mounted with the 'blkdev' option +- */ +- int (*bmap) (const char *, size_t blocksize, uint64_t *idx); +- +- /** +- * Ioctl +- * +- * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in +- * 64bit environment. The size and direction of data is +- * determined by _IOC_*() decoding of cmd. For _IOC_NONE, +- * data will be NULL, for _IOC_WRITE data is out area, for +- * _IOC_READ in area and if both are set in/out area. In all +- * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes. +- * +- * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a +- * directory file handle. +- * +- * Note : the unsigned long request submitted by the application +- * is truncated to 32 bits. +- */ +- int (*ioctl) (const char *, unsigned int cmd, void *arg, +- struct fuse_file_info *, unsigned int flags, void *data); +- +- /** +- * Poll for IO readiness events +- * +- * Note: If ph is non-NULL, the client should notify +- * when IO readiness events occur by calling +- * fuse_notify_poll() with the specified ph. +- * +- * Regardless of the number of times poll with a non-NULL ph +- * is received, single notification is enough to clear all. +- * Notifying more times incurs overhead but doesn't harm +- * correctness. +- * +- * The callee is responsible for destroying ph with +- * fuse_pollhandle_destroy() when no longer in use. +- */ +- int (*poll) (const char *, struct fuse_file_info *, +- struct fuse_pollhandle *ph, unsigned *reventsp); +- +- /** Write contents of buffer to an open file +- * +- * Similar to the write() method, but data is supplied in a +- * generic buffer. Use fuse_buf_copy() to transfer data to +- * the destination. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- */ +- int (*write_buf) (const char *, struct fuse_bufvec *buf, off_t off, +- struct fuse_file_info *); +- +- /** Store data from an open file in a buffer +- * +- * Similar to the read() method, but data is stored and +- * returned in a generic buffer. +- * +- * No actual copying of data has to take place, the source +- * file descriptor may simply be stored in the buffer for +- * later data transfer. +- * +- * The buffer must be allocated dynamically and stored at the +- * location pointed to by bufp. If the buffer contains memory +- * regions, they too must be allocated using malloc(). The +- * allocated memory will be freed by the caller. +- */ +- int (*read_buf) (const char *, struct fuse_bufvec **bufp, +- size_t size, off_t off, struct fuse_file_info *); +- /** +- * Perform BSD file locking operation +- * +- * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN +- * +- * Nonblocking requests will be indicated by ORing LOCK_NB to +- * the above operations +- * +- * For more information see the flock(2) manual page. +- * +- * Additionally fi->owner will be set to a value unique to +- * this open file. This same value will be supplied to +- * ->release() when the file is released. +- * +- * Note: if this method is not implemented, the kernel will still +- * allow file locking to work locally. Hence it is only +- * interesting for network filesystems and similar. +- */ +- int (*flock) (const char *, struct fuse_file_info *, int op); +- +- /** +- * Allocates space for an open file +- * +- * This function ensures that required space is allocated for specified +- * file. If this function returns success then any subsequent write +- * request to specified range is guaranteed not to fail because of lack +- * of space on the file system media. +- */ +- int (*fallocate) (const char *, int, off_t, off_t, +- struct fuse_file_info *); +- +- /** +- * Copy a range of data from one file to another +- * +- * Performs an optimized copy between two file descriptors without the +- * additional cost of transferring data through the FUSE kernel module +- * to user space (glibc) and then back into the FUSE filesystem again. +- * +- * In case this method is not implemented, glibc falls back to reading +- * data from the source and writing to the destination. Effectively +- * doing an inefficient copy of the data. +- */ +- ssize_t (*copy_file_range) (const char *path_in, +- struct fuse_file_info *fi_in, +- off_t offset_in, const char *path_out, +- struct fuse_file_info *fi_out, +- off_t offset_out, size_t size, int flags); +- +- /** +- * Find next data or hole after the specified offset +- */ +- off_t (*lseek) (const char *, off_t off, int whence, struct fuse_file_info *); ++ /** ++ * Get file attributes. ++ * ++ * Similar to stat(). The 'st_dev' and 'st_blksize' fields are ++ * ignored. The 'st_ino' field is ignored except if the 'use_ino' ++ * mount option is given. In that case it is passed to userspace, ++ * but libfuse and the kernel will still assign a different ++ * inode for internal use (called the "nodeid"). ++ * ++ * `fi` will always be NULL if the file is not currently open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*getattr)(const char *, struct stat *, struct fuse_file_info *fi); ++ ++ /** ++ * Read the target of a symbolic link ++ * ++ * The buffer should be filled with a null terminated string. The ++ * buffer size argument includes the space for the terminating ++ * null character. If the linkname is too long to fit in the ++ * buffer, it should be truncated. The return value should be 0 ++ * for success. ++ */ ++ int (*readlink)(const char *, char *, size_t); ++ ++ /** ++ * Create a file node ++ * ++ * This is called for creation of all non-directory, non-symlink ++ * nodes. If the filesystem defines a create() method, then for ++ * regular files that will be called instead. ++ */ ++ int (*mknod)(const char *, mode_t, dev_t); ++ ++ /** ++ * Create a directory ++ * ++ * Note that the mode argument may not have the type specification ++ * bits set, i.e. S_ISDIR(mode) can be false. To obtain the ++ * correct directory type bits use mode|S_IFDIR ++ */ ++ int (*mkdir)(const char *, mode_t); ++ ++ /** Remove a file */ ++ int (*unlink)(const char *); ++ ++ /** Remove a directory */ ++ int (*rmdir)(const char *); ++ ++ /** Create a symbolic link */ ++ int (*symlink)(const char *, const char *); ++ ++ /** ++ * Rename a file ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ */ ++ int (*rename)(const char *, const char *, unsigned int flags); ++ ++ /** Create a hard link to a file */ ++ int (*link)(const char *, const char *); ++ ++ /** ++ * Change the permission bits of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*chmod)(const char *, mode_t, struct fuse_file_info *fi); ++ ++ /** ++ * Change the owner and group of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*chown)(const char *, uid_t, gid_t, struct fuse_file_info *fi); ++ ++ /** ++ * Change the size of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*truncate)(const char *, off_t, struct fuse_file_info *fi); ++ ++ /** ++ * Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH) ++ * should be used by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount option is ++ * given, this check is already done by the kernel before calling ++ * open() and may thus be omitted by the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open will also succeed without being send ++ * to the filesystem process. ++ * ++ */ ++ int (*open)(const char *, struct fuse_file_info *); ++ ++ /** ++ * Read data from an open file ++ * ++ * Read should return exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the ++ * 'direct_io' mount option is specified, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ */ ++ int (*read)(const char *, char *, size_t, off_t, struct fuse_file_info *); ++ ++ /** ++ * Write data to an open file ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the 'direct_io' ++ * mount option is specified (see read operation). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write)(const char *, const char *, size_t, off_t, ++ struct fuse_file_info *); ++ ++ /** ++ * Get file system statistics ++ * ++ * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored ++ */ ++ int (*statfs)(const char *, struct statvfs *); ++ ++ /** ++ * Possibly flush cached data ++ * ++ * BIG NOTE: This is not equivalent to fsync(). It's not a ++ * request to sync dirty data. ++ * ++ * Flush is called on each close() of a file descriptor, as opposed to ++ * release which is called on the close of the last file descriptor for ++ * a file. Under Linux, errors returned by flush() will be passed to ++ * userspace as errors from close(), so flush() is a good place to write ++ * back any cached dirty data. However, many applications ignore errors ++ * on close(), and on non-Linux systems, close() may succeed even if flush() ++ * returns an error. For these reasons, filesystems should not assume ++ * that errors returned by flush will ever be noticed or even ++ * delivered. ++ * ++ * NOTE: The flush() method may be called more than once for each ++ * open(). This happens if more than one file descriptor refers to an ++ * open file handle, e.g. due to dup(), dup2() or fork() calls. It is ++ * not possible to determine if a flush is final, so each flush should ++ * be treated equally. Multiple write-flush sequences are relatively ++ * rare, so this shouldn't be a problem. ++ * ++ * Filesystems shouldn't assume that flush will be called at any ++ * particular point. It may be called more times than expected, or not ++ * at all. ++ * ++ * [close]: ++ * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ int (*flush)(const char *, struct fuse_file_info *); ++ ++ /** ++ * Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open() call there will be exactly one release() call ++ * with the same flags and file handle. It is possible to ++ * have a file opened more than once, in which case only the last ++ * release will mean, that no more reads/writes will happen on the ++ * file. The return value of release is ignored. ++ */ ++ int (*release)(const char *, struct fuse_file_info *); ++ ++ /* ++ * Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ */ ++ int (*fsync)(const char *, int, struct fuse_file_info *); ++ ++ /** Set extended attributes */ ++ int (*setxattr)(const char *, const char *, const char *, size_t, int); ++ ++ /** Get extended attributes */ ++ int (*getxattr)(const char *, const char *, char *, size_t); ++ ++ /** List extended attributes */ ++ int (*listxattr)(const char *, char *, size_t); ++ ++ /** Remove extended attributes */ ++ int (*removexattr)(const char *, const char *); ++ ++ /* ++ * Open directory ++ * ++ * Unless the 'default_permissions' mount option is given, ++ * this method should check if opendir is permitted for this ++ * directory. Optionally opendir may also return an arbitrary ++ * filehandle in the fuse_file_info structure, which will be ++ * passed to readdir, releasedir and fsyncdir. ++ */ ++ int (*opendir)(const char *, struct fuse_file_info *); ++ ++ /* ++ * Read directory ++ * ++ * The filesystem may choose between two modes of operation: ++ * ++ * 1) The readdir implementation ignores the offset parameter, and ++ * passes zero to the filler function's offset. The filler ++ * function will not return '1' (unless an error happens), so the ++ * whole directory is read in a single readdir operation. ++ * ++ * 2) The readdir implementation keeps track of the offsets of the ++ * directory entries. It uses the offset parameter and always ++ * passes non-zero offset to the filler function. When the buffer ++ * is full (or an error happens) the filler function will return ++ * '1'. ++ */ ++ int (*readdir)(const char *, void *, fuse_fill_dir_t, off_t, ++ struct fuse_file_info *, enum fuse_readdir_flags); ++ ++ /** ++ * Release directory ++ */ ++ int (*releasedir)(const char *, struct fuse_file_info *); ++ ++ /** ++ * Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data ++ */ ++ int (*fsyncdir)(const char *, int, struct fuse_file_info *); ++ ++ /** ++ * Initialize filesystem ++ * ++ * The return value will passed in the `private_data` field of ++ * `struct fuse_context` to all file operations, and as a ++ * parameter to the destroy() method. It overrides the initial ++ * value provided to fuse_main() / fuse_new(). ++ */ ++ void *(*init)(struct fuse_conn_info *conn, struct fuse_config *cfg); ++ ++ /** ++ * Clean up filesystem ++ * ++ * Called on filesystem exit. ++ */ ++ void (*destroy)(void *private_data); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() system call. If the ++ * 'default_permissions' mount option is given, this method is not ++ * called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ */ ++ int (*access)(const char *, int); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ */ ++ int (*create)(const char *, mode_t, struct fuse_file_info *); ++ ++ /** ++ * Perform POSIX file locking operation ++ * ++ * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW. ++ * ++ * For the meaning of fields in 'struct flock' see the man page ++ * for fcntl(2). The l_whence field will always be set to ++ * SEEK_SET. ++ * ++ * For checking lock ownership, the 'fuse_file_info->owner' ++ * argument must be used. ++ * ++ * For F_GETLK operation, the library will first check currently ++ * held locks, and if a conflicting lock is found it will return ++ * information without calling this method. This ensures, that ++ * for local locks the l_pid field is correctly filled in. The ++ * results may not be accurate in case of race conditions and in ++ * the presence of hard links, but it's unlikely that an ++ * application would rely on accurate GETLK results in these ++ * cases. If a conflicting lock is not found, this method will be ++ * called, and the filesystem may fill out l_pid by a meaningful ++ * value, or it may leave this field zero. ++ * ++ * For F_SETLK and F_SETLKW the l_pid field will be set to the pid ++ * of the process performing the locking operation. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*lock)(const char *, struct fuse_file_info *, int cmd, struct flock *); ++ ++ /** ++ * Change the access and modification times of a file with ++ * nanosecond resolution ++ * ++ * This supersedes the old utime() interface. New applications ++ * should use this. ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * See the utimensat(2) man page for details. ++ */ ++ int (*utimens)(const char *, const struct timespec tv[2], ++ struct fuse_file_info *fi); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ */ ++ int (*bmap)(const char *, size_t blocksize, uint64_t *idx); ++ ++ /** ++ * Ioctl ++ * ++ * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in ++ * 64bit environment. The size and direction of data is ++ * determined by _IOC_*() decoding of cmd. For _IOC_NONE, ++ * data will be NULL, for _IOC_WRITE data is out area, for ++ * _IOC_READ in area and if both are set in/out area. In all ++ * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes. ++ * ++ * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a ++ * directory file handle. ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ int (*ioctl)(const char *, unsigned int cmd, void *arg, ++ struct fuse_file_info *, unsigned int flags, void *data); ++ ++ /** ++ * Poll for IO readiness events ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ */ ++ int (*poll)(const char *, struct fuse_file_info *, ++ struct fuse_pollhandle *ph, unsigned *reventsp); ++ ++ /* ++ * Write contents of buffer to an open file ++ * ++ * Similar to the write() method, but data is supplied in a ++ * generic buffer. Use fuse_buf_copy() to transfer data to ++ * the destination. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write_buf)(const char *, struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *); ++ ++ /* ++ * Store data from an open file in a buffer ++ * ++ * Similar to the read() method, but data is stored and ++ * returned in a generic buffer. ++ * ++ * No actual copying of data has to take place, the source ++ * file descriptor may simply be stored in the buffer for ++ * later data transfer. ++ * ++ * The buffer must be allocated dynamically and stored at the ++ * location pointed to by bufp. If the buffer contains memory ++ * regions, they too must be allocated using malloc(). The ++ * allocated memory will be freed by the caller. ++ */ ++ int (*read_buf)(const char *, struct fuse_bufvec **bufp, size_t size, ++ off_t off, struct fuse_file_info *); ++ /** ++ * Perform BSD file locking operation ++ * ++ * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN ++ * ++ * Nonblocking requests will be indicated by ORing LOCK_NB to ++ * the above operations ++ * ++ * For more information see the flock(2) manual page. ++ * ++ * Additionally fi->owner will be set to a value unique to ++ * this open file. This same value will be supplied to ++ * ->release() when the file is released. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*flock)(const char *, struct fuse_file_info *, int op); ++ ++ /** ++ * Allocates space for an open file ++ * ++ * This function ensures that required space is allocated for specified ++ * file. If this function returns success then any subsequent write ++ * request to specified range is guaranteed not to fail because of lack ++ * of space on the file system media. ++ */ ++ int (*fallocate)(const char *, int, off_t, off_t, struct fuse_file_info *); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ */ ++ ssize_t (*copy_file_range)(const char *path_in, ++ struct fuse_file_info *fi_in, off_t offset_in, ++ const char *path_out, ++ struct fuse_file_info *fi_out, off_t offset_out, ++ size_t size, int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ */ ++ off_t (*lseek)(const char *, off_t off, int whence, ++ struct fuse_file_info *); + }; + +-/** Extra context that may be needed by some filesystems ++/* ++ * Extra context that may be needed by some filesystems + * + * The uid, gid and pid fields are not filled in case of a writepage + * operation. + */ + struct fuse_context { +- /** Pointer to the fuse object */ +- struct fuse *fuse; ++ /** Pointer to the fuse object */ ++ struct fuse *fuse; + +- /** User ID of the calling process */ +- uid_t uid; ++ /** User ID of the calling process */ ++ uid_t uid; + +- /** Group ID of the calling process */ +- gid_t gid; ++ /** Group ID of the calling process */ ++ gid_t gid; + +- /** Process ID of the calling thread */ +- pid_t pid; ++ /** Process ID of the calling thread */ ++ pid_t pid; + +- /** Private filesystem data */ +- void *private_data; ++ /** Private filesystem data */ ++ void *private_data; + +- /** Umask of the calling process */ +- mode_t umask; ++ /** Umask of the calling process */ ++ mode_t umask; + }; + + /** +@@ -859,15 +880,15 @@ struct fuse_context { + * Example usage, see hello.c + */ + /* +- int fuse_main(int argc, char *argv[], const struct fuse_operations *op, +- void *private_data); +-*/ +-#define fuse_main(argc, argv, op, private_data) \ +- fuse_main_real(argc, argv, op, sizeof(*(op)), private_data) ++ * int fuse_main(int argc, char *argv[], const struct fuse_operations *op, ++ * void *private_data); ++ */ ++#define fuse_main(argc, argv, op, private_data) \ ++ fuse_main_real(argc, argv, op, sizeof(*(op)), private_data) + +-/* ----------------------------------------------------------- * +- * More detailed API * +- * ----------------------------------------------------------- */ ++/* ++ * More detailed API ++ */ + + /** + * Print available options (high- and low-level) to stdout. This is +@@ -910,12 +931,13 @@ void fuse_lib_help(struct fuse_args *args); + * @return the created FUSE handle + */ + #if FUSE_USE_VERSION == 30 +-struct fuse *fuse_new_30(struct fuse_args *args, const struct fuse_operations *op, +- size_t op_size, void *private_data); ++struct fuse *fuse_new_30(struct fuse_args *args, ++ const struct fuse_operations *op, size_t op_size, ++ void *private_data); + #define fuse_new(args, op, size, data) fuse_new_30(args, op, size, data) + #else + struct fuse *fuse_new(struct fuse_args *args, const struct fuse_operations *op, +- size_t op_size, void *private_data); ++ size_t op_size, void *private_data); + #endif + + /** +@@ -940,7 +962,7 @@ void fuse_unmount(struct fuse *f); + /** + * Destroy the FUSE handle. + * +- * NOTE: This function does not unmount the filesystem. If this is ++ * NOTE: This function does not unmount the filesystem. If this is + * needed, call fuse_unmount() before calling this function. + * + * @param f the FUSE handle +@@ -1030,7 +1052,7 @@ int fuse_invalidate_path(struct fuse *f, const char *path); + * Do not call this directly, use fuse_main() + */ + int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, +- size_t op_size, void *private_data); ++ size_t op_size, void *private_data); + + /** + * Start the cleanup thread when using option "remember". +@@ -1081,89 +1103,87 @@ struct fuse_fs; + */ + + int fuse_fs_getattr(struct fuse_fs *fs, const char *path, struct stat *buf, +- struct fuse_file_info *fi); +-int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, +- const char *newpath, unsigned int flags); ++ struct fuse_file_info *fi); ++int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, const char *newpath, ++ unsigned int flags); + int fuse_fs_unlink(struct fuse_fs *fs, const char *path); + int fuse_fs_rmdir(struct fuse_fs *fs, const char *path); +-int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, +- const char *path); ++int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, const char *path); + int fuse_fs_link(struct fuse_fs *fs, const char *oldpath, const char *newpath); +-int fuse_fs_release(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++int fuse_fs_release(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); + int fuse_fs_open(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_read(struct fuse_fs *fs, const char *path, char *buf, size_t size, +- off_t off, struct fuse_file_info *fi); ++ off_t off, struct fuse_file_info *fi); + int fuse_fs_read_buf(struct fuse_fs *fs, const char *path, +- struct fuse_bufvec **bufp, size_t size, off_t off, +- struct fuse_file_info *fi); ++ struct fuse_bufvec **bufp, size_t size, off_t off, ++ struct fuse_file_info *fi); + int fuse_fs_write(struct fuse_fs *fs, const char *path, const char *buf, +- size_t size, off_t off, struct fuse_file_info *fi); ++ size_t size, off_t off, struct fuse_file_info *fi); + int fuse_fs_write_buf(struct fuse_fs *fs, const char *path, +- struct fuse_bufvec *buf, off_t off, +- struct fuse_file_info *fi); ++ struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *fi); + int fuse_fs_fsync(struct fuse_fs *fs, const char *path, int datasync, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_flush(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_statfs(struct fuse_fs *fs, const char *path, struct statvfs *buf); + int fuse_fs_opendir(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_readdir(struct fuse_fs *fs, const char *path, void *buf, +- fuse_fill_dir_t filler, off_t off, +- struct fuse_file_info *fi, enum fuse_readdir_flags flags); ++ fuse_fill_dir_t filler, off_t off, ++ struct fuse_file_info *fi, enum fuse_readdir_flags flags); + int fuse_fs_fsyncdir(struct fuse_fs *fs, const char *path, int datasync, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_releasedir(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_create(struct fuse_fs *fs, const char *path, mode_t mode, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_lock(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi, int cmd, struct flock *lock); ++ struct fuse_file_info *fi, int cmd, struct flock *lock); + int fuse_fs_flock(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi, int op); ++ struct fuse_file_info *fi, int op); + int fuse_fs_chmod(struct fuse_fs *fs, const char *path, mode_t mode, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_chown(struct fuse_fs *fs, const char *path, uid_t uid, gid_t gid, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_truncate(struct fuse_fs *fs, const char *path, off_t size, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + int fuse_fs_utimens(struct fuse_fs *fs, const char *path, +- const struct timespec tv[2], struct fuse_file_info *fi); ++ const struct timespec tv[2], struct fuse_file_info *fi); + int fuse_fs_access(struct fuse_fs *fs, const char *path, int mask); + int fuse_fs_readlink(struct fuse_fs *fs, const char *path, char *buf, +- size_t len); ++ size_t len); + int fuse_fs_mknod(struct fuse_fs *fs, const char *path, mode_t mode, +- dev_t rdev); ++ dev_t rdev); + int fuse_fs_mkdir(struct fuse_fs *fs, const char *path, mode_t mode); + int fuse_fs_setxattr(struct fuse_fs *fs, const char *path, const char *name, +- const char *value, size_t size, int flags); ++ const char *value, size_t size, int flags); + int fuse_fs_getxattr(struct fuse_fs *fs, const char *path, const char *name, +- char *value, size_t size); ++ char *value, size_t size); + int fuse_fs_listxattr(struct fuse_fs *fs, const char *path, char *list, +- size_t size); +-int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, +- const char *name); ++ size_t size); ++int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, const char *name); + int fuse_fs_bmap(struct fuse_fs *fs, const char *path, size_t blocksize, +- uint64_t *idx); ++ uint64_t *idx); + int fuse_fs_ioctl(struct fuse_fs *fs, const char *path, unsigned int cmd, +- void *arg, struct fuse_file_info *fi, unsigned int flags, +- void *data); ++ void *arg, struct fuse_file_info *fi, unsigned int flags, ++ void *data); + int fuse_fs_poll(struct fuse_fs *fs, const char *path, +- struct fuse_file_info *fi, struct fuse_pollhandle *ph, +- unsigned *reventsp); ++ struct fuse_file_info *fi, struct fuse_pollhandle *ph, ++ unsigned *reventsp); + int fuse_fs_fallocate(struct fuse_fs *fs, const char *path, int mode, +- off_t offset, off_t length, struct fuse_file_info *fi); ++ off_t offset, off_t length, struct fuse_file_info *fi); + ssize_t fuse_fs_copy_file_range(struct fuse_fs *fs, const char *path_in, +- struct fuse_file_info *fi_in, off_t off_in, +- const char *path_out, +- struct fuse_file_info *fi_out, off_t off_out, +- size_t len, int flags); ++ struct fuse_file_info *fi_in, off_t off_in, ++ const char *path_out, ++ struct fuse_file_info *fi_out, off_t off_out, ++ size_t len, int flags); + off_t fuse_fs_lseek(struct fuse_fs *fs, const char *path, off_t off, int whence, +- struct fuse_file_info *fi); ++ struct fuse_file_info *fi); + void fuse_fs_init(struct fuse_fs *fs, struct fuse_conn_info *conn, +- struct fuse_config *cfg); ++ struct fuse_config *cfg); + void fuse_fs_destroy(struct fuse_fs *fs); + + int fuse_notify_poll(struct fuse_pollhandle *ph); +@@ -1182,7 +1202,7 @@ int fuse_notify_poll(struct fuse_pollhandle *ph); + * @return a new filesystem object + */ + struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size, +- void *private_data); ++ void *private_data); + + /** + * Factory for creating filesystem objects +@@ -1199,7 +1219,7 @@ struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size, + * @return the new filesystem object + */ + typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args, +- struct fuse_fs *fs[]); ++ struct fuse_fs *fs[]); + /** + * Register filesystem module + * +@@ -1211,7 +1231,7 @@ typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args, + * @param factory_ the factory function for this filesystem module + */ + #define FUSE_REGISTER_MODULE(name_, factory_) \ +- fuse_module_factory_t fuse_module_ ## name_ ## _factory = factory_ ++ fuse_module_factory_t fuse_module_##name_##_factory = factory_ + + /** Get session from fuse object */ + struct fuse_session *fuse_get_session(struct fuse *f); +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index bf8f8cc..bd9bf86 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -1,21 +1,23 @@ +-/* FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++/* ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + /** @file */ + + #if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_) +-#error "Never include directly; use or instead." ++#error \ ++ "Never include directly; use or instead." + #endif + + #ifndef FUSE_COMMON_H_ + #define FUSE_COMMON_H_ + +-#include "fuse_opt.h" + #include "fuse_log.h" ++#include "fuse_opt.h" + #include + #include + +@@ -25,7 +27,7 @@ + /** Minor version of FUSE library interface */ + #define FUSE_MINOR_VERSION 2 + +-#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) ++#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) + #define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) + + /** +@@ -38,67 +40,83 @@ + * descriptors can share a single file handle. + */ + struct fuse_file_info { +- /** Open flags. Available in open() and release() */ +- int flags; +- +- /** In case of a write operation indicates if this was caused +- by a delayed write from the page cache. If so, then the +- context's pid, uid, and gid fields will not be valid, and +- the *fh* value may not match the *fh* value that would +- have been sent with the corresponding individual write +- requests if write caching had been disabled. */ +- unsigned int writepage : 1; +- +- /** Can be filled in by open, to use direct I/O on this file. */ +- unsigned int direct_io : 1; +- +- /** Can be filled in by open. It signals the kernel that any +- currently cached file data (ie., data that the filesystem +- provided the last time the file was open) need not be +- invalidated. Has no effect when set in other contexts (in +- particular it does nothing when set by opendir()). */ +- unsigned int keep_cache : 1; +- +- /** Indicates a flush operation. Set in flush operation, also +- maybe set in highlevel lock operation and lowlevel release +- operation. */ +- unsigned int flush : 1; +- +- /** Can be filled in by open, to indicate that the file is not +- seekable. */ +- unsigned int nonseekable : 1; +- +- /* Indicates that flock locks for this file should be +- released. If set, lock_owner shall contain a valid value. +- May only be set in ->release(). */ +- unsigned int flock_release : 1; +- +- /** Can be filled in by opendir. It signals the kernel to +- enable caching of entries returned by readdir(). Has no +- effect when set in other contexts (in particular it does +- nothing when set by open()). */ +- unsigned int cache_readdir : 1; +- +- /** Padding. Reserved for future use*/ +- unsigned int padding : 25; +- unsigned int padding2 : 32; +- +- /** File handle id. May be filled in by filesystem in create, +- * open, and opendir(). Available in most other file operations on the +- * same file handle. */ +- uint64_t fh; +- +- /** Lock owner id. Available in locking operations and flush */ +- uint64_t lock_owner; +- +- /** Requested poll events. Available in ->poll. Only set on kernels +- which support it. If unsupported, this field is set to zero. */ +- uint32_t poll_events; ++ /** Open flags. Available in open() and release() */ ++ int flags; ++ ++ /* ++ * In case of a write operation indicates if this was caused ++ * by a delayed write from the page cache. If so, then the ++ * context's pid, uid, and gid fields will not be valid, and ++ * the *fh* value may not match the *fh* value that would ++ * have been sent with the corresponding individual write ++ * requests if write caching had been disabled. ++ */ ++ unsigned int writepage:1; ++ ++ /** Can be filled in by open, to use direct I/O on this file. */ ++ unsigned int direct_io:1; ++ ++ /* ++ * Can be filled in by open. It signals the kernel that any ++ * currently cached file data (ie., data that the filesystem ++ * provided the last time the file was open) need not be ++ * invalidated. Has no effect when set in other contexts (in ++ * particular it does nothing when set by opendir()). ++ */ ++ unsigned int keep_cache:1; ++ ++ /* ++ * Indicates a flush operation. Set in flush operation, also ++ * maybe set in highlevel lock operation and lowlevel release ++ * operation. ++ */ ++ unsigned int flush:1; ++ ++ /* ++ * Can be filled in by open, to indicate that the file is not ++ * seekable. ++ */ ++ unsigned int nonseekable:1; ++ ++ /* ++ * Indicates that flock locks for this file should be ++ * released. If set, lock_owner shall contain a valid value. ++ * May only be set in ->release(). ++ */ ++ unsigned int flock_release:1; ++ ++ /* ++ * Can be filled in by opendir. It signals the kernel to ++ * enable caching of entries returned by readdir(). Has no ++ * effect when set in other contexts (in particular it does ++ * nothing when set by open()). ++ */ ++ unsigned int cache_readdir:1; ++ ++ /** Padding. Reserved for future use*/ ++ unsigned int padding:25; ++ unsigned int padding2:32; ++ ++ /* ++ * File handle id. May be filled in by filesystem in create, ++ * open, and opendir(). Available in most other file operations on the ++ * same file handle. ++ */ ++ uint64_t fh; ++ ++ /** Lock owner id. Available in locking operations and flush */ ++ uint64_t lock_owner; ++ ++ /* ++ * Requested poll events. Available in ->poll. Only set on kernels ++ * which support it. If unsupported, this field is set to zero. ++ */ ++ uint32_t poll_events; + }; + +-/************************************************************************** +- * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' * +- **************************************************************************/ ++/* ++ * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' ++ */ + + /** + * Indicates that the filesystem supports asynchronous read requests. +@@ -110,7 +128,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_ASYNC_READ (1 << 0) ++#define FUSE_CAP_ASYNC_READ (1 << 0) + + /** + * Indicates that the filesystem supports "remote" locking. +@@ -118,7 +136,7 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel, + * and if getlk() and setlk() handlers are implemented. + */ +-#define FUSE_CAP_POSIX_LOCKS (1 << 1) ++#define FUSE_CAP_POSIX_LOCKS (1 << 1) + + /** + * Indicates that the filesystem supports the O_TRUNC open flag. If +@@ -127,14 +145,14 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) ++#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) + + /** + * Indicates that the filesystem supports lookups of "." and "..". + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) ++#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) + + /** + * Indicates that the kernel should not apply the umask to the +@@ -142,7 +160,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_DONT_MASK (1 << 6) ++#define FUSE_CAP_DONT_MASK (1 << 6) + + /** + * Indicates that libfuse should try to use splice() when writing to +@@ -150,7 +168,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_SPLICE_WRITE (1 << 7) ++#define FUSE_CAP_SPLICE_WRITE (1 << 7) + + /** + * Indicates that libfuse should try to move pages instead of copying when +@@ -158,7 +176,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_SPLICE_MOVE (1 << 8) ++#define FUSE_CAP_SPLICE_MOVE (1 << 8) + + /** + * Indicates that libfuse should try to use splice() when reading from +@@ -167,7 +185,7 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a write_buf() handler. + */ +-#define FUSE_CAP_SPLICE_READ (1 << 9) ++#define FUSE_CAP_SPLICE_READ (1 << 9) + + /** + * If set, the calls to flock(2) will be emulated using POSIX locks and must +@@ -180,14 +198,14 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a flock() handler. + */ +-#define FUSE_CAP_FLOCK_LOCKS (1 << 10) ++#define FUSE_CAP_FLOCK_LOCKS (1 << 10) + + /** + * Indicates that the filesystem supports ioctl's on directories. + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_IOCTL_DIR (1 << 11) ++#define FUSE_CAP_IOCTL_DIR (1 << 11) + + /** + * Traditionally, while a file is open the FUSE kernel module only +@@ -209,7 +227,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) ++#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) + + /** + * Indicates that the filesystem supports readdirplus. +@@ -217,7 +235,7 @@ struct fuse_file_info { + * This feature is enabled by default when supported by the kernel and if the + * filesystem implements a readdirplus() handler. + */ +-#define FUSE_CAP_READDIRPLUS (1 << 13) ++#define FUSE_CAP_READDIRPLUS (1 << 13) + + /** + * Indicates that the filesystem supports adaptive readdirplus. +@@ -245,7 +263,7 @@ struct fuse_file_info { + * if the filesystem implements both a readdirplus() and a readdir() + * handler. + */ +-#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) ++#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) + + /** + * Indicates that the filesystem supports asynchronous direct I/O submission. +@@ -256,7 +274,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_ASYNC_DIO (1 << 15) ++#define FUSE_CAP_ASYNC_DIO (1 << 15) + + /** + * Indicates that writeback caching should be enabled. This means that +@@ -265,7 +283,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) ++#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) + + /** + * Indicates support for zero-message opens. If this flag is set in +@@ -278,7 +296,7 @@ struct fuse_file_info { + * Setting (or unsetting) this flag in the `want` field has *no + * effect*. + */ +-#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) ++#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) + + /** + * Indicates support for parallel directory operations. If this flag +@@ -288,7 +306,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) ++#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) + + /** + * Indicates support for POSIX ACLs. +@@ -307,7 +325,7 @@ struct fuse_file_info { + * + * This feature is disabled by default. + */ +-#define FUSE_CAP_POSIX_ACL (1 << 19) ++#define FUSE_CAP_POSIX_ACL (1 << 19) + + /** + * Indicates that the filesystem is responsible for unsetting +@@ -316,7 +334,7 @@ struct fuse_file_info { + * + * This feature is enabled by default when supported by the kernel. + */ +-#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) ++#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) + + /** + * Indicates support for zero-message opendirs. If this flag is set in +@@ -328,7 +346,7 @@ struct fuse_file_info { + * + * Setting (or unsetting) this flag in the `want` field has *no effect*. + */ +-#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) ++#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) + + /** + * Ioctl flags +@@ -340,12 +358,12 @@ struct fuse_file_info { + * + * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs + */ +-#define FUSE_IOCTL_COMPAT (1 << 0) +-#define FUSE_IOCTL_UNRESTRICTED (1 << 1) +-#define FUSE_IOCTL_RETRY (1 << 2) +-#define FUSE_IOCTL_DIR (1 << 4) ++#define FUSE_IOCTL_COMPAT (1 << 0) ++#define FUSE_IOCTL_UNRESTRICTED (1 << 1) ++#define FUSE_IOCTL_RETRY (1 << 2) ++#define FUSE_IOCTL_DIR (1 << 4) + +-#define FUSE_IOCTL_MAX_IOV 256 ++#define FUSE_IOCTL_MAX_IOV 256 + + /** + * Connection information, passed to the ->init() method +@@ -355,114 +373,114 @@ struct fuse_file_info { + * value must usually be smaller than the indicated value. + */ + struct fuse_conn_info { +- /** +- * Major version of the protocol (read-only) +- */ +- unsigned proto_major; +- +- /** +- * Minor version of the protocol (read-only) +- */ +- unsigned proto_minor; +- +- /** +- * Maximum size of the write buffer +- */ +- unsigned max_write; +- +- /** +- * Maximum size of read requests. A value of zero indicates no +- * limit. However, even if the filesystem does not specify a +- * limit, the maximum size of read requests will still be +- * limited by the kernel. +- * +- * NOTE: For the time being, the maximum size of read requests +- * must be set both here *and* passed to fuse_session_new() +- * using the ``-o max_read=`` mount option. At some point +- * in the future, specifying the mount option will no longer +- * be necessary. +- */ +- unsigned max_read; +- +- /** +- * Maximum readahead +- */ +- unsigned max_readahead; +- +- /** +- * Capability flags that the kernel supports (read-only) +- */ +- unsigned capable; +- +- /** +- * Capability flags that the filesystem wants to enable. +- * +- * libfuse attempts to initialize this field with +- * reasonable default values before calling the init() handler. +- */ +- unsigned want; +- +- /** +- * Maximum number of pending "background" requests. A +- * background request is any type of request for which the +- * total number is not limited by other means. As of kernel +- * 4.8, only two types of requests fall into this category: +- * +- * 1. Read-ahead requests +- * 2. Asynchronous direct I/O requests +- * +- * Read-ahead requests are generated (if max_readahead is +- * non-zero) by the kernel to preemptively fill its caches +- * when it anticipates that userspace will soon read more +- * data. +- * +- * Asynchronous direct I/O requests are generated if +- * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large +- * direct I/O request. In this case the kernel will internally +- * split it up into multiple smaller requests and submit them +- * to the filesystem concurrently. +- * +- * Note that the following requests are *not* background +- * requests: writeback requests (limited by the kernel's +- * flusher algorithm), regular (i.e., synchronous and +- * buffered) userspace read/write requests (limited to one per +- * thread), asynchronous read requests (Linux's io_submit(2) +- * call actually blocks, so these are also limited to one per +- * thread). +- */ +- unsigned max_background; +- +- /** +- * Kernel congestion threshold parameter. If the number of pending +- * background requests exceeds this number, the FUSE kernel module will +- * mark the filesystem as "congested". This instructs the kernel to +- * expect that queued requests will take some time to complete, and to +- * adjust its algorithms accordingly (e.g. by putting a waiting thread +- * to sleep instead of using a busy-loop). +- */ +- unsigned congestion_threshold; +- +- /** +- * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible +- * for updating mtime and ctime when write requests are received. The +- * updated values are passed to the filesystem with setattr() requests. +- * However, if the filesystem does not support the full resolution of +- * the kernel timestamps (nanoseconds), the mtime and ctime values used +- * by kernel and filesystem will differ (and result in an apparent +- * change of times after a cache flush). +- * +- * To prevent this problem, this variable can be used to inform the +- * kernel about the timestamp granularity supported by the file-system. +- * The value should be power of 10. The default is 1, i.e. full +- * nano-second resolution. Filesystems supporting only second resolution +- * should set this to 1000000000. +- */ +- unsigned time_gran; +- +- /** +- * For future use. +- */ +- unsigned reserved[22]; ++ /** ++ * Major version of the protocol (read-only) ++ */ ++ unsigned proto_major; ++ ++ /** ++ * Minor version of the protocol (read-only) ++ */ ++ unsigned proto_minor; ++ ++ /** ++ * Maximum size of the write buffer ++ */ ++ unsigned max_write; ++ ++ /** ++ * Maximum size of read requests. A value of zero indicates no ++ * limit. However, even if the filesystem does not specify a ++ * limit, the maximum size of read requests will still be ++ * limited by the kernel. ++ * ++ * NOTE: For the time being, the maximum size of read requests ++ * must be set both here *and* passed to fuse_session_new() ++ * using the ``-o max_read=`` mount option. At some point ++ * in the future, specifying the mount option will no longer ++ * be necessary. ++ */ ++ unsigned max_read; ++ ++ /** ++ * Maximum readahead ++ */ ++ unsigned max_readahead; ++ ++ /** ++ * Capability flags that the kernel supports (read-only) ++ */ ++ unsigned capable; ++ ++ /** ++ * Capability flags that the filesystem wants to enable. ++ * ++ * libfuse attempts to initialize this field with ++ * reasonable default values before calling the init() handler. ++ */ ++ unsigned want; ++ ++ /** ++ * Maximum number of pending "background" requests. A ++ * background request is any type of request for which the ++ * total number is not limited by other means. As of kernel ++ * 4.8, only two types of requests fall into this category: ++ * ++ * 1. Read-ahead requests ++ * 2. Asynchronous direct I/O requests ++ * ++ * Read-ahead requests are generated (if max_readahead is ++ * non-zero) by the kernel to preemptively fill its caches ++ * when it anticipates that userspace will soon read more ++ * data. ++ * ++ * Asynchronous direct I/O requests are generated if ++ * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large ++ * direct I/O request. In this case the kernel will internally ++ * split it up into multiple smaller requests and submit them ++ * to the filesystem concurrently. ++ * ++ * Note that the following requests are *not* background ++ * requests: writeback requests (limited by the kernel's ++ * flusher algorithm), regular (i.e., synchronous and ++ * buffered) userspace read/write requests (limited to one per ++ * thread), asynchronous read requests (Linux's io_submit(2) ++ * call actually blocks, so these are also limited to one per ++ * thread). ++ */ ++ unsigned max_background; ++ ++ /** ++ * Kernel congestion threshold parameter. If the number of pending ++ * background requests exceeds this number, the FUSE kernel module will ++ * mark the filesystem as "congested". This instructs the kernel to ++ * expect that queued requests will take some time to complete, and to ++ * adjust its algorithms accordingly (e.g. by putting a waiting thread ++ * to sleep instead of using a busy-loop). ++ */ ++ unsigned congestion_threshold; ++ ++ /** ++ * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible ++ * for updating mtime and ctime when write requests are received. The ++ * updated values are passed to the filesystem with setattr() requests. ++ * However, if the filesystem does not support the full resolution of ++ * the kernel timestamps (nanoseconds), the mtime and ctime values used ++ * by kernel and filesystem will differ (and result in an apparent ++ * change of times after a cache flush). ++ * ++ * To prevent this problem, this variable can be used to inform the ++ * kernel about the timestamp granularity supported by the file-system. ++ * The value should be power of 10. The default is 1, i.e. full ++ * nano-second resolution. Filesystems supporting only second resolution ++ * should set this to 1000000000. ++ */ ++ unsigned time_gran; ++ ++ /** ++ * For future use. ++ */ ++ unsigned reserved[22]; + }; + + struct fuse_session; +@@ -489,21 +507,20 @@ struct fuse_conn_info_opts; + * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want + * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want + * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want +- * -o no_remote_lock Equivalent to -o no_remote_flock,no_remote_posix_lock +- * -o no_remote_flock Unsets FUSE_CAP_FLOCK_LOCKS in conn->want +- * -o no_remote_posix_lock Unsets FUSE_CAP_POSIX_LOCKS in conn->want +- * -o [no_]splice_write (un-)sets FUSE_CAP_SPLICE_WRITE in conn->want +- * -o [no_]splice_move (un-)sets FUSE_CAP_SPLICE_MOVE in conn->want +- * -o [no_]splice_read (un-)sets FUSE_CAP_SPLICE_READ in conn->want +- * -o [no_]auto_inval_data (un-)sets FUSE_CAP_AUTO_INVAL_DATA in conn->want +- * -o readdirplus=no unsets FUSE_CAP_READDIRPLUS in conn->want +- * -o readdirplus=yes sets FUSE_CAP_READDIRPLUS and unsets +- * FUSE_CAP_READDIRPLUS_AUTO in conn->want +- * -o readdirplus=auto sets FUSE_CAP_READDIRPLUS and +- * FUSE_CAP_READDIRPLUS_AUTO in conn->want +- * -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in conn->want +- * -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in conn->want +- * -o time_gran=N sets conn->time_gran ++ * -o no_remote_lock Equivalent to -o ++ *no_remote_flock,no_remote_posix_lock -o no_remote_flock Unsets ++ *FUSE_CAP_FLOCK_LOCKS in conn->want -o no_remote_posix_lock Unsets ++ *FUSE_CAP_POSIX_LOCKS in conn->want -o [no_]splice_write (un-)sets ++ *FUSE_CAP_SPLICE_WRITE in conn->want -o [no_]splice_move (un-)sets ++ *FUSE_CAP_SPLICE_MOVE in conn->want -o [no_]splice_read (un-)sets ++ *FUSE_CAP_SPLICE_READ in conn->want -o [no_]auto_inval_data (un-)sets ++ *FUSE_CAP_AUTO_INVAL_DATA in conn->want -o readdirplus=no unsets ++ *FUSE_CAP_READDIRPLUS in conn->want -o readdirplus=yes sets ++ *FUSE_CAP_READDIRPLUS and unsets FUSE_CAP_READDIRPLUS_AUTO in conn->want -o ++ *readdirplus=auto sets FUSE_CAP_READDIRPLUS and FUSE_CAP_READDIRPLUS_AUTO ++ *in conn->want -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in ++ *conn->want -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in ++ *conn->want -o time_gran=N sets conn->time_gran + * + * Known options will be removed from *args*, unknown options will be + * passed through unchanged. +@@ -511,7 +528,7 @@ struct fuse_conn_info_opts; + * @param args argument vector (input+output) + * @return parsed options + **/ +-struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args); ++struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args); + + /** + * This function applies the (parsed) parameters in *opts* to the +@@ -521,7 +538,7 @@ struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args); + * option has been explicitly set. + */ + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, +- struct fuse_conn_info *conn); ++ struct fuse_conn_info *conn); + + /** + * Go into the background +@@ -552,81 +569,81 @@ const char *fuse_pkgversion(void); + */ + void fuse_pollhandle_destroy(struct fuse_pollhandle *ph); + +-/* ----------------------------------------------------------- * +- * Data buffer * +- * ----------------------------------------------------------- */ ++/* ++ * Data buffer ++ */ + + /** + * Buffer flags + */ + enum fuse_buf_flags { +- /** +- * Buffer contains a file descriptor +- * +- * If this flag is set, the .fd field is valid, otherwise the +- * .mem fields is valid. +- */ +- FUSE_BUF_IS_FD = (1 << 1), +- +- /** +- * Seek on the file descriptor +- * +- * If this flag is set then the .pos field is valid and is +- * used to seek to the given offset before performing +- * operation on file descriptor. +- */ +- FUSE_BUF_FD_SEEK = (1 << 2), +- +- /** +- * Retry operation on file descriptor +- * +- * If this flag is set then retry operation on file descriptor +- * until .size bytes have been copied or an error or EOF is +- * detected. +- */ +- FUSE_BUF_FD_RETRY = (1 << 3), ++ /** ++ * Buffer contains a file descriptor ++ * ++ * If this flag is set, the .fd field is valid, otherwise the ++ * .mem fields is valid. ++ */ ++ FUSE_BUF_IS_FD = (1 << 1), ++ ++ /** ++ * Seek on the file descriptor ++ * ++ * If this flag is set then the .pos field is valid and is ++ * used to seek to the given offset before performing ++ * operation on file descriptor. ++ */ ++ FUSE_BUF_FD_SEEK = (1 << 2), ++ ++ /** ++ * Retry operation on file descriptor ++ * ++ * If this flag is set then retry operation on file descriptor ++ * until .size bytes have been copied or an error or EOF is ++ * detected. ++ */ ++ FUSE_BUF_FD_RETRY = (1 << 3), + }; + + /** + * Buffer copy flags + */ + enum fuse_buf_copy_flags { +- /** +- * Don't use splice(2) +- * +- * Always fall back to using read and write instead of +- * splice(2) to copy data from one file descriptor to another. +- * +- * If this flag is not set, then only fall back if splice is +- * unavailable. +- */ +- FUSE_BUF_NO_SPLICE = (1 << 1), +- +- /** +- * Force splice +- * +- * Always use splice(2) to copy data from one file descriptor +- * to another. If splice is not available, return -EINVAL. +- */ +- FUSE_BUF_FORCE_SPLICE = (1 << 2), +- +- /** +- * Try to move data with splice. +- * +- * If splice is used, try to move pages from the source to the +- * destination instead of copying. See documentation of +- * SPLICE_F_MOVE in splice(2) man page. +- */ +- FUSE_BUF_SPLICE_MOVE = (1 << 3), +- +- /** +- * Don't block on the pipe when copying data with splice +- * +- * Makes the operations on the pipe non-blocking (if the pipe +- * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) +- * man page. +- */ +- FUSE_BUF_SPLICE_NONBLOCK= (1 << 4), ++ /** ++ * Don't use splice(2) ++ * ++ * Always fall back to using read and write instead of ++ * splice(2) to copy data from one file descriptor to another. ++ * ++ * If this flag is not set, then only fall back if splice is ++ * unavailable. ++ */ ++ FUSE_BUF_NO_SPLICE = (1 << 1), ++ ++ /** ++ * Force splice ++ * ++ * Always use splice(2) to copy data from one file descriptor ++ * to another. If splice is not available, return -EINVAL. ++ */ ++ FUSE_BUF_FORCE_SPLICE = (1 << 2), ++ ++ /** ++ * Try to move data with splice. ++ * ++ * If splice is used, try to move pages from the source to the ++ * destination instead of copying. See documentation of ++ * SPLICE_F_MOVE in splice(2) man page. ++ */ ++ FUSE_BUF_SPLICE_MOVE = (1 << 3), ++ ++ /** ++ * Don't block on the pipe when copying data with splice ++ * ++ * Makes the operations on the pipe non-blocking (if the pipe ++ * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) ++ * man page. ++ */ ++ FUSE_BUF_SPLICE_NONBLOCK = (1 << 4), + }; + + /** +@@ -636,36 +653,36 @@ enum fuse_buf_copy_flags { + * be supplied as a memory pointer or as a file descriptor + */ + struct fuse_buf { +- /** +- * Size of data in bytes +- */ +- size_t size; +- +- /** +- * Buffer flags +- */ +- enum fuse_buf_flags flags; +- +- /** +- * Memory pointer +- * +- * Used unless FUSE_BUF_IS_FD flag is set. +- */ +- void *mem; +- +- /** +- * File descriptor +- * +- * Used if FUSE_BUF_IS_FD flag is set. +- */ +- int fd; +- +- /** +- * File position +- * +- * Used if FUSE_BUF_FD_SEEK flag is set. +- */ +- off_t pos; ++ /** ++ * Size of data in bytes ++ */ ++ size_t size; ++ ++ /** ++ * Buffer flags ++ */ ++ enum fuse_buf_flags flags; ++ ++ /** ++ * Memory pointer ++ * ++ * Used unless FUSE_BUF_IS_FD flag is set. ++ */ ++ void *mem; ++ ++ /** ++ * File descriptor ++ * ++ * Used if FUSE_BUF_IS_FD flag is set. ++ */ ++ int fd; ++ ++ /** ++ * File position ++ * ++ * Used if FUSE_BUF_FD_SEEK flag is set. ++ */ ++ off_t pos; + }; + + /** +@@ -677,41 +694,39 @@ struct fuse_buf { + * Allocate dynamically to add more than one buffer. + */ + struct fuse_bufvec { +- /** +- * Number of buffers in the array +- */ +- size_t count; +- +- /** +- * Index of current buffer within the array +- */ +- size_t idx; +- +- /** +- * Current offset within the current buffer +- */ +- size_t off; +- +- /** +- * Array of buffers +- */ +- struct fuse_buf buf[1]; ++ /** ++ * Number of buffers in the array ++ */ ++ size_t count; ++ ++ /** ++ * Index of current buffer within the array ++ */ ++ size_t idx; ++ ++ /** ++ * Current offset within the current buffer ++ */ ++ size_t off; ++ ++ /** ++ * Array of buffers ++ */ ++ struct fuse_buf buf[1]; + }; + + /* Initialize bufvec with a single buffer of given size */ +-#define FUSE_BUFVEC_INIT(size__) \ +- ((struct fuse_bufvec) { \ +- /* .count= */ 1, \ +- /* .idx = */ 0, \ +- /* .off = */ 0, \ +- /* .buf = */ { /* [0] = */ { \ +- /* .size = */ (size__), \ +- /* .flags = */ (enum fuse_buf_flags) 0, \ +- /* .mem = */ NULL, \ +- /* .fd = */ -1, \ +- /* .pos = */ 0, \ +- } } \ +- } ) ++#define FUSE_BUFVEC_INIT(size__) \ ++ ((struct fuse_bufvec){ /* .count= */ 1, \ ++ /* .idx = */ 0, \ ++ /* .off = */ 0, /* .buf = */ \ ++ { /* [0] = */ { \ ++ /* .size = */ (size__), \ ++ /* .flags = */ (enum fuse_buf_flags)0, \ ++ /* .mem = */ NULL, \ ++ /* .fd = */ -1, \ ++ /* .pos = */ 0, \ ++ } } }) + + /** + * Get total size of data in a fuse buffer vector +@@ -730,16 +745,16 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv); + * @return actual number of bytes copied or -errno on error + */ + ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src, +- enum fuse_buf_copy_flags flags); ++ enum fuse_buf_copy_flags flags); + +-/* ----------------------------------------------------------- * +- * Signal handling * +- * ----------------------------------------------------------- */ ++/* ++ * Signal handling ++ */ + + /** + * Exit session on HUP, TERM and INT signals and ignore PIPE signal + * +- * Stores session in a global variable. May only be called once per ++ * Stores session in a global variable. May only be called once per + * process until fuse_remove_signal_handlers() is called. + * + * Once either of the POSIX signals arrives, the signal handler calls +@@ -766,12 +781,12 @@ int fuse_set_signal_handlers(struct fuse_session *se); + */ + void fuse_remove_signal_handlers(struct fuse_session *se); + +-/* ----------------------------------------------------------- * +- * Compatibility stuff * +- * ----------------------------------------------------------- */ ++/* ++ * Compatibility stuff ++ */ + + #if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30 +-# error only API version 30 or greater is supported ++#error only API version 30 or greater is supported + #endif + + +@@ -781,11 +796,14 @@ void fuse_remove_signal_handlers(struct fuse_session *se); + * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags! + */ + +-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++#if defined(__GNUC__) && \ ++ (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \ ++ !defined __cplusplus + _Static_assert(sizeof(off_t) == 8, "fuse: off_t must be 64bit"); + #else +-struct _fuse_off_t_must_be_64bit_dummy_struct \ +- { unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1); }; ++struct _fuse_off_t_must_be_64bit_dummy_struct { ++ unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1); ++}; + #endif + + #endif /* FUSE_COMMON_H_ */ +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index b39522e..e63cb58 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -1,71 +1,71 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include "fuse.h" + #include "fuse_lowlevel.h" + + struct fuse_req { +- struct fuse_session *se; +- uint64_t unique; +- int ctr; +- pthread_mutex_t lock; +- struct fuse_ctx ctx; +- struct fuse_chan *ch; +- int interrupted; +- unsigned int ioctl_64bit : 1; +- union { +- struct { +- uint64_t unique; +- } i; +- struct { +- fuse_interrupt_func_t func; +- void *data; +- } ni; +- } u; +- struct fuse_req *next; +- struct fuse_req *prev; ++ struct fuse_session *se; ++ uint64_t unique; ++ int ctr; ++ pthread_mutex_t lock; ++ struct fuse_ctx ctx; ++ struct fuse_chan *ch; ++ int interrupted; ++ unsigned int ioctl_64bit:1; ++ union { ++ struct { ++ uint64_t unique; ++ } i; ++ struct { ++ fuse_interrupt_func_t func; ++ void *data; ++ } ni; ++ } u; ++ struct fuse_req *next; ++ struct fuse_req *prev; + }; + + struct fuse_notify_req { +- uint64_t unique; +- void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, +- const void *, const struct fuse_buf *); +- struct fuse_notify_req *next; +- struct fuse_notify_req *prev; ++ uint64_t unique; ++ void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, ++ const void *, const struct fuse_buf *); ++ struct fuse_notify_req *next; ++ struct fuse_notify_req *prev; + }; + + struct fuse_session { +- char *mountpoint; +- volatile int exited; +- int fd; +- int debug; +- int deny_others; +- struct fuse_lowlevel_ops op; +- int got_init; +- struct cuse_data *cuse_data; +- void *userdata; +- uid_t owner; +- struct fuse_conn_info conn; +- struct fuse_req list; +- struct fuse_req interrupts; +- pthread_mutex_t lock; +- int got_destroy; +- int broken_splice_nonblock; +- uint64_t notify_ctr; +- struct fuse_notify_req notify_list; +- size_t bufsize; +- int error; ++ char *mountpoint; ++ volatile int exited; ++ int fd; ++ int debug; ++ int deny_others; ++ struct fuse_lowlevel_ops op; ++ int got_init; ++ struct cuse_data *cuse_data; ++ void *userdata; ++ uid_t owner; ++ struct fuse_conn_info conn; ++ struct fuse_req list; ++ struct fuse_req interrupts; ++ pthread_mutex_t lock; ++ int got_destroy; ++ int broken_splice_nonblock; ++ uint64_t notify_ctr; ++ struct fuse_notify_req notify_list; ++ size_t bufsize; ++ int error; + }; + + struct fuse_chan { +- pthread_mutex_t lock; +- int ctr; +- int fd; ++ pthread_mutex_t lock; ++ int ctr; ++ int fd; + }; + + /** +@@ -76,19 +76,20 @@ struct fuse_chan { + * + */ + struct fuse_module { +- char *name; +- fuse_module_factory_t factory; +- struct fuse_module *next; +- struct fusemod_so *so; +- int ctr; ++ char *name; ++ fuse_module_factory_t factory; ++ struct fuse_module *next; ++ struct fusemod_so *so; ++ int ctr; + }; + + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, +- int count); ++ int count); + void fuse_free_req(fuse_req_t req); + + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, struct fuse_chan *ch); ++ const struct fuse_buf *buf, ++ struct fuse_chan *ch); + + + #define FUSE_MAX_MAX_PAGES 256 +diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c +index 0d268ab..11345f9 100644 +--- a/tools/virtiofsd/fuse_log.c ++++ b/tools/virtiofsd/fuse_log.c +@@ -1,40 +1,40 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2019 Red Hat, Inc. +- +- Logging API. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * Logging API. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include "fuse_log.h" + + #include + #include + +-static void default_log_func( +- __attribute__(( unused )) enum fuse_log_level level, +- const char *fmt, va_list ap) ++static void default_log_func(__attribute__((unused)) enum fuse_log_level level, ++ const char *fmt, va_list ap) + { +- vfprintf(stderr, fmt, ap); ++ vfprintf(stderr, fmt, ap); + } + + static fuse_log_func_t log_func = default_log_func; + + void fuse_set_log_func(fuse_log_func_t func) + { +- if (!func) +- func = default_log_func; ++ if (!func) { ++ func = default_log_func; ++ } + +- log_func = func; ++ log_func = func; + } + + void fuse_log(enum fuse_log_level level, const char *fmt, ...) + { +- va_list ap; ++ va_list ap; + +- va_start(ap, fmt); +- log_func(level, fmt, ap); +- va_end(ap); ++ va_start(ap, fmt); ++ log_func(level, fmt, ap); ++ va_end(ap); + } +diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h +index 0af700d..bf6c11f 100644 +--- a/tools/virtiofsd/fuse_log.h ++++ b/tools/virtiofsd/fuse_log.h +@@ -1,10 +1,10 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2019 Red Hat, Inc. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_LOG_H_ + #define FUSE_LOG_H_ +@@ -22,14 +22,14 @@ + * These levels correspond to syslog(2) log levels since they are widely used. + */ + enum fuse_log_level { +- FUSE_LOG_EMERG, +- FUSE_LOG_ALERT, +- FUSE_LOG_CRIT, +- FUSE_LOG_ERR, +- FUSE_LOG_WARNING, +- FUSE_LOG_NOTICE, +- FUSE_LOG_INFO, +- FUSE_LOG_DEBUG ++ FUSE_LOG_EMERG, ++ FUSE_LOG_ALERT, ++ FUSE_LOG_CRIT, ++ FUSE_LOG_ERR, ++ FUSE_LOG_WARNING, ++ FUSE_LOG_NOTICE, ++ FUSE_LOG_INFO, ++ FUSE_LOG_DEBUG + }; + + /** +@@ -45,8 +45,8 @@ enum fuse_log_level { + * @param fmt sprintf-style format string including newline + * @param ap format string arguments + */ +-typedef void (*fuse_log_func_t)(enum fuse_log_level level, +- const char *fmt, va_list ap); ++typedef void (*fuse_log_func_t)(enum fuse_log_level level, const char *fmt, ++ va_list ap); + + /** + * Install a custom log handler function. +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index e6fa247..5c9cb52 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1,2380 +1,2515 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- Implementation of (most of) the low-level FUSE API. The session loop +- functions are implemented in separate files. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Implementation of (most of) the low-level FUSE API. The session loop ++ * functions are implemented in separate files. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #define _GNU_SOURCE + + #include "config.h" + #include "fuse_i.h" + #include "fuse_kernel.h" +-#include "fuse_opt.h" + #include "fuse_misc.h" ++#include "fuse_opt.h" + ++#include ++#include ++#include ++#include + #include + #include +-#include + #include +-#include +-#include +-#include +-#include + #include +- ++#include + + + #define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + #define OFFSET_MAX 0x7fffffffffffffffLL + +-#define container_of(ptr, type, member) ({ \ +- const typeof( ((type *)0)->member ) *__mptr = (ptr); \ +- (type *)( (char *)__mptr - offsetof(type,member) );}) ++#define container_of(ptr, type, member) \ ++ ({ \ ++ const typeof(((type *)0)->member) *__mptr = (ptr); \ ++ (type *)((char *)__mptr - offsetof(type, member)); \ ++ }) + + struct fuse_pollhandle { +- uint64_t kh; +- struct fuse_session *se; ++ uint64_t kh; ++ struct fuse_session *se; + }; + + static size_t pagesize; + + static __attribute__((constructor)) void fuse_ll_init_pagesize(void) + { +- pagesize = getpagesize(); ++ pagesize = getpagesize(); + } + + static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) + { +- attr->ino = stbuf->st_ino; +- attr->mode = stbuf->st_mode; +- attr->nlink = stbuf->st_nlink; +- attr->uid = stbuf->st_uid; +- attr->gid = stbuf->st_gid; +- attr->rdev = stbuf->st_rdev; +- attr->size = stbuf->st_size; +- attr->blksize = stbuf->st_blksize; +- attr->blocks = stbuf->st_blocks; +- attr->atime = stbuf->st_atime; +- attr->mtime = stbuf->st_mtime; +- attr->ctime = stbuf->st_ctime; +- attr->atimensec = ST_ATIM_NSEC(stbuf); +- attr->mtimensec = ST_MTIM_NSEC(stbuf); +- attr->ctimensec = ST_CTIM_NSEC(stbuf); ++ attr->ino = stbuf->st_ino; ++ attr->mode = stbuf->st_mode; ++ attr->nlink = stbuf->st_nlink; ++ attr->uid = stbuf->st_uid; ++ attr->gid = stbuf->st_gid; ++ attr->rdev = stbuf->st_rdev; ++ attr->size = stbuf->st_size; ++ attr->blksize = stbuf->st_blksize; ++ attr->blocks = stbuf->st_blocks; ++ attr->atime = stbuf->st_atime; ++ attr->mtime = stbuf->st_mtime; ++ attr->ctime = stbuf->st_ctime; ++ attr->atimensec = ST_ATIM_NSEC(stbuf); ++ attr->mtimensec = ST_MTIM_NSEC(stbuf); ++ attr->ctimensec = ST_CTIM_NSEC(stbuf); + } + + static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) + { +- stbuf->st_mode = attr->mode; +- stbuf->st_uid = attr->uid; +- stbuf->st_gid = attr->gid; +- stbuf->st_size = attr->size; +- stbuf->st_atime = attr->atime; +- stbuf->st_mtime = attr->mtime; +- stbuf->st_ctime = attr->ctime; +- ST_ATIM_NSEC_SET(stbuf, attr->atimensec); +- ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); +- ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); ++ stbuf->st_mode = attr->mode; ++ stbuf->st_uid = attr->uid; ++ stbuf->st_gid = attr->gid; ++ stbuf->st_size = attr->size; ++ stbuf->st_atime = attr->atime; ++ stbuf->st_mtime = attr->mtime; ++ stbuf->st_ctime = attr->ctime; ++ ST_ATIM_NSEC_SET(stbuf, attr->atimensec); ++ ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); ++ ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); + } + +-static size_t iov_length(const struct iovec *iov, size_t count) ++static size_t iov_length(const struct iovec *iov, size_t count) + { +- size_t seg; +- size_t ret = 0; ++ size_t seg; ++ size_t ret = 0; + +- for (seg = 0; seg < count; seg++) +- ret += iov[seg].iov_len; +- return ret; ++ for (seg = 0; seg < count; seg++) { ++ ret += iov[seg].iov_len; ++ } ++ return ret; + } + + static void list_init_req(struct fuse_req *req) + { +- req->next = req; +- req->prev = req; ++ req->next = req; ++ req->prev = req; + } + + static void list_del_req(struct fuse_req *req) + { +- struct fuse_req *prev = req->prev; +- struct fuse_req *next = req->next; +- prev->next = next; +- next->prev = prev; ++ struct fuse_req *prev = req->prev; ++ struct fuse_req *next = req->next; ++ prev->next = next; ++ next->prev = prev; + } + + static void list_add_req(struct fuse_req *req, struct fuse_req *next) + { +- struct fuse_req *prev = next->prev; +- req->next = next; +- req->prev = prev; +- prev->next = req; +- next->prev = req; ++ struct fuse_req *prev = next->prev; ++ req->next = next; ++ req->prev = prev; ++ prev->next = req; ++ next->prev = req; + } + + static void destroy_req(fuse_req_t req) + { +- pthread_mutex_destroy(&req->lock); +- free(req); ++ pthread_mutex_destroy(&req->lock); ++ free(req); + } + + void fuse_free_req(fuse_req_t req) + { +- int ctr; +- struct fuse_session *se = req->se; ++ int ctr; ++ struct fuse_session *se = req->se; + +- pthread_mutex_lock(&se->lock); +- req->u.ni.func = NULL; +- req->u.ni.data = NULL; +- list_del_req(req); +- ctr = --req->ctr; +- req->ch = NULL; +- pthread_mutex_unlock(&se->lock); +- if (!ctr) +- destroy_req(req); ++ pthread_mutex_lock(&se->lock); ++ req->u.ni.func = NULL; ++ req->u.ni.data = NULL; ++ list_del_req(req); ++ ctr = --req->ctr; ++ req->ch = NULL; ++ pthread_mutex_unlock(&se->lock); ++ if (!ctr) { ++ destroy_req(req); ++ } + } + + static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se) + { +- struct fuse_req *req; ++ struct fuse_req *req; + +- req = (struct fuse_req *) calloc(1, sizeof(struct fuse_req)); +- if (req == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); +- } else { +- req->se = se; +- req->ctr = 1; +- list_init_req(req); +- fuse_mutex_init(&req->lock); +- } ++ req = (struct fuse_req *)calloc(1, sizeof(struct fuse_req)); ++ if (req == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); ++ } else { ++ req->se = se; ++ req->ctr = 1; ++ list_init_req(req); ++ fuse_mutex_init(&req->lock); ++ } + +- return req; ++ return req; + } + + /* Send data. If *ch* is NULL, send via session master fd */ + static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, +- struct iovec *iov, int count) ++ struct iovec *iov, int count) + { +- struct fuse_out_header *out = iov[0].iov_base; ++ struct fuse_out_header *out = iov[0].iov_base; + +- out->len = iov_length(iov, count); +- if (se->debug) { +- if (out->unique == 0) { +- fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", +- out->error, out->len); +- } else if (out->error) { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, error: %i (%s), outsize: %i\n", +- (unsigned long long) out->unique, out->error, +- strerror(-out->error), out->len); +- } else { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, success, outsize: %i\n", +- (unsigned long long) out->unique, out->len); +- } +- } ++ out->len = iov_length(iov, count); ++ if (se->debug) { ++ if (out->unique == 0) { ++ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, ++ out->len); ++ } else if (out->error) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, error: %i (%s), outsize: %i\n", ++ (unsigned long long)out->unique, out->error, ++ strerror(-out->error), out->len); ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", ++ (unsigned long long)out->unique, out->len); ++ } ++ } + +- abort(); /* virtio should have taken it before here */ +- return 0; ++ abort(); /* virtio should have taken it before here */ ++ return 0; + } + + + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, +- int count) ++ int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out; + +- if (error <= -1000 || error > 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); +- error = -ERANGE; +- } ++ if (error <= -1000 || error > 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); ++ error = -ERANGE; ++ } + +- out.unique = req->unique; +- out.error = error; ++ out.unique = req->unique; ++ out.error = error; + +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(struct fuse_out_header); ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); + +- return fuse_send_msg(req->se, req->ch, iov, count); ++ return fuse_send_msg(req->se, req->ch, iov, count); + } + + static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov, +- int count) ++ int count) + { +- int res; ++ int res; + +- res = fuse_send_reply_iov_nofree(req, error, iov, count); +- fuse_free_req(req); +- return res; ++ res = fuse_send_reply_iov_nofree(req, error, iov, count); ++ fuse_free_req(req); ++ return res; + } + + static int send_reply(fuse_req_t req, int error, const void *arg, +- size_t argsize) ++ size_t argsize) + { +- struct iovec iov[2]; +- int count = 1; +- if (argsize) { +- iov[1].iov_base = (void *) arg; +- iov[1].iov_len = argsize; +- count++; +- } +- return send_reply_iov(req, error, iov, count); ++ struct iovec iov[2]; ++ int count = 1; ++ if (argsize) { ++ iov[1].iov_base = (void *)arg; ++ iov[1].iov_len = argsize; ++ count++; ++ } ++ return send_reply_iov(req, error, iov, count); + } + + int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count) + { +- int res; +- struct iovec *padded_iov; ++ int res; ++ struct iovec *padded_iov; + +- padded_iov = malloc((count + 1) * sizeof(struct iovec)); +- if (padded_iov == NULL) +- return fuse_reply_err(req, ENOMEM); ++ padded_iov = malloc((count + 1) * sizeof(struct iovec)); ++ if (padded_iov == NULL) { ++ return fuse_reply_err(req, ENOMEM); ++ } + +- memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); +- count++; ++ memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); ++ count++; + +- res = send_reply_iov(req, 0, padded_iov, count); +- free(padded_iov); ++ res = send_reply_iov(req, 0, padded_iov, count); ++ free(padded_iov); + +- return res; ++ return res; + } + + +-/* `buf` is allowed to be empty so that the proper size may be +- allocated by the caller */ ++/* ++ * 'buf` is allowed to be empty so that the proper size may be ++ * allocated by the caller ++ */ + size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, const struct stat *stbuf, off_t off) ++ const char *name, const struct stat *stbuf, off_t off) + { +- (void)req; +- size_t namelen; +- size_t entlen; +- size_t entlen_padded; +- struct fuse_dirent *dirent; ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ struct fuse_dirent *dirent; + +- namelen = strlen(name); +- entlen = FUSE_NAME_OFFSET + namelen; +- entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); + +- if ((buf == NULL) || (entlen_padded > bufsize)) +- return entlen_padded; ++ if ((buf == NULL) || (entlen_padded > bufsize)) { ++ return entlen_padded; ++ } + +- dirent = (struct fuse_dirent*) buf; +- dirent->ino = stbuf->st_ino; +- dirent->off = off; +- dirent->namelen = namelen; +- dirent->type = (stbuf->st_mode & S_IFMT) >> 12; +- memcpy(dirent->name, name, namelen); +- memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ dirent = (struct fuse_dirent *)buf; ++ dirent->ino = stbuf->st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (stbuf->st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); + +- return entlen_padded; ++ return entlen_padded; + } + + static void convert_statfs(const struct statvfs *stbuf, +- struct fuse_kstatfs *kstatfs) ++ struct fuse_kstatfs *kstatfs) + { +- kstatfs->bsize = stbuf->f_bsize; +- kstatfs->frsize = stbuf->f_frsize; +- kstatfs->blocks = stbuf->f_blocks; +- kstatfs->bfree = stbuf->f_bfree; +- kstatfs->bavail = stbuf->f_bavail; +- kstatfs->files = stbuf->f_files; +- kstatfs->ffree = stbuf->f_ffree; +- kstatfs->namelen = stbuf->f_namemax; ++ kstatfs->bsize = stbuf->f_bsize; ++ kstatfs->frsize = stbuf->f_frsize; ++ kstatfs->blocks = stbuf->f_blocks; ++ kstatfs->bfree = stbuf->f_bfree; ++ kstatfs->bavail = stbuf->f_bavail; ++ kstatfs->files = stbuf->f_files; ++ kstatfs->ffree = stbuf->f_ffree; ++ kstatfs->namelen = stbuf->f_namemax; + } + + static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) + { +- return send_reply(req, 0, arg, argsize); ++ return send_reply(req, 0, arg, argsize); + } + + int fuse_reply_err(fuse_req_t req, int err) + { +- return send_reply(req, -err, NULL, 0); ++ return send_reply(req, -err, NULL, 0); + } + + void fuse_reply_none(fuse_req_t req) + { +- fuse_free_req(req); ++ fuse_free_req(req); + } + + static unsigned long calc_timeout_sec(double t) + { +- if (t > (double) ULONG_MAX) +- return ULONG_MAX; +- else if (t < 0.0) +- return 0; +- else +- return (unsigned long) t; ++ if (t > (double)ULONG_MAX) { ++ return ULONG_MAX; ++ } else if (t < 0.0) { ++ return 0; ++ } else { ++ return (unsigned long)t; ++ } + } + + static unsigned int calc_timeout_nsec(double t) + { +- double f = t - (double) calc_timeout_sec(t); +- if (f < 0.0) +- return 0; +- else if (f >= 0.999999999) +- return 999999999; +- else +- return (unsigned int) (f * 1.0e9); ++ double f = t - (double)calc_timeout_sec(t); ++ if (f < 0.0) { ++ return 0; ++ } else if (f >= 0.999999999) { ++ return 999999999; ++ } else { ++ return (unsigned int)(f * 1.0e9); ++ } + } + + static void fill_entry(struct fuse_entry_out *arg, +- const struct fuse_entry_param *e) ++ const struct fuse_entry_param *e) + { +- arg->nodeid = e->ino; +- arg->generation = e->generation; +- arg->entry_valid = calc_timeout_sec(e->entry_timeout); +- arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); +- arg->attr_valid = calc_timeout_sec(e->attr_timeout); +- arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); +- convert_stat(&e->attr, &arg->attr); ++ arg->nodeid = e->ino; ++ arg->generation = e->generation; ++ arg->entry_valid = calc_timeout_sec(e->entry_timeout); ++ arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); ++ arg->attr_valid = calc_timeout_sec(e->attr_timeout); ++ arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); ++ convert_stat(&e->attr, &arg->attr); + } + +-/* `buf` is allowed to be empty so that the proper size may be +- allocated by the caller */ ++/* ++ * `buf` is allowed to be empty so that the proper size may be ++ * allocated by the caller ++ */ + size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, +- const struct fuse_entry_param *e, off_t off) +-{ +- (void)req; +- size_t namelen; +- size_t entlen; +- size_t entlen_padded; +- +- namelen = strlen(name); +- entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; +- entlen_padded = FUSE_DIRENT_ALIGN(entlen); +- if ((buf == NULL) || (entlen_padded > bufsize)) +- return entlen_padded; +- +- struct fuse_direntplus *dp = (struct fuse_direntplus *) buf; +- memset(&dp->entry_out, 0, sizeof(dp->entry_out)); +- fill_entry(&dp->entry_out, e); +- +- struct fuse_dirent *dirent = &dp->dirent; +- dirent->ino = e->attr.st_ino; +- dirent->off = off; +- dirent->namelen = namelen; +- dirent->type = (e->attr.st_mode & S_IFMT) >> 12; +- memcpy(dirent->name, name, namelen); +- memset(dirent->name + namelen, 0, entlen_padded - entlen); +- +- return entlen_padded; +-} +- +-static void fill_open(struct fuse_open_out *arg, +- const struct fuse_file_info *f) +-{ +- arg->fh = f->fh; +- if (f->direct_io) +- arg->open_flags |= FOPEN_DIRECT_IO; +- if (f->keep_cache) +- arg->open_flags |= FOPEN_KEEP_CACHE; +- if (f->cache_readdir) +- arg->open_flags |= FOPEN_CACHE_DIR; +- if (f->nonseekable) +- arg->open_flags |= FOPEN_NONSEEKABLE; ++ const char *name, ++ const struct fuse_entry_param *e, off_t off) ++{ ++ (void)req; ++ size_t namelen; ++ size_t entlen; ++ size_t entlen_padded; ++ ++ namelen = strlen(name); ++ entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; ++ entlen_padded = FUSE_DIRENT_ALIGN(entlen); ++ if ((buf == NULL) || (entlen_padded > bufsize)) { ++ return entlen_padded; ++ } ++ ++ struct fuse_direntplus *dp = (struct fuse_direntplus *)buf; ++ memset(&dp->entry_out, 0, sizeof(dp->entry_out)); ++ fill_entry(&dp->entry_out, e); ++ ++ struct fuse_dirent *dirent = &dp->dirent; ++ dirent->ino = e->attr.st_ino; ++ dirent->off = off; ++ dirent->namelen = namelen; ++ dirent->type = (e->attr.st_mode & S_IFMT) >> 12; ++ memcpy(dirent->name, name, namelen); ++ memset(dirent->name + namelen, 0, entlen_padded - entlen); ++ ++ return entlen_padded; ++} ++ ++static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f) ++{ ++ arg->fh = f->fh; ++ if (f->direct_io) { ++ arg->open_flags |= FOPEN_DIRECT_IO; ++ } ++ if (f->keep_cache) { ++ arg->open_flags |= FOPEN_KEEP_CACHE; ++ } ++ if (f->cache_readdir) { ++ arg->open_flags |= FOPEN_CACHE_DIR; ++ } ++ if (f->nonseekable) { ++ arg->open_flags |= FOPEN_NONSEEKABLE; ++ } + } + + int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) + { +- struct fuse_entry_out arg; +- size_t size = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(arg); ++ struct fuse_entry_out arg; ++ size_t size = req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ENTRY_OUT_SIZE : ++ sizeof(arg); + +- /* before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant +- negative entry */ +- if (!e->ino && req->se->conn.proto_minor < 4) +- return fuse_reply_err(req, ENOENT); ++ /* ++ * before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant ++ * negative entry ++ */ ++ if (!e->ino && req->se->conn.proto_minor < 4) { ++ return fuse_reply_err(req, ENOENT); ++ } + +- memset(&arg, 0, sizeof(arg)); +- fill_entry(&arg, e); +- return send_reply_ok(req, &arg, size); ++ memset(&arg, 0, sizeof(arg)); ++ fill_entry(&arg, e); ++ return send_reply_ok(req, &arg, size); + } + + int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, +- const struct fuse_file_info *f) ++ const struct fuse_file_info *f) + { +- char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; +- size_t entrysize = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ENTRY_OUT_SIZE : sizeof(struct fuse_entry_out); +- struct fuse_entry_out *earg = (struct fuse_entry_out *) buf; +- struct fuse_open_out *oarg = (struct fuse_open_out *) (buf + entrysize); ++ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; ++ size_t entrysize = req->se->conn.proto_minor < 9 ? ++ FUSE_COMPAT_ENTRY_OUT_SIZE : ++ sizeof(struct fuse_entry_out); ++ struct fuse_entry_out *earg = (struct fuse_entry_out *)buf; ++ struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize); + +- memset(buf, 0, sizeof(buf)); +- fill_entry(earg, e); +- fill_open(oarg, f); +- return send_reply_ok(req, buf, +- entrysize + sizeof(struct fuse_open_out)); ++ memset(buf, 0, sizeof(buf)); ++ fill_entry(earg, e); ++ fill_open(oarg, f); ++ return send_reply_ok(req, buf, entrysize + sizeof(struct fuse_open_out)); + } + + int fuse_reply_attr(fuse_req_t req, const struct stat *attr, +- double attr_timeout) ++ double attr_timeout) + { +- struct fuse_attr_out arg; +- size_t size = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); ++ struct fuse_attr_out arg; ++ size_t size = ++ req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); + +- memset(&arg, 0, sizeof(arg)); +- arg.attr_valid = calc_timeout_sec(attr_timeout); +- arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); +- convert_stat(attr, &arg.attr); ++ memset(&arg, 0, sizeof(arg)); ++ arg.attr_valid = calc_timeout_sec(attr_timeout); ++ arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); ++ convert_stat(attr, &arg.attr); + +- return send_reply_ok(req, &arg, size); ++ return send_reply_ok(req, &arg, size); + } + + int fuse_reply_readlink(fuse_req_t req, const char *linkname) + { +- return send_reply_ok(req, linkname, strlen(linkname)); ++ return send_reply_ok(req, linkname, strlen(linkname)); + } + + int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f) + { +- struct fuse_open_out arg; ++ struct fuse_open_out arg; + +- memset(&arg, 0, sizeof(arg)); +- fill_open(&arg, f); +- return send_reply_ok(req, &arg, sizeof(arg)); ++ memset(&arg, 0, sizeof(arg)); ++ fill_open(&arg, f); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_write(fuse_req_t req, size_t count) + { +- struct fuse_write_out arg; ++ struct fuse_write_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.size = count; ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size) + { +- return send_reply_ok(req, buf, size); ++ return send_reply_ok(req, buf, size); + } + + static int fuse_send_data_iov_fallback(struct fuse_session *se, +- struct fuse_chan *ch, +- struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, +- size_t len) ++ struct fuse_chan *ch, struct iovec *iov, ++ int iov_count, struct fuse_bufvec *buf, ++ size_t len) + { +- /* Optimize common case */ +- if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && +- !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { +- /* FIXME: also avoid memory copy if there are multiple buffers +- but none of them contain an fd */ ++ /* Optimize common case */ ++ if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && ++ !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { ++ /* ++ * FIXME: also avoid memory copy if there are multiple buffers ++ * but none of them contain an fd ++ */ + +- iov[iov_count].iov_base = buf->buf[0].mem; +- iov[iov_count].iov_len = len; +- iov_count++; +- return fuse_send_msg(se, ch, iov, iov_count); +- } ++ iov[iov_count].iov_base = buf->buf[0].mem; ++ iov[iov_count].iov_len = len; ++ iov_count++; ++ return fuse_send_msg(se, ch, iov, iov_count); ++ } + +- abort(); /* Will have taken vhost path */ +- return 0; ++ abort(); /* Will have taken vhost path */ ++ return 0; + } + + static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, +- struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, unsigned int flags) ++ struct iovec *iov, int iov_count, ++ struct fuse_bufvec *buf, unsigned int flags) + { +- size_t len = fuse_buf_size(buf); +- (void) flags; ++ size_t len = fuse_buf_size(buf); ++ (void)flags; + +- return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); ++ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); + } + + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++ enum fuse_buf_copy_flags flags) + { +- struct iovec iov[2]; +- struct fuse_out_header out; +- int res; ++ struct iovec iov[2]; ++ struct fuse_out_header out; ++ int res; + +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(struct fuse_out_header); ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); + +- out.unique = req->unique; +- out.error = 0; ++ out.unique = req->unique; ++ out.error = 0; + +- res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); +- if (res <= 0) { +- fuse_free_req(req); +- return res; +- } else { +- return fuse_reply_err(req, res); +- } ++ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); ++ if (res <= 0) { ++ fuse_free_req(req); ++ return res; ++ } else { ++ return fuse_reply_err(req, res); ++ } + } + + int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) + { +- struct fuse_statfs_out arg; +- size_t size = req->se->conn.proto_minor < 4 ? +- FUSE_COMPAT_STATFS_SIZE : sizeof(arg); ++ struct fuse_statfs_out arg; ++ size_t size = ++ req->se->conn.proto_minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(arg); + +- memset(&arg, 0, sizeof(arg)); +- convert_statfs(stbuf, &arg.st); ++ memset(&arg, 0, sizeof(arg)); ++ convert_statfs(stbuf, &arg.st); + +- return send_reply_ok(req, &arg, size); ++ return send_reply_ok(req, &arg, size); + } + + int fuse_reply_xattr(fuse_req_t req, size_t count) + { +- struct fuse_getxattr_out arg; ++ struct fuse_getxattr_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.size = count; ++ memset(&arg, 0, sizeof(arg)); ++ arg.size = count; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_lock(fuse_req_t req, const struct flock *lock) + { +- struct fuse_lk_out arg; ++ struct fuse_lk_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.lk.type = lock->l_type; +- if (lock->l_type != F_UNLCK) { +- arg.lk.start = lock->l_start; +- if (lock->l_len == 0) +- arg.lk.end = OFFSET_MAX; +- else +- arg.lk.end = lock->l_start + lock->l_len - 1; +- } +- arg.lk.pid = lock->l_pid; +- return send_reply_ok(req, &arg, sizeof(arg)); ++ memset(&arg, 0, sizeof(arg)); ++ arg.lk.type = lock->l_type; ++ if (lock->l_type != F_UNLCK) { ++ arg.lk.start = lock->l_start; ++ if (lock->l_len == 0) { ++ arg.lk.end = OFFSET_MAX; ++ } else { ++ arg.lk.end = lock->l_start + lock->l_len - 1; ++ } ++ } ++ arg.lk.pid = lock->l_pid; ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_bmap(fuse_req_t req, uint64_t idx) + { +- struct fuse_bmap_out arg; ++ struct fuse_bmap_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.block = idx; ++ memset(&arg, 0, sizeof(arg)); ++ arg.block = idx; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov, +- size_t count) +-{ +- struct fuse_ioctl_iovec *fiov; +- size_t i; +- +- fiov = malloc(sizeof(fiov[0]) * count); +- if (!fiov) +- return NULL; +- +- for (i = 0; i < count; i++) { +- fiov[i].base = (uintptr_t) iov[i].iov_base; +- fiov[i].len = iov[i].iov_len; +- } +- +- return fiov; +-} +- +-int fuse_reply_ioctl_retry(fuse_req_t req, +- const struct iovec *in_iov, size_t in_count, +- const struct iovec *out_iov, size_t out_count) +-{ +- struct fuse_ioctl_out arg; +- struct fuse_ioctl_iovec *in_fiov = NULL; +- struct fuse_ioctl_iovec *out_fiov = NULL; +- struct iovec iov[4]; +- size_t count = 1; +- int res; +- +- memset(&arg, 0, sizeof(arg)); +- arg.flags |= FUSE_IOCTL_RETRY; +- arg.in_iovs = in_count; +- arg.out_iovs = out_count; +- iov[count].iov_base = &arg; +- iov[count].iov_len = sizeof(arg); +- count++; +- +- if (req->se->conn.proto_minor < 16) { +- if (in_count) { +- iov[count].iov_base = (void *)in_iov; +- iov[count].iov_len = sizeof(in_iov[0]) * in_count; +- count++; +- } +- +- if (out_count) { +- iov[count].iov_base = (void *)out_iov; +- iov[count].iov_len = sizeof(out_iov[0]) * out_count; +- count++; +- } +- } else { +- /* Can't handle non-compat 64bit ioctls on 32bit */ +- if (sizeof(void *) == 4 && req->ioctl_64bit) { +- res = fuse_reply_err(req, EINVAL); +- goto out; +- } +- +- if (in_count) { +- in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); +- if (!in_fiov) +- goto enomem; +- +- iov[count].iov_base = (void *)in_fiov; +- iov[count].iov_len = sizeof(in_fiov[0]) * in_count; +- count++; +- } +- if (out_count) { +- out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); +- if (!out_fiov) +- goto enomem; +- +- iov[count].iov_base = (void *)out_fiov; +- iov[count].iov_len = sizeof(out_fiov[0]) * out_count; +- count++; +- } +- } +- +- res = send_reply_iov(req, 0, iov, count); ++ size_t count) ++{ ++ struct fuse_ioctl_iovec *fiov; ++ size_t i; ++ ++ fiov = malloc(sizeof(fiov[0]) * count); ++ if (!fiov) { ++ return NULL; ++ } ++ ++ for (i = 0; i < count; i++) { ++ fiov[i].base = (uintptr_t)iov[i].iov_base; ++ fiov[i].len = iov[i].iov_len; ++ } ++ ++ return fiov; ++} ++ ++int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, ++ size_t in_count, const struct iovec *out_iov, ++ size_t out_count) ++{ ++ struct fuse_ioctl_out arg; ++ struct fuse_ioctl_iovec *in_fiov = NULL; ++ struct fuse_ioctl_iovec *out_fiov = NULL; ++ struct iovec iov[4]; ++ size_t count = 1; ++ int res; ++ ++ memset(&arg, 0, sizeof(arg)); ++ arg.flags |= FUSE_IOCTL_RETRY; ++ arg.in_iovs = in_count; ++ arg.out_iovs = out_count; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; ++ ++ if (req->se->conn.proto_minor < 16) { ++ if (in_count) { ++ iov[count].iov_base = (void *)in_iov; ++ iov[count].iov_len = sizeof(in_iov[0]) * in_count; ++ count++; ++ } ++ ++ if (out_count) { ++ iov[count].iov_base = (void *)out_iov; ++ iov[count].iov_len = sizeof(out_iov[0]) * out_count; ++ count++; ++ } ++ } else { ++ /* Can't handle non-compat 64bit ioctls on 32bit */ ++ if (sizeof(void *) == 4 && req->ioctl_64bit) { ++ res = fuse_reply_err(req, EINVAL); ++ goto out; ++ } ++ ++ if (in_count) { ++ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); ++ if (!in_fiov) { ++ goto enomem; ++ } ++ ++ iov[count].iov_base = (void *)in_fiov; ++ iov[count].iov_len = sizeof(in_fiov[0]) * in_count; ++ count++; ++ } ++ if (out_count) { ++ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); ++ if (!out_fiov) { ++ goto enomem; ++ } ++ ++ iov[count].iov_base = (void *)out_fiov; ++ iov[count].iov_len = sizeof(out_fiov[0]) * out_count; ++ count++; ++ } ++ } ++ ++ res = send_reply_iov(req, 0, iov, count); + out: +- free(in_fiov); +- free(out_fiov); ++ free(in_fiov); ++ free(out_fiov); + +- return res; ++ return res; + + enomem: +- res = fuse_reply_err(req, ENOMEM); +- goto out; ++ res = fuse_reply_err(req, ENOMEM); ++ goto out; + } + + int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size) + { +- struct fuse_ioctl_out arg; +- struct iovec iov[3]; +- size_t count = 1; ++ struct fuse_ioctl_out arg; ++ struct iovec iov[3]; ++ size_t count = 1; + +- memset(&arg, 0, sizeof(arg)); +- arg.result = result; +- iov[count].iov_base = &arg; +- iov[count].iov_len = sizeof(arg); +- count++; ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ iov[count].iov_base = &arg; ++ iov[count].iov_len = sizeof(arg); ++ count++; + +- if (size) { +- iov[count].iov_base = (char *) buf; +- iov[count].iov_len = size; +- count++; +- } ++ if (size) { ++ iov[count].iov_base = (char *)buf; ++ iov[count].iov_len = size; ++ count++; ++ } + +- return send_reply_iov(req, 0, iov, count); ++ return send_reply_iov(req, 0, iov, count); + } + + int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, +- int count) ++ int count) + { +- struct iovec *padded_iov; +- struct fuse_ioctl_out arg; +- int res; ++ struct iovec *padded_iov; ++ struct fuse_ioctl_out arg; ++ int res; + +- padded_iov = malloc((count + 2) * sizeof(struct iovec)); +- if (padded_iov == NULL) +- return fuse_reply_err(req, ENOMEM); ++ padded_iov = malloc((count + 2) * sizeof(struct iovec)); ++ if (padded_iov == NULL) { ++ return fuse_reply_err(req, ENOMEM); ++ } + +- memset(&arg, 0, sizeof(arg)); +- arg.result = result; +- padded_iov[1].iov_base = &arg; +- padded_iov[1].iov_len = sizeof(arg); ++ memset(&arg, 0, sizeof(arg)); ++ arg.result = result; ++ padded_iov[1].iov_base = &arg; ++ padded_iov[1].iov_len = sizeof(arg); + +- memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); ++ memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); + +- res = send_reply_iov(req, 0, padded_iov, count + 2); +- free(padded_iov); ++ res = send_reply_iov(req, 0, padded_iov, count + 2); ++ free(padded_iov); + +- return res; ++ return res; + } + + int fuse_reply_poll(fuse_req_t req, unsigned revents) + { +- struct fuse_poll_out arg; ++ struct fuse_poll_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.revents = revents; ++ memset(&arg, 0, sizeof(arg)); ++ arg.revents = revents; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + int fuse_reply_lseek(fuse_req_t req, off_t off) + { +- struct fuse_lseek_out arg; ++ struct fuse_lseek_out arg; + +- memset(&arg, 0, sizeof(arg)); +- arg.offset = off; ++ memset(&arg, 0, sizeof(arg)); ++ arg.offset = off; + +- return send_reply_ok(req, &arg, sizeof(arg)); ++ return send_reply_ok(req, &arg, sizeof(arg)); + } + + static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.lookup) +- req->se->op.lookup(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.lookup) { ++ req->se->op.lookup(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_forget_in *arg = (struct fuse_forget_in *) inarg; ++ struct fuse_forget_in *arg = (struct fuse_forget_in *)inarg; + +- if (req->se->op.forget) +- req->se->op.forget(req, nodeid, arg->nlookup); +- else +- fuse_reply_none(req); ++ if (req->se->op.forget) { ++ req->se->op.forget(req, nodeid, arg->nlookup); ++ } else { ++ fuse_reply_none(req); ++ } + } + + static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg) ++ const void *inarg) + { +- struct fuse_batch_forget_in *arg = (void *) inarg; +- struct fuse_forget_one *param = (void *) PARAM(arg); +- unsigned int i; ++ struct fuse_batch_forget_in *arg = (void *)inarg; ++ struct fuse_forget_one *param = (void *)PARAM(arg); ++ unsigned int i; + +- (void) nodeid; ++ (void)nodeid; + +- if (req->se->op.forget_multi) { +- req->se->op.forget_multi(req, arg->count, +- (struct fuse_forget_data *) param); +- } else if (req->se->op.forget) { +- for (i = 0; i < arg->count; i++) { +- struct fuse_forget_one *forget = ¶m[i]; +- struct fuse_req *dummy_req; ++ if (req->se->op.forget_multi) { ++ req->se->op.forget_multi(req, arg->count, ++ (struct fuse_forget_data *)param); ++ } else if (req->se->op.forget) { ++ for (i = 0; i < arg->count; i++) { ++ struct fuse_forget_one *forget = ¶m[i]; ++ struct fuse_req *dummy_req; + +- dummy_req = fuse_ll_alloc_req(req->se); +- if (dummy_req == NULL) +- break; ++ dummy_req = fuse_ll_alloc_req(req->se); ++ if (dummy_req == NULL) { ++ break; ++ } + +- dummy_req->unique = req->unique; +- dummy_req->ctx = req->ctx; +- dummy_req->ch = NULL; ++ dummy_req->unique = req->unique; ++ dummy_req->ctx = req->ctx; ++ dummy_req->ch = NULL; + +- req->se->op.forget(dummy_req, forget->nodeid, +- forget->nlookup); +- } +- fuse_reply_none(req); +- } else { +- fuse_reply_none(req); +- } ++ req->se->op.forget(dummy_req, forget->nodeid, forget->nlookup); ++ } ++ fuse_reply_none(req); ++ } else { ++ fuse_reply_none(req); ++ } + } + + static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_file_info *fip = NULL; +- struct fuse_file_info fi; ++ struct fuse_file_info *fip = NULL; ++ struct fuse_file_info fi; + +- if (req->se->conn.proto_minor >= 9) { +- struct fuse_getattr_in *arg = (struct fuse_getattr_in *) inarg; ++ if (req->se->conn.proto_minor >= 9) { ++ struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; + +- if (arg->getattr_flags & FUSE_GETATTR_FH) { +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fip = &fi; +- } +- } ++ if (arg->getattr_flags & FUSE_GETATTR_FH) { ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fip = &fi; ++ } ++ } + +- if (req->se->op.getattr) +- req->se->op.getattr(req, nodeid, fip); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.getattr) { ++ req->se->op.getattr(req, nodeid, fip); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_setattr_in *arg = (struct fuse_setattr_in *) inarg; +- +- if (req->se->op.setattr) { +- struct fuse_file_info *fi = NULL; +- struct fuse_file_info fi_store; +- struct stat stbuf; +- memset(&stbuf, 0, sizeof(stbuf)); +- convert_attr(arg, &stbuf); +- if (arg->valid & FATTR_FH) { +- arg->valid &= ~FATTR_FH; +- memset(&fi_store, 0, sizeof(fi_store)); +- fi = &fi_store; +- fi->fh = arg->fh; +- } +- arg->valid &= +- FUSE_SET_ATTR_MODE | +- FUSE_SET_ATTR_UID | +- FUSE_SET_ATTR_GID | +- FUSE_SET_ATTR_SIZE | +- FUSE_SET_ATTR_ATIME | +- FUSE_SET_ATTR_MTIME | +- FUSE_SET_ATTR_ATIME_NOW | +- FUSE_SET_ATTR_MTIME_NOW | +- FUSE_SET_ATTR_CTIME; +- +- req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); +- } else +- fuse_reply_err(req, ENOSYS); ++ struct fuse_setattr_in *arg = (struct fuse_setattr_in *)inarg; ++ ++ if (req->se->op.setattr) { ++ struct fuse_file_info *fi = NULL; ++ struct fuse_file_info fi_store; ++ struct stat stbuf; ++ memset(&stbuf, 0, sizeof(stbuf)); ++ convert_attr(arg, &stbuf); ++ if (arg->valid & FATTR_FH) { ++ arg->valid &= ~FATTR_FH; ++ memset(&fi_store, 0, sizeof(fi_store)); ++ fi = &fi_store; ++ fi->fh = arg->fh; ++ } ++ arg->valid &= FUSE_SET_ATTR_MODE | FUSE_SET_ATTR_UID | ++ FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE | ++ FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME | ++ FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW | ++ FUSE_SET_ATTR_CTIME; ++ ++ req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_access_in *arg = (struct fuse_access_in *) inarg; ++ struct fuse_access_in *arg = (struct fuse_access_in *)inarg; + +- if (req->se->op.access) +- req->se->op.access(req, nodeid, arg->mask); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.access) { ++ req->se->op.access(req, nodeid, arg->mask); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- (void) inarg; ++ (void)inarg; + +- if (req->se->op.readlink) +- req->se->op.readlink(req, nodeid); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.readlink) { ++ req->se->op.readlink(req, nodeid); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_mknod_in *arg = (struct fuse_mknod_in *) inarg; +- char *name = PARAM(arg); ++ struct fuse_mknod_in *arg = (struct fuse_mknod_in *)inarg; ++ char *name = PARAM(arg); + +- if (req->se->conn.proto_minor >= 12) +- req->ctx.umask = arg->umask; +- else +- name = (char *) inarg + FUSE_COMPAT_MKNOD_IN_SIZE; ++ if (req->se->conn.proto_minor >= 12) { ++ req->ctx.umask = arg->umask; ++ } else { ++ name = (char *)inarg + FUSE_COMPAT_MKNOD_IN_SIZE; ++ } + +- if (req->se->op.mknod) +- req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.mknod) { ++ req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *) inarg; ++ struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *)inarg; + +- if (req->se->conn.proto_minor >= 12) +- req->ctx.umask = arg->umask; ++ if (req->se->conn.proto_minor >= 12) { ++ req->ctx.umask = arg->umask; ++ } + +- if (req->se->op.mkdir) +- req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.mkdir) { ++ req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.unlink) +- req->se->op.unlink(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.unlink) { ++ req->se->op.unlink(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.rmdir) +- req->se->op.rmdir(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.rmdir) { ++ req->se->op.rmdir(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; +- char *linkname = ((char *) inarg) + strlen((char *) inarg) + 1; ++ char *name = (char *)inarg; ++ char *linkname = ((char *)inarg) + strlen((char *)inarg) + 1; + +- if (req->se->op.symlink) +- req->se->op.symlink(req, linkname, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.symlink) { ++ req->se->op.symlink(req, linkname, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_rename_in *arg = (struct fuse_rename_in *) inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename_in *arg = (struct fuse_rename_in *)inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; + +- if (req->se->op.rename) +- req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, +- 0); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.rename) { ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_rename2_in *arg = (struct fuse_rename2_in *) inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename2_in *arg = (struct fuse_rename2_in *)inarg; ++ char *oldname = PARAM(arg); ++ char *newname = oldname + strlen(oldname) + 1; + +- if (req->se->op.rename) +- req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, +- arg->flags); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.rename) { ++ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, ++ arg->flags); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_link(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_link_in *arg = (struct fuse_link_in *) inarg; ++ struct fuse_link_in *arg = (struct fuse_link_in *)inarg; + +- if (req->se->op.link) +- req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.link) { ++ req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_create_in *arg = (struct fuse_create_in *) inarg; ++ struct fuse_create_in *arg = (struct fuse_create_in *)inarg; + +- if (req->se->op.create) { +- struct fuse_file_info fi; +- char *name = PARAM(arg); ++ if (req->se->op.create) { ++ struct fuse_file_info fi; ++ char *name = PARAM(arg); + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; + +- if (req->se->conn.proto_minor >= 12) +- req->ctx.umask = arg->umask; +- else +- name = (char *) inarg + sizeof(struct fuse_open_in); ++ if (req->se->conn.proto_minor >= 12) { ++ req->ctx.umask = arg->umask; ++ } else { ++ name = (char *)inarg + sizeof(struct fuse_open_in); ++ } + +- req->se->op.create(req, nodeid, name, arg->mode, &fi); +- } else +- fuse_reply_err(req, ENOSYS); ++ req->se->op.create(req, nodeid, name, arg->mode, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_open_in *arg = (struct fuse_open_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; + +- if (req->se->op.open) +- req->se->op.open(req, nodeid, &fi); +- else +- fuse_reply_open(req, &fi); ++ if (req->se->op.open) { ++ req->se->op.open(req, nodeid, &fi); ++ } else { ++ fuse_reply_open(req, &fi); ++ } + } + + static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_read_in *arg = (struct fuse_read_in *) inarg; ++ struct fuse_read_in *arg = (struct fuse_read_in *)inarg; + +- if (req->se->op.read) { +- struct fuse_file_info fi; ++ if (req->se->op.read) { ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 9) { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- } +- req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); +- } else +- fuse_reply_err(req, ENOSYS); ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 9) { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ } ++ req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_write_in *arg = (struct fuse_write_in *) inarg; +- struct fuse_file_info fi; +- char *param; ++ struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_file_info fi; ++ char *param; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; + +- if (req->se->conn.proto_minor < 9) { +- param = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- param = PARAM(arg); +- } ++ if (req->se->conn.proto_minor < 9) { ++ param = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ param = PARAM(arg); ++ } + +- if (req->se->op.write) +- req->se->op.write(req, nodeid, param, arg->size, +- arg->offset, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.write) { ++ req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, +- const struct fuse_buf *ibuf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, +- .count = 1, +- }; +- struct fuse_write_in *arg = (struct fuse_write_in *) inarg; +- struct fuse_file_info fi; +- +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; +- +- if (se->conn.proto_minor < 9) { +- bufv.buf[0].mem = ((char *) arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- bufv.buf[0].size -= sizeof(struct fuse_in_header) + +- FUSE_COMPAT_WRITE_IN_SIZE; +- assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) +- bufv.buf[0].mem = PARAM(arg); +- +- bufv.buf[0].size -= sizeof(struct fuse_in_header) + +- sizeof(struct fuse_write_in); +- } +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); +- fuse_reply_err(req, EIO); +- return; +- } +- bufv.buf[0].size = arg->size; +- +- se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_file_info fi; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; ++ ++ if (se->conn.proto_minor < 9) { ++ bufv.buf[0].mem = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + FUSE_COMPAT_WRITE_IN_SIZE; ++ assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); ++ } else { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ bufv.buf[0].mem = PARAM(arg); ++ } ++ ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); ++ } ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); + } + + static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_flush_in *arg = (struct fuse_flush_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_flush_in *arg = (struct fuse_flush_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.flush = 1; +- if (req->se->conn.proto_minor >= 7) +- fi.lock_owner = arg->lock_owner; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.flush = 1; ++ if (req->se->conn.proto_minor >= 7) { ++ fi.lock_owner = arg->lock_owner; ++ } + +- if (req->se->op.flush) +- req->se->op.flush(req, nodeid, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.flush) { ++ req->se->op.flush(req, nodeid, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_release_in *arg = (struct fuse_release_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; +- fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 8) { +- fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; +- fi.lock_owner = arg->lock_owner; +- } +- if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { +- fi.flock_release = 1; +- fi.lock_owner = arg->lock_owner; +- } ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; ++ if (req->se->conn.proto_minor >= 8) { ++ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; ++ fi.lock_owner = arg->lock_owner; ++ } ++ if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { ++ fi.flock_release = 1; ++ fi.lock_owner = arg->lock_owner; ++ } + +- if (req->se->op.release) +- req->se->op.release(req, nodeid, &fi); +- else +- fuse_reply_err(req, 0); ++ if (req->se->op.release) { ++ req->se->op.release(req, nodeid, &fi); ++ } else { ++ fuse_reply_err(req, 0); ++ } + } + + static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; +- struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.fsync) +- req->se->op.fsync(req, nodeid, datasync, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.fsync) { ++ req->se->op.fsync(req, nodeid, datasync, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_open_in *arg = (struct fuse_open_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; + +- if (req->se->op.opendir) +- req->se->op.opendir(req, nodeid, &fi); +- else +- fuse_reply_open(req, &fi); ++ if (req->se->op.opendir) { ++ req->se->op.opendir(req, nodeid, &fi); ++ } else { ++ fuse_reply_open(req, &fi); ++ } + } + + static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_read_in *arg = (struct fuse_read_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.readdir) +- req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.readdir) { ++ req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_read_in *arg = (struct fuse_read_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.readdirplus) +- req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.readdirplus) { ++ req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_release_in *arg = (struct fuse_release_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.flags = arg->flags; +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.flags = arg->flags; ++ fi.fh = arg->fh; + +- if (req->se->op.releasedir) +- req->se->op.releasedir(req, nodeid, &fi); +- else +- fuse_reply_err(req, 0); ++ if (req->se->op.releasedir) { ++ req->se->op.releasedir(req, nodeid, &fi); ++ } else { ++ fuse_reply_err(req, 0); ++ } + } + + static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; +- struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_file_info fi; ++ int datasync = arg->fsync_flags & 1; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.fsyncdir) +- req->se->op.fsyncdir(req, nodeid, datasync, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.fsyncdir) { ++ req->se->op.fsyncdir(req, nodeid, datasync, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- (void) nodeid; +- (void) inarg; ++ (void)nodeid; ++ (void)inarg; + +- if (req->se->op.statfs) +- req->se->op.statfs(req, nodeid); +- else { +- struct statvfs buf = { +- .f_namemax = 255, +- .f_bsize = 512, +- }; +- fuse_reply_statfs(req, &buf); +- } ++ if (req->se->op.statfs) { ++ req->se->op.statfs(req, nodeid); ++ } else { ++ struct statvfs buf = { ++ .f_namemax = 255, ++ .f_bsize = 512, ++ }; ++ fuse_reply_statfs(req, &buf); ++ } + } + + static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *) inarg; +- char *name = PARAM(arg); +- char *value = name + strlen(name) + 1; ++ struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *)inarg; ++ char *name = PARAM(arg); ++ char *value = name + strlen(name) + 1; + +- if (req->se->op.setxattr) +- req->se->op.setxattr(req, nodeid, name, value, arg->size, +- arg->flags); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.setxattr) { ++ req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; + +- if (req->se->op.getxattr) +- req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.getxattr) { ++ req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; ++ struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; + +- if (req->se->op.listxattr) +- req->se->op.listxattr(req, nodeid, arg->size); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.listxattr) { ++ req->se->op.listxattr(req, nodeid, arg->size); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- char *name = (char *) inarg; ++ char *name = (char *)inarg; + +- if (req->se->op.removexattr) +- req->se->op.removexattr(req, nodeid, name); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.removexattr) { ++ req->se->op.removexattr(req, nodeid, name); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void convert_fuse_file_lock(struct fuse_file_lock *fl, +- struct flock *flock) ++ struct flock *flock) + { +- memset(flock, 0, sizeof(struct flock)); +- flock->l_type = fl->type; +- flock->l_whence = SEEK_SET; +- flock->l_start = fl->start; +- if (fl->end == OFFSET_MAX) +- flock->l_len = 0; +- else +- flock->l_len = fl->end - fl->start + 1; +- flock->l_pid = fl->pid; ++ memset(flock, 0, sizeof(struct flock)); ++ flock->l_type = fl->type; ++ flock->l_whence = SEEK_SET; ++ flock->l_start = fl->start; ++ if (fl->end == OFFSET_MAX) { ++ flock->l_len = 0; ++ } else { ++ flock->l_len = fl->end - fl->start + 1; ++ } ++ flock->l_pid = fl->pid; + } + + static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; +- struct fuse_file_info fi; +- struct flock flock; ++ struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_file_info fi; ++ struct flock flock; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.lock_owner = arg->owner; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; + +- convert_fuse_file_lock(&arg->lk, &flock); +- if (req->se->op.getlk) +- req->se->op.getlk(req, nodeid, &fi, &flock); +- else +- fuse_reply_err(req, ENOSYS); ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.getlk) { ++ req->se->op.getlk(req, nodeid, &fi, &flock); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, int sleep) +-{ +- struct fuse_lk_in *arg = (struct fuse_lk_in *) inarg; +- struct fuse_file_info fi; +- struct flock flock; +- +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.lock_owner = arg->owner; +- +- if (arg->lk_flags & FUSE_LK_FLOCK) { +- int op = 0; +- +- switch (arg->lk.type) { +- case F_RDLCK: +- op = LOCK_SH; +- break; +- case F_WRLCK: +- op = LOCK_EX; +- break; +- case F_UNLCK: +- op = LOCK_UN; +- break; +- } +- if (!sleep) +- op |= LOCK_NB; +- +- if (req->se->op.flock) +- req->se->op.flock(req, nodeid, &fi, op); +- else +- fuse_reply_err(req, ENOSYS); +- } else { +- convert_fuse_file_lock(&arg->lk, &flock); +- if (req->se->op.setlk) +- req->se->op.setlk(req, nodeid, &fi, &flock, sleep); +- else +- fuse_reply_err(req, ENOSYS); +- } ++ const void *inarg, int sleep) ++{ ++ struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_file_info fi; ++ struct flock flock; ++ ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.lock_owner = arg->owner; ++ ++ if (arg->lk_flags & FUSE_LK_FLOCK) { ++ int op = 0; ++ ++ switch (arg->lk.type) { ++ case F_RDLCK: ++ op = LOCK_SH; ++ break; ++ case F_WRLCK: ++ op = LOCK_EX; ++ break; ++ case F_UNLCK: ++ op = LOCK_UN; ++ break; ++ } ++ if (!sleep) { ++ op |= LOCK_NB; ++ } ++ ++ if (req->se->op.flock) { ++ req->se->op.flock(req, nodeid, &fi, op); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } ++ } else { ++ convert_fuse_file_lock(&arg->lk, &flock); ++ if (req->se->op.setlk) { ++ req->se->op.setlk(req, nodeid, &fi, &flock, sleep); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } ++ } + } + + static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- do_setlk_common(req, nodeid, inarg, 0); ++ do_setlk_common(req, nodeid, inarg, 0); + } + + static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- do_setlk_common(req, nodeid, inarg, 1); ++ do_setlk_common(req, nodeid, inarg, 1); + } + + static int find_interrupted(struct fuse_session *se, struct fuse_req *req) + { +- struct fuse_req *curr; +- +- for (curr = se->list.next; curr != &se->list; curr = curr->next) { +- if (curr->unique == req->u.i.unique) { +- fuse_interrupt_func_t func; +- void *data; +- +- curr->ctr++; +- pthread_mutex_unlock(&se->lock); +- +- /* Ugh, ugly locking */ +- pthread_mutex_lock(&curr->lock); +- pthread_mutex_lock(&se->lock); +- curr->interrupted = 1; +- func = curr->u.ni.func; +- data = curr->u.ni.data; +- pthread_mutex_unlock(&se->lock); +- if (func) +- func(curr, data); +- pthread_mutex_unlock(&curr->lock); +- +- pthread_mutex_lock(&se->lock); +- curr->ctr--; +- if (!curr->ctr) +- destroy_req(curr); +- +- return 1; +- } +- } +- for (curr = se->interrupts.next; curr != &se->interrupts; +- curr = curr->next) { +- if (curr->u.i.unique == req->u.i.unique) +- return 1; +- } +- return 0; ++ struct fuse_req *curr; ++ ++ for (curr = se->list.next; curr != &se->list; curr = curr->next) { ++ if (curr->unique == req->u.i.unique) { ++ fuse_interrupt_func_t func; ++ void *data; ++ ++ curr->ctr++; ++ pthread_mutex_unlock(&se->lock); ++ ++ /* Ugh, ugly locking */ ++ pthread_mutex_lock(&curr->lock); ++ pthread_mutex_lock(&se->lock); ++ curr->interrupted = 1; ++ func = curr->u.ni.func; ++ data = curr->u.ni.data; ++ pthread_mutex_unlock(&se->lock); ++ if (func) { ++ func(curr, data); ++ } ++ pthread_mutex_unlock(&curr->lock); ++ ++ pthread_mutex_lock(&se->lock); ++ curr->ctr--; ++ if (!curr->ctr) { ++ destroy_req(curr); ++ } ++ ++ return 1; ++ } ++ } ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->u.i.unique) { ++ return 1; ++ } ++ } ++ return 0; + } + + static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *) inarg; +- struct fuse_session *se = req->se; ++ struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *)inarg; ++ struct fuse_session *se = req->se; + +- (void) nodeid; +- if (se->debug) +- fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", +- (unsigned long long) arg->unique); ++ (void)nodeid; ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", ++ (unsigned long long)arg->unique); ++ } + +- req->u.i.unique = arg->unique; ++ req->u.i.unique = arg->unique; + +- pthread_mutex_lock(&se->lock); +- if (find_interrupted(se, req)) +- destroy_req(req); +- else +- list_add_req(req, &se->interrupts); +- pthread_mutex_unlock(&se->lock); ++ pthread_mutex_lock(&se->lock); ++ if (find_interrupted(se, req)) { ++ destroy_req(req); ++ } else { ++ list_add_req(req, &se->interrupts); ++ } ++ pthread_mutex_unlock(&se->lock); + } + + static struct fuse_req *check_interrupt(struct fuse_session *se, +- struct fuse_req *req) +-{ +- struct fuse_req *curr; +- +- for (curr = se->interrupts.next; curr != &se->interrupts; +- curr = curr->next) { +- if (curr->u.i.unique == req->unique) { +- req->interrupted = 1; +- list_del_req(curr); +- free(curr); +- return NULL; +- } +- } +- curr = se->interrupts.next; +- if (curr != &se->interrupts) { +- list_del_req(curr); +- list_init_req(curr); +- return curr; +- } else +- return NULL; ++ struct fuse_req *req) ++{ ++ struct fuse_req *curr; ++ ++ for (curr = se->interrupts.next; curr != &se->interrupts; ++ curr = curr->next) { ++ if (curr->u.i.unique == req->unique) { ++ req->interrupted = 1; ++ list_del_req(curr); ++ free(curr); ++ return NULL; ++ } ++ } ++ curr = se->interrupts.next; ++ if (curr != &se->interrupts) { ++ list_del_req(curr); ++ list_init_req(curr); ++ return curr; ++ } else { ++ return NULL; ++ } + } + + static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_bmap_in *arg = (struct fuse_bmap_in *) inarg; ++ struct fuse_bmap_in *arg = (struct fuse_bmap_in *)inarg; + +- if (req->se->op.bmap) +- req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.bmap) { ++ req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *) inarg; +- unsigned int flags = arg->flags; +- void *in_buf = arg->in_size ? PARAM(arg) : NULL; +- struct fuse_file_info fi; ++ struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *)inarg; ++ unsigned int flags = arg->flags; ++ void *in_buf = arg->in_size ? PARAM(arg) : NULL; ++ struct fuse_file_info fi; + +- if (flags & FUSE_IOCTL_DIR && +- !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { +- fuse_reply_err(req, ENOTTY); +- return; +- } ++ if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { ++ fuse_reply_err(req, ENOTTY); ++ return; ++ } + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && +- !(flags & FUSE_IOCTL_32BIT)) { +- req->ioctl_64bit = 1; +- } ++ if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && ++ !(flags & FUSE_IOCTL_32BIT)) { ++ req->ioctl_64bit = 1; ++ } + +- if (req->se->op.ioctl) +- req->se->op.ioctl(req, nodeid, arg->cmd, +- (void *)(uintptr_t)arg->arg, &fi, flags, +- in_buf, arg->in_size, arg->out_size); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.ioctl) { ++ req->se->op.ioctl(req, nodeid, arg->cmd, (void *)(uintptr_t)arg->arg, ++ &fi, flags, in_buf, arg->in_size, arg->out_size); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) + { +- free(ph); ++ free(ph); + } + + static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_poll_in *arg = (struct fuse_poll_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_poll_in *arg = (struct fuse_poll_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fi.poll_events = arg->events; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fi.poll_events = arg->events; + +- if (req->se->op.poll) { +- struct fuse_pollhandle *ph = NULL; ++ if (req->se->op.poll) { ++ struct fuse_pollhandle *ph = NULL; + +- if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { +- ph = malloc(sizeof(struct fuse_pollhandle)); +- if (ph == NULL) { +- fuse_reply_err(req, ENOMEM); +- return; +- } +- ph->kh = arg->kh; +- ph->se = req->se; +- } ++ if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { ++ ph = malloc(sizeof(struct fuse_pollhandle)); ++ if (ph == NULL) { ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ph->kh = arg->kh; ++ ph->se = req->se; ++ } + +- req->se->op.poll(req, nodeid, &fi, ph); +- } else { +- fuse_reply_err(req, ENOSYS); +- } ++ req->se->op.poll(req, nodeid, &fi, ph); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.fallocate) +- req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.fallocate) { ++ req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, ++ &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + +-static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, const void *inarg) ++static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, ++ const void *inarg) + { +- struct fuse_copy_file_range_in *arg = (struct fuse_copy_file_range_in *) inarg; +- struct fuse_file_info fi_in, fi_out; ++ struct fuse_copy_file_range_in *arg = ++ (struct fuse_copy_file_range_in *)inarg; ++ struct fuse_file_info fi_in, fi_out; + +- memset(&fi_in, 0, sizeof(fi_in)); +- fi_in.fh = arg->fh_in; ++ memset(&fi_in, 0, sizeof(fi_in)); ++ fi_in.fh = arg->fh_in; + +- memset(&fi_out, 0, sizeof(fi_out)); +- fi_out.fh = arg->fh_out; ++ memset(&fi_out, 0, sizeof(fi_out)); ++ fi_out.fh = arg->fh_out; + + +- if (req->se->op.copy_file_range) +- req->se->op.copy_file_range(req, nodeid_in, arg->off_in, +- &fi_in, arg->nodeid_out, +- arg->off_out, &fi_out, arg->len, +- arg->flags); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.copy_file_range) { ++ req->se->op.copy_file_range(req, nodeid_in, arg->off_in, &fi_in, ++ arg->nodeid_out, arg->off_out, &fi_out, ++ arg->len, arg->flags); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_lseek_in *arg = (struct fuse_lseek_in *) inarg; +- struct fuse_file_info fi; ++ struct fuse_lseek_in *arg = (struct fuse_lseek_in *)inarg; ++ struct fuse_file_info fi; + +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; + +- if (req->se->op.lseek) +- req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); +- else +- fuse_reply_err(req, ENOSYS); ++ if (req->se->op.lseek) { ++ req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); ++ } else { ++ fuse_reply_err(req, ENOSYS); ++ } + } + + static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_init_in *arg = (struct fuse_init_in *) inarg; +- struct fuse_init_out outarg; +- struct fuse_session *se = req->se; +- size_t bufsize = se->bufsize; +- size_t outargsize = sizeof(outarg); +- +- (void) nodeid; +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); +- if (arg->major == 7 && arg->minor >= 6) { +- fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); +- fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", +- arg->max_readahead); +- } +- } +- se->conn.proto_major = arg->major; +- se->conn.proto_minor = arg->minor; +- se->conn.capable = 0; +- se->conn.want = 0; +- +- memset(&outarg, 0, sizeof(outarg)); +- outarg.major = FUSE_KERNEL_VERSION; +- outarg.minor = FUSE_KERNEL_MINOR_VERSION; +- +- if (arg->major < 7) { +- fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", +- arg->major, arg->minor); +- fuse_reply_err(req, EPROTO); +- return; +- } +- +- if (arg->major > 7) { +- /* Wait for a second INIT request with a 7.X version */ +- send_reply_ok(req, &outarg, sizeof(outarg)); +- return; +- } +- +- if (arg->minor >= 6) { +- if (arg->max_readahead < se->conn.max_readahead) +- se->conn.max_readahead = arg->max_readahead; +- if (arg->flags & FUSE_ASYNC_READ) +- se->conn.capable |= FUSE_CAP_ASYNC_READ; +- if (arg->flags & FUSE_POSIX_LOCKS) +- se->conn.capable |= FUSE_CAP_POSIX_LOCKS; +- if (arg->flags & FUSE_ATOMIC_O_TRUNC) +- se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; +- if (arg->flags & FUSE_EXPORT_SUPPORT) +- se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; +- if (arg->flags & FUSE_DONT_MASK) +- se->conn.capable |= FUSE_CAP_DONT_MASK; +- if (arg->flags & FUSE_FLOCK_LOCKS) +- se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; +- if (arg->flags & FUSE_AUTO_INVAL_DATA) +- se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; +- if (arg->flags & FUSE_DO_READDIRPLUS) +- se->conn.capable |= FUSE_CAP_READDIRPLUS; +- if (arg->flags & FUSE_READDIRPLUS_AUTO) +- se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; +- if (arg->flags & FUSE_ASYNC_DIO) +- se->conn.capable |= FUSE_CAP_ASYNC_DIO; +- if (arg->flags & FUSE_WRITEBACK_CACHE) +- se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; +- if (arg->flags & FUSE_NO_OPEN_SUPPORT) +- se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; +- if (arg->flags & FUSE_PARALLEL_DIROPS) +- se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; +- if (arg->flags & FUSE_POSIX_ACL) +- se->conn.capable |= FUSE_CAP_POSIX_ACL; +- if (arg->flags & FUSE_HANDLE_KILLPRIV) +- se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; +- if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) +- se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; +- if (!(arg->flags & FUSE_MAX_PAGES)) { +- size_t max_bufsize = +- FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() +- + FUSE_BUFFER_HEADER_SIZE; +- if (bufsize > max_bufsize) { +- bufsize = max_bufsize; +- } +- } +- } else { +- se->conn.max_readahead = 0; +- } +- +- if (se->conn.proto_minor >= 14) { ++ struct fuse_init_in *arg = (struct fuse_init_in *)inarg; ++ struct fuse_init_out outarg; ++ struct fuse_session *se = req->se; ++ size_t bufsize = se->bufsize; ++ size_t outargsize = sizeof(outarg); ++ ++ (void)nodeid; ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); ++ if (arg->major == 7 && arg->minor >= 6) { ++ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); ++ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", ++ arg->max_readahead); ++ } ++ } ++ se->conn.proto_major = arg->major; ++ se->conn.proto_minor = arg->minor; ++ se->conn.capable = 0; ++ se->conn.want = 0; ++ ++ memset(&outarg, 0, sizeof(outarg)); ++ outarg.major = FUSE_KERNEL_VERSION; ++ outarg.minor = FUSE_KERNEL_MINOR_VERSION; ++ ++ if (arg->major < 7) { ++ fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", ++ arg->major, arg->minor); ++ fuse_reply_err(req, EPROTO); ++ return; ++ } ++ ++ if (arg->major > 7) { ++ /* Wait for a second INIT request with a 7.X version */ ++ send_reply_ok(req, &outarg, sizeof(outarg)); ++ return; ++ } ++ ++ if (arg->minor >= 6) { ++ if (arg->max_readahead < se->conn.max_readahead) { ++ se->conn.max_readahead = arg->max_readahead; ++ } ++ if (arg->flags & FUSE_ASYNC_READ) { ++ se->conn.capable |= FUSE_CAP_ASYNC_READ; ++ } ++ if (arg->flags & FUSE_POSIX_LOCKS) { ++ se->conn.capable |= FUSE_CAP_POSIX_LOCKS; ++ } ++ if (arg->flags & FUSE_ATOMIC_O_TRUNC) { ++ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; ++ } ++ if (arg->flags & FUSE_EXPORT_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; ++ } ++ if (arg->flags & FUSE_DONT_MASK) { ++ se->conn.capable |= FUSE_CAP_DONT_MASK; ++ } ++ if (arg->flags & FUSE_FLOCK_LOCKS) { ++ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; ++ } ++ if (arg->flags & FUSE_AUTO_INVAL_DATA) { ++ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; ++ } ++ if (arg->flags & FUSE_DO_READDIRPLUS) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS; ++ } ++ if (arg->flags & FUSE_READDIRPLUS_AUTO) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; ++ } ++ if (arg->flags & FUSE_ASYNC_DIO) { ++ se->conn.capable |= FUSE_CAP_ASYNC_DIO; ++ } ++ if (arg->flags & FUSE_WRITEBACK_CACHE) { ++ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (arg->flags & FUSE_NO_OPEN_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; ++ } ++ if (arg->flags & FUSE_PARALLEL_DIROPS) { ++ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; ++ } ++ if (arg->flags & FUSE_POSIX_ACL) { ++ se->conn.capable |= FUSE_CAP_POSIX_ACL; ++ } ++ if (arg->flags & FUSE_HANDLE_KILLPRIV) { ++ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; ++ } ++ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; ++ } ++ if (!(arg->flags & FUSE_MAX_PAGES)) { ++ size_t max_bufsize = ++ FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + ++ FUSE_BUFFER_HEADER_SIZE; ++ if (bufsize > max_bufsize) { ++ bufsize = max_bufsize; ++ } ++ } ++ } else { ++ se->conn.max_readahead = 0; ++ } ++ ++ if (se->conn.proto_minor >= 14) { + #ifdef HAVE_SPLICE + #ifdef HAVE_VMSPLICE +- se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; ++ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; + #endif +- se->conn.capable |= FUSE_CAP_SPLICE_READ; ++ se->conn.capable |= FUSE_CAP_SPLICE_READ; + #endif +- } +- if (se->conn.proto_minor >= 18) +- se->conn.capable |= FUSE_CAP_IOCTL_DIR; +- +- /* Default settings for modern filesystems. +- * +- * Most of these capabilities were disabled by default in +- * libfuse2 for backwards compatibility reasons. In libfuse3, +- * we can finally enable them by default (as long as they're +- * supported by the kernel). +- */ +-#define LL_SET_DEFAULT(cond, cap) \ +- if ((cond) && (se->conn.capable & (cap))) \ +- se->conn.want |= (cap) +- LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); +- LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); +- LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); +- LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); +- LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); +- LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); +- LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); +- LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); +- LL_SET_DEFAULT(se->op.getlk && se->op.setlk, +- FUSE_CAP_POSIX_LOCKS); +- LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); +- LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); +- LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, +- FUSE_CAP_READDIRPLUS_AUTO); +- se->conn.time_gran = 1; +- +- if (bufsize < FUSE_MIN_READ_BUFFER) { +- fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", +- bufsize); +- bufsize = FUSE_MIN_READ_BUFFER; +- } +- se->bufsize = bufsize; +- +- if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) +- se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; +- +- se->got_init = 1; +- if (se->op.init) +- se->op.init(se->userdata, &se->conn); +- +- if (se->conn.want & (~se->conn.capable)) { +- fuse_log(FUSE_LOG_ERR, "fuse: error: filesystem requested capabilities " +- "0x%x that are not supported by kernel, aborting.\n", +- se->conn.want & (~se->conn.capable)); +- fuse_reply_err(req, EPROTO); +- se->error = -EPROTO; +- fuse_session_exit(se); +- return; +- } +- +- if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { +- se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; +- } +- if (arg->flags & FUSE_MAX_PAGES) { +- outarg.flags |= FUSE_MAX_PAGES; +- outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; +- } +- +- /* Always enable big writes, this is superseded +- by the max_write option */ +- outarg.flags |= FUSE_BIG_WRITES; +- +- if (se->conn.want & FUSE_CAP_ASYNC_READ) +- outarg.flags |= FUSE_ASYNC_READ; +- if (se->conn.want & FUSE_CAP_POSIX_LOCKS) +- outarg.flags |= FUSE_POSIX_LOCKS; +- if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) +- outarg.flags |= FUSE_ATOMIC_O_TRUNC; +- if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) +- outarg.flags |= FUSE_EXPORT_SUPPORT; +- if (se->conn.want & FUSE_CAP_DONT_MASK) +- outarg.flags |= FUSE_DONT_MASK; +- if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) +- outarg.flags |= FUSE_FLOCK_LOCKS; +- if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) +- outarg.flags |= FUSE_AUTO_INVAL_DATA; +- if (se->conn.want & FUSE_CAP_READDIRPLUS) +- outarg.flags |= FUSE_DO_READDIRPLUS; +- if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) +- outarg.flags |= FUSE_READDIRPLUS_AUTO; +- if (se->conn.want & FUSE_CAP_ASYNC_DIO) +- outarg.flags |= FUSE_ASYNC_DIO; +- if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) +- outarg.flags |= FUSE_WRITEBACK_CACHE; +- if (se->conn.want & FUSE_CAP_POSIX_ACL) +- outarg.flags |= FUSE_POSIX_ACL; +- outarg.max_readahead = se->conn.max_readahead; +- outarg.max_write = se->conn.max_write; +- if (se->conn.proto_minor >= 13) { +- if (se->conn.max_background >= (1 << 16)) +- se->conn.max_background = (1 << 16) - 1; +- if (se->conn.congestion_threshold > se->conn.max_background) +- se->conn.congestion_threshold = se->conn.max_background; +- if (!se->conn.congestion_threshold) { +- se->conn.congestion_threshold = +- se->conn.max_background * 3 / 4; +- } +- +- outarg.max_background = se->conn.max_background; +- outarg.congestion_threshold = se->conn.congestion_threshold; +- } +- if (se->conn.proto_minor >= 23) +- outarg.time_gran = se->conn.time_gran; +- +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); +- fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); +- fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", +- outarg.max_readahead); +- fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); +- fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", +- outarg.max_background); +- fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", +- outarg.congestion_threshold); +- fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", +- outarg.time_gran); +- } +- if (arg->minor < 5) +- outargsize = FUSE_COMPAT_INIT_OUT_SIZE; +- else if (arg->minor < 23) +- outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; +- +- send_reply_ok(req, &outarg, outargsize); ++ } ++ if (se->conn.proto_minor >= 18) { ++ se->conn.capable |= FUSE_CAP_IOCTL_DIR; ++ } ++ ++ /* ++ * Default settings for modern filesystems. ++ * ++ * Most of these capabilities were disabled by default in ++ * libfuse2 for backwards compatibility reasons. In libfuse3, ++ * we can finally enable them by default (as long as they're ++ * supported by the kernel). ++ */ ++#define LL_SET_DEFAULT(cond, cap) \ ++ if ((cond) && (se->conn.capable & (cap))) \ ++ se->conn.want |= (cap) ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); ++ LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); ++ LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); ++ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); ++ LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); ++ LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); ++ LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); ++ LL_SET_DEFAULT(se->op.getlk && se->op.setlk, FUSE_CAP_POSIX_LOCKS); ++ LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); ++ LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, ++ FUSE_CAP_READDIRPLUS_AUTO); ++ se->conn.time_gran = 1; ++ ++ if (bufsize < FUSE_MIN_READ_BUFFER) { ++ fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", ++ bufsize); ++ bufsize = FUSE_MIN_READ_BUFFER; ++ } ++ se->bufsize = bufsize; ++ ++ if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) { ++ se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; ++ } ++ ++ se->got_init = 1; ++ if (se->op.init) { ++ se->op.init(se->userdata, &se->conn); ++ } ++ ++ if (se->conn.want & (~se->conn.capable)) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: error: filesystem requested capabilities " ++ "0x%x that are not supported by kernel, aborting.\n", ++ se->conn.want & (~se->conn.capable)); ++ fuse_reply_err(req, EPROTO); ++ se->error = -EPROTO; ++ fuse_session_exit(se); ++ return; ++ } ++ ++ if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { ++ se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; ++ } ++ if (arg->flags & FUSE_MAX_PAGES) { ++ outarg.flags |= FUSE_MAX_PAGES; ++ outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; ++ } ++ ++ /* ++ * Always enable big writes, this is superseded ++ * by the max_write option ++ */ ++ outarg.flags |= FUSE_BIG_WRITES; ++ ++ if (se->conn.want & FUSE_CAP_ASYNC_READ) { ++ outarg.flags |= FUSE_ASYNC_READ; ++ } ++ if (se->conn.want & FUSE_CAP_POSIX_LOCKS) { ++ outarg.flags |= FUSE_POSIX_LOCKS; ++ } ++ if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) { ++ outarg.flags |= FUSE_ATOMIC_O_TRUNC; ++ } ++ if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) { ++ outarg.flags |= FUSE_EXPORT_SUPPORT; ++ } ++ if (se->conn.want & FUSE_CAP_DONT_MASK) { ++ outarg.flags |= FUSE_DONT_MASK; ++ } ++ if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) { ++ outarg.flags |= FUSE_FLOCK_LOCKS; ++ } ++ if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) { ++ outarg.flags |= FUSE_AUTO_INVAL_DATA; ++ } ++ if (se->conn.want & FUSE_CAP_READDIRPLUS) { ++ outarg.flags |= FUSE_DO_READDIRPLUS; ++ } ++ if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) { ++ outarg.flags |= FUSE_READDIRPLUS_AUTO; ++ } ++ if (se->conn.want & FUSE_CAP_ASYNC_DIO) { ++ outarg.flags |= FUSE_ASYNC_DIO; ++ } ++ if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) { ++ outarg.flags |= FUSE_WRITEBACK_CACHE; ++ } ++ if (se->conn.want & FUSE_CAP_POSIX_ACL) { ++ outarg.flags |= FUSE_POSIX_ACL; ++ } ++ outarg.max_readahead = se->conn.max_readahead; ++ outarg.max_write = se->conn.max_write; ++ if (se->conn.proto_minor >= 13) { ++ if (se->conn.max_background >= (1 << 16)) { ++ se->conn.max_background = (1 << 16) - 1; ++ } ++ if (se->conn.congestion_threshold > se->conn.max_background) { ++ se->conn.congestion_threshold = se->conn.max_background; ++ } ++ if (!se->conn.congestion_threshold) { ++ se->conn.congestion_threshold = se->conn.max_background * 3 / 4; ++ } ++ ++ outarg.max_background = se->conn.max_background; ++ outarg.congestion_threshold = se->conn.congestion_threshold; ++ } ++ if (se->conn.proto_minor >= 23) { ++ outarg.time_gran = se->conn.time_gran; ++ } ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, ++ outarg.minor); ++ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); ++ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", ++ outarg.max_readahead); ++ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); ++ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", ++ outarg.max_background); ++ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", ++ outarg.congestion_threshold); ++ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); ++ } ++ if (arg->minor < 5) { ++ outargsize = FUSE_COMPAT_INIT_OUT_SIZE; ++ } else if (arg->minor < 23) { ++ outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; ++ } ++ ++ send_reply_ok(req, &outarg, outargsize); + } + + static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { +- struct fuse_session *se = req->se; ++ struct fuse_session *se = req->se; + +- (void) nodeid; +- (void) inarg; ++ (void)nodeid; ++ (void)inarg; + +- se->got_destroy = 1; +- if (se->op.destroy) +- se->op.destroy(se->userdata); ++ se->got_destroy = 1; ++ if (se->op.destroy) { ++ se->op.destroy(se->userdata); ++ } + +- send_reply_ok(req, NULL, 0); ++ send_reply_ok(req, NULL, 0); + } + + static void list_del_nreq(struct fuse_notify_req *nreq) + { +- struct fuse_notify_req *prev = nreq->prev; +- struct fuse_notify_req *next = nreq->next; +- prev->next = next; +- next->prev = prev; ++ struct fuse_notify_req *prev = nreq->prev; ++ struct fuse_notify_req *next = nreq->next; ++ prev->next = next; ++ next->prev = prev; + } + + static void list_add_nreq(struct fuse_notify_req *nreq, +- struct fuse_notify_req *next) ++ struct fuse_notify_req *next) + { +- struct fuse_notify_req *prev = next->prev; +- nreq->next = next; +- nreq->prev = prev; +- prev->next = nreq; +- next->prev = nreq; ++ struct fuse_notify_req *prev = next->prev; ++ nreq->next = next; ++ nreq->prev = prev; ++ prev->next = nreq; ++ next->prev = nreq; + } + + static void list_init_nreq(struct fuse_notify_req *nreq) + { +- nreq->next = nreq; +- nreq->prev = nreq; ++ nreq->next = nreq; ++ nreq->prev = nreq; + } + + static void do_notify_reply(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, const struct fuse_buf *buf) ++ const void *inarg, const struct fuse_buf *buf) + { +- struct fuse_session *se = req->se; +- struct fuse_notify_req *nreq; +- struct fuse_notify_req *head; ++ struct fuse_session *se = req->se; ++ struct fuse_notify_req *nreq; ++ struct fuse_notify_req *head; + +- pthread_mutex_lock(&se->lock); +- head = &se->notify_list; +- for (nreq = head->next; nreq != head; nreq = nreq->next) { +- if (nreq->unique == req->unique) { +- list_del_nreq(nreq); +- break; +- } +- } +- pthread_mutex_unlock(&se->lock); ++ pthread_mutex_lock(&se->lock); ++ head = &se->notify_list; ++ for (nreq = head->next; nreq != head; nreq = nreq->next) { ++ if (nreq->unique == req->unique) { ++ list_del_nreq(nreq); ++ break; ++ } ++ } ++ pthread_mutex_unlock(&se->lock); + +- if (nreq != head) +- nreq->reply(nreq, req, nodeid, inarg, buf); ++ if (nreq != head) { ++ nreq->reply(nreq, req, nodeid, inarg, buf); ++ } + } + + static int send_notify_iov(struct fuse_session *se, int notify_code, +- struct iovec *iov, int count) ++ struct iovec *iov, int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out; + +- if (!se->got_init) +- return -ENOTCONN; ++ if (!se->got_init) { ++ return -ENOTCONN; ++ } + +- out.unique = 0; +- out.error = notify_code; +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(struct fuse_out_header); ++ out.unique = 0; ++ out.error = notify_code; ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(struct fuse_out_header); + +- return fuse_send_msg(se, NULL, iov, count); ++ return fuse_send_msg(se, NULL, iov, count); + } + + int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) + { +- if (ph != NULL) { +- struct fuse_notify_poll_wakeup_out outarg; +- struct iovec iov[2]; ++ if (ph != NULL) { ++ struct fuse_notify_poll_wakeup_out outarg; ++ struct iovec iov[2]; + +- outarg.kh = ph->kh; ++ outarg.kh = ph->kh; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2); +- } else { +- return 0; +- } ++ return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2); ++ } else { ++ return 0; ++ } + } + + int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, +- off_t off, off_t len) ++ off_t off, off_t len) + { +- struct fuse_notify_inval_inode_out outarg; +- struct iovec iov[2]; ++ struct fuse_notify_inval_inode_out outarg; ++ struct iovec iov[2]; ++ ++ if (!se) { ++ return -EINVAL; ++ } + +- if (!se) +- return -EINVAL; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { ++ return -ENOSYS; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) +- return -ENOSYS; +- +- outarg.ino = ino; +- outarg.off = off; +- outarg.len = len; ++ outarg.ino = ino; ++ outarg.off = off; ++ outarg.len = len; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2); ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2); + } + + int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, +- const char *name, size_t namelen) ++ const char *name, size_t namelen) + { +- struct fuse_notify_inval_entry_out outarg; +- struct iovec iov[3]; ++ struct fuse_notify_inval_entry_out outarg; ++ struct iovec iov[3]; ++ ++ if (!se) { ++ return -EINVAL; ++ } + +- if (!se) +- return -EINVAL; +- +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { ++ return -ENOSYS; ++ } + +- outarg.parent = parent; +- outarg.namelen = namelen; +- outarg.padding = 0; ++ outarg.parent = parent; ++ outarg.namelen = namelen; ++ outarg.padding = 0; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); +- iov[2].iov_base = (void *)name; +- iov[2].iov_len = namelen + 1; ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; + +- return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3); ++ return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3); + } + +-int fuse_lowlevel_notify_delete(struct fuse_session *se, +- fuse_ino_t parent, fuse_ino_t child, +- const char *name, size_t namelen) ++int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, ++ fuse_ino_t child, const char *name, ++ size_t namelen) + { +- struct fuse_notify_delete_out outarg; +- struct iovec iov[3]; ++ struct fuse_notify_delete_out outarg; ++ struct iovec iov[3]; + +- if (!se) +- return -EINVAL; ++ if (!se) { ++ return -EINVAL; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) { ++ return -ENOSYS; ++ } + +- outarg.parent = parent; +- outarg.child = child; +- outarg.namelen = namelen; +- outarg.padding = 0; ++ outarg.parent = parent; ++ outarg.child = child; ++ outarg.namelen = namelen; ++ outarg.padding = 0; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); +- iov[2].iov_base = (void *)name; +- iov[2].iov_len = namelen + 1; ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); ++ iov[2].iov_base = (void *)name; ++ iov[2].iov_len = namelen + 1; + +- return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3); ++ return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3); + } + + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags) + { +- struct fuse_out_header out; +- struct fuse_notify_store_out outarg; +- struct iovec iov[3]; +- size_t size = fuse_buf_size(bufv); +- int res; ++ struct fuse_out_header out; ++ struct fuse_notify_store_out outarg; ++ struct iovec iov[3]; ++ size_t size = fuse_buf_size(bufv); ++ int res; + +- if (!se) +- return -EINVAL; ++ if (!se) { ++ return -EINVAL; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { ++ return -ENOSYS; ++ } + +- out.unique = 0; +- out.error = FUSE_NOTIFY_STORE; ++ out.unique = 0; ++ out.error = FUSE_NOTIFY_STORE; + +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; + +- iov[0].iov_base = &out; +- iov[0].iov_len = sizeof(out); +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[0].iov_base = &out; ++ iov[0].iov_len = sizeof(out); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); +- if (res > 0) +- res = -res; ++ res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); ++ if (res > 0) { ++ res = -res; ++ } + +- return res; ++ return res; + } + + struct fuse_retrieve_req { +- struct fuse_notify_req nreq; +- void *cookie; ++ struct fuse_notify_req nreq; ++ void *cookie; + }; + +-static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, +- fuse_req_t req, fuse_ino_t ino, +- const void *inarg, +- const struct fuse_buf *ibuf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_retrieve_req *rreq = +- container_of(nreq, struct fuse_retrieve_req, nreq); +- const struct fuse_notify_retrieve_in *arg = inarg; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, +- .count = 1, +- }; +- +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) +- bufv.buf[0].mem = PARAM(arg); +- +- bufv.buf[0].size -= sizeof(struct fuse_in_header) + +- sizeof(struct fuse_notify_retrieve_in); +- +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); +- fuse_reply_none(req); +- goto out; +- } +- bufv.buf[0].size = arg->size; +- +- if (se->op.retrieve_reply) { +- se->op.retrieve_reply(req, rreq->cookie, ino, +- arg->offset, &bufv); +- } else { +- fuse_reply_none(req); +- } ++static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, fuse_req_t req, ++ fuse_ino_t ino, const void *inarg, ++ const struct fuse_buf *ibuf) ++{ ++ struct fuse_session *se = req->se; ++ struct fuse_retrieve_req *rreq = ++ container_of(nreq, struct fuse_retrieve_req, nreq); ++ const struct fuse_notify_retrieve_in *arg = inarg; ++ struct fuse_bufvec bufv = { ++ .buf[0] = *ibuf, ++ .count = 1, ++ }; ++ ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ bufv.buf[0].mem = PARAM(arg); ++ } ++ ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_notify_retrieve_in); ++ ++ if (bufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); ++ fuse_reply_none(req); ++ goto out; ++ } ++ bufv.buf[0].size = arg->size; ++ ++ if (se->op.retrieve_reply) { ++ se->op.retrieve_reply(req, rreq->cookie, ino, arg->offset, &bufv); ++ } else { ++ fuse_reply_none(req); ++ } + out: +- free(rreq); ++ free(rreq); + } + + int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie) ++ size_t size, off_t offset, void *cookie) + { +- struct fuse_notify_retrieve_out outarg; +- struct iovec iov[2]; +- struct fuse_retrieve_req *rreq; +- int err; ++ struct fuse_notify_retrieve_out outarg; ++ struct iovec iov[2]; ++ struct fuse_retrieve_req *rreq; ++ int err; + +- if (!se) +- return -EINVAL; ++ if (!se) { ++ return -EINVAL; ++ } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) +- return -ENOSYS; ++ if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { ++ return -ENOSYS; ++ } + +- rreq = malloc(sizeof(*rreq)); +- if (rreq == NULL) +- return -ENOMEM; ++ rreq = malloc(sizeof(*rreq)); ++ if (rreq == NULL) { ++ return -ENOMEM; ++ } + +- pthread_mutex_lock(&se->lock); +- rreq->cookie = cookie; +- rreq->nreq.unique = se->notify_ctr++; +- rreq->nreq.reply = fuse_ll_retrieve_reply; +- list_add_nreq(&rreq->nreq, &se->notify_list); +- pthread_mutex_unlock(&se->lock); ++ pthread_mutex_lock(&se->lock); ++ rreq->cookie = cookie; ++ rreq->nreq.unique = se->notify_ctr++; ++ rreq->nreq.reply = fuse_ll_retrieve_reply; ++ list_add_nreq(&rreq->nreq, &se->notify_list); ++ pthread_mutex_unlock(&se->lock); + +- outarg.notify_unique = rreq->nreq.unique; +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; ++ outarg.notify_unique = rreq->nreq.unique; ++ outarg.nodeid = ino; ++ outarg.offset = offset; ++ outarg.size = size; ++ outarg.padding = 0; + +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); ++ iov[1].iov_base = &outarg; ++ iov[1].iov_len = sizeof(outarg); + +- err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); +- if (err) { +- pthread_mutex_lock(&se->lock); +- list_del_nreq(&rreq->nreq); +- pthread_mutex_unlock(&se->lock); +- free(rreq); +- } ++ err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); ++ if (err) { ++ pthread_mutex_lock(&se->lock); ++ list_del_nreq(&rreq->nreq); ++ pthread_mutex_unlock(&se->lock); ++ free(rreq); ++ } + +- return err; ++ return err; + } + + void *fuse_req_userdata(fuse_req_t req) + { +- return req->se->userdata; ++ return req->se->userdata; + } + + const struct fuse_ctx *fuse_req_ctx(fuse_req_t req) + { +- return &req->ctx; ++ return &req->ctx; + } + + void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, +- void *data) ++ void *data) + { +- pthread_mutex_lock(&req->lock); +- pthread_mutex_lock(&req->se->lock); +- req->u.ni.func = func; +- req->u.ni.data = data; +- pthread_mutex_unlock(&req->se->lock); +- if (req->interrupted && func) +- func(req, data); +- pthread_mutex_unlock(&req->lock); ++ pthread_mutex_lock(&req->lock); ++ pthread_mutex_lock(&req->se->lock); ++ req->u.ni.func = func; ++ req->u.ni.data = data; ++ pthread_mutex_unlock(&req->se->lock); ++ if (req->interrupted && func) { ++ func(req, data); ++ } ++ pthread_mutex_unlock(&req->lock); + } + + int fuse_req_interrupted(fuse_req_t req) + { +- int interrupted; ++ int interrupted; + +- pthread_mutex_lock(&req->se->lock); +- interrupted = req->interrupted; +- pthread_mutex_unlock(&req->se->lock); ++ pthread_mutex_lock(&req->se->lock); ++ interrupted = req->interrupted; ++ pthread_mutex_unlock(&req->se->lock); + +- return interrupted; ++ return interrupted; + } + + static struct { +- void (*func)(fuse_req_t, fuse_ino_t, const void *); +- const char *name; ++ void (*func)(fuse_req_t, fuse_ino_t, const void *); ++ const char *name; + } fuse_ll_ops[] = { +- [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, +- [FUSE_FORGET] = { do_forget, "FORGET" }, +- [FUSE_GETATTR] = { do_getattr, "GETATTR" }, +- [FUSE_SETATTR] = { do_setattr, "SETATTR" }, +- [FUSE_READLINK] = { do_readlink, "READLINK" }, +- [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, +- [FUSE_MKNOD] = { do_mknod, "MKNOD" }, +- [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, +- [FUSE_UNLINK] = { do_unlink, "UNLINK" }, +- [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, +- [FUSE_RENAME] = { do_rename, "RENAME" }, +- [FUSE_LINK] = { do_link, "LINK" }, +- [FUSE_OPEN] = { do_open, "OPEN" }, +- [FUSE_READ] = { do_read, "READ" }, +- [FUSE_WRITE] = { do_write, "WRITE" }, +- [FUSE_STATFS] = { do_statfs, "STATFS" }, +- [FUSE_RELEASE] = { do_release, "RELEASE" }, +- [FUSE_FSYNC] = { do_fsync, "FSYNC" }, +- [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, +- [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, +- [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, +- [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, +- [FUSE_FLUSH] = { do_flush, "FLUSH" }, +- [FUSE_INIT] = { do_init, "INIT" }, +- [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, +- [FUSE_READDIR] = { do_readdir, "READDIR" }, +- [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, +- [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, +- [FUSE_GETLK] = { do_getlk, "GETLK" }, +- [FUSE_SETLK] = { do_setlk, "SETLK" }, +- [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, +- [FUSE_ACCESS] = { do_access, "ACCESS" }, +- [FUSE_CREATE] = { do_create, "CREATE" }, +- [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, +- [FUSE_BMAP] = { do_bmap, "BMAP" }, +- [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, +- [FUSE_POLL] = { do_poll, "POLL" }, +- [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, +- [FUSE_DESTROY] = { do_destroy, "DESTROY" }, +- [FUSE_NOTIFY_REPLY] = { (void *) 1, "NOTIFY_REPLY" }, +- [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, +- [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS"}, +- [FUSE_RENAME2] = { do_rename2, "RENAME2" }, +- [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, +- [FUSE_LSEEK] = { do_lseek, "LSEEK" }, ++ [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, ++ [FUSE_FORGET] = { do_forget, "FORGET" }, ++ [FUSE_GETATTR] = { do_getattr, "GETATTR" }, ++ [FUSE_SETATTR] = { do_setattr, "SETATTR" }, ++ [FUSE_READLINK] = { do_readlink, "READLINK" }, ++ [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, ++ [FUSE_MKNOD] = { do_mknod, "MKNOD" }, ++ [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, ++ [FUSE_UNLINK] = { do_unlink, "UNLINK" }, ++ [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, ++ [FUSE_RENAME] = { do_rename, "RENAME" }, ++ [FUSE_LINK] = { do_link, "LINK" }, ++ [FUSE_OPEN] = { do_open, "OPEN" }, ++ [FUSE_READ] = { do_read, "READ" }, ++ [FUSE_WRITE] = { do_write, "WRITE" }, ++ [FUSE_STATFS] = { do_statfs, "STATFS" }, ++ [FUSE_RELEASE] = { do_release, "RELEASE" }, ++ [FUSE_FSYNC] = { do_fsync, "FSYNC" }, ++ [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, ++ [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, ++ [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, ++ [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, ++ [FUSE_FLUSH] = { do_flush, "FLUSH" }, ++ [FUSE_INIT] = { do_init, "INIT" }, ++ [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, ++ [FUSE_READDIR] = { do_readdir, "READDIR" }, ++ [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, ++ [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, ++ [FUSE_GETLK] = { do_getlk, "GETLK" }, ++ [FUSE_SETLK] = { do_setlk, "SETLK" }, ++ [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, ++ [FUSE_ACCESS] = { do_access, "ACCESS" }, ++ [FUSE_CREATE] = { do_create, "CREATE" }, ++ [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, ++ [FUSE_BMAP] = { do_bmap, "BMAP" }, ++ [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, ++ [FUSE_POLL] = { do_poll, "POLL" }, ++ [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, ++ [FUSE_DESTROY] = { do_destroy, "DESTROY" }, ++ [FUSE_NOTIFY_REPLY] = { (void *)1, "NOTIFY_REPLY" }, ++ [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, ++ [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" }, ++ [FUSE_RENAME2] = { do_rename2, "RENAME2" }, ++ [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, ++ [FUSE_LSEEK] = { do_lseek, "LSEEK" }, + }; + + #define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) + + static const char *opname(enum fuse_opcode opcode) + { +- if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) +- return "???"; +- else +- return fuse_ll_ops[opcode].name; ++ if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) { ++ return "???"; ++ } else { ++ return fuse_ll_ops[opcode].name; ++ } + } + + void fuse_session_process_buf(struct fuse_session *se, +- const struct fuse_buf *buf) ++ const struct fuse_buf *buf) + { +- fuse_session_process_buf_int(se, buf, NULL); ++ fuse_session_process_buf_int(se, buf, NULL); + } + + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, struct fuse_chan *ch) +-{ +- struct fuse_in_header *in; +- const void *inarg; +- struct fuse_req *req; +- int err; +- +- in = buf->mem; +- +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, +- "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", +- (unsigned long long) in->unique, +- opname((enum fuse_opcode) in->opcode), in->opcode, +- (unsigned long long) in->nodeid, buf->size, in->pid); +- } +- +- req = fuse_ll_alloc_req(se); +- if (req == NULL) { +- struct fuse_out_header out = { +- .unique = in->unique, +- .error = -ENOMEM, +- }; +- struct iovec iov = { +- .iov_base = &out, +- .iov_len = sizeof(struct fuse_out_header), +- }; +- +- fuse_send_msg(se, ch, &iov, 1); +- return; +- } +- +- req->unique = in->unique; +- req->ctx.uid = in->uid; +- req->ctx.gid = in->gid; +- req->ctx.pid = in->pid; +- req->ch = ch; +- +- err = EIO; +- if (!se->got_init) { +- enum fuse_opcode expected; +- +- expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; +- if (in->opcode != expected) +- goto reply_err; +- } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) +- goto reply_err; +- +- err = EACCES; +- /* Implement -o allow_root */ +- if (se->deny_others && in->uid != se->owner && in->uid != 0 && +- in->opcode != FUSE_INIT && in->opcode != FUSE_READ && +- in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && +- in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && +- in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && +- in->opcode != FUSE_NOTIFY_REPLY && +- in->opcode != FUSE_READDIRPLUS) +- goto reply_err; +- +- err = ENOSYS; +- if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) +- goto reply_err; +- if (in->opcode != FUSE_INTERRUPT) { +- struct fuse_req *intr; +- pthread_mutex_lock(&se->lock); +- intr = check_interrupt(se, req); +- list_add_req(req, &se->list); +- pthread_mutex_unlock(&se->lock); +- if (intr) +- fuse_reply_err(intr, EAGAIN); +- } +- +- inarg = (void *) &in[1]; +- if (in->opcode == FUSE_WRITE && se->op.write_buf) +- do_write_buf(req, in->nodeid, inarg, buf); +- else if (in->opcode == FUSE_NOTIFY_REPLY) +- do_notify_reply(req, in->nodeid, inarg, buf); +- else +- fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); +- +- return; ++ const struct fuse_buf *buf, ++ struct fuse_chan *ch) ++{ ++ struct fuse_in_header *in; ++ const void *inarg; ++ struct fuse_req *req; ++ int err; ++ ++ in = buf->mem; ++ ++ if (se->debug) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, " ++ "pid: %u\n", ++ (unsigned long long)in->unique, ++ opname((enum fuse_opcode)in->opcode), in->opcode, ++ (unsigned long long)in->nodeid, buf->size, in->pid); ++ } ++ ++ req = fuse_ll_alloc_req(se); ++ if (req == NULL) { ++ struct fuse_out_header out = { ++ .unique = in->unique, ++ .error = -ENOMEM, ++ }; ++ struct iovec iov = { ++ .iov_base = &out, ++ .iov_len = sizeof(struct fuse_out_header), ++ }; ++ ++ fuse_send_msg(se, ch, &iov, 1); ++ return; ++ } ++ ++ req->unique = in->unique; ++ req->ctx.uid = in->uid; ++ req->ctx.gid = in->gid; ++ req->ctx.pid = in->pid; ++ req->ch = ch; ++ ++ err = EIO; ++ if (!se->got_init) { ++ enum fuse_opcode expected; ++ ++ expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; ++ if (in->opcode != expected) { ++ goto reply_err; ++ } ++ } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) { ++ goto reply_err; ++ } ++ ++ err = EACCES; ++ /* Implement -o allow_root */ ++ if (se->deny_others && in->uid != se->owner && in->uid != 0 && ++ in->opcode != FUSE_INIT && in->opcode != FUSE_READ && ++ in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && ++ in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && ++ in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && ++ in->opcode != FUSE_NOTIFY_REPLY && in->opcode != FUSE_READDIRPLUS) { ++ goto reply_err; ++ } ++ ++ err = ENOSYS; ++ if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) { ++ goto reply_err; ++ } ++ if (in->opcode != FUSE_INTERRUPT) { ++ struct fuse_req *intr; ++ pthread_mutex_lock(&se->lock); ++ intr = check_interrupt(se, req); ++ list_add_req(req, &se->list); ++ pthread_mutex_unlock(&se->lock); ++ if (intr) { ++ fuse_reply_err(intr, EAGAIN); ++ } ++ } ++ ++ inarg = (void *)&in[1]; ++ if (in->opcode == FUSE_WRITE && se->op.write_buf) { ++ do_write_buf(req, in->nodeid, inarg, buf); ++ } else if (in->opcode == FUSE_NOTIFY_REPLY) { ++ do_notify_reply(req, in->nodeid, inarg, buf); ++ } else { ++ fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); ++ } ++ ++ return; + + reply_err: +- fuse_reply_err(req, err); ++ fuse_reply_err(req, err); + } + +-#define LL_OPTION(n,o,v) \ +- { n, offsetof(struct fuse_session, o), v } ++#define LL_OPTION(n, o, v) \ ++ { \ ++ n, offsetof(struct fuse_session, o), v \ ++ } + + static const struct fuse_opt fuse_ll_opts[] = { +- LL_OPTION("debug", debug, 1), +- LL_OPTION("-d", debug, 1), +- LL_OPTION("--debug", debug, 1), +- LL_OPTION("allow_root", deny_others, 1), +- FUSE_OPT_END ++ LL_OPTION("debug", debug, 1), LL_OPTION("-d", debug, 1), ++ LL_OPTION("--debug", debug, 1), LL_OPTION("allow_root", deny_others, 1), ++ FUSE_OPT_END + }; + + void fuse_lowlevel_version(void) + { +- printf("using FUSE kernel interface version %i.%i\n", +- FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); ++ printf("using FUSE kernel interface version %i.%i\n", FUSE_KERNEL_VERSION, ++ FUSE_KERNEL_MINOR_VERSION); + } + + void fuse_lowlevel_help(void) + { +- /* These are not all options, but the ones that are +- potentially of interest to an end-user */ +- printf( +-" -o allow_root allow access by root\n" +-); ++ /* ++ * These are not all options, but the ones that are ++ * potentially of interest to an end-user ++ */ ++ printf(" -o allow_root allow access by root\n"); + } + + void fuse_session_destroy(struct fuse_session *se) + { +- if (se->got_init && !se->got_destroy) { +- if (se->op.destroy) +- se->op.destroy(se->userdata); +- } +- pthread_mutex_destroy(&se->lock); +- free(se->cuse_data); +- if (se->fd != -1) +- close(se->fd); +- free(se); ++ if (se->got_init && !se->got_destroy) { ++ if (se->op.destroy) { ++ se->op.destroy(se->userdata); ++ } ++ } ++ pthread_mutex_destroy(&se->lock); ++ free(se->cuse_data); ++ if (se->fd != -1) { ++ close(se->fd); ++ } ++ free(se); + } + + + struct fuse_session *fuse_session_new(struct fuse_args *args, +- const struct fuse_lowlevel_ops *op, +- size_t op_size, void *userdata) +-{ +- struct fuse_session *se; +- +- if (sizeof(struct fuse_lowlevel_ops) < op_size) { +- fuse_log(FUSE_LOG_ERR, "fuse: warning: library too old, some operations may not work\n"); +- op_size = sizeof(struct fuse_lowlevel_ops); +- } +- +- if (args->argc == 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: empty argv passed to fuse_session_new().\n"); +- return NULL; +- } +- +- se = (struct fuse_session *) calloc(1, sizeof(struct fuse_session)); +- if (se == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); +- goto out1; +- } +- se->fd = -1; +- se->conn.max_write = UINT_MAX; +- se->conn.max_readahead = UINT_MAX; +- +- /* Parse options */ +- if(fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) +- goto out2; +- if(args->argc == 1 && +- args->argv[0][0] == '-') { +- fuse_log(FUSE_LOG_ERR, "fuse: warning: argv[0] looks like an option, but " +- "will be ignored\n"); +- } else if (args->argc != 1) { +- int i; +- fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); +- for(i = 1; i < args->argc-1; i++) +- fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); +- fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); +- goto out4; +- } +- +- se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + +- FUSE_BUFFER_HEADER_SIZE; +- +- list_init_req(&se->list); +- list_init_req(&se->interrupts); +- list_init_nreq(&se->notify_list); +- se->notify_ctr = 1; +- fuse_mutex_init(&se->lock); +- +- memcpy(&se->op, op, op_size); +- se->owner = getuid(); +- se->userdata = userdata; +- +- return se; ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata) ++{ ++ struct fuse_session *se; ++ ++ if (sizeof(struct fuse_lowlevel_ops) < op_size) { ++ fuse_log( ++ FUSE_LOG_ERR, ++ "fuse: warning: library too old, some operations may not work\n"); ++ op_size = sizeof(struct fuse_lowlevel_ops); ++ } ++ ++ if (args->argc == 0) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: empty argv passed to fuse_session_new().\n"); ++ return NULL; ++ } ++ ++ se = (struct fuse_session *)calloc(1, sizeof(struct fuse_session)); ++ if (se == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); ++ goto out1; ++ } ++ se->fd = -1; ++ se->conn.max_write = UINT_MAX; ++ se->conn.max_readahead = UINT_MAX; ++ ++ /* Parse options */ ++ if (fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) { ++ goto out2; ++ } ++ if (args->argc == 1 && args->argv[0][0] == '-') { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: warning: argv[0] looks like an option, but " ++ "will be ignored\n"); ++ } else if (args->argc != 1) { ++ int i; ++ fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); ++ for (i = 1; i < args->argc - 1; i++) { ++ fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); ++ } ++ fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); ++ goto out4; ++ } ++ ++ se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE; ++ ++ list_init_req(&se->list); ++ list_init_req(&se->interrupts); ++ list_init_nreq(&se->notify_list); ++ se->notify_ctr = 1; ++ fuse_mutex_init(&se->lock); ++ ++ memcpy(&se->op, op, op_size); ++ se->owner = getuid(); ++ se->userdata = userdata; ++ ++ return se; + + out4: +- fuse_opt_free_args(args); ++ fuse_opt_free_args(args); + out2: +- free(se); ++ free(se); + out1: +- return NULL; ++ return NULL; + } + + int fuse_session_mount(struct fuse_session *se, const char *mountpoint) + { +- int fd; +- +- /* +- * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos +- * would ensue. +- */ +- do { +- fd = open("/dev/null", O_RDWR); +- if (fd > 2) +- close(fd); +- } while (fd >= 0 && fd <= 2); +- +- /* +- * To allow FUSE daemons to run without privileges, the caller may open +- * /dev/fuse before launching the file system and pass on the file +- * descriptor by specifying /dev/fd/N as the mount point. Note that the +- * parent process takes care of performing the mount in this case. +- */ +- fd = fuse_mnt_parse_fuse_fd(mountpoint); +- if (fd != -1) { +- if (fcntl(fd, F_GETFD) == -1) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: Invalid file descriptor /dev/fd/%u\n", +- fd); +- return -1; +- } +- se->fd = fd; +- return 0; +- } +- +- /* Open channel */ +- fd = fuse_kern_mount(mountpoint, se->mo); +- if (fd == -1) +- return -1; +- se->fd = fd; +- +- /* Save mountpoint */ +- se->mountpoint = strdup(mountpoint); +- if (se->mountpoint == NULL) +- goto error_out; +- +- return 0; ++ int fd; ++ ++ /* ++ * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos ++ * would ensue. ++ */ ++ do { ++ fd = open("/dev/null", O_RDWR); ++ if (fd > 2) { ++ close(fd); ++ } ++ } while (fd >= 0 && fd <= 2); ++ ++ /* ++ * To allow FUSE daemons to run without privileges, the caller may open ++ * /dev/fuse before launching the file system and pass on the file ++ * descriptor by specifying /dev/fd/N as the mount point. Note that the ++ * parent process takes care of performing the mount in this case. ++ */ ++ fd = fuse_mnt_parse_fuse_fd(mountpoint); ++ if (fd != -1) { ++ if (fcntl(fd, F_GETFD) == -1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: Invalid file descriptor /dev/fd/%u\n", ++ fd); ++ return -1; ++ } ++ se->fd = fd; ++ return 0; ++ } ++ ++ /* Open channel */ ++ fd = fuse_kern_mount(mountpoint, se->mo); ++ if (fd == -1) { ++ return -1; ++ } ++ se->fd = fd; ++ ++ /* Save mountpoint */ ++ se->mountpoint = strdup(mountpoint); ++ if (se->mountpoint == NULL) { ++ goto error_out; ++ } ++ ++ return 0; + + error_out: +- fuse_kern_unmount(mountpoint, fd); +- return -1; ++ fuse_kern_unmount(mountpoint, fd); ++ return -1; + } + + int fuse_session_fd(struct fuse_session *se) + { +- return se->fd; ++ return se->fd; + } + + void fuse_session_unmount(struct fuse_session *se) +@@ -2384,61 +2519,66 @@ void fuse_session_unmount(struct fuse_session *se) + #ifdef linux + int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) + { +- char *buf; +- size_t bufsize = 1024; +- char path[128]; +- int ret; +- int fd; +- unsigned long pid = req->ctx.pid; +- char *s; ++ char *buf; ++ size_t bufsize = 1024; ++ char path[128]; ++ int ret; ++ int fd; ++ unsigned long pid = req->ctx.pid; ++ char *s; + +- sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); ++ sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); + + retry: +- buf = malloc(bufsize); +- if (buf == NULL) +- return -ENOMEM; +- +- ret = -EIO; +- fd = open(path, O_RDONLY); +- if (fd == -1) +- goto out_free; +- +- ret = read(fd, buf, bufsize); +- close(fd); +- if (ret < 0) { +- ret = -EIO; +- goto out_free; +- } +- +- if ((size_t)ret == bufsize) { +- free(buf); +- bufsize *= 4; +- goto retry; +- } +- +- ret = -EIO; +- s = strstr(buf, "\nGroups:"); +- if (s == NULL) +- goto out_free; +- +- s += 8; +- ret = 0; +- while (1) { +- char *end; +- unsigned long val = strtoul(s, &end, 0); +- if (end == s) +- break; +- +- s = end; +- if (ret < size) +- list[ret] = val; +- ret++; +- } ++ buf = malloc(bufsize); ++ if (buf == NULL) { ++ return -ENOMEM; ++ } ++ ++ ret = -EIO; ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ goto out_free; ++ } ++ ++ ret = read(fd, buf, bufsize); ++ close(fd); ++ if (ret < 0) { ++ ret = -EIO; ++ goto out_free; ++ } ++ ++ if ((size_t)ret == bufsize) { ++ free(buf); ++ bufsize *= 4; ++ goto retry; ++ } ++ ++ ret = -EIO; ++ s = strstr(buf, "\nGroups:"); ++ if (s == NULL) { ++ goto out_free; ++ } ++ ++ s += 8; ++ ret = 0; ++ while (1) { ++ char *end; ++ unsigned long val = strtoul(s, &end, 0); ++ if (end == s) { ++ break; ++ } ++ ++ s = end; ++ if (ret < size) { ++ list[ret] = val; ++ } ++ ret++; ++ } + + out_free: +- free(buf); +- return ret; ++ free(buf); ++ return ret; + } + #else /* linux */ + /* +@@ -2446,23 +2586,25 @@ out_free: + */ + int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) + { +- (void) req; (void) size; (void) list; +- return -ENOSYS; ++ (void)req; ++ (void)size; ++ (void)list; ++ return -ENOSYS; + } + #endif + + void fuse_session_exit(struct fuse_session *se) + { +- se->exited = 1; ++ se->exited = 1; + } + + void fuse_session_reset(struct fuse_session *se) + { +- se->exited = 0; +- se->error = 0; ++ se->exited = 0; ++ se->error = 0; + } + + int fuse_session_exited(struct fuse_session *se) + { +- return se->exited; ++ return se->exited; + } +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 6b1adfc..adb9054 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1,15 +1,16 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_LOWLEVEL_H_ + #define FUSE_LOWLEVEL_H_ + +-/** @file ++/** ++ * @file + * + * Low level API + * +@@ -24,16 +25,16 @@ + + #include "fuse_common.h" + +-#include + #include +-#include + #include + #include ++#include + #include ++#include + +-/* ----------------------------------------------------------- * +- * Miscellaneous definitions * +- * ----------------------------------------------------------- */ ++/* ++ * Miscellaneous definitions ++ */ + + /** The node ID of the root inode */ + #define FUSE_ROOT_ID 1 +@@ -53,47 +54,54 @@ struct fuse_session; + + /** Directory entry parameters supplied to fuse_reply_entry() */ + struct fuse_entry_param { +- /** Unique inode number +- * +- * In lookup, zero means negative entry (from version 2.5) +- * Returning ENOENT also means negative entry, but by setting zero +- * ino the kernel may cache negative entries for entry_timeout +- * seconds. +- */ +- fuse_ino_t ino; +- +- /** Generation number for this entry. +- * +- * If the file system will be exported over NFS, the +- * ino/generation pairs need to be unique over the file +- * system's lifetime (rather than just the mount time). So if +- * the file system reuses an inode after it has been deleted, +- * it must assign a new, previously unused generation number +- * to the inode at the same time. +- * +- */ +- uint64_t generation; +- +- /** Inode attributes. +- * +- * Even if attr_timeout == 0, attr must be correct. For example, +- * for open(), FUSE uses attr.st_size from lookup() to determine +- * how many bytes to request. If this value is not correct, +- * incorrect data will be returned. +- */ +- struct stat attr; +- +- /** Validity timeout (in seconds) for inode attributes. If +- attributes only change as a result of requests that come +- through the kernel, this should be set to a very large +- value. */ +- double attr_timeout; +- +- /** Validity timeout (in seconds) for the name. If directory +- entries are changed/deleted only as a result of requests +- that come through the kernel, this should be set to a very +- large value. */ +- double entry_timeout; ++ /** ++ * Unique inode number ++ * ++ * In lookup, zero means negative entry (from version 2.5) ++ * Returning ENOENT also means negative entry, but by setting zero ++ * ino the kernel may cache negative entries for entry_timeout ++ * seconds. ++ */ ++ fuse_ino_t ino; ++ ++ /** ++ * Generation number for this entry. ++ * ++ * If the file system will be exported over NFS, the ++ * ino/generation pairs need to be unique over the file ++ * system's lifetime (rather than just the mount time). So if ++ * the file system reuses an inode after it has been deleted, ++ * it must assign a new, previously unused generation number ++ * to the inode at the same time. ++ * ++ */ ++ uint64_t generation; ++ ++ /** ++ * Inode attributes. ++ * ++ * Even if attr_timeout == 0, attr must be correct. For example, ++ * for open(), FUSE uses attr.st_size from lookup() to determine ++ * how many bytes to request. If this value is not correct, ++ * incorrect data will be returned. ++ */ ++ struct stat attr; ++ ++ /** ++ * Validity timeout (in seconds) for inode attributes. If ++ * attributes only change as a result of requests that come ++ * through the kernel, this should be set to a very large ++ * value. ++ */ ++ double attr_timeout; ++ ++ /** ++ * Validity timeout (in seconds) for the name. If directory ++ * entries are changed/deleted only as a result of requests ++ * that come through the kernel, this should be set to a very ++ * large value. ++ */ ++ double entry_timeout; + }; + + /** +@@ -105,38 +113,38 @@ struct fuse_entry_param { + * there is no valid uid/pid/gid that could be reported. + */ + struct fuse_ctx { +- /** User ID of the calling process */ +- uid_t uid; ++ /** User ID of the calling process */ ++ uid_t uid; + +- /** Group ID of the calling process */ +- gid_t gid; ++ /** Group ID of the calling process */ ++ gid_t gid; + +- /** Thread ID of the calling process */ +- pid_t pid; ++ /** Thread ID of the calling process */ ++ pid_t pid; + +- /** Umask of the calling process */ +- mode_t umask; ++ /** Umask of the calling process */ ++ mode_t umask; + }; + + struct fuse_forget_data { +- fuse_ino_t ino; +- uint64_t nlookup; ++ fuse_ino_t ino; ++ uint64_t nlookup; + }; + + /* 'to_set' flags in setattr */ +-#define FUSE_SET_ATTR_MODE (1 << 0) +-#define FUSE_SET_ATTR_UID (1 << 1) +-#define FUSE_SET_ATTR_GID (1 << 2) +-#define FUSE_SET_ATTR_SIZE (1 << 3) +-#define FUSE_SET_ATTR_ATIME (1 << 4) +-#define FUSE_SET_ATTR_MTIME (1 << 5) +-#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) +-#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) +-#define FUSE_SET_ATTR_CTIME (1 << 10) +- +-/* ----------------------------------------------------------- * +- * Request methods and replies * +- * ----------------------------------------------------------- */ ++#define FUSE_SET_ATTR_MODE (1 << 0) ++#define FUSE_SET_ATTR_UID (1 << 1) ++#define FUSE_SET_ATTR_GID (1 << 2) ++#define FUSE_SET_ATTR_SIZE (1 << 3) ++#define FUSE_SET_ATTR_ATIME (1 << 4) ++#define FUSE_SET_ATTR_MTIME (1 << 5) ++#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) ++#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) ++#define FUSE_SET_ATTR_CTIME (1 << 10) ++ ++/* ++ * Request methods and replies ++ */ + + /** + * Low level filesystem operations +@@ -166,1075 +174,1069 @@ struct fuse_forget_data { + * this file will not be called. + */ + struct fuse_lowlevel_ops { +- /** +- * Initialize filesystem +- * +- * This function is called when libfuse establishes +- * communication with the FUSE kernel module. The file system +- * should use this module to inspect and/or modify the +- * connection parameters provided in the `conn` structure. +- * +- * Note that some parameters may be overwritten by options +- * passed to fuse_session_new() which take precedence over the +- * values set in this handler. +- * +- * There's no reply to this function +- * +- * @param userdata the user data passed to fuse_session_new() +- */ +- void (*init) (void *userdata, struct fuse_conn_info *conn); +- +- /** +- * Clean up filesystem. +- * +- * Called on filesystem exit. When this method is called, the +- * connection to the kernel may be gone already, so that eg. calls +- * to fuse_lowlevel_notify_* will fail. +- * +- * There's no reply to this function +- * +- * @param userdata the user data passed to fuse_session_new() +- */ +- void (*destroy) (void *userdata); +- +- /** +- * Look up a directory entry by name and get its attributes. +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name the name to look up +- */ +- void (*lookup) (fuse_req_t req, fuse_ino_t parent, const char *name); +- +- /** +- * Forget about an inode +- * +- * This function is called when the kernel removes an inode +- * from its internal caches. +- * +- * The inode's lookup count increases by one for every call to +- * fuse_reply_entry and fuse_reply_create. The nlookup parameter +- * indicates by how much the lookup count should be decreased. +- * +- * Inodes with a non-zero lookup count may receive request from +- * the kernel even after calls to unlink, rmdir or (when +- * overwriting an existing file) rename. Filesystems must handle +- * such requests properly and it is recommended to defer removal +- * of the inode until the lookup count reaches zero. Calls to +- * unlink, rmdir or rename will be followed closely by forget +- * unless the file or directory is open, in which case the +- * kernel issues forget only after the release or releasedir +- * calls. +- * +- * Note that if a file system will be exported over NFS the +- * inodes lifetime must extend even beyond forget. See the +- * generation field in struct fuse_entry_param above. +- * +- * On unmount the lookup count for all inodes implicitly drops +- * to zero. It is not guaranteed that the file system will +- * receive corresponding forget messages for the affected +- * inodes. +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- * @param ino the inode number +- * @param nlookup the number of lookups to forget +- */ +- void (*forget) (fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); +- +- /** +- * Get file attributes. +- * +- * If writeback caching is enabled, the kernel may have a +- * better idea of a file's length than the FUSE file system +- * (eg if there has been a write that extended the file size, +- * but that has not yet been passed to the filesystem.n +- * +- * In this case, the st_size value provided by the file system +- * will be ignored. +- * +- * Valid replies: +- * fuse_reply_attr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi for future use, currently always NULL +- */ +- void (*getattr) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Set file attributes +- * +- * In the 'attr' argument only members indicated by the 'to_set' +- * bitmask contain valid values. Other members contain undefined +- * values. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits if the file +- * size or owner is being changed. +- * +- * If the setattr was invoked from the ftruncate() system call +- * under Linux kernel versions 2.6.15 or later, the fi->fh will +- * contain the value set by the open method or will be undefined +- * if the open method didn't set any value. Otherwise (not +- * ftruncate call, or kernel version earlier than 2.6.15) the fi +- * parameter will be NULL. +- * +- * Valid replies: +- * fuse_reply_attr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param attr the attributes +- * @param to_set bit mask of attributes which should be set +- * @param fi file information, or NULL +- */ +- void (*setattr) (fuse_req_t req, fuse_ino_t ino, struct stat *attr, +- int to_set, struct fuse_file_info *fi); +- +- /** +- * Read symbolic link +- * +- * Valid replies: +- * fuse_reply_readlink +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- */ +- void (*readlink) (fuse_req_t req, fuse_ino_t ino); +- +- /** +- * Create file node +- * +- * Create a regular file, character device, block device, fifo or +- * socket node. +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to create +- * @param mode file type and mode with which to create the new file +- * @param rdev the device number (only valid if created file is a device) +- */ +- void (*mknod) (fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode, dev_t rdev); +- +- /** +- * Create a directory +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to create +- * @param mode with which to create the new file +- */ +- void (*mkdir) (fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode); +- +- /** +- * Remove a file +- * +- * If the file's inode's lookup count is non-zero, the file +- * system is expected to postpone any removal of the inode +- * until the lookup count reaches zero (see description of the +- * forget function). +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to remove +- */ +- void (*unlink) (fuse_req_t req, fuse_ino_t parent, const char *name); +- +- /** +- * Remove a directory +- * +- * If the directory's inode's lookup count is non-zero, the +- * file system is expected to postpone any removal of the +- * inode until the lookup count reaches zero (see description +- * of the forget function). +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to remove +- */ +- void (*rmdir) (fuse_req_t req, fuse_ino_t parent, const char *name); +- +- /** +- * Create a symbolic link +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param link the contents of the symbolic link +- * @param parent inode number of the parent directory +- * @param name to create +- */ +- void (*symlink) (fuse_req_t req, const char *link, fuse_ino_t parent, +- const char *name); +- +- /** Rename a file +- * +- * If the target exists it should be atomically replaced. If +- * the target's inode's lookup count is non-zero, the file +- * system is expected to postpone any removal of the inode +- * until the lookup count reaches zero (see description of the +- * forget function). +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EINVAL, i.e. all +- * future bmap requests will fail with EINVAL without being +- * send to the filesystem process. +- * +- * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If +- * RENAME_NOREPLACE is specified, the filesystem must not +- * overwrite *newname* if it exists and return an error +- * instead. If `RENAME_EXCHANGE` is specified, the filesystem +- * must atomically exchange the two files, i.e. both must +- * exist and neither may be deleted. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the old parent directory +- * @param name old name +- * @param newparent inode number of the new parent directory +- * @param newname new name +- */ +- void (*rename) (fuse_req_t req, fuse_ino_t parent, const char *name, +- fuse_ino_t newparent, const char *newname, +- unsigned int flags); +- +- /** +- * Create a hard link +- * +- * Valid replies: +- * fuse_reply_entry +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the old inode number +- * @param newparent inode number of the new parent directory +- * @param newname new name to create +- */ +- void (*link) (fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, +- const char *newname); +- +- /** +- * Open a file +- * +- * Open flags are available in fi->flags. The following rules +- * apply. +- * +- * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be +- * filtered out / handled by the kernel. +- * +- * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used +- * by the filesystem to check if the operation is +- * permitted. If the ``-o default_permissions`` mount +- * option is given, this check is already done by the +- * kernel before calling open() and may thus be omitted by +- * the filesystem. +- * +- * - When writeback caching is enabled, the kernel may send +- * read requests even for files opened with O_WRONLY. The +- * filesystem should be prepared to handle this. +- * +- * - When writeback caching is disabled, the filesystem is +- * expected to properly handle the O_APPEND flag and ensure +- * that each write is appending to the end of the file. +- * +- * - When writeback caching is enabled, the kernel will +- * handle O_APPEND. However, unless all changes to the file +- * come through the kernel this will not work reliably. The +- * filesystem should thus either ignore the O_APPEND flag +- * (and let the kernel handle it), or return an error +- * (indicating that reliably O_APPEND is not available). +- * +- * Filesystem may store an arbitrary file handle (pointer, +- * index, etc) in fi->fh, and use this in other all other file +- * operations (read, write, flush, release, fsync). +- * +- * Filesystem may also implement stateless file I/O and not store +- * anything in fi->fh. +- * +- * There are also some flags (direct_io, keep_cache) which the +- * filesystem may set in fi, to change the way the file is opened. +- * See fuse_file_info structure in for more details. +- * +- * If this request is answered with an error code of ENOSYS +- * and FUSE_CAP_NO_OPEN_SUPPORT is set in +- * `fuse_conn_info.capable`, this is treated as success and +- * future calls to open and release will also succeed without being +- * sent to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_open +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*open) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Read data +- * +- * Read should send exactly the number of bytes requested except +- * on EOF or error, otherwise the rest of the data will be +- * substituted with zeroes. An exception to this is when the file +- * has been opened in 'direct_io' mode, in which case the return +- * value of the read system call will reflect the return value of +- * this operation. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_iov +- * fuse_reply_data +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size number of bytes to read +- * @param off offset to read from +- * @param fi file information +- */ +- void (*read) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Write data +- * +- * Write should return exactly the number of bytes requested +- * except on error. An exception to this is when the file has +- * been opened in 'direct_io' mode, in which case the return value +- * of the write system call will reflect the return value of this +- * operation. +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * +- * Valid replies: +- * fuse_reply_write +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param buf data to write +- * @param size number of bytes to write +- * @param off offset to write to +- * @param fi file information +- */ +- void (*write) (fuse_req_t req, fuse_ino_t ino, const char *buf, +- size_t size, off_t off, struct fuse_file_info *fi); +- +- /** +- * Flush method +- * +- * This is called on each close() of the opened file. +- * +- * Since file descriptors can be duplicated (dup, dup2, fork), for +- * one open call there may be many flush calls. +- * +- * Filesystems shouldn't assume that flush will always be called +- * after some writes, or that if will be called at all. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * +- * NOTE: the name of the method is misleading, since (unlike +- * fsync) the filesystem is not forced to flush pending writes. +- * One reason to flush data is if the filesystem wants to return +- * write errors during close. However, such use is non-portable +- * because POSIX does not require [close] to wait for delayed I/O to +- * complete. +- * +- * If the filesystem supports file locking operations (setlk, +- * getlk) it should remove all locks belonging to 'fi->owner'. +- * +- * If this request is answered with an error code of ENOSYS, +- * this is treated as success and future calls to flush() will +- * succeed automatically without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * +- * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html +- */ +- void (*flush) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Release an open file +- * +- * Release is called when there are no more references to an open +- * file: all file descriptors are closed and all memory mappings +- * are unmapped. +- * +- * For every open call there will be exactly one release call (unless +- * the filesystem is force-unmounted). +- * +- * The filesystem may reply with an error, but error values are +- * not returned to close() or munmap() which triggered the +- * release. +- * +- * fi->fh will contain the value set by the open method, or will +- * be undefined if the open method didn't set any value. +- * fi->flags will contain the same flags as for open. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*release) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Synchronize file contents +- * +- * If the datasync parameter is non-zero, then only the user data +- * should be flushed, not the meta data. +- * +- * If this request is answered with an error code of ENOSYS, +- * this is treated as success and future calls to fsync() will +- * succeed automatically without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param datasync flag indicating if only data should be flushed +- * @param fi file information +- */ +- void (*fsync) (fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi); +- +- /** +- * Open a directory +- * +- * Filesystem may store an arbitrary file handle (pointer, index, +- * etc) in fi->fh, and use this in other all other directory +- * stream operations (readdir, releasedir, fsyncdir). +- * +- * If this request is answered with an error code of ENOSYS and +- * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, +- * this is treated as success and future calls to opendir and +- * releasedir will also succeed without being sent to the filesystem +- * process. In addition, the kernel will cache readdir results +- * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. +- * +- * Valid replies: +- * fuse_reply_open +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*opendir) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Read directory +- * +- * Send a buffer filled using fuse_add_direntry(), with size not +- * exceeding the requested size. Send an empty buffer on end of +- * stream. +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * Returning a directory entry from readdir() does not affect +- * its lookup count. +- * +- * If off_t is non-zero, then it will correspond to one of the off_t +- * values that was previously returned by readdir() for the same +- * directory handle. In this case, readdir() should skip over entries +- * coming before the position defined by the off_t value. If entries +- * are added or removed while the directory handle is open, they filesystem +- * may still include the entries that have been removed, and may not +- * report the entries that have been created. However, addition or +- * removal of entries must never cause readdir() to skip over unrelated +- * entries or to report them more than once. This means +- * that off_t can not be a simple index that enumerates the entries +- * that have been returned but must contain sufficient information to +- * uniquely determine the next directory entry to return even when the +- * set of entries is changing. +- * +- * The function does not have to report the '.' and '..' +- * entries, but is allowed to do so. Note that, if readdir does +- * not return '.' or '..', they will not be implicitly returned, +- * and this behavior is observable by the caller. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size maximum number of bytes to send +- * @param off offset to continue reading the directory stream +- * @param fi file information +- */ +- void (*readdir) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Release an open directory +- * +- * For every opendir call there will be exactly one releasedir +- * call (unless the filesystem is force-unmounted). +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- */ +- void (*releasedir) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi); +- +- /** +- * Synchronize directory contents +- * +- * If the datasync parameter is non-zero, then only the directory +- * contents should be flushed, not the meta data. +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * If this request is answered with an error code of ENOSYS, +- * this is treated as success and future calls to fsyncdir() will +- * succeed automatically without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param datasync flag indicating if only data should be flushed +- * @param fi file information +- */ +- void (*fsyncdir) (fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi); +- +- /** +- * Get file system statistics +- * +- * Valid replies: +- * fuse_reply_statfs +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number, zero means "undefined" +- */ +- void (*statfs) (fuse_req_t req, fuse_ino_t ino); +- +- /** +- * Set an extended attribute +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future setxattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- */ +- void (*setxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, +- const char *value, size_t size, int flags); +- +- /** +- * Get an extended attribute +- * +- * If size is zero, the size of the value should be sent with +- * fuse_reply_xattr. +- * +- * If the size is non-zero, and the value fits in the buffer, the +- * value should be sent with fuse_reply_buf. +- * +- * If the size is too small for the value, the ERANGE error should +- * be sent. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future getxattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_xattr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param name of the extended attribute +- * @param size maximum size of the value to send +- */ +- void (*getxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, +- size_t size); +- +- /** +- * List extended attribute names +- * +- * If size is zero, the total size of the attribute list should be +- * sent with fuse_reply_xattr. +- * +- * If the size is non-zero, and the null character separated +- * attribute list fits in the buffer, the list should be sent with +- * fuse_reply_buf. +- * +- * If the size is too small for the list, the ERANGE error should +- * be sent. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future listxattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_xattr +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size maximum size of the list to send +- */ +- void (*listxattr) (fuse_req_t req, fuse_ino_t ino, size_t size); +- +- /** +- * Remove an extended attribute +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future removexattr() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param name of the extended attribute +- */ +- void (*removexattr) (fuse_req_t req, fuse_ino_t ino, const char *name); +- +- /** +- * Check file access permissions +- * +- * This will be called for the access() and chdir() system +- * calls. If the 'default_permissions' mount option is given, +- * this method is not called. +- * +- * This method is not called under Linux kernel versions 2.4.x +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent success, i.e. this and all future access() +- * requests will succeed without being send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param mask requested access mode +- */ +- void (*access) (fuse_req_t req, fuse_ino_t ino, int mask); +- +- /** +- * Create and open a file +- * +- * If the file does not exist, first create it with the specified +- * mode, and then open it. +- * +- * See the description of the open handler for more +- * information. +- * +- * If this method is not implemented or under Linux kernel +- * versions earlier than 2.6.15, the mknod() and open() methods +- * will be called instead. +- * +- * If this request is answered with an error code of ENOSYS, the handler +- * is treated as not implemented (i.e., for this and future requests the +- * mknod() and open() handlers will be called instead). +- * +- * Valid replies: +- * fuse_reply_create +- * fuse_reply_err +- * +- * @param req request handle +- * @param parent inode number of the parent directory +- * @param name to create +- * @param mode file type and mode with which to create the new file +- * @param fi file information +- */ +- void (*create) (fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode, struct fuse_file_info *fi); +- +- /** +- * Test for a POSIX file lock +- * +- * Valid replies: +- * fuse_reply_lock +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param lock the region/type to test +- */ +- void (*getlk) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi, struct flock *lock); +- +- /** +- * Acquire, modify or release a POSIX file lock +- * +- * For POSIX threads (NPTL) there's a 1-1 relation between pid and +- * owner, but otherwise this is not always the case. For checking +- * lock ownership, 'fi->owner' must be used. The l_pid field in +- * 'struct flock' should only be used to fill in this field in +- * getlk(). +- * +- * Note: if the locking methods are not implemented, the kernel +- * will still allow file locking to work locally. Hence these are +- * only interesting for network filesystems and similar. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param lock the region/type to set +- * @param sleep locking operation may sleep +- */ +- void (*setlk) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi, +- struct flock *lock, int sleep); +- +- /** +- * Map block index within file to block index within device +- * +- * Note: This makes sense only for block device backed filesystems +- * mounted with the 'blkdev' option +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure, i.e. all future bmap() requests will +- * fail with the same error code without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_bmap +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param blocksize unit of block index +- * @param idx block index within file +- */ +- void (*bmap) (fuse_req_t req, fuse_ino_t ino, size_t blocksize, +- uint64_t idx); +- +- /** +- * Ioctl +- * +- * Note: For unrestricted ioctls (not allowed for FUSE +- * servers), data in and out areas can be discovered by giving +- * iovs and setting FUSE_IOCTL_RETRY in *flags*. For +- * restricted ioctls, kernel prepares in/out data area +- * according to the information encoded in cmd. +- * +- * Valid replies: +- * fuse_reply_ioctl_retry +- * fuse_reply_ioctl +- * fuse_reply_ioctl_iov +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param cmd ioctl command +- * @param arg ioctl argument +- * @param fi file information +- * @param flags for FUSE_IOCTL_* flags +- * @param in_buf data fetched from the caller +- * @param in_bufsz number of fetched bytes +- * @param out_bufsz maximum size of output data +- * +- * Note : the unsigned long request submitted by the application +- * is truncated to 32 bits. +- */ +- void (*ioctl) (fuse_req_t req, fuse_ino_t ino, unsigned int cmd, +- void *arg, struct fuse_file_info *fi, unsigned flags, +- const void *in_buf, size_t in_bufsz, size_t out_bufsz); +- +- /** +- * Poll for IO readiness +- * +- * Note: If ph is non-NULL, the client should notify +- * when IO readiness events occur by calling +- * fuse_lowlevel_notify_poll() with the specified ph. +- * +- * Regardless of the number of times poll with a non-NULL ph +- * is received, single notification is enough to clear all. +- * Notifying more times incurs overhead but doesn't harm +- * correctness. +- * +- * The callee is responsible for destroying ph with +- * fuse_pollhandle_destroy() when no longer in use. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as success (with a kernel-defined default poll-mask) and +- * future calls to pull() will succeed the same way without being send +- * to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_poll +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param ph poll handle to be used for notification +- */ +- void (*poll) (fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, +- struct fuse_pollhandle *ph); +- +- /** +- * Write data made available in a buffer +- * +- * This is a more generic version of the ->write() method. If +- * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the +- * kernel supports splicing from the fuse device, then the +- * data will be made available in pipe for supporting zero +- * copy data transfer. +- * +- * buf->count is guaranteed to be one (and thus buf->idx is +- * always zero). The write_buf handler must ensure that +- * bufv->off is correctly updated (reflecting the number of +- * bytes read from bufv->buf[0]). +- * +- * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is +- * expected to reset the setuid and setgid bits. +- * +- * Valid replies: +- * fuse_reply_write +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param bufv buffer containing the data +- * @param off offset to write to +- * @param fi file information +- */ +- void (*write_buf) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_bufvec *bufv, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Callback function for the retrieve request +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() +- * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() +- * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() +- * @param bufv the buffer containing the returned data +- */ +- void (*retrieve_reply) (fuse_req_t req, void *cookie, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv); +- +- /** +- * Forget about multiple inodes +- * +- * See description of the forget function for more +- * information. +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- */ +- void (*forget_multi) (fuse_req_t req, size_t count, +- struct fuse_forget_data *forgets); +- +- /** +- * Acquire, modify or release a BSD file lock +- * +- * Note: if the locking methods are not implemented, the kernel +- * will still allow file locking to work locally. Hence these are +- * only interesting for network filesystems and similar. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param fi file information +- * @param op the locking operation, see flock(2) +- */ +- void (*flock) (fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi, int op); +- +- /** +- * Allocate requested space. If this function returns success then +- * subsequent writes to the specified range shall not fail due to the lack +- * of free space on the file system storage media. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future fallocate() requests will fail with EOPNOTSUPP without being +- * send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param offset starting point for allocated region +- * @param length size of allocated region +- * @param mode determines the operation to be performed on the given range, +- * see fallocate(2) +- */ +- void (*fallocate) (fuse_req_t req, fuse_ino_t ino, int mode, +- off_t offset, off_t length, struct fuse_file_info *fi); +- +- /** +- * Read directory with attributes +- * +- * Send a buffer filled using fuse_add_direntry_plus(), with size not +- * exceeding the requested size. Send an empty buffer on end of +- * stream. +- * +- * fi->fh will contain the value set by the opendir method, or +- * will be undefined if the opendir method didn't set any value. +- * +- * In contrast to readdir() (which does not affect the lookup counts), +- * the lookup count of every entry returned by readdirplus(), except "." +- * and "..", is incremented by one. +- * +- * Valid replies: +- * fuse_reply_buf +- * fuse_reply_data +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param size maximum number of bytes to send +- * @param off offset to continue reading the directory stream +- * @param fi file information +- */ +- void (*readdirplus) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, +- struct fuse_file_info *fi); +- +- /** +- * Copy a range of data from one file to another +- * +- * Performs an optimized copy between two file descriptors without the +- * additional cost of transferring data through the FUSE kernel module +- * to user space (glibc) and then back into the FUSE filesystem again. +- * +- * In case this method is not implemented, glibc falls back to reading +- * data from the source and writing to the destination. Effectively +- * doing an inefficient copy of the data. +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure with error code EOPNOTSUPP, i.e. all +- * future copy_file_range() requests will fail with EOPNOTSUPP without +- * being send to the filesystem process. +- * +- * Valid replies: +- * fuse_reply_write +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino_in the inode number or the source file +- * @param off_in starting point from were the data should be read +- * @param fi_in file information of the source file +- * @param ino_out the inode number or the destination file +- * @param off_out starting point where the data should be written +- * @param fi_out file information of the destination file +- * @param len maximum size of the data to copy +- * @param flags passed along with the copy_file_range() syscall +- */ +- void (*copy_file_range) (fuse_req_t req, fuse_ino_t ino_in, +- off_t off_in, struct fuse_file_info *fi_in, +- fuse_ino_t ino_out, off_t off_out, +- struct fuse_file_info *fi_out, size_t len, +- int flags); +- +- /** +- * Find next data or hole after the specified offset +- * +- * If this request is answered with an error code of ENOSYS, this is +- * treated as a permanent failure, i.e. all future lseek() requests will +- * fail with the same error code without being send to the filesystem +- * process. +- * +- * Valid replies: +- * fuse_reply_lseek +- * fuse_reply_err +- * +- * @param req request handle +- * @param ino the inode number +- * @param off offset to start search from +- * @param whence either SEEK_DATA or SEEK_HOLE +- * @param fi file information +- */ +- void (*lseek) (fuse_req_t req, fuse_ino_t ino, off_t off, int whence, +- struct fuse_file_info *fi); ++ /** ++ * Initialize filesystem ++ * ++ * This function is called when libfuse establishes ++ * communication with the FUSE kernel module. The file system ++ * should use this module to inspect and/or modify the ++ * connection parameters provided in the `conn` structure. ++ * ++ * Note that some parameters may be overwritten by options ++ * passed to fuse_session_new() which take precedence over the ++ * values set in this handler. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*init)(void *userdata, struct fuse_conn_info *conn); ++ ++ /** ++ * Clean up filesystem. ++ * ++ * Called on filesystem exit. When this method is called, the ++ * connection to the kernel may be gone already, so that eg. calls ++ * to fuse_lowlevel_notify_* will fail. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*destroy)(void *userdata); ++ ++ /** ++ * Look up a directory entry by name and get its attributes. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name the name to look up ++ */ ++ void (*lookup)(fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Forget about an inode ++ * ++ * This function is called when the kernel removes an inode ++ * from its internal caches. ++ * ++ * The inode's lookup count increases by one for every call to ++ * fuse_reply_entry and fuse_reply_create. The nlookup parameter ++ * indicates by how much the lookup count should be decreased. ++ * ++ * Inodes with a non-zero lookup count may receive request from ++ * the kernel even after calls to unlink, rmdir or (when ++ * overwriting an existing file) rename. Filesystems must handle ++ * such requests properly and it is recommended to defer removal ++ * of the inode until the lookup count reaches zero. Calls to ++ * unlink, rmdir or rename will be followed closely by forget ++ * unless the file or directory is open, in which case the ++ * kernel issues forget only after the release or releasedir ++ * calls. ++ * ++ * Note that if a file system will be exported over NFS the ++ * inodes lifetime must extend even beyond forget. See the ++ * generation field in struct fuse_entry_param above. ++ * ++ * On unmount the lookup count for all inodes implicitly drops ++ * to zero. It is not guaranteed that the file system will ++ * receive corresponding forget messages for the affected ++ * inodes. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param nlookup the number of lookups to forget ++ */ ++ void (*forget)(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); ++ ++ /** ++ * Get file attributes. ++ * ++ * If writeback caching is enabled, the kernel may have a ++ * better idea of a file's length than the FUSE file system ++ * (eg if there has been a write that extended the file size, ++ * but that has not yet been passed to the filesystem.n ++ * ++ * In this case, the st_size value provided by the file system ++ * will be ignored. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi for future use, currently always NULL ++ */ ++ void (*getattr)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Set file attributes ++ * ++ * In the 'attr' argument only members indicated by the 'to_set' ++ * bitmask contain valid values. Other members contain undefined ++ * values. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits if the file ++ * size or owner is being changed. ++ * ++ * If the setattr was invoked from the ftruncate() system call ++ * under Linux kernel versions 2.6.15 or later, the fi->fh will ++ * contain the value set by the open method or will be undefined ++ * if the open method didn't set any value. Otherwise (not ++ * ftruncate call, or kernel version earlier than 2.6.15) the fi ++ * parameter will be NULL. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param attr the attributes ++ * @param to_set bit mask of attributes which should be set ++ * @param fi file information, or NULL ++ */ ++ void (*setattr)(fuse_req_t req, fuse_ino_t ino, struct stat *attr, ++ int to_set, struct fuse_file_info *fi); ++ ++ /** ++ * Read symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_readlink ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ */ ++ void (*readlink)(fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Create file node ++ * ++ * Create a regular file, character device, block device, fifo or ++ * socket node. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param rdev the device number (only valid if created file is a device) ++ */ ++ void (*mknod)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, dev_t rdev); ++ ++ /** ++ * Create a directory ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode with which to create the new file ++ */ ++ void (*mkdir)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode); ++ ++ /** ++ * Remove a file ++ * ++ * If the file's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*unlink)(fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Remove a directory ++ * ++ * If the directory's inode's lookup count is non-zero, the ++ * file system is expected to postpone any removal of the ++ * inode until the lookup count reaches zero (see description ++ * of the forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*rmdir)(fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Create a symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param link the contents of the symbolic link ++ * @param parent inode number of the parent directory ++ * @param name to create ++ */ ++ void (*symlink)(fuse_req_t req, const char *link, fuse_ino_t parent, ++ const char *name); ++ ++ /** ++ * Rename a file ++ * ++ * If the target exists it should be atomically replaced. If ++ * the target's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EINVAL, i.e. all ++ * future bmap requests will fail with EINVAL without being ++ * send to the filesystem process. ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the old parent directory ++ * @param name old name ++ * @param newparent inode number of the new parent directory ++ * @param newname new name ++ */ ++ void (*rename)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags); ++ ++ /** ++ * Create a hard link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the old inode number ++ * @param newparent inode number of the new parent directory ++ * @param newname new name to create ++ */ ++ void (*link)(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, ++ const char *newname); ++ ++ /** ++ * Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used ++ * by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount ++ * option is given, this check is already done by the ++ * kernel before calling open() and may thus be omitted by ++ * the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open and release will also succeed without being ++ * sent to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*open)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Read data ++ * ++ * Read should send exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the file ++ * has been opened in 'direct_io' mode, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_iov ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size number of bytes to read ++ * @param off offset to read from ++ * @param fi file information ++ */ ++ void (*read)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Write data ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the file has ++ * been opened in 'direct_io' mode, in which case the return value ++ * of the write system call will reflect the return value of this ++ * operation. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param buf data to write ++ * @param size number of bytes to write ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write)(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, ++ off_t off, struct fuse_file_info *fi); ++ ++ /** ++ * Flush method ++ * ++ * This is called on each close() of the opened file. ++ * ++ * Since file descriptors can be duplicated (dup, dup2, fork), for ++ * one open call there may be many flush calls. ++ * ++ * Filesystems shouldn't assume that flush will always be called ++ * after some writes, or that if will be called at all. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * NOTE: the name of the method is misleading, since (unlike ++ * fsync) the filesystem is not forced to flush pending writes. ++ * One reason to flush data is if the filesystem wants to return ++ * write errors during close. However, such use is non-portable ++ * because POSIX does not require [close] to wait for delayed I/O to ++ * complete. ++ * ++ * If the filesystem supports file locking operations (setlk, ++ * getlk) it should remove all locks belonging to 'fi->owner'. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to flush() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * ++ * [close]: ++ * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ void (*flush)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open call there will be exactly one release call (unless ++ * the filesystem is force-unmounted). ++ * ++ * The filesystem may reply with an error, but error values are ++ * not returned to close() or munmap() which triggered the ++ * release. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * fi->flags will contain the same flags as for open. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*release)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsync() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsync)(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Open a directory ++ * ++ * Filesystem may store an arbitrary file handle (pointer, index, ++ * etc) in fi->fh, and use this in other all other directory ++ * stream operations (readdir, releasedir, fsyncdir). ++ * ++ * If this request is answered with an error code of ENOSYS and ++ * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, ++ * this is treated as success and future calls to opendir and ++ * releasedir will also succeed without being sent to the filesystem ++ * process. In addition, the kernel will cache readdir results ++ * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*opendir)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); ++ ++ /** ++ * Read directory ++ * ++ * Send a buffer filled using fuse_add_direntry(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Returning a directory entry from readdir() does not affect ++ * its lookup count. ++ * ++ * If off_t is non-zero, then it will correspond to one of the off_t ++ * values that was previously returned by readdir() for the same ++ * directory handle. In this case, readdir() should skip over entries ++ * coming before the position defined by the off_t value. If entries ++ * are added or removed while the directory handle is open, they filesystem ++ * may still include the entries that have been removed, and may not ++ * report the entries that have been created. However, addition or ++ * removal of entries must never cause readdir() to skip over unrelated ++ * entries or to report them more than once. This means ++ * that off_t can not be a simple index that enumerates the entries ++ * that have been returned but must contain sufficient information to ++ * uniquely determine the next directory entry to return even when the ++ * set of entries is changing. ++ * ++ * The function does not have to report the '.' and '..' ++ * entries, but is allowed to do so. Note that, if readdir does ++ * not return '.' or '..', they will not be implicitly returned, ++ * and this behavior is observable by the caller. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdir)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Release an open directory ++ * ++ * For every opendir call there will be exactly one releasedir ++ * call (unless the filesystem is force-unmounted). ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*releasedir)(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the directory ++ * contents should be flushed, not the meta data. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsyncdir() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsyncdir)(fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Get file system statistics ++ * ++ * Valid replies: ++ * fuse_reply_statfs ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number, zero means "undefined" ++ */ ++ void (*statfs)(fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Set an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future setxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ */ ++ void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, ++ const char *value, size_t size, int flags); ++ ++ /** ++ * Get an extended attribute ++ * ++ * If size is zero, the size of the value should be sent with ++ * fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the value fits in the buffer, the ++ * value should be sent with fuse_reply_buf. ++ * ++ * If the size is too small for the value, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future getxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ * @param size maximum size of the value to send ++ */ ++ void (*getxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, ++ size_t size); ++ ++ /** ++ * List extended attribute names ++ * ++ * If size is zero, the total size of the attribute list should be ++ * sent with fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the null character separated ++ * attribute list fits in the buffer, the list should be sent with ++ * fuse_reply_buf. ++ * ++ * If the size is too small for the list, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future listxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum size of the list to send ++ */ ++ void (*listxattr)(fuse_req_t req, fuse_ino_t ino, size_t size); ++ ++ /** ++ * Remove an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future removexattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ */ ++ void (*removexattr)(fuse_req_t req, fuse_ino_t ino, const char *name); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() and chdir() system ++ * calls. If the 'default_permissions' mount option is given, ++ * this method is not called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent success, i.e. this and all future access() ++ * requests will succeed without being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param mask requested access mode ++ */ ++ void (*access)(fuse_req_t req, fuse_ino_t ino, int mask); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * See the description of the open handler for more ++ * information. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ * ++ * If this request is answered with an error code of ENOSYS, the handler ++ * is treated as not implemented (i.e., for this and future requests the ++ * mknod() and open() handlers will be called instead). ++ * ++ * Valid replies: ++ * fuse_reply_create ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param fi file information ++ */ ++ void (*create)(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, struct fuse_file_info *fi); ++ ++ /** ++ * Test for a POSIX file lock ++ * ++ * Valid replies: ++ * fuse_reply_lock ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to test ++ */ ++ void (*getlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock); ++ ++ /** ++ * Acquire, modify or release a POSIX file lock ++ * ++ * For POSIX threads (NPTL) there's a 1-1 relation between pid and ++ * owner, but otherwise this is not always the case. For checking ++ * lock ownership, 'fi->owner' must be used. The l_pid field in ++ * 'struct flock' should only be used to fill in this field in ++ * getlk(). ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to set ++ * @param sleep locking operation may sleep ++ */ ++ void (*setlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock, int sleep); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future bmap() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_bmap ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param blocksize unit of block index ++ * @param idx block index within file ++ */ ++ void (*bmap)(fuse_req_t req, fuse_ino_t ino, size_t blocksize, ++ uint64_t idx); ++ ++ /** ++ * Ioctl ++ * ++ * Note: For unrestricted ioctls (not allowed for FUSE ++ * servers), data in and out areas can be discovered by giving ++ * iovs and setting FUSE_IOCTL_RETRY in *flags*. For ++ * restricted ioctls, kernel prepares in/out data area ++ * according to the information encoded in cmd. ++ * ++ * Valid replies: ++ * fuse_reply_ioctl_retry ++ * fuse_reply_ioctl ++ * fuse_reply_ioctl_iov ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param cmd ioctl command ++ * @param arg ioctl argument ++ * @param fi file information ++ * @param flags for FUSE_IOCTL_* flags ++ * @param in_buf data fetched from the caller ++ * @param in_bufsz number of fetched bytes ++ * @param out_bufsz maximum size of output data ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ void (*ioctl)(fuse_req_t req, fuse_ino_t ino, unsigned int cmd, void *arg, ++ struct fuse_file_info *fi, unsigned flags, const void *in_buf, ++ size_t in_bufsz, size_t out_bufsz); ++ ++ /** ++ * Poll for IO readiness ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_lowlevel_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as success (with a kernel-defined default poll-mask) and ++ * future calls to pull() will succeed the same way without being send ++ * to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_poll ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param ph poll handle to be used for notification ++ */ ++ void (*poll)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct fuse_pollhandle *ph); ++ ++ /** ++ * Write data made available in a buffer ++ * ++ * This is a more generic version of the ->write() method. If ++ * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the ++ * kernel supports splicing from the fuse device, then the ++ * data will be made available in pipe for supporting zero ++ * copy data transfer. ++ * ++ * buf->count is guaranteed to be one (and thus buf->idx is ++ * always zero). The write_buf handler must ensure that ++ * bufv->off is correctly updated (reflecting the number of ++ * bytes read from bufv->buf[0]). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param bufv buffer containing the data ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write_buf)(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, ++ off_t off, struct fuse_file_info *fi); ++ ++ /** ++ * Callback function for the retrieve request ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() ++ * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() ++ * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() ++ * @param bufv the buffer containing the returned data ++ */ ++ void (*retrieve_reply)(fuse_req_t req, void *cookie, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv); ++ ++ /** ++ * Forget about multiple inodes ++ * ++ * See description of the forget function for more ++ * information. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ */ ++ void (*forget_multi)(fuse_req_t req, size_t count, ++ struct fuse_forget_data *forgets); ++ ++ /** ++ * Acquire, modify or release a BSD file lock ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param op the locking operation, see flock(2) ++ */ ++ void (*flock)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ int op); ++ ++ /** ++ * Allocate requested space. If this function returns success then ++ * subsequent writes to the specified range shall not fail due to the lack ++ * of free space on the file system storage media. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future fallocate() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param offset starting point for allocated region ++ * @param length size of allocated region ++ * @param mode determines the operation to be performed on the given range, ++ * see fallocate(2) ++ */ ++ void (*fallocate)(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, ++ off_t length, struct fuse_file_info *fi); ++ ++ /** ++ * Read directory with attributes ++ * ++ * Send a buffer filled using fuse_add_direntry_plus(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * In contrast to readdir() (which does not affect the lookup counts), ++ * the lookup count of every entry returned by readdirplus(), except "." ++ * and "..", is incremented by one. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdirplus)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future copy_file_range() requests will fail with EOPNOTSUPP without ++ * being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino_in the inode number or the source file ++ * @param off_in starting point from were the data should be read ++ * @param fi_in file information of the source file ++ * @param ino_out the inode number or the destination file ++ * @param off_out starting point where the data should be written ++ * @param fi_out file information of the destination file ++ * @param len maximum size of the data to copy ++ * @param flags passed along with the copy_file_range() syscall ++ */ ++ void (*copy_file_range)(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, ++ struct fuse_file_info *fi_in, fuse_ino_t ino_out, ++ off_t off_out, struct fuse_file_info *fi_out, ++ size_t len, int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future lseek() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_lseek ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param off offset to start search from ++ * @param whence either SEEK_DATA or SEEK_HOLE ++ * @param fi file information ++ */ ++ void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, ++ struct fuse_file_info *fi); + }; + + /** +@@ -1305,7 +1307,7 @@ int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e); + * @return zero for success, -errno for failure to send reply + */ + int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, +- const struct fuse_file_info *fi); ++ const struct fuse_file_info *fi); + + /** + * Reply with attributes +@@ -1315,11 +1317,11 @@ int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + * + * @param req request handle + * @param attr the attributes +- * @param attr_timeout validity timeout (in seconds) for the attributes ++ * @param attr_timeout validity timeout (in seconds) for the attributes + * @return zero for success, -errno for failure to send reply + */ + int fuse_reply_attr(fuse_req_t req, const struct stat *attr, +- double attr_timeout); ++ double attr_timeout); + + /** + * Reply with the contents of a symbolic link +@@ -1417,7 +1419,7 @@ int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + * @return zero for success, -errno for failure to send reply + */ + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++ enum fuse_buf_copy_flags flags); + + /** + * Reply with data vector +@@ -1480,9 +1482,9 @@ int fuse_reply_lock(fuse_req_t req, const struct flock *lock); + */ + int fuse_reply_bmap(fuse_req_t req, uint64_t idx); + +-/* ----------------------------------------------------------- * +- * Filling a buffer in readdir * +- * ----------------------------------------------------------- */ ++/* ++ * Filling a buffer in readdir ++ */ + + /** + * Add a directory entry to the buffer +@@ -1512,8 +1514,7 @@ int fuse_reply_bmap(fuse_req_t req, uint64_t idx); + * @return the space needed for the entry + */ + size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, const struct stat *stbuf, +- off_t off); ++ const char *name, const struct stat *stbuf, off_t off); + + /** + * Add a directory entry to the buffer with the attributes +@@ -1529,8 +1530,8 @@ size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + * @return the space needed for the entry + */ + size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, +- const char *name, +- const struct fuse_entry_param *e, off_t off); ++ const char *name, ++ const struct fuse_entry_param *e, off_t off); + + /** + * Reply to ask for data fetch and output buffer preparation. ioctl +@@ -1547,9 +1548,9 @@ size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + * @param out_count number of entries in out_iov + * @return zero for success, -errno for failure to send reply + */ +-int fuse_reply_ioctl_retry(fuse_req_t req, +- const struct iovec *in_iov, size_t in_count, +- const struct iovec *out_iov, size_t out_count); ++int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, ++ size_t in_count, const struct iovec *out_iov, ++ size_t out_count); + + /** + * Reply to finish ioctl +@@ -1576,7 +1577,7 @@ int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size); + * @param count the size of vector + */ + int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, +- int count); ++ int count); + + /** + * Reply with poll result event mask +@@ -1598,9 +1599,9 @@ int fuse_reply_poll(fuse_req_t req, unsigned revents); + */ + int fuse_reply_lseek(fuse_req_t req, off_t off); + +-/* ----------------------------------------------------------- * +- * Notification * +- * ----------------------------------------------------------- */ ++/* ++ * Notification ++ */ + + /** + * Notify IO readiness event +@@ -1635,7 +1636,7 @@ int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph); + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, +- off_t off, off_t len); ++ off_t off, off_t len); + + /** + * Notify to invalidate parent attributes and the dentry matching +@@ -1663,7 +1664,7 @@ int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, +- const char *name, size_t namelen); ++ const char *name, size_t namelen); + + /** + * This function behaves like fuse_lowlevel_notify_inval_entry() with +@@ -1693,9 +1694,9 @@ int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + * @param namelen strlen() of file name + * @return zero for success, -errno for failure + */ +-int fuse_lowlevel_notify_delete(struct fuse_session *se, +- fuse_ino_t parent, fuse_ino_t child, +- const char *name, size_t namelen); ++int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, ++ fuse_ino_t child, const char *name, ++ size_t namelen); + + /** + * Store data to the kernel buffers +@@ -1723,8 +1724,8 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags); + /** + * Retrieve data from the kernel buffers + * +@@ -1755,12 +1756,12 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie); ++ size_t size, off_t offset, void *cookie); + + +-/* ----------------------------------------------------------- * +- * Utility functions * +- * ----------------------------------------------------------- */ ++/* ++ * Utility functions ++ */ + + /** + * Get the userdata from the request +@@ -1822,7 +1823,7 @@ typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data); + * @param data user data passed to the callback function + */ + void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, +- void *data); ++ void *data); + + /** + * Check if a request has already been interrupted +@@ -1833,9 +1834,9 @@ void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + int fuse_req_interrupted(fuse_req_t req); + + +-/* ----------------------------------------------------------- * +- * Inquiry functions * +- * ----------------------------------------------------------- */ ++/* ++ * Inquiry functions ++ */ + + /** + * Print low-level version information to stdout. +@@ -1854,18 +1855,18 @@ void fuse_lowlevel_help(void); + */ + void fuse_cmdline_help(void); + +-/* ----------------------------------------------------------- * +- * Filesystem setup & teardown * +- * ----------------------------------------------------------- */ ++/* ++ * Filesystem setup & teardown ++ */ + + struct fuse_cmdline_opts { +- int foreground; +- int debug; +- int nodefault_subtype; +- char *mountpoint; +- int show_version; +- int show_help; +- unsigned int max_idle_threads; ++ int foreground; ++ int debug; ++ int nodefault_subtype; ++ char *mountpoint; ++ int show_version; ++ int show_help; ++ unsigned int max_idle_threads; + }; + + /** +@@ -1886,8 +1887,7 @@ struct fuse_cmdline_opts { + * @param opts output argument for parsed options + * @return 0 on success, -1 on failure + */ +-int fuse_parse_cmdline(struct fuse_args *args, +- struct fuse_cmdline_opts *opts); ++int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts); + + /** + * Create a low level session. +@@ -1918,8 +1918,8 @@ int fuse_parse_cmdline(struct fuse_args *args, + * @return the fuse session on success, NULL on failure + **/ + struct fuse_session *fuse_session_new(struct fuse_args *args, +- const struct fuse_lowlevel_ops *op, +- size_t op_size, void *userdata); ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata); + + /** + * Mount a FUSE file system. +@@ -2014,9 +2014,9 @@ void fuse_session_unmount(struct fuse_session *se); + */ + void fuse_session_destroy(struct fuse_session *se); + +-/* ----------------------------------------------------------- * +- * Custom event loop support * +- * ----------------------------------------------------------- */ ++/* ++ * Custom event loop support ++ */ + + /** + * Return file descriptor for communication with kernel. +@@ -2043,7 +2043,7 @@ int fuse_session_fd(struct fuse_session *se); + * @param buf the fuse_buf containing the request + */ + void fuse_session_process_buf(struct fuse_session *se, +- const struct fuse_buf *buf); ++ const struct fuse_buf *buf); + + /** + * Read a raw request from the kernel into the supplied buffer. +diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h +index 2f6663e..f252baa 100644 +--- a/tools/virtiofsd/fuse_misc.h ++++ b/tools/virtiofsd/fuse_misc.h +@@ -1,18 +1,18 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include + + /* +- Versioned symbols cannot be used in some cases because it +- - confuse the dynamic linker in uClibc +- - not supported on MacOSX (in MachO binary format) +-*/ ++ * Versioned symbols cannot be used in some cases because it ++ * - confuse the dynamic linker in uClibc ++ * - not supported on MacOSX (in MachO binary format) ++ */ + #if (!defined(__UCLIBC__) && !defined(__APPLE__)) + #define FUSE_SYMVER(x) __asm__(x) + #else +@@ -25,11 +25,11 @@ + /* Is this hack still needed? */ + static inline void fuse_mutex_init(pthread_mutex_t *mut) + { +- pthread_mutexattr_t attr; +- pthread_mutexattr_init(&attr); +- pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); +- pthread_mutex_init(mut, &attr); +- pthread_mutexattr_destroy(&attr); ++ pthread_mutexattr_t attr; ++ pthread_mutexattr_init(&attr); ++ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); ++ pthread_mutex_init(mut, &attr); ++ pthread_mutexattr_destroy(&attr); + } + #endif + +diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c +index 93066b9..edd36f4 100644 +--- a/tools/virtiofsd/fuse_opt.c ++++ b/tools/virtiofsd/fuse_opt.c +@@ -1,423 +1,450 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- Implementation of option parsing routines (dealing with `struct +- fuse_args`). +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Implementation of option parsing routines (dealing with `struct ++ * fuse_args`). ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + ++#include "fuse_opt.h" + #include "config.h" + #include "fuse_i.h" +-#include "fuse_opt.h" + #include "fuse_misc.h" + ++#include + #include + #include + #include +-#include + + struct fuse_opt_context { +- void *data; +- const struct fuse_opt *opt; +- fuse_opt_proc_t proc; +- int argctr; +- int argc; +- char **argv; +- struct fuse_args outargs; +- char *opts; +- int nonopt; ++ void *data; ++ const struct fuse_opt *opt; ++ fuse_opt_proc_t proc; ++ int argctr; ++ int argc; ++ char **argv; ++ struct fuse_args outargs; ++ char *opts; ++ int nonopt; + }; + + void fuse_opt_free_args(struct fuse_args *args) + { +- if (args) { +- if (args->argv && args->allocated) { +- int i; +- for (i = 0; i < args->argc; i++) +- free(args->argv[i]); +- free(args->argv); +- } +- args->argc = 0; +- args->argv = NULL; +- args->allocated = 0; +- } ++ if (args) { ++ if (args->argv && args->allocated) { ++ int i; ++ for (i = 0; i < args->argc; i++) { ++ free(args->argv[i]); ++ } ++ free(args->argv); ++ } ++ args->argc = 0; ++ args->argv = NULL; ++ args->allocated = 0; ++ } + } + + static int alloc_failed(void) + { +- fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); +- return -1; ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; + } + + int fuse_opt_add_arg(struct fuse_args *args, const char *arg) + { +- char **newargv; +- char *newarg; +- +- assert(!args->argv || args->allocated); +- +- newarg = strdup(arg); +- if (!newarg) +- return alloc_failed(); +- +- newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); +- if (!newargv) { +- free(newarg); +- return alloc_failed(); +- } +- +- args->argv = newargv; +- args->allocated = 1; +- args->argv[args->argc++] = newarg; +- args->argv[args->argc] = NULL; +- return 0; ++ char **newargv; ++ char *newarg; ++ ++ assert(!args->argv || args->allocated); ++ ++ newarg = strdup(arg); ++ if (!newarg) { ++ return alloc_failed(); ++ } ++ ++ newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); ++ if (!newargv) { ++ free(newarg); ++ return alloc_failed(); ++ } ++ ++ args->argv = newargv; ++ args->allocated = 1; ++ args->argv[args->argc++] = newarg; ++ args->argv[args->argc] = NULL; ++ return 0; + } + + static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos, +- const char *arg) ++ const char *arg) + { +- assert(pos <= args->argc); +- if (fuse_opt_add_arg(args, arg) == -1) +- return -1; +- +- if (pos != args->argc - 1) { +- char *newarg = args->argv[args->argc - 1]; +- memmove(&args->argv[pos + 1], &args->argv[pos], +- sizeof(char *) * (args->argc - pos - 1)); +- args->argv[pos] = newarg; +- } +- return 0; ++ assert(pos <= args->argc); ++ if (fuse_opt_add_arg(args, arg) == -1) { ++ return -1; ++ } ++ ++ if (pos != args->argc - 1) { ++ char *newarg = args->argv[args->argc - 1]; ++ memmove(&args->argv[pos + 1], &args->argv[pos], ++ sizeof(char *) * (args->argc - pos - 1)); ++ args->argv[pos] = newarg; ++ } ++ return 0; + } + + int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg) + { +- return fuse_opt_insert_arg_common(args, pos, arg); ++ return fuse_opt_insert_arg_common(args, pos, arg); + } + + static int next_arg(struct fuse_opt_context *ctx, const char *opt) + { +- if (ctx->argctr + 1 >= ctx->argc) { +- fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); +- return -1; +- } +- ctx->argctr++; +- return 0; ++ if (ctx->argctr + 1 >= ctx->argc) { ++ fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); ++ return -1; ++ } ++ ctx->argctr++; ++ return 0; + } + + static int add_arg(struct fuse_opt_context *ctx, const char *arg) + { +- return fuse_opt_add_arg(&ctx->outargs, arg); ++ return fuse_opt_add_arg(&ctx->outargs, arg); + } + + static int add_opt_common(char **opts, const char *opt, int esc) + { +- unsigned oldlen = *opts ? strlen(*opts) : 0; +- char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); +- +- if (!d) +- return alloc_failed(); +- +- *opts = d; +- if (oldlen) { +- d += oldlen; +- *d++ = ','; +- } +- +- for (; *opt; opt++) { +- if (esc && (*opt == ',' || *opt == '\\')) +- *d++ = '\\'; +- *d++ = *opt; +- } +- *d = '\0'; +- +- return 0; ++ unsigned oldlen = *opts ? strlen(*opts) : 0; ++ char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); ++ ++ if (!d) { ++ return alloc_failed(); ++ } ++ ++ *opts = d; ++ if (oldlen) { ++ d += oldlen; ++ *d++ = ','; ++ } ++ ++ for (; *opt; opt++) { ++ if (esc && (*opt == ',' || *opt == '\\')) { ++ *d++ = '\\'; ++ } ++ *d++ = *opt; ++ } ++ *d = '\0'; ++ ++ return 0; + } + + int fuse_opt_add_opt(char **opts, const char *opt) + { +- return add_opt_common(opts, opt, 0); ++ return add_opt_common(opts, opt, 0); + } + + int fuse_opt_add_opt_escaped(char **opts, const char *opt) + { +- return add_opt_common(opts, opt, 1); ++ return add_opt_common(opts, opt, 1); + } + + static int add_opt(struct fuse_opt_context *ctx, const char *opt) + { +- return add_opt_common(&ctx->opts, opt, 1); ++ return add_opt_common(&ctx->opts, opt, 1); + } + + static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key, +- int iso) ++ int iso) + { +- if (key == FUSE_OPT_KEY_DISCARD) +- return 0; +- +- if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { +- int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); +- if (res == -1 || !res) +- return res; +- } +- if (iso) +- return add_opt(ctx, arg); +- else +- return add_arg(ctx, arg); ++ if (key == FUSE_OPT_KEY_DISCARD) { ++ return 0; ++ } ++ ++ if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { ++ int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); ++ if (res == -1 || !res) { ++ return res; ++ } ++ } ++ if (iso) { ++ return add_opt(ctx, arg); ++ } else { ++ return add_arg(ctx, arg); ++ } + } + + static int match_template(const char *t, const char *arg, unsigned *sepp) + { +- int arglen = strlen(arg); +- const char *sep = strchr(t, '='); +- sep = sep ? sep : strchr(t, ' '); +- if (sep && (!sep[1] || sep[1] == '%')) { +- int tlen = sep - t; +- if (sep[0] == '=') +- tlen ++; +- if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { +- *sepp = sep - t; +- return 1; +- } +- } +- if (strcmp(t, arg) == 0) { +- *sepp = 0; +- return 1; +- } +- return 0; ++ int arglen = strlen(arg); ++ const char *sep = strchr(t, '='); ++ sep = sep ? sep : strchr(t, ' '); ++ if (sep && (!sep[1] || sep[1] == '%')) { ++ int tlen = sep - t; ++ if (sep[0] == '=') { ++ tlen++; ++ } ++ if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { ++ *sepp = sep - t; ++ return 1; ++ } ++ } ++ if (strcmp(t, arg) == 0) { ++ *sepp = 0; ++ return 1; ++ } ++ return 0; + } + + static const struct fuse_opt *find_opt(const struct fuse_opt *opt, +- const char *arg, unsigned *sepp) ++ const char *arg, unsigned *sepp) + { +- for (; opt && opt->templ; opt++) +- if (match_template(opt->templ, arg, sepp)) +- return opt; +- return NULL; ++ for (; opt && opt->templ; opt++) { ++ if (match_template(opt->templ, arg, sepp)) { ++ return opt; ++ } ++ } ++ return NULL; + } + + int fuse_opt_match(const struct fuse_opt *opts, const char *opt) + { +- unsigned dummy; +- return find_opt(opts, opt, &dummy) ? 1 : 0; ++ unsigned dummy; ++ return find_opt(opts, opt, &dummy) ? 1 : 0; + } + + static int process_opt_param(void *var, const char *format, const char *param, +- const char *arg) ++ const char *arg) + { +- assert(format[0] == '%'); +- if (format[1] == 's') { +- char **s = var; +- char *copy = strdup(param); +- if (!copy) +- return alloc_failed(); +- +- free(*s); +- *s = copy; +- } else { +- if (sscanf(param, format, var) != 1) { +- fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", arg); +- return -1; +- } +- } +- return 0; ++ assert(format[0] == '%'); ++ if (format[1] == 's') { ++ char **s = var; ++ char *copy = strdup(param); ++ if (!copy) { ++ return alloc_failed(); ++ } ++ ++ free(*s); ++ *s = copy; ++ } else { ++ if (sscanf(param, format, var) != 1) { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", ++ arg); ++ return -1; ++ } ++ } ++ return 0; + } + +-static int process_opt(struct fuse_opt_context *ctx, +- const struct fuse_opt *opt, unsigned sep, +- const char *arg, int iso) ++static int process_opt(struct fuse_opt_context *ctx, const struct fuse_opt *opt, ++ unsigned sep, const char *arg, int iso) + { +- if (opt->offset == -1U) { +- if (call_proc(ctx, arg, opt->value, iso) == -1) +- return -1; +- } else { +- void *var = (char *)ctx->data + opt->offset; +- if (sep && opt->templ[sep + 1]) { +- const char *param = arg + sep; +- if (opt->templ[sep] == '=') +- param ++; +- if (process_opt_param(var, opt->templ + sep + 1, +- param, arg) == -1) +- return -1; +- } else +- *(int *)var = opt->value; +- } +- return 0; ++ if (opt->offset == -1U) { ++ if (call_proc(ctx, arg, opt->value, iso) == -1) { ++ return -1; ++ } ++ } else { ++ void *var = (char *)ctx->data + opt->offset; ++ if (sep && opt->templ[sep + 1]) { ++ const char *param = arg + sep; ++ if (opt->templ[sep] == '=') { ++ param++; ++ } ++ if (process_opt_param(var, opt->templ + sep + 1, param, arg) == ++ -1) { ++ return -1; ++ } ++ } else { ++ *(int *)var = opt->value; ++ } ++ } ++ return 0; + } + + static int process_opt_sep_arg(struct fuse_opt_context *ctx, +- const struct fuse_opt *opt, unsigned sep, +- const char *arg, int iso) ++ const struct fuse_opt *opt, unsigned sep, ++ const char *arg, int iso) + { +- int res; +- char *newarg; +- char *param; +- +- if (next_arg(ctx, arg) == -1) +- return -1; +- +- param = ctx->argv[ctx->argctr]; +- newarg = malloc(sep + strlen(param) + 1); +- if (!newarg) +- return alloc_failed(); +- +- memcpy(newarg, arg, sep); +- strcpy(newarg + sep, param); +- res = process_opt(ctx, opt, sep, newarg, iso); +- free(newarg); +- +- return res; ++ int res; ++ char *newarg; ++ char *param; ++ ++ if (next_arg(ctx, arg) == -1) { ++ return -1; ++ } ++ ++ param = ctx->argv[ctx->argctr]; ++ newarg = malloc(sep + strlen(param) + 1); ++ if (!newarg) { ++ return alloc_failed(); ++ } ++ ++ memcpy(newarg, arg, sep); ++ strcpy(newarg + sep, param); ++ res = process_opt(ctx, opt, sep, newarg, iso); ++ free(newarg); ++ ++ return res; + } + + static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso) + { +- unsigned sep; +- const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); +- if (opt) { +- for (; opt; opt = find_opt(opt + 1, arg, &sep)) { +- int res; +- if (sep && opt->templ[sep] == ' ' && !arg[sep]) +- res = process_opt_sep_arg(ctx, opt, sep, arg, +- iso); +- else +- res = process_opt(ctx, opt, sep, arg, iso); +- if (res == -1) +- return -1; +- } +- return 0; +- } else +- return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); ++ unsigned sep; ++ const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); ++ if (opt) { ++ for (; opt; opt = find_opt(opt + 1, arg, &sep)) { ++ int res; ++ if (sep && opt->templ[sep] == ' ' && !arg[sep]) { ++ res = process_opt_sep_arg(ctx, opt, sep, arg, iso); ++ } else { ++ res = process_opt(ctx, opt, sep, arg, iso); ++ } ++ if (res == -1) { ++ return -1; ++ } ++ } ++ return 0; ++ } else { ++ return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); ++ } + } + + static int process_real_option_group(struct fuse_opt_context *ctx, char *opts) + { +- char *s = opts; +- char *d = s; +- int end = 0; +- +- while (!end) { +- if (*s == '\0') +- end = 1; +- if (*s == ',' || end) { +- int res; +- +- *d = '\0'; +- res = process_gopt(ctx, opts, 1); +- if (res == -1) +- return -1; +- d = opts; +- } else { +- if (s[0] == '\\' && s[1] != '\0') { +- s++; +- if (s[0] >= '0' && s[0] <= '3' && +- s[1] >= '0' && s[1] <= '7' && +- s[2] >= '0' && s[2] <= '7') { +- *d++ = (s[0] - '0') * 0100 + +- (s[1] - '0') * 0010 + +- (s[2] - '0'); +- s += 2; +- } else { +- *d++ = *s; +- } +- } else { +- *d++ = *s; +- } +- } +- s++; +- } +- +- return 0; ++ char *s = opts; ++ char *d = s; ++ int end = 0; ++ ++ while (!end) { ++ if (*s == '\0') { ++ end = 1; ++ } ++ if (*s == ',' || end) { ++ int res; ++ ++ *d = '\0'; ++ res = process_gopt(ctx, opts, 1); ++ if (res == -1) { ++ return -1; ++ } ++ d = opts; ++ } else { ++ if (s[0] == '\\' && s[1] != '\0') { ++ s++; ++ if (s[0] >= '0' && s[0] <= '3' && s[1] >= '0' && s[1] <= '7' && ++ s[2] >= '0' && s[2] <= '7') { ++ *d++ = (s[0] - '0') * 0100 + (s[1] - '0') * 0010 + ++ (s[2] - '0'); ++ s += 2; ++ } else { ++ *d++ = *s; ++ } ++ } else { ++ *d++ = *s; ++ } ++ } ++ s++; ++ } ++ ++ return 0; + } + + static int process_option_group(struct fuse_opt_context *ctx, const char *opts) + { +- int res; +- char *copy = strdup(opts); +- +- if (!copy) { +- fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); +- return -1; +- } +- res = process_real_option_group(ctx, copy); +- free(copy); +- return res; ++ int res; ++ char *copy = strdup(opts); ++ ++ if (!copy) { ++ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); ++ return -1; ++ } ++ res = process_real_option_group(ctx, copy); ++ free(copy); ++ return res; + } + + static int process_one(struct fuse_opt_context *ctx, const char *arg) + { +- if (ctx->nonopt || arg[0] != '-') +- return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); +- else if (arg[1] == 'o') { +- if (arg[2]) +- return process_option_group(ctx, arg + 2); +- else { +- if (next_arg(ctx, arg) == -1) +- return -1; +- +- return process_option_group(ctx, +- ctx->argv[ctx->argctr]); +- } +- } else if (arg[1] == '-' && !arg[2]) { +- if (add_arg(ctx, arg) == -1) +- return -1; +- ctx->nonopt = ctx->outargs.argc; +- return 0; +- } else +- return process_gopt(ctx, arg, 0); ++ if (ctx->nonopt || arg[0] != '-') { ++ return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); ++ } else if (arg[1] == 'o') { ++ if (arg[2]) { ++ return process_option_group(ctx, arg + 2); ++ } else { ++ if (next_arg(ctx, arg) == -1) { ++ return -1; ++ } ++ ++ return process_option_group(ctx, ctx->argv[ctx->argctr]); ++ } ++ } else if (arg[1] == '-' && !arg[2]) { ++ if (add_arg(ctx, arg) == -1) { ++ return -1; ++ } ++ ctx->nonopt = ctx->outargs.argc; ++ return 0; ++ } else { ++ return process_gopt(ctx, arg, 0); ++ } + } + + static int opt_parse(struct fuse_opt_context *ctx) + { +- if (ctx->argc) { +- if (add_arg(ctx, ctx->argv[0]) == -1) +- return -1; +- } +- +- for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) +- if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) +- return -1; +- +- if (ctx->opts) { +- if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || +- fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) +- return -1; +- } +- +- /* If option separator ("--") is the last argument, remove it */ +- if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && +- strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { +- free(ctx->outargs.argv[ctx->outargs.argc - 1]); +- ctx->outargs.argv[--ctx->outargs.argc] = NULL; +- } +- +- return 0; ++ if (ctx->argc) { ++ if (add_arg(ctx, ctx->argv[0]) == -1) { ++ return -1; ++ } ++ } ++ ++ for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) { ++ if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) { ++ return -1; ++ } ++ } ++ ++ if (ctx->opts) { ++ if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || ++ fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) { ++ return -1; ++ } ++ } ++ ++ /* If option separator ("--") is the last argument, remove it */ ++ if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && ++ strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { ++ free(ctx->outargs.argv[ctx->outargs.argc - 1]); ++ ctx->outargs.argv[--ctx->outargs.argc] = NULL; ++ } ++ ++ return 0; + } + + int fuse_opt_parse(struct fuse_args *args, void *data, +- const struct fuse_opt opts[], fuse_opt_proc_t proc) ++ const struct fuse_opt opts[], fuse_opt_proc_t proc) + { +- int res; +- struct fuse_opt_context ctx = { +- .data = data, +- .opt = opts, +- .proc = proc, +- }; +- +- if (!args || !args->argv || !args->argc) +- return 0; +- +- ctx.argc = args->argc; +- ctx.argv = args->argv; +- +- res = opt_parse(&ctx); +- if (res != -1) { +- struct fuse_args tmp = *args; +- *args = ctx.outargs; +- ctx.outargs = tmp; +- } +- free(ctx.opts); +- fuse_opt_free_args(&ctx.outargs); +- return res; ++ int res; ++ struct fuse_opt_context ctx = { ++ .data = data, ++ .opt = opts, ++ .proc = proc, ++ }; ++ ++ if (!args || !args->argv || !args->argc) { ++ return 0; ++ } ++ ++ ctx.argc = args->argc; ++ ctx.argv = args->argv; ++ ++ res = opt_parse(&ctx); ++ if (res != -1) { ++ struct fuse_args tmp = *args; ++ *args = ctx.outargs; ++ ctx.outargs = tmp; ++ } ++ free(ctx.opts); ++ fuse_opt_free_args(&ctx.outargs); ++ return res; + } +diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h +index 6910255..8f59b4d 100644 +--- a/tools/virtiofsd/fuse_opt.h ++++ b/tools/virtiofsd/fuse_opt.h +@@ -1,10 +1,10 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #ifndef FUSE_OPT_H_ + #define FUSE_OPT_H_ +@@ -37,7 +37,7 @@ + * + * - 'offsetof(struct foo, member)' actions i) and iii) + * +- * - -1 action ii) ++ * - -1 action ii) + * + * The 'offsetof()' macro is defined in the header. + * +@@ -48,7 +48,7 @@ + * + * The types of templates are: + * +- * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only ++ * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only + * themselves. Invalid values are "--" and anything beginning + * with "-o" + * +@@ -71,58 +71,67 @@ + * freed. + */ + struct fuse_opt { +- /** Matching template and optional parameter formatting */ +- const char *templ; ++ /** Matching template and optional parameter formatting */ ++ const char *templ; + +- /** +- * Offset of variable within 'data' parameter of fuse_opt_parse() +- * or -1 +- */ +- unsigned long offset; ++ /** ++ * Offset of variable within 'data' parameter of fuse_opt_parse() ++ * or -1 ++ */ ++ unsigned long offset; + +- /** +- * Value to set the variable to, or to be passed as 'key' to the +- * processing function. Ignored if template has a format +- */ +- int value; ++ /** ++ * Value to set the variable to, or to be passed as 'key' to the ++ * processing function. Ignored if template has a format ++ */ ++ int value; + }; + + /** +- * Key option. In case of a match, the processing function will be ++ * Key option. In case of a match, the processing function will be + * called with the specified key. + */ +-#define FUSE_OPT_KEY(templ, key) { templ, -1U, key } ++#define FUSE_OPT_KEY(templ, key) \ ++ { \ ++ templ, -1U, key \ ++ } + + /** +- * Last option. An array of 'struct fuse_opt' must end with a NULL ++ * Last option. An array of 'struct fuse_opt' must end with a NULL + * template value + */ +-#define FUSE_OPT_END { NULL, 0, 0 } ++#define FUSE_OPT_END \ ++ { \ ++ NULL, 0, 0 \ ++ } + + /** + * Argument list + */ + struct fuse_args { +- /** Argument count */ +- int argc; ++ /** Argument count */ ++ int argc; + +- /** Argument vector. NULL terminated */ +- char **argv; ++ /** Argument vector. NULL terminated */ ++ char **argv; + +- /** Is 'argv' allocated? */ +- int allocated; ++ /** Is 'argv' allocated? */ ++ int allocated; + }; + + /** + * Initializer for 'struct fuse_args' + */ +-#define FUSE_ARGS_INIT(argc, argv) { argc, argv, 0 } ++#define FUSE_ARGS_INIT(argc, argv) \ ++ { \ ++ argc, argv, 0 \ ++ } + + /** + * Key value passed to the processing function if an option did not + * match any template + */ +-#define FUSE_OPT_KEY_OPT -1 ++#define FUSE_OPT_KEY_OPT -1 + + /** + * Key value passed to the processing function for all non-options +@@ -130,7 +139,7 @@ struct fuse_args { + * Non-options are the arguments beginning with a character other than + * '-' or all arguments after the special '--' option + */ +-#define FUSE_OPT_KEY_NONOPT -2 ++#define FUSE_OPT_KEY_NONOPT -2 + + /** + * Special key value for options to keep +@@ -174,7 +183,7 @@ struct fuse_args { + * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept + */ + typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, +- struct fuse_args *outargs); ++ struct fuse_args *outargs); + + /** + * Option parsing function +@@ -197,7 +206,7 @@ typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, + * @return -1 on error, 0 on success + */ + int fuse_opt_parse(struct fuse_args *args, void *data, +- const struct fuse_opt opts[], fuse_opt_proc_t proc); ++ const struct fuse_opt opts[], fuse_opt_proc_t proc); + + /** + * Add an option to a comma separated option list +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +index 4271947..19d6791 100644 +--- a/tools/virtiofsd/fuse_signals.c ++++ b/tools/virtiofsd/fuse_signals.c +@@ -1,91 +1,95 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- Utility functions for setting signal handlers. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Utility functions for setting signal handlers. ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ + + #include "config.h" +-#include "fuse_lowlevel.h" + #include "fuse_i.h" ++#include "fuse_lowlevel.h" + +-#include +-#include + #include ++#include + #include ++#include + + static struct fuse_session *fuse_instance; + + static void exit_handler(int sig) + { +- if (fuse_instance) { +- fuse_session_exit(fuse_instance); +- if(sig <= 0) { +- fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); +- abort(); +- } +- fuse_instance->error = sig; +- } ++ if (fuse_instance) { ++ fuse_session_exit(fuse_instance); ++ if (sig <= 0) { ++ fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); ++ abort(); ++ } ++ fuse_instance->error = sig; ++ } + } + + static void do_nothing(int sig) + { +- (void) sig; ++ (void)sig; + } + + static int set_one_signal_handler(int sig, void (*handler)(int), int remove) + { +- struct sigaction sa; +- struct sigaction old_sa; ++ struct sigaction sa; ++ struct sigaction old_sa; + +- memset(&sa, 0, sizeof(struct sigaction)); +- sa.sa_handler = remove ? SIG_DFL : handler; +- sigemptyset(&(sa.sa_mask)); +- sa.sa_flags = 0; ++ memset(&sa, 0, sizeof(struct sigaction)); ++ sa.sa_handler = remove ? SIG_DFL : handler; ++ sigemptyset(&(sa.sa_mask)); ++ sa.sa_flags = 0; + +- if (sigaction(sig, NULL, &old_sa) == -1) { +- perror("fuse: cannot get old signal handler"); +- return -1; +- } ++ if (sigaction(sig, NULL, &old_sa) == -1) { ++ perror("fuse: cannot get old signal handler"); ++ return -1; ++ } + +- if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && +- sigaction(sig, &sa, NULL) == -1) { +- perror("fuse: cannot set signal handler"); +- return -1; +- } +- return 0; ++ if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && ++ sigaction(sig, &sa, NULL) == -1) { ++ perror("fuse: cannot set signal handler"); ++ return -1; ++ } ++ return 0; + } + + int fuse_set_signal_handlers(struct fuse_session *se) + { +- /* If we used SIG_IGN instead of the do_nothing function, +- then we would be unable to tell if we set SIG_IGN (and +- thus should reset to SIG_DFL in fuse_remove_signal_handlers) +- or if it was already set to SIG_IGN (and should be left +- untouched. */ +- if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || +- set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || +- set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || +- set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) +- return -1; ++ /* ++ * If we used SIG_IGN instead of the do_nothing function, ++ * then we would be unable to tell if we set SIG_IGN (and ++ * thus should reset to SIG_DFL in fuse_remove_signal_handlers) ++ * or if it was already set to SIG_IGN (and should be left ++ * untouched. ++ */ ++ if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || ++ set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) { ++ return -1; ++ } + +- fuse_instance = se; +- return 0; ++ fuse_instance = se; ++ return 0; + } + + void fuse_remove_signal_handlers(struct fuse_session *se) + { +- if (fuse_instance != se) +- fuse_log(FUSE_LOG_ERR, +- "fuse: fuse_remove_signal_handlers: unknown session\n"); +- else +- fuse_instance = NULL; ++ if (fuse_instance != se) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: fuse_remove_signal_handlers: unknown session\n"); ++ } else { ++ fuse_instance = NULL; ++ } + +- set_one_signal_handler(SIGHUP, exit_handler, 1); +- set_one_signal_handler(SIGINT, exit_handler, 1); +- set_one_signal_handler(SIGTERM, exit_handler, 1); +- set_one_signal_handler(SIGPIPE, do_nothing, 1); ++ set_one_signal_handler(SIGHUP, exit_handler, 1); ++ set_one_signal_handler(SIGINT, exit_handler, 1); ++ set_one_signal_handler(SIGTERM, exit_handler, 1); ++ set_one_signal_handler(SIGPIPE, do_nothing, 1); + } +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5a2e64c..5711dd2 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -1,297 +1,309 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * Helper functions to create (simple) standalone programs. With the ++ * aid of these functions it should be possible to create full FUSE ++ * file system by implementing nothing but the request handlers. + +- Helper functions to create (simple) standalone programs. With the +- aid of these functions it should be possible to create full FUSE +- file system by implementing nothing but the request handlers. +- +- This program can be distributed under the terms of the GNU LGPLv2. +- See the file COPYING.LIB. +-*/ ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB. ++ */ + + #include "config.h" + #include "fuse_i.h" ++#include "fuse_lowlevel.h" + #include "fuse_misc.h" + #include "fuse_opt.h" +-#include "fuse_lowlevel.h" + #include "mount_util.h" + ++#include ++#include ++#include + #include + #include +-#include +-#include + #include +-#include +-#include + #include ++#include + +-#define FUSE_HELPER_OPT(t, p) \ +- { t, offsetof(struct fuse_cmdline_opts, p), 1 } ++#define FUSE_HELPER_OPT(t, p) \ ++ { \ ++ t, offsetof(struct fuse_cmdline_opts, p), 1 \ ++ } + + static const struct fuse_opt fuse_helper_opts[] = { +- FUSE_HELPER_OPT("-h", show_help), +- FUSE_HELPER_OPT("--help", show_help), +- FUSE_HELPER_OPT("-V", show_version), +- FUSE_HELPER_OPT("--version", show_version), +- FUSE_HELPER_OPT("-d", debug), +- FUSE_HELPER_OPT("debug", debug), +- FUSE_HELPER_OPT("-d", foreground), +- FUSE_HELPER_OPT("debug", foreground), +- FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), +- FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), +- FUSE_HELPER_OPT("-f", foreground), +- FUSE_HELPER_OPT("fsname=", nodefault_subtype), +- FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), +- FUSE_HELPER_OPT("subtype=", nodefault_subtype), +- FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), +- FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), +- FUSE_OPT_END ++ FUSE_HELPER_OPT("-h", show_help), ++ FUSE_HELPER_OPT("--help", show_help), ++ FUSE_HELPER_OPT("-V", show_version), ++ FUSE_HELPER_OPT("--version", show_version), ++ FUSE_HELPER_OPT("-d", debug), ++ FUSE_HELPER_OPT("debug", debug), ++ FUSE_HELPER_OPT("-d", foreground), ++ FUSE_HELPER_OPT("debug", foreground), ++ FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), ++ FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("-f", foreground), ++ FUSE_HELPER_OPT("fsname=", nodefault_subtype), ++ FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("subtype=", nodefault_subtype), ++ FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), ++ FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_OPT_END + }; + + struct fuse_conn_info_opts { +- int atomic_o_trunc; +- int no_remote_posix_lock; +- int no_remote_flock; +- int splice_write; +- int splice_move; +- int splice_read; +- int no_splice_write; +- int no_splice_move; +- int no_splice_read; +- int auto_inval_data; +- int no_auto_inval_data; +- int no_readdirplus; +- int no_readdirplus_auto; +- int async_dio; +- int no_async_dio; +- int writeback_cache; +- int no_writeback_cache; +- int async_read; +- int sync_read; +- unsigned max_write; +- unsigned max_readahead; +- unsigned max_background; +- unsigned congestion_threshold; +- unsigned time_gran; +- int set_max_write; +- int set_max_readahead; +- int set_max_background; +- int set_congestion_threshold; +- int set_time_gran; ++ int atomic_o_trunc; ++ int no_remote_posix_lock; ++ int no_remote_flock; ++ int splice_write; ++ int splice_move; ++ int splice_read; ++ int no_splice_write; ++ int no_splice_move; ++ int no_splice_read; ++ int auto_inval_data; ++ int no_auto_inval_data; ++ int no_readdirplus; ++ int no_readdirplus_auto; ++ int async_dio; ++ int no_async_dio; ++ int writeback_cache; ++ int no_writeback_cache; ++ int async_read; ++ int sync_read; ++ unsigned max_write; ++ unsigned max_readahead; ++ unsigned max_background; ++ unsigned congestion_threshold; ++ unsigned time_gran; ++ int set_max_write; ++ int set_max_readahead; ++ int set_max_background; ++ int set_congestion_threshold; ++ int set_time_gran; + }; + +-#define CONN_OPTION(t, p, v) \ +- { t, offsetof(struct fuse_conn_info_opts, p), v } ++#define CONN_OPTION(t, p, v) \ ++ { \ ++ t, offsetof(struct fuse_conn_info_opts, p), v \ ++ } + static const struct fuse_opt conn_info_opt_spec[] = { +- CONN_OPTION("max_write=%u", max_write, 0), +- CONN_OPTION("max_write=", set_max_write, 1), +- CONN_OPTION("max_readahead=%u", max_readahead, 0), +- CONN_OPTION("max_readahead=", set_max_readahead, 1), +- CONN_OPTION("max_background=%u", max_background, 0), +- CONN_OPTION("max_background=", set_max_background, 1), +- CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), +- CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), +- CONN_OPTION("sync_read", sync_read, 1), +- CONN_OPTION("async_read", async_read, 1), +- CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), +- CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), +- CONN_OPTION("no_remote_lock", no_remote_flock, 1), +- CONN_OPTION("no_remote_flock", no_remote_flock, 1), +- CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), +- CONN_OPTION("splice_write", splice_write, 1), +- CONN_OPTION("no_splice_write", no_splice_write, 1), +- CONN_OPTION("splice_move", splice_move, 1), +- CONN_OPTION("no_splice_move", no_splice_move, 1), +- CONN_OPTION("splice_read", splice_read, 1), +- CONN_OPTION("no_splice_read", no_splice_read, 1), +- CONN_OPTION("auto_inval_data", auto_inval_data, 1), +- CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), +- CONN_OPTION("readdirplus=no", no_readdirplus, 1), +- CONN_OPTION("readdirplus=yes", no_readdirplus, 0), +- CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), +- CONN_OPTION("readdirplus=auto", no_readdirplus, 0), +- CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), +- CONN_OPTION("async_dio", async_dio, 1), +- CONN_OPTION("no_async_dio", no_async_dio, 1), +- CONN_OPTION("writeback_cache", writeback_cache, 1), +- CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), +- CONN_OPTION("time_gran=%u", time_gran, 0), +- CONN_OPTION("time_gran=", set_time_gran, 1), +- FUSE_OPT_END ++ CONN_OPTION("max_write=%u", max_write, 0), ++ CONN_OPTION("max_write=", set_max_write, 1), ++ CONN_OPTION("max_readahead=%u", max_readahead, 0), ++ CONN_OPTION("max_readahead=", set_max_readahead, 1), ++ CONN_OPTION("max_background=%u", max_background, 0), ++ CONN_OPTION("max_background=", set_max_background, 1), ++ CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), ++ CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), ++ CONN_OPTION("sync_read", sync_read, 1), ++ CONN_OPTION("async_read", async_read, 1), ++ CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), ++ CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("no_remote_lock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_flock", no_remote_flock, 1), ++ CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), ++ CONN_OPTION("splice_write", splice_write, 1), ++ CONN_OPTION("no_splice_write", no_splice_write, 1), ++ CONN_OPTION("splice_move", splice_move, 1), ++ CONN_OPTION("no_splice_move", no_splice_move, 1), ++ CONN_OPTION("splice_read", splice_read, 1), ++ CONN_OPTION("no_splice_read", no_splice_read, 1), ++ CONN_OPTION("auto_inval_data", auto_inval_data, 1), ++ CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), ++ CONN_OPTION("readdirplus=no", no_readdirplus, 1), ++ CONN_OPTION("readdirplus=yes", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), ++ CONN_OPTION("readdirplus=auto", no_readdirplus, 0), ++ CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), ++ CONN_OPTION("async_dio", async_dio, 1), ++ CONN_OPTION("no_async_dio", no_async_dio, 1), ++ CONN_OPTION("writeback_cache", writeback_cache, 1), ++ CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), ++ CONN_OPTION("time_gran=%u", time_gran, 0), ++ CONN_OPTION("time_gran=", set_time_gran, 1), ++ FUSE_OPT_END + }; + + + void fuse_cmdline_help(void) + { +- printf(" -h --help print help\n" +- " -V --version print version\n" +- " -d -o debug enable debug output (implies -f)\n" +- " -f foreground operation\n" +- " -o max_idle_threads the maximum number of idle worker threads\n" +- " allowed (default: 10)\n"); ++ printf( ++ " -h --help print help\n" ++ " -V --version print version\n" ++ " -d -o debug enable debug output (implies -f)\n" ++ " -f foreground operation\n" ++ " -o max_idle_threads the maximum number of idle worker threads\n" ++ " allowed (default: 10)\n"); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +- struct fuse_args *outargs) ++ struct fuse_args *outargs) + { +- (void) outargs; +- struct fuse_cmdline_opts *opts = data; +- +- switch (key) { +- case FUSE_OPT_KEY_NONOPT: +- if (!opts->mountpoint) { +- if (fuse_mnt_parse_fuse_fd(arg) != -1) { +- return fuse_opt_add_opt(&opts->mountpoint, arg); +- } +- +- char mountpoint[PATH_MAX] = ""; +- if (realpath(arg, mountpoint) == NULL) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: bad mount point `%s': %s\n", +- arg, strerror(errno)); +- return -1; +- } +- return fuse_opt_add_opt(&opts->mountpoint, mountpoint); +- } else { +- fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); +- return -1; +- } +- +- default: +- /* Pass through unknown options */ +- return 1; +- } ++ (void)outargs; ++ struct fuse_cmdline_opts *opts = data; ++ ++ switch (key) { ++ case FUSE_OPT_KEY_NONOPT: ++ if (!opts->mountpoint) { ++ if (fuse_mnt_parse_fuse_fd(arg) != -1) { ++ return fuse_opt_add_opt(&opts->mountpoint, arg); ++ } ++ ++ char mountpoint[PATH_MAX] = ""; ++ if (realpath(arg, mountpoint) == NULL) { ++ fuse_log(FUSE_LOG_ERR, "fuse: bad mount point `%s': %s\n", arg, ++ strerror(errno)); ++ return -1; ++ } ++ return fuse_opt_add_opt(&opts->mountpoint, mountpoint); ++ } else { ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); ++ return -1; ++ } ++ ++ default: ++ /* Pass through unknown options */ ++ return 1; ++ } + } + +-int fuse_parse_cmdline(struct fuse_args *args, +- struct fuse_cmdline_opts *opts) ++int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + { +- memset(opts, 0, sizeof(struct fuse_cmdline_opts)); ++ memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + +- opts->max_idle_threads = 10; ++ opts->max_idle_threads = 10; + +- if (fuse_opt_parse(args, opts, fuse_helper_opts, +- fuse_helper_opt_proc) == -1) +- return -1; ++ if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == ++ -1) { ++ return -1; ++ } + +- return 0; ++ return 0; + } + + + int fuse_daemonize(int foreground) + { +- if (!foreground) { +- int nullfd; +- int waiter[2]; +- char completed; +- +- if (pipe(waiter)) { +- perror("fuse_daemonize: pipe"); +- return -1; +- } +- +- /* +- * demonize current process by forking it and killing the +- * parent. This makes current process as a child of 'init'. +- */ +- switch(fork()) { +- case -1: +- perror("fuse_daemonize: fork"); +- return -1; +- case 0: +- break; +- default: +- (void) read(waiter[0], &completed, sizeof(completed)); +- _exit(0); +- } +- +- if (setsid() == -1) { +- perror("fuse_daemonize: setsid"); +- return -1; +- } +- +- (void) chdir("/"); +- +- nullfd = open("/dev/null", O_RDWR, 0); +- if (nullfd != -1) { +- (void) dup2(nullfd, 0); +- (void) dup2(nullfd, 1); +- (void) dup2(nullfd, 2); +- if (nullfd > 2) +- close(nullfd); +- } +- +- /* Propagate completion of daemon initialization */ +- completed = 1; +- (void) write(waiter[1], &completed, sizeof(completed)); +- close(waiter[0]); +- close(waiter[1]); +- } else { +- (void) chdir("/"); +- } +- return 0; ++ if (!foreground) { ++ int nullfd; ++ int waiter[2]; ++ char completed; ++ ++ if (pipe(waiter)) { ++ perror("fuse_daemonize: pipe"); ++ return -1; ++ } ++ ++ /* ++ * demonize current process by forking it and killing the ++ * parent. This makes current process as a child of 'init'. ++ */ ++ switch (fork()) { ++ case -1: ++ perror("fuse_daemonize: fork"); ++ return -1; ++ case 0: ++ break; ++ default: ++ (void)read(waiter[0], &completed, sizeof(completed)); ++ _exit(0); ++ } ++ ++ if (setsid() == -1) { ++ perror("fuse_daemonize: setsid"); ++ return -1; ++ } ++ ++ (void)chdir("/"); ++ ++ nullfd = open("/dev/null", O_RDWR, 0); ++ if (nullfd != -1) { ++ (void)dup2(nullfd, 0); ++ (void)dup2(nullfd, 1); ++ (void)dup2(nullfd, 2); ++ if (nullfd > 2) { ++ close(nullfd); ++ } ++ } ++ ++ /* Propagate completion of daemon initialization */ ++ completed = 1; ++ (void)write(waiter[1], &completed, sizeof(completed)); ++ close(waiter[0]); ++ close(waiter[1]); ++ } else { ++ (void)chdir("/"); ++ } ++ return 0; + } + + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, +- struct fuse_conn_info *conn) ++ struct fuse_conn_info *conn) + { +- if(opts->set_max_write) +- conn->max_write = opts->max_write; +- if(opts->set_max_background) +- conn->max_background = opts->max_background; +- if(opts->set_congestion_threshold) +- conn->congestion_threshold = opts->congestion_threshold; +- if(opts->set_time_gran) +- conn->time_gran = opts->time_gran; +- if(opts->set_max_readahead) +- conn->max_readahead = opts->max_readahead; +- +-#define LL_ENABLE(cond,cap) \ +- if (cond) conn->want |= (cap) +-#define LL_DISABLE(cond,cap) \ +- if (cond) conn->want &= ~(cap) +- +- LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); +- LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); +- +- LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); +- LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); +- +- LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); +- LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); +- +- LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); +- LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); +- +- LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); +- LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); +- +- LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); +- LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); +- +- LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); +- LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); +- +- LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); +- LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); +- +- LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); +- LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); ++ if (opts->set_max_write) { ++ conn->max_write = opts->max_write; ++ } ++ if (opts->set_max_background) { ++ conn->max_background = opts->max_background; ++ } ++ if (opts->set_congestion_threshold) { ++ conn->congestion_threshold = opts->congestion_threshold; ++ } ++ if (opts->set_time_gran) { ++ conn->time_gran = opts->time_gran; ++ } ++ if (opts->set_max_readahead) { ++ conn->max_readahead = opts->max_readahead; ++ } ++ ++#define LL_ENABLE(cond, cap) \ ++ if (cond) \ ++ conn->want |= (cap) ++#define LL_DISABLE(cond, cap) \ ++ if (cond) \ ++ conn->want &= ~(cap) ++ ++ LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); ++ LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); ++ ++ LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); ++ LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); ++ ++ LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); ++ LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); ++ ++ LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); ++ ++ LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); ++ LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); ++ ++ LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); ++ LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); ++ ++ LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); ++ ++ LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); ++ LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); ++ ++ LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); ++ LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); + } + +-struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args) ++struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args) + { +- struct fuse_conn_info_opts *opts; +- +- opts = calloc(1, sizeof(struct fuse_conn_info_opts)); +- if(opts == NULL) { +- fuse_log(FUSE_LOG_ERR, "calloc failed\n"); +- return NULL; +- } +- if(fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { +- free(opts); +- return NULL; +- } +- return opts; ++ struct fuse_conn_info_opts *opts; ++ ++ opts = calloc(1, sizeof(struct fuse_conn_info_opts)); ++ if (opts == NULL) { ++ fuse_log(FUSE_LOG_ERR, "calloc failed\n"); ++ return NULL; ++ } ++ if (fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { ++ free(opts); ++ return NULL; ++ } ++ return opts; + } +diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h +index 7c5f561..0b98275 100644 +--- a/tools/virtiofsd/passthrough_helpers.h ++++ b/tools/virtiofsd/passthrough_helpers.h +@@ -28,23 +28,24 @@ + * operation + */ + static int mknod_wrapper(int dirfd, const char *path, const char *link, +- int mode, dev_t rdev) ++ int mode, dev_t rdev) + { +- int res; ++ int res; + +- if (S_ISREG(mode)) { +- res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); +- if (res >= 0) +- res = close(res); +- } else if (S_ISDIR(mode)) { +- res = mkdirat(dirfd, path, mode); +- } else if (S_ISLNK(mode) && link != NULL) { +- res = symlinkat(link, dirfd, path); +- } else if (S_ISFIFO(mode)) { +- res = mkfifoat(dirfd, path, mode); +- } else { +- res = mknodat(dirfd, path, mode, rdev); +- } ++ if (S_ISREG(mode)) { ++ res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); ++ if (res >= 0) { ++ res = close(res); ++ } ++ } else if (S_ISDIR(mode)) { ++ res = mkdirat(dirfd, path, mode); ++ } else if (S_ISLNK(mode) && link != NULL) { ++ res = symlinkat(link, dirfd, path); ++ } else if (S_ISFIFO(mode)) { ++ res = mkfifoat(dirfd, path, mode); ++ } else { ++ res = mknodat(dirfd, path, mode, rdev); ++ } + +- return res; ++ return res; + } +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e5f7115..c5850ef 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1,12 +1,12 @@ + /* +- FUSE: Filesystem in Userspace +- Copyright (C) 2001-2007 Miklos Szeredi +- +- This program can be distributed under the terms of the GNU GPLv2. +- See the file COPYING. +-*/ ++ * FUSE: Filesystem in Userspace ++ * Copyright (C) 2001-2007 Miklos Szeredi ++ * ++ * This program can be distributed under the terms of the GNU GPLv2. ++ * See the file COPYING. ++ */ + +-/** @file ++/* + * + * This file system mirrors the existing file system hierarchy of the + * system, starting at the root file system. This is implemented by +@@ -28,7 +28,8 @@ + * + * Compile with: + * +- * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o passthrough_ll ++ * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o ++ * passthrough_ll + * + * ## Source code ## + * \include passthrough_ll.c +@@ -39,1299 +40,1365 @@ + + #include "config.h" + +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include + #include ++#include + #include ++#include + #include ++#include + #include ++#include ++#include ++#include ++#include ++#include + #include + #include ++#include + + #include "passthrough_helpers.h" + +-/* We are re-using pointers to our `struct lo_inode` and `struct +- lo_dirp` elements as inodes. This means that we must be able to +- store uintptr_t values in a fuse_ino_t variable. The following +- incantation checks this condition at compile time. */ +-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++/* ++ * We are re-using pointers to our `struct lo_inode` and `struct ++ * lo_dirp` elements as inodes. This means that we must be able to ++ * store uintptr_t values in a fuse_ino_t variable. The following ++ * incantation checks this condition at compile time. ++ */ ++#if defined(__GNUC__) && \ ++ (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \ ++ !defined __cplusplus + _Static_assert(sizeof(fuse_ino_t) >= sizeof(uintptr_t), +- "fuse_ino_t too small to hold uintptr_t values!"); ++ "fuse_ino_t too small to hold uintptr_t values!"); + #else +-struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct \ +- { unsigned _uintptr_to_must_hold_fuse_ino_t: +- ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); }; ++struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { ++ unsigned _uintptr_to_must_hold_fuse_ino_t ++ : ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); ++}; + #endif + + struct lo_inode { +- struct lo_inode *next; /* protected by lo->mutex */ +- struct lo_inode *prev; /* protected by lo->mutex */ +- int fd; +- bool is_symlink; +- ino_t ino; +- dev_t dev; +- uint64_t refcount; /* protected by lo->mutex */ ++ struct lo_inode *next; /* protected by lo->mutex */ ++ struct lo_inode *prev; /* protected by lo->mutex */ ++ int fd; ++ bool is_symlink; ++ ino_t ino; ++ dev_t dev; ++ uint64_t refcount; /* protected by lo->mutex */ + }; + + enum { +- CACHE_NEVER, +- CACHE_NORMAL, +- CACHE_ALWAYS, ++ CACHE_NEVER, ++ CACHE_NORMAL, ++ CACHE_ALWAYS, + }; + + struct lo_data { +- pthread_mutex_t mutex; +- int debug; +- int writeback; +- int flock; +- int xattr; +- const char *source; +- double timeout; +- int cache; +- int timeout_set; +- struct lo_inode root; /* protected by lo->mutex */ ++ pthread_mutex_t mutex; ++ int debug; ++ int writeback; ++ int flock; ++ int xattr; ++ const char *source; ++ double timeout; ++ int cache; ++ int timeout_set; ++ struct lo_inode root; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +- { "writeback", +- offsetof(struct lo_data, writeback), 1 }, +- { "no_writeback", +- offsetof(struct lo_data, writeback), 0 }, +- { "source=%s", +- offsetof(struct lo_data, source), 0 }, +- { "flock", +- offsetof(struct lo_data, flock), 1 }, +- { "no_flock", +- offsetof(struct lo_data, flock), 0 }, +- { "xattr", +- offsetof(struct lo_data, xattr), 1 }, +- { "no_xattr", +- offsetof(struct lo_data, xattr), 0 }, +- { "timeout=%lf", +- offsetof(struct lo_data, timeout), 0 }, +- { "timeout=", +- offsetof(struct lo_data, timeout_set), 1 }, +- { "cache=never", +- offsetof(struct lo_data, cache), CACHE_NEVER }, +- { "cache=auto", +- offsetof(struct lo_data, cache), CACHE_NORMAL }, +- { "cache=always", +- offsetof(struct lo_data, cache), CACHE_ALWAYS }, +- +- FUSE_OPT_END ++ { "writeback", offsetof(struct lo_data, writeback), 1 }, ++ { "no_writeback", offsetof(struct lo_data, writeback), 0 }, ++ { "source=%s", offsetof(struct lo_data, source), 0 }, ++ { "flock", offsetof(struct lo_data, flock), 1 }, ++ { "no_flock", offsetof(struct lo_data, flock), 0 }, ++ { "xattr", offsetof(struct lo_data, xattr), 1 }, ++ { "no_xattr", offsetof(struct lo_data, xattr), 0 }, ++ { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, ++ { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, ++ { "cache=never", offsetof(struct lo_data, cache), CACHE_NEVER }, ++ { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, ++ { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, ++ ++ FUSE_OPT_END + }; + + static struct lo_data *lo_data(fuse_req_t req) + { +- return (struct lo_data *) fuse_req_userdata(req); ++ return (struct lo_data *)fuse_req_userdata(req); + } + + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { +- if (ino == FUSE_ROOT_ID) +- return &lo_data(req)->root; +- else +- return (struct lo_inode *) (uintptr_t) ino; ++ if (ino == FUSE_ROOT_ID) { ++ return &lo_data(req)->root; ++ } else { ++ return (struct lo_inode *)(uintptr_t)ino; ++ } + } + + static int lo_fd(fuse_req_t req, fuse_ino_t ino) + { +- return lo_inode(req, ino)->fd; ++ return lo_inode(req, ino)->fd; + } + + static bool lo_debug(fuse_req_t req) + { +- return lo_data(req)->debug != 0; ++ return lo_data(req)->debug != 0; + } + +-static void lo_init(void *userdata, +- struct fuse_conn_info *conn) ++static void lo_init(void *userdata, struct fuse_conn_info *conn) + { +- struct lo_data *lo = (struct lo_data*) userdata; +- +- if(conn->capable & FUSE_CAP_EXPORT_SUPPORT) +- conn->want |= FUSE_CAP_EXPORT_SUPPORT; +- +- if (lo->writeback && +- conn->capable & FUSE_CAP_WRITEBACK_CACHE) { +- if (lo->debug) +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); +- conn->want |= FUSE_CAP_WRITEBACK_CACHE; +- } +- if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { +- if (lo->debug) +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); +- conn->want |= FUSE_CAP_FLOCK_LOCKS; +- } ++ struct lo_data *lo = (struct lo_data *)userdata; ++ ++ if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) { ++ conn->want |= FUSE_CAP_EXPORT_SUPPORT; ++ } ++ ++ if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { ++ if (lo->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); ++ } ++ conn->want |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { ++ if (lo->debug) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); ++ } ++ conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ } + } + + static void lo_getattr(fuse_req_t req, fuse_ino_t ino, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- int res; +- struct stat buf; +- struct lo_data *lo = lo_data(req); ++ int res; ++ struct stat buf; ++ struct lo_data *lo = lo_data(req); + +- (void) fi; ++ (void)fi; + +- res = fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- return (void) fuse_reply_err(req, errno); ++ res = ++ fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } + +- fuse_reply_attr(req, &buf, lo->timeout); ++ fuse_reply_attr(req, &buf, lo->timeout); + } + + static int utimensat_empty_nofollow(struct lo_inode *inode, +- const struct timespec *tv) ++ const struct timespec *tv) + { +- int res; +- char procname[64]; +- +- if (inode->is_symlink) { +- res = utimensat(inode->fd, "", tv, +- AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1 && errno == EINVAL) { +- /* Sorry, no race free way to set times on symlink. */ +- errno = EPERM; +- } +- return res; +- } +- sprintf(procname, "/proc/self/fd/%i", inode->fd); +- +- return utimensat(AT_FDCWD, procname, tv, 0); ++ int res; ++ char procname[64]; ++ ++ if (inode->is_symlink) { ++ res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1 && errno == EINVAL) { ++ /* Sorry, no race free way to set times on symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ return utimensat(AT_FDCWD, procname, tv, 0); + } + + static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, +- int valid, struct fuse_file_info *fi) ++ int valid, struct fuse_file_info *fi) + { +- int saverr; +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- int ifd = inode->fd; +- int res; +- +- if (valid & FUSE_SET_ATTR_MODE) { +- if (fi) { +- res = fchmod(fi->fh, attr->st_mode); +- } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = chmod(procname, attr->st_mode); +- } +- if (res == -1) +- goto out_err; +- } +- if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { +- uid_t uid = (valid & FUSE_SET_ATTR_UID) ? +- attr->st_uid : (uid_t) -1; +- gid_t gid = (valid & FUSE_SET_ATTR_GID) ? +- attr->st_gid : (gid_t) -1; +- +- res = fchownat(ifd, "", uid, gid, +- AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- goto out_err; +- } +- if (valid & FUSE_SET_ATTR_SIZE) { +- if (fi) { +- res = ftruncate(fi->fh, attr->st_size); +- } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = truncate(procname, attr->st_size); +- } +- if (res == -1) +- goto out_err; +- } +- if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { +- struct timespec tv[2]; +- +- tv[0].tv_sec = 0; +- tv[1].tv_sec = 0; +- tv[0].tv_nsec = UTIME_OMIT; +- tv[1].tv_nsec = UTIME_OMIT; +- +- if (valid & FUSE_SET_ATTR_ATIME_NOW) +- tv[0].tv_nsec = UTIME_NOW; +- else if (valid & FUSE_SET_ATTR_ATIME) +- tv[0] = attr->st_atim; +- +- if (valid & FUSE_SET_ATTR_MTIME_NOW) +- tv[1].tv_nsec = UTIME_NOW; +- else if (valid & FUSE_SET_ATTR_MTIME) +- tv[1] = attr->st_mtim; +- +- if (fi) +- res = futimens(fi->fh, tv); +- else +- res = utimensat_empty_nofollow(inode, tv); +- if (res == -1) +- goto out_err; +- } +- +- return lo_getattr(req, ino, fi); ++ int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ int ifd = inode->fd; ++ int res; ++ ++ if (valid & FUSE_SET_ATTR_MODE) { ++ if (fi) { ++ res = fchmod(fi->fh, attr->st_mode); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = chmod(procname, attr->st_mode); ++ } ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { ++ uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1; ++ gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1; ++ ++ res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ if (valid & FUSE_SET_ATTR_SIZE) { ++ if (fi) { ++ res = ftruncate(fi->fh, attr->st_size); ++ } else { ++ sprintf(procname, "/proc/self/fd/%i", ifd); ++ res = truncate(procname, attr->st_size); ++ } ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { ++ struct timespec tv[2]; ++ ++ tv[0].tv_sec = 0; ++ tv[1].tv_sec = 0; ++ tv[0].tv_nsec = UTIME_OMIT; ++ tv[1].tv_nsec = UTIME_OMIT; ++ ++ if (valid & FUSE_SET_ATTR_ATIME_NOW) { ++ tv[0].tv_nsec = UTIME_NOW; ++ } else if (valid & FUSE_SET_ATTR_ATIME) { ++ tv[0] = attr->st_atim; ++ } ++ ++ if (valid & FUSE_SET_ATTR_MTIME_NOW) { ++ tv[1].tv_nsec = UTIME_NOW; ++ } else if (valid & FUSE_SET_ATTR_MTIME) { ++ tv[1] = attr->st_mtim; ++ } ++ ++ if (fi) { ++ res = futimens(fi->fh, tv); ++ } else { ++ res = utimensat_empty_nofollow(inode, tv); ++ } ++ if (res == -1) { ++ goto out_err; ++ } ++ } ++ ++ return lo_getattr(req, ino, fi); + + out_err: +- saverr = errno; +- fuse_reply_err(req, saverr); ++ saverr = errno; ++ fuse_reply_err(req, saverr); + } + + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + { +- struct lo_inode *p; +- struct lo_inode *ret = NULL; +- +- pthread_mutex_lock(&lo->mutex); +- for (p = lo->root.next; p != &lo->root; p = p->next) { +- if (p->ino == st->st_ino && p->dev == st->st_dev) { +- assert(p->refcount > 0); +- ret = p; +- ret->refcount++; +- break; +- } +- } +- pthread_mutex_unlock(&lo->mutex); +- return ret; ++ struct lo_inode *p; ++ struct lo_inode *ret = NULL; ++ ++ pthread_mutex_lock(&lo->mutex); ++ for (p = lo->root.next; p != &lo->root; p = p->next) { ++ if (p->ino == st->st_ino && p->dev == st->st_dev) { ++ assert(p->refcount > 0); ++ ret = p; ++ ret->refcount++; ++ break; ++ } ++ } ++ pthread_mutex_unlock(&lo->mutex); ++ return ret; + } + + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, +- struct fuse_entry_param *e) ++ struct fuse_entry_param *e) + { +- int newfd; +- int res; +- int saverr; +- struct lo_data *lo = lo_data(req); +- struct lo_inode *inode; +- +- memset(e, 0, sizeof(*e)); +- e->attr_timeout = lo->timeout; +- e->entry_timeout = lo->timeout; +- +- newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); +- if (newfd == -1) +- goto out_err; +- +- res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- goto out_err; +- +- inode = lo_find(lo_data(req), &e->attr); +- if (inode) { +- close(newfd); +- newfd = -1; +- } else { +- struct lo_inode *prev, *next; +- +- saverr = ENOMEM; +- inode = calloc(1, sizeof(struct lo_inode)); +- if (!inode) +- goto out_err; +- +- inode->is_symlink = S_ISLNK(e->attr.st_mode); +- inode->refcount = 1; +- inode->fd = newfd; +- inode->ino = e->attr.st_ino; +- inode->dev = e->attr.st_dev; +- +- pthread_mutex_lock(&lo->mutex); +- prev = &lo->root; +- next = prev->next; +- next->prev = inode; +- inode->next = next; +- inode->prev = prev; +- prev->next = inode; +- pthread_mutex_unlock(&lo->mutex); +- } +- e->ino = (uintptr_t) inode; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long) parent, name, (unsigned long long) e->ino); +- +- return 0; ++ int newfd; ++ int res; ++ int saverr; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ ++ memset(e, 0, sizeof(*e)); ++ e->attr_timeout = lo->timeout; ++ e->entry_timeout = lo->timeout; ++ ++ newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); ++ if (newfd == -1) { ++ goto out_err; ++ } ++ ++ res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ goto out_err; ++ } ++ ++ inode = lo_find(lo_data(req), &e->attr); ++ if (inode) { ++ close(newfd); ++ newfd = -1; ++ } else { ++ struct lo_inode *prev, *next; ++ ++ saverr = ENOMEM; ++ inode = calloc(1, sizeof(struct lo_inode)); ++ if (!inode) { ++ goto out_err; ++ } ++ ++ inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ inode->refcount = 1; ++ inode->fd = newfd; ++ inode->ino = e->attr.st_ino; ++ inode->dev = e->attr.st_dev; ++ ++ pthread_mutex_lock(&lo->mutex); ++ prev = &lo->root; ++ next = prev->next; ++ next->prev = inode; ++ inode->next = next; ++ inode->prev = prev; ++ prev->next = inode; ++ pthread_mutex_unlock(&lo->mutex); ++ } ++ e->ino = (uintptr_t)inode; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long)parent, name, (unsigned long long)e->ino); ++ } ++ ++ return 0; + + out_err: +- saverr = errno; +- if (newfd != -1) +- close(newfd); +- return saverr; ++ saverr = errno; ++ if (newfd != -1) { ++ close(newfd); ++ } ++ return saverr; + } + + static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + { +- struct fuse_entry_param e; +- int err; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- +- err = lo_do_lookup(req, parent, name, &e); +- if (err) +- fuse_reply_err(req, err); +- else +- fuse_reply_entry(req, &e); ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ } ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_entry(req, &e); ++ } + } + + static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, +- const char *name, mode_t mode, dev_t rdev, +- const char *link) ++ const char *name, mode_t mode, dev_t rdev, ++ const char *link) + { +- int res; +- int saverr; +- struct lo_inode *dir = lo_inode(req, parent); +- struct fuse_entry_param e; ++ int res; ++ int saverr; ++ struct lo_inode *dir = lo_inode(req, parent); ++ struct fuse_entry_param e; + +- saverr = ENOMEM; ++ saverr = ENOMEM; + +- res = mknod_wrapper(dir->fd, name, link, mode, rdev); ++ res = mknod_wrapper(dir->fd, name, link, mode, rdev); + +- saverr = errno; +- if (res == -1) +- goto out; ++ saverr = errno; ++ if (res == -1) { ++ goto out; ++ } + +- saverr = lo_do_lookup(req, parent, name, &e); +- if (saverr) +- goto out; ++ saverr = lo_do_lookup(req, parent, name, &e); ++ if (saverr) { ++ goto out; ++ } + +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long) parent, name, (unsigned long long) e.ino); ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long)parent, name, (unsigned long long)e.ino); ++ } + +- fuse_reply_entry(req, &e); +- return; ++ fuse_reply_entry(req, &e); ++ return; + + out: +- fuse_reply_err(req, saverr); ++ fuse_reply_err(req, saverr); + } + +-static void lo_mknod(fuse_req_t req, fuse_ino_t parent, +- const char *name, mode_t mode, dev_t rdev) ++static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, dev_t rdev) + { +- lo_mknod_symlink(req, parent, name, mode, rdev, NULL); ++ lo_mknod_symlink(req, parent, name, mode, rdev, NULL); + } + + static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode) ++ mode_t mode) + { +- lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); ++ lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); + } + +-static void lo_symlink(fuse_req_t req, const char *link, +- fuse_ino_t parent, const char *name) ++static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, ++ const char *name) + { +- lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); ++ lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); + } + + static int linkat_empty_nofollow(struct lo_inode *inode, int dfd, +- const char *name) ++ const char *name) + { +- int res; +- char procname[64]; ++ int res; ++ char procname[64]; + +- if (inode->is_symlink) { +- res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); +- if (res == -1 && (errno == ENOENT || errno == EINVAL)) { +- /* Sorry, no race free way to hard-link a symlink. */ +- errno = EPERM; +- } +- return res; +- } ++ if (inode->is_symlink) { ++ res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); ++ if (res == -1 && (errno == ENOENT || errno == EINVAL)) { ++ /* Sorry, no race free way to hard-link a symlink. */ ++ errno = EPERM; ++ } ++ return res; ++ } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); + +- return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); ++ return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); + } + + static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, +- const char *name) ++ const char *name) + { +- int res; +- struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); +- struct fuse_entry_param e; +- int saverr; +- +- memset(&e, 0, sizeof(struct fuse_entry_param)); +- e.attr_timeout = lo->timeout; +- e.entry_timeout = lo->timeout; +- +- res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); +- if (res == -1) +- goto out_err; +- +- res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (res == -1) +- goto out_err; +- +- pthread_mutex_lock(&lo->mutex); +- inode->refcount++; +- pthread_mutex_unlock(&lo->mutex); +- e.ino = (uintptr_t) inode; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long) parent, name, +- (unsigned long long) e.ino); +- +- fuse_reply_entry(req, &e); +- return; ++ int res; ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ struct fuse_entry_param e; ++ int saverr; ++ ++ memset(&e, 0, sizeof(struct fuse_entry_param)); ++ e.attr_timeout = lo->timeout; ++ e.entry_timeout = lo->timeout; ++ ++ res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); ++ if (res == -1) { ++ goto out_err; ++ } ++ ++ res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ goto out_err; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ inode->refcount++; ++ pthread_mutex_unlock(&lo->mutex); ++ e.ino = (uintptr_t)inode; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", ++ (unsigned long long)parent, name, (unsigned long long)e.ino); ++ } ++ ++ fuse_reply_entry(req, &e); ++ return; + + out_err: +- saverr = errno; +- fuse_reply_err(req, saverr); ++ saverr = errno; ++ fuse_reply_err(req, saverr); + } + + static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + { +- int res; ++ int res; + +- res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); ++ res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, +- fuse_ino_t newparent, const char *newname, +- unsigned int flags) ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags) + { +- int res; ++ int res; + +- if (flags) { +- fuse_reply_err(req, EINVAL); +- return; +- } ++ if (flags) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + +- res = renameat(lo_fd(req, parent), name, +- lo_fd(req, newparent), newname); ++ res = renameat(lo_fd(req, parent), name, lo_fd(req, newparent), newname); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + { +- int res; ++ int res; + +- res = unlinkat(lo_fd(req, parent), name, 0); ++ res = unlinkat(lo_fd(req, parent), name, 0); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + { +- if (!inode) +- return; +- +- pthread_mutex_lock(&lo->mutex); +- assert(inode->refcount >= n); +- inode->refcount -= n; +- if (!inode->refcount) { +- struct lo_inode *prev, *next; +- +- prev = inode->prev; +- next = inode->next; +- next->prev = prev; +- prev->next = next; +- +- pthread_mutex_unlock(&lo->mutex); +- close(inode->fd); +- free(inode); +- +- } else { +- pthread_mutex_unlock(&lo->mutex); +- } ++ if (!inode) { ++ return; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ assert(inode->refcount >= n); ++ inode->refcount -= n; ++ if (!inode->refcount) { ++ struct lo_inode *prev, *next; ++ ++ prev = inode->prev; ++ next = inode->next; ++ next->prev = prev; ++ prev->next = next; ++ ++ pthread_mutex_unlock(&lo->mutex); ++ close(inode->fd); ++ free(inode); ++ ++ } else { ++ pthread_mutex_unlock(&lo->mutex); ++ } + } + + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { +- struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +- (unsigned long long) ino, +- (unsigned long long) inode->refcount, +- (unsigned long long) nlookup); +- } ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", ++ (unsigned long long)ino, (unsigned long long)inode->refcount, ++ (unsigned long long)nlookup); ++ } + +- unref_inode(lo, inode, nlookup); ++ unref_inode(lo, inode, nlookup); + } + + static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { +- lo_forget_one(req, ino, nlookup); +- fuse_reply_none(req); ++ lo_forget_one(req, ino, nlookup); ++ fuse_reply_none(req); + } + + static void lo_forget_multi(fuse_req_t req, size_t count, +- struct fuse_forget_data *forgets) ++ struct fuse_forget_data *forgets) + { +- int i; ++ int i; + +- for (i = 0; i < count; i++) +- lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); +- fuse_reply_none(req); ++ for (i = 0; i < count; i++) { ++ lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); ++ } ++ fuse_reply_none(req); + } + + static void lo_readlink(fuse_req_t req, fuse_ino_t ino) + { +- char buf[PATH_MAX + 1]; +- int res; ++ char buf[PATH_MAX + 1]; ++ int res; + +- res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); +- if (res == -1) +- return (void) fuse_reply_err(req, errno); ++ res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); ++ if (res == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } + +- if (res == sizeof(buf)) +- return (void) fuse_reply_err(req, ENAMETOOLONG); ++ if (res == sizeof(buf)) { ++ return (void)fuse_reply_err(req, ENAMETOOLONG); ++ } + +- buf[res] = '\0'; ++ buf[res] = '\0'; + +- fuse_reply_readlink(req, buf); ++ fuse_reply_readlink(req, buf); + } + + struct lo_dirp { +- DIR *dp; +- struct dirent *entry; +- off_t offset; ++ DIR *dp; ++ struct dirent *entry; ++ off_t offset; + }; + + static struct lo_dirp *lo_dirp(struct fuse_file_info *fi) + { +- return (struct lo_dirp *) (uintptr_t) fi->fh; ++ return (struct lo_dirp *)(uintptr_t)fi->fh; + } + +-static void lo_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++static void lo_opendir(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) + { +- int error = ENOMEM; +- struct lo_data *lo = lo_data(req); +- struct lo_dirp *d; +- int fd; +- +- d = calloc(1, sizeof(struct lo_dirp)); +- if (d == NULL) +- goto out_err; +- +- fd = openat(lo_fd(req, ino), ".", O_RDONLY); +- if (fd == -1) +- goto out_errno; +- +- d->dp = fdopendir(fd); +- if (d->dp == NULL) +- goto out_errno; +- +- d->offset = 0; +- d->entry = NULL; +- +- fi->fh = (uintptr_t) d; +- if (lo->cache == CACHE_ALWAYS) +- fi->keep_cache = 1; +- fuse_reply_open(req, fi); +- return; ++ int error = ENOMEM; ++ struct lo_data *lo = lo_data(req); ++ struct lo_dirp *d; ++ int fd; ++ ++ d = calloc(1, sizeof(struct lo_dirp)); ++ if (d == NULL) { ++ goto out_err; ++ } ++ ++ fd = openat(lo_fd(req, ino), ".", O_RDONLY); ++ if (fd == -1) { ++ goto out_errno; ++ } ++ ++ d->dp = fdopendir(fd); ++ if (d->dp == NULL) { ++ goto out_errno; ++ } ++ ++ d->offset = 0; ++ d->entry = NULL; ++ ++ fi->fh = (uintptr_t)d; ++ if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ fuse_reply_open(req, fi); ++ return; + + out_errno: +- error = errno; ++ error = errno; + out_err: +- if (d) { +- if (fd != -1) +- close(fd); +- free(d); +- } +- fuse_reply_err(req, error); ++ if (d) { ++ if (fd != -1) { ++ close(fd); ++ } ++ free(d); ++ } ++ fuse_reply_err(req, error); + } + + static int is_dot_or_dotdot(const char *name) + { +- return name[0] == '.' && (name[1] == '\0' || +- (name[1] == '.' && name[2] == '\0')); ++ return name[0] == '.' && ++ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); + } + + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi, int plus) ++ off_t offset, struct fuse_file_info *fi, int plus) + { +- struct lo_dirp *d = lo_dirp(fi); +- char *buf; +- char *p; +- size_t rem = size; +- int err; +- +- (void) ino; +- +- buf = calloc(1, size); +- if (!buf) { +- err = ENOMEM; +- goto error; +- } +- p = buf; +- +- if (offset != d->offset) { +- seekdir(d->dp, offset); +- d->entry = NULL; +- d->offset = offset; +- } +- while (1) { +- size_t entsize; +- off_t nextoff; +- const char *name; +- +- if (!d->entry) { +- errno = 0; +- d->entry = readdir(d->dp); +- if (!d->entry) { +- if (errno) { // Error +- err = errno; +- goto error; +- } else { // End of stream +- break; +- } +- } +- } +- nextoff = d->entry->d_off; +- name = d->entry->d_name; +- fuse_ino_t entry_ino = 0; +- if (plus) { +- struct fuse_entry_param e; +- if (is_dot_or_dotdot(name)) { +- e = (struct fuse_entry_param) { +- .attr.st_ino = d->entry->d_ino, +- .attr.st_mode = d->entry->d_type << 12, +- }; +- } else { +- err = lo_do_lookup(req, ino, name, &e); +- if (err) +- goto error; +- entry_ino = e.ino; +- } +- +- entsize = fuse_add_direntry_plus(req, p, rem, name, +- &e, nextoff); +- } else { +- struct stat st = { +- .st_ino = d->entry->d_ino, +- .st_mode = d->entry->d_type << 12, +- }; +- entsize = fuse_add_direntry(req, p, rem, name, +- &st, nextoff); +- } +- if (entsize > rem) { +- if (entry_ino != 0) +- lo_forget_one(req, entry_ino, 1); +- break; +- } +- +- p += entsize; +- rem -= entsize; +- +- d->entry = NULL; +- d->offset = nextoff; +- } ++ struct lo_dirp *d = lo_dirp(fi); ++ char *buf; ++ char *p; ++ size_t rem = size; ++ int err; ++ ++ (void)ino; ++ ++ buf = calloc(1, size); ++ if (!buf) { ++ err = ENOMEM; ++ goto error; ++ } ++ p = buf; ++ ++ if (offset != d->offset) { ++ seekdir(d->dp, offset); ++ d->entry = NULL; ++ d->offset = offset; ++ } ++ while (1) { ++ size_t entsize; ++ off_t nextoff; ++ const char *name; ++ ++ if (!d->entry) { ++ errno = 0; ++ d->entry = readdir(d->dp); ++ if (!d->entry) { ++ if (errno) { /* Error */ ++ err = errno; ++ goto error; ++ } else { /* End of stream */ ++ break; ++ } ++ } ++ } ++ nextoff = d->entry->d_off; ++ name = d->entry->d_name; ++ fuse_ino_t entry_ino = 0; ++ if (plus) { ++ struct fuse_entry_param e; ++ if (is_dot_or_dotdot(name)) { ++ e = (struct fuse_entry_param){ ++ .attr.st_ino = d->entry->d_ino, ++ .attr.st_mode = d->entry->d_type << 12, ++ }; ++ } else { ++ err = lo_do_lookup(req, ino, name, &e); ++ if (err) { ++ goto error; ++ } ++ entry_ino = e.ino; ++ } ++ ++ entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); ++ } else { ++ struct stat st = { ++ .st_ino = d->entry->d_ino, ++ .st_mode = d->entry->d_type << 12, ++ }; ++ entsize = fuse_add_direntry(req, p, rem, name, &st, nextoff); ++ } ++ if (entsize > rem) { ++ if (entry_ino != 0) { ++ lo_forget_one(req, entry_ino, 1); ++ } ++ break; ++ } ++ ++ p += entsize; ++ rem -= entsize; ++ ++ d->entry = NULL; ++ d->offset = nextoff; ++ } + + err = 0; + error: +- // If there's an error, we can only signal it if we haven't stored +- // any entries yet - otherwise we'd end up with wrong lookup +- // counts for the entries that are already in the buffer. So we +- // return what we've collected until that point. +- if (err && rem == size) +- fuse_reply_err(req, err); +- else +- fuse_reply_buf(req, buf, size - rem); ++ /* ++ * If there's an error, we can only signal it if we haven't stored ++ * any entries yet - otherwise we'd end up with wrong lookup ++ * counts for the entries that are already in the buffer. So we ++ * return what we've collected until that point. ++ */ ++ if (err && rem == size) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_buf(req, buf, size - rem); ++ } + free(buf); + } + + static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi) ++ off_t offset, struct fuse_file_info *fi) + { +- lo_do_readdir(req, ino, size, offset, fi, 0); ++ lo_do_readdir(req, ino, size, offset, fi, 0); + } + + static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi) ++ off_t offset, struct fuse_file_info *fi) + { +- lo_do_readdir(req, ino, size, offset, fi, 1); ++ lo_do_readdir(req, ino, size, offset, fi, 1); + } + +-static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) + { +- struct lo_dirp *d = lo_dirp(fi); +- (void) ino; +- closedir(d->dp); +- free(d); +- fuse_reply_err(req, 0); ++ struct lo_dirp *d = lo_dirp(fi); ++ (void)ino; ++ closedir(d->dp); ++ free(d); ++ fuse_reply_err(req, 0); + } + + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, +- mode_t mode, struct fuse_file_info *fi) ++ mode_t mode, struct fuse_file_info *fi) + { +- int fd; +- struct lo_data *lo = lo_data(req); +- struct fuse_entry_param e; +- int err; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- +- fd = openat(lo_fd(req, parent), name, +- (fi->flags | O_CREAT) & ~O_NOFOLLOW, mode); +- if (fd == -1) +- return (void) fuse_reply_err(req, errno); +- +- fi->fh = fd; +- if (lo->cache == CACHE_NEVER) +- fi->direct_io = 1; +- else if (lo->cache == CACHE_ALWAYS) +- fi->keep_cache = 1; +- +- err = lo_do_lookup(req, parent, name, &e); +- if (err) +- fuse_reply_err(req, err); +- else +- fuse_reply_create(req, &e, fi); ++ int fd; ++ struct lo_data *lo = lo_data(req); ++ struct fuse_entry_param e; ++ int err; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", ++ parent, name); ++ } ++ ++ fd = openat(lo_fd(req, parent), name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, ++ mode); ++ if (fd == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) { ++ fi->direct_io = 1; ++ } else if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ ++ err = lo_do_lookup(req, parent, name, &e); ++ if (err) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_create(req, &e, fi); ++ } + } + + static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- int res; +- int fd = dirfd(lo_dirp(fi)->dp); +- (void) ino; +- if (datasync) +- res = fdatasync(fd); +- else +- res = fsync(fd); +- fuse_reply_err(req, res == -1 ? errno : 0); ++ int res; ++ int fd = dirfd(lo_dirp(fi)->dp); ++ (void)ino; ++ if (datasync) { ++ res = fdatasync(fd); ++ } else { ++ res = fsync(fd); ++ } ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { +- int fd; +- char buf[64]; +- struct lo_data *lo = lo_data(req); +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", +- ino, fi->flags); +- +- /* With writeback cache, kernel may send read requests even +- when userspace opened write-only */ +- if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { +- fi->flags &= ~O_ACCMODE; +- fi->flags |= O_RDWR; +- } +- +- /* With writeback cache, O_APPEND is handled by the kernel. +- This breaks atomicity (since the file may change in the +- underlying filesystem, so that the kernel's idea of the +- end of the file isn't accurate anymore). In this example, +- we just accept that. A more rigorous filesystem may want +- to return an error here */ +- if (lo->writeback && (fi->flags & O_APPEND)) +- fi->flags &= ~O_APPEND; +- +- sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); +- fd = open(buf, fi->flags & ~O_NOFOLLOW); +- if (fd == -1) +- return (void) fuse_reply_err(req, errno); +- +- fi->fh = fd; +- if (lo->cache == CACHE_NEVER) +- fi->direct_io = 1; +- else if (lo->cache == CACHE_ALWAYS) +- fi->keep_cache = 1; +- fuse_reply_open(req, fi); ++ int fd; ++ char buf[64]; ++ struct lo_data *lo = lo_data(req); ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, ++ fi->flags); ++ } ++ ++ /* ++ * With writeback cache, kernel may send read requests even ++ * when userspace opened write-only ++ */ ++ if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { ++ fi->flags &= ~O_ACCMODE; ++ fi->flags |= O_RDWR; ++ } ++ ++ /* ++ * With writeback cache, O_APPEND is handled by the kernel. ++ * This breaks atomicity (since the file may change in the ++ * underlying filesystem, so that the kernel's idea of the ++ * end of the file isn't accurate anymore). In this example, ++ * we just accept that. A more rigorous filesystem may want ++ * to return an error here ++ */ ++ if (lo->writeback && (fi->flags & O_APPEND)) { ++ fi->flags &= ~O_APPEND; ++ } ++ ++ sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ fd = open(buf, fi->flags & ~O_NOFOLLOW); ++ if (fd == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ ++ fi->fh = fd; ++ if (lo->cache == CACHE_NEVER) { ++ fi->direct_io = 1; ++ } else if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ fuse_reply_open(req, fi); + } + +-static void lo_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ++static void lo_release(fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi) + { +- (void) ino; ++ (void)ino; + +- close(fi->fh); +- fuse_reply_err(req, 0); ++ close(fi->fh); ++ fuse_reply_err(req, 0); + } + + static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { +- int res; +- (void) ino; +- res = close(dup(fi->fh)); +- fuse_reply_err(req, res == -1 ? errno : 0); ++ int res; ++ (void)ino; ++ res = close(dup(fi->fh)); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- int res; +- (void) ino; +- if (datasync) +- res = fdatasync(fi->fh); +- else +- res = fsync(fi->fh); +- fuse_reply_err(req, res == -1 ? errno : 0); ++ int res; ++ (void)ino; ++ if (datasync) { ++ res = fdatasync(fi->fh); ++ } else { ++ res = fsync(fi->fh); ++ } ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + +-static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, +- off_t offset, struct fuse_file_info *fi) ++static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, ++ struct fuse_file_info *fi) + { +- struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); ++ struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); + +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_read(ino=%" PRIu64 ", size=%zd, " +- "off=%lu)\n", ino, size, (unsigned long) offset); ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_read(ino=%" PRIu64 ", size=%zd, " ++ "off=%lu)\n", ++ ino, size, (unsigned long)offset); ++ } + +- buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- buf.buf[0].fd = fi->fh; +- buf.buf[0].pos = offset; ++ buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ buf.buf[0].fd = fi->fh; ++ buf.buf[0].pos = offset; + +- fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); ++ fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); + } + + static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, +- struct fuse_bufvec *in_buf, off_t off, +- struct fuse_file_info *fi) ++ struct fuse_bufvec *in_buf, off_t off, ++ struct fuse_file_info *fi) + { +- (void) ino; +- ssize_t res; +- struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); +- +- out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- out_buf.buf[0].fd = fi->fh; +- out_buf.buf[0].pos = off; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", +- ino, out_buf.buf[0].size, (unsigned long) off); +- +- res = fuse_buf_copy(&out_buf, in_buf, 0); +- if(res < 0) +- fuse_reply_err(req, -res); +- else +- fuse_reply_write(req, (size_t) res); ++ (void)ino; ++ ssize_t res; ++ struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); ++ ++ out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; ++ out_buf.buf[0].fd = fi->fh; ++ out_buf.buf[0].pos = off; ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, ++ out_buf.buf[0].size, (unsigned long)off); ++ } ++ ++ res = fuse_buf_copy(&out_buf, in_buf, 0); ++ if (res < 0) { ++ fuse_reply_err(req, -res); ++ } else { ++ fuse_reply_write(req, (size_t)res); ++ } + } + + static void lo_statfs(fuse_req_t req, fuse_ino_t ino) + { +- int res; +- struct statvfs stbuf; +- +- res = fstatvfs(lo_fd(req, ino), &stbuf); +- if (res == -1) +- fuse_reply_err(req, errno); +- else +- fuse_reply_statfs(req, &stbuf); ++ int res; ++ struct statvfs stbuf; ++ ++ res = fstatvfs(lo_fd(req, ino), &stbuf); ++ if (res == -1) { ++ fuse_reply_err(req, errno); ++ } else { ++ fuse_reply_statfs(req, &stbuf); ++ } + } + +-static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, +- off_t offset, off_t length, struct fuse_file_info *fi) ++static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, ++ off_t length, struct fuse_file_info *fi) + { +- int err = EOPNOTSUPP; +- (void) ino; ++ int err = EOPNOTSUPP; ++ (void)ino; + + #ifdef HAVE_FALLOCATE +- err = fallocate(fi->fh, mode, offset, length); +- if (err < 0) +- err = errno; ++ err = fallocate(fi->fh, mode, offset, length); ++ if (err < 0) { ++ err = errno; ++ } + + #elif defined(HAVE_POSIX_FALLOCATE) +- if (mode) { +- fuse_reply_err(req, EOPNOTSUPP); +- return; +- } ++ if (mode) { ++ fuse_reply_err(req, EOPNOTSUPP); ++ return; ++ } + +- err = posix_fallocate(fi->fh, offset, length); ++ err = posix_fallocate(fi->fh, offset, length); + #endif + +- fuse_reply_err(req, err); ++ fuse_reply_err(req, err); + } + + static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, +- int op) ++ int op) + { +- int res; +- (void) ino; ++ int res; ++ (void)ino; + +- res = flock(fi->fh, op); ++ res = flock(fi->fh, op); + +- fuse_reply_err(req, res == -1 ? errno : 0); ++ fuse_reply_err(req, res == -1 ? errno : 0); + } + + static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, +- size_t size) ++ size_t size) + { +- char *value = NULL; +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; +- +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; +- +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", +- ino, name, size); +- } +- +- if (inode->is_symlink) { +- /* Sorry, no race free way to getxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- +- sprintf(procname, "/proc/self/fd/%i", inode->fd); +- +- if (size) { +- value = malloc(size); +- if (!value) +- goto out_err; +- +- ret = getxattr(procname, name, value, size); +- if (ret == -1) +- goto out_err; +- saverr = 0; +- if (ret == 0) +- goto out; +- +- fuse_reply_buf(req, value, ret); +- } else { +- ret = getxattr(procname, name, NULL, 0); +- if (ret == -1) +- goto out_err; +- +- fuse_reply_xattr(req, ret); +- } ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ino, name, ++ size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to getxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ ++ ret = getxattr(procname, name, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } ++ saverr = 0; ++ if (ret == 0) { ++ goto out; ++ } ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = getxattr(procname, name, NULL, 0); ++ if (ret == -1) { ++ goto out_err; ++ } ++ ++ fuse_reply_xattr(req, ret); ++ } + out_free: +- free(value); +- return; ++ free(value); ++ return; + + out_err: +- saverr = errno; ++ saverr = errno; + out: +- fuse_reply_err(req, saverr); +- goto out_free; ++ fuse_reply_err(req, saverr); ++ goto out_free; + } + + static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + { +- char *value = NULL; +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; +- +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; +- +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", +- ino, size); +- } +- +- if (inode->is_symlink) { +- /* Sorry, no race free way to listxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } +- +- sprintf(procname, "/proc/self/fd/%i", inode->fd); +- +- if (size) { +- value = malloc(size); +- if (!value) +- goto out_err; +- +- ret = listxattr(procname, value, size); +- if (ret == -1) +- goto out_err; +- saverr = 0; +- if (ret == 0) +- goto out; +- +- fuse_reply_buf(req, value, ret); +- } else { +- ret = listxattr(procname, NULL, 0); +- if (ret == -1) +- goto out_err; +- +- fuse_reply_xattr(req, ret); +- } ++ char *value = NULL; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; ++ ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } ++ ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ++ ino, size); ++ } ++ ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to listxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } ++ ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ ++ ret = listxattr(procname, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } ++ saverr = 0; ++ if (ret == 0) { ++ goto out; ++ } ++ ++ fuse_reply_buf(req, value, ret); ++ } else { ++ ret = listxattr(procname, NULL, 0); ++ if (ret == -1) { ++ goto out_err; ++ } ++ ++ fuse_reply_xattr(req, ret); ++ } + out_free: +- free(value); +- return; ++ free(value); ++ return; + + out_err: +- saverr = errno; ++ saverr = errno; + out: +- fuse_reply_err(req, saverr); +- goto out_free; ++ fuse_reply_err(req, saverr); ++ goto out_free; + } + + static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, +- const char *value, size_t size, int flags) ++ const char *value, size_t size, int flags) + { +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; + +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", +- ino, name, value, size); +- } ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", ++ ino, name, value, size); ++ } + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); + +- ret = setxattr(procname, name, value, size, flags); +- saverr = ret == -1 ? errno : 0; ++ ret = setxattr(procname, name, value, size, flags); ++ saverr = ret == -1 ? errno : 0; + + out: +- fuse_reply_err(req, saverr); ++ fuse_reply_err(req, saverr); + } + + static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + { +- char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- ssize_t ret; +- int saverr; ++ char procname[64]; ++ struct lo_inode *inode = lo_inode(req, ino); ++ ssize_t ret; ++ int saverr; + +- saverr = ENOSYS; +- if (!lo_data(req)->xattr) +- goto out; ++ saverr = ENOSYS; ++ if (!lo_data(req)->xattr) { ++ goto out; ++ } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", +- ino, name); +- } ++ if (lo_debug(req)) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ++ ino, name); ++ } + +- if (inode->is_symlink) { +- /* Sorry, no race free way to setxattr on symlink. */ +- saverr = EPERM; +- goto out; +- } ++ if (inode->is_symlink) { ++ /* Sorry, no race free way to setxattr on symlink. */ ++ saverr = EPERM; ++ goto out; ++ } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); + +- ret = removexattr(procname, name); +- saverr = ret == -1 ? errno : 0; ++ ret = removexattr(procname, name); ++ saverr = ret == -1 ? errno : 0; + + out: +- fuse_reply_err(req, saverr); ++ fuse_reply_err(req, saverr); + } + + #ifdef HAVE_COPY_FILE_RANGE + static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, +- struct fuse_file_info *fi_in, +- fuse_ino_t ino_out, off_t off_out, +- struct fuse_file_info *fi_out, size_t len, +- int flags) ++ struct fuse_file_info *fi_in, fuse_ino_t ino_out, ++ off_t off_out, struct fuse_file_info *fi_out, ++ size_t len, int flags) + { +- ssize_t res; +- +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, size=%zd, flags=0x%x)\n", +- ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, +- len, flags); +- +- res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, +- flags); +- if (res < 0) +- fuse_reply_err(req, -errno); +- else +- fuse_reply_write(req, res); ++ ssize_t res; ++ ++ if (lo_debug(req)) ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, ino=%" PRIu64 "/fd=%lu, " ++ "off=%lu, size=%zd, flags=0x%x)\n", ++ ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, len, ++ flags); ++ ++ res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, flags); ++ if (res < 0) { ++ fuse_reply_err(req, -errno); ++ } else { ++ fuse_reply_write(req, res); ++ } + } + #endif + + static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, +- struct fuse_file_info *fi) ++ struct fuse_file_info *fi) + { +- off_t res; +- +- (void)ino; +- res = lseek(fi->fh, off, whence); +- if (res != -1) +- fuse_reply_lseek(req, res); +- else +- fuse_reply_err(req, errno); ++ off_t res; ++ ++ (void)ino; ++ res = lseek(fi->fh, off, whence); ++ if (res != -1) { ++ fuse_reply_lseek(req, res); ++ } else { ++ fuse_reply_err(req, errno); ++ } + } + + static struct fuse_lowlevel_ops lo_oper = { +- .init = lo_init, +- .lookup = lo_lookup, +- .mkdir = lo_mkdir, +- .mknod = lo_mknod, +- .symlink = lo_symlink, +- .link = lo_link, +- .unlink = lo_unlink, +- .rmdir = lo_rmdir, +- .rename = lo_rename, +- .forget = lo_forget, +- .forget_multi = lo_forget_multi, +- .getattr = lo_getattr, +- .setattr = lo_setattr, +- .readlink = lo_readlink, +- .opendir = lo_opendir, +- .readdir = lo_readdir, +- .readdirplus = lo_readdirplus, +- .releasedir = lo_releasedir, +- .fsyncdir = lo_fsyncdir, +- .create = lo_create, +- .open = lo_open, +- .release = lo_release, +- .flush = lo_flush, +- .fsync = lo_fsync, +- .read = lo_read, +- .write_buf = lo_write_buf, +- .statfs = lo_statfs, +- .fallocate = lo_fallocate, +- .flock = lo_flock, +- .getxattr = lo_getxattr, +- .listxattr = lo_listxattr, +- .setxattr = lo_setxattr, +- .removexattr = lo_removexattr, ++ .init = lo_init, ++ .lookup = lo_lookup, ++ .mkdir = lo_mkdir, ++ .mknod = lo_mknod, ++ .symlink = lo_symlink, ++ .link = lo_link, ++ .unlink = lo_unlink, ++ .rmdir = lo_rmdir, ++ .rename = lo_rename, ++ .forget = lo_forget, ++ .forget_multi = lo_forget_multi, ++ .getattr = lo_getattr, ++ .setattr = lo_setattr, ++ .readlink = lo_readlink, ++ .opendir = lo_opendir, ++ .readdir = lo_readdir, ++ .readdirplus = lo_readdirplus, ++ .releasedir = lo_releasedir, ++ .fsyncdir = lo_fsyncdir, ++ .create = lo_create, ++ .open = lo_open, ++ .release = lo_release, ++ .flush = lo_flush, ++ .fsync = lo_fsync, ++ .read = lo_read, ++ .write_buf = lo_write_buf, ++ .statfs = lo_statfs, ++ .fallocate = lo_fallocate, ++ .flock = lo_flock, ++ .getxattr = lo_getxattr, ++ .listxattr = lo_listxattr, ++ .setxattr = lo_setxattr, ++ .removexattr = lo_removexattr, + #ifdef HAVE_COPY_FILE_RANGE +- .copy_file_range = lo_copy_file_range, ++ .copy_file_range = lo_copy_file_range, + #endif +- .lseek = lo_lseek, ++ .lseek = lo_lseek, + }; + + int main(int argc, char *argv[]) + { +- struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +- struct fuse_session *se; +- struct fuse_cmdline_opts opts; +- struct lo_data lo = { .debug = 0, +- .writeback = 0 }; +- int ret = -1; +- +- /* Don't mask creation mode, kernel already did that */ +- umask(0); +- +- pthread_mutex_init(&lo.mutex, NULL); +- lo.root.next = lo.root.prev = &lo.root; +- lo.root.fd = -1; +- lo.cache = CACHE_NORMAL; +- +- if (fuse_parse_cmdline(&args, &opts) != 0) +- return 1; +- if (opts.show_help) { +- printf("usage: %s [options] \n\n", argv[0]); +- fuse_cmdline_help(); +- fuse_lowlevel_help(); +- ret = 0; +- goto err_out1; +- } else if (opts.show_version) { +- fuse_lowlevel_version(); +- ret = 0; +- goto err_out1; +- } +- +- if(opts.mountpoint == NULL) { +- printf("usage: %s [options] \n", argv[0]); +- printf(" %s --help\n", argv[0]); +- ret = 1; +- goto err_out1; +- } +- +- if (fuse_opt_parse(&args, &lo, lo_opts, NULL)== -1) +- return 1; +- +- lo.debug = opts.debug; +- lo.root.refcount = 2; +- if (lo.source) { +- struct stat stat; +- int res; +- +- res = lstat(lo.source, &stat); +- if (res == -1) { +- fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", +- lo.source); +- exit(1); +- } +- if (!S_ISDIR(stat.st_mode)) { +- fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); +- exit(1); +- } +- +- } else { +- lo.source = "/"; +- } +- lo.root.is_symlink = false; +- if (!lo.timeout_set) { +- switch (lo.cache) { +- case CACHE_NEVER: +- lo.timeout = 0.0; +- break; +- +- case CACHE_NORMAL: +- lo.timeout = 1.0; +- break; +- +- case CACHE_ALWAYS: +- lo.timeout = 86400.0; +- break; +- } +- } else if (lo.timeout < 0) { +- fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", +- lo.timeout); +- exit(1); +- } +- +- lo.root.fd = open(lo.source, O_PATH); +- if (lo.root.fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", +- lo.source); +- exit(1); +- } +- +- se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); +- if (se == NULL) +- goto err_out1; +- +- if (fuse_set_signal_handlers(se) != 0) +- goto err_out2; +- +- if (fuse_session_mount(se, opts.mountpoint) != 0) +- goto err_out3; +- +- fuse_daemonize(opts.foreground); +- +- /* Block until ctrl+c or fusermount -u */ +- if (opts.singlethread) +- ret = fuse_session_loop(se); +- else +- ret = fuse_session_loop_mt(se, opts.clone_fd); +- +- fuse_session_unmount(se); ++ struct fuse_args args = FUSE_ARGS_INIT(argc, argv); ++ struct fuse_session *se; ++ struct fuse_cmdline_opts opts; ++ struct lo_data lo = { .debug = 0, .writeback = 0 }; ++ int ret = -1; ++ ++ /* Don't mask creation mode, kernel already did that */ ++ umask(0); ++ ++ pthread_mutex_init(&lo.mutex, NULL); ++ lo.root.next = lo.root.prev = &lo.root; ++ lo.root.fd = -1; ++ lo.cache = CACHE_NORMAL; ++ ++ if (fuse_parse_cmdline(&args, &opts) != 0) { ++ return 1; ++ } ++ if (opts.show_help) { ++ printf("usage: %s [options] \n\n", argv[0]); ++ fuse_cmdline_help(); ++ fuse_lowlevel_help(); ++ ret = 0; ++ goto err_out1; ++ } else if (opts.show_version) { ++ fuse_lowlevel_version(); ++ ret = 0; ++ goto err_out1; ++ } ++ ++ if (opts.mountpoint == NULL) { ++ printf("usage: %s [options] \n", argv[0]); ++ printf(" %s --help\n", argv[0]); ++ ret = 1; ++ goto err_out1; ++ } ++ ++ if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { ++ return 1; ++ } ++ ++ lo.debug = opts.debug; ++ lo.root.refcount = 2; ++ if (lo.source) { ++ struct stat stat; ++ int res; ++ ++ res = lstat(lo.source, &stat); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", ++ lo.source); ++ exit(1); ++ } ++ if (!S_ISDIR(stat.st_mode)) { ++ fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); ++ exit(1); ++ } ++ ++ } else { ++ lo.source = "/"; ++ } ++ lo.root.is_symlink = false; ++ if (!lo.timeout_set) { ++ switch (lo.cache) { ++ case CACHE_NEVER: ++ lo.timeout = 0.0; ++ break; ++ ++ case CACHE_NORMAL: ++ lo.timeout = 1.0; ++ break; ++ ++ case CACHE_ALWAYS: ++ lo.timeout = 86400.0; ++ break; ++ } ++ } else if (lo.timeout < 0) { ++ fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout); ++ exit(1); ++ } ++ ++ lo.root.fd = open(lo.source, O_PATH); ++ if (lo.root.fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", lo.source); ++ exit(1); ++ } ++ ++ se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); ++ if (se == NULL) { ++ goto err_out1; ++ } ++ ++ if (fuse_set_signal_handlers(se) != 0) { ++ goto err_out2; ++ } ++ ++ if (fuse_session_mount(se, opts.mountpoint) != 0) { ++ goto err_out3; ++ } ++ ++ fuse_daemonize(opts.foreground); ++ ++ /* Block until ctrl+c or fusermount -u */ ++ if (opts.singlethread) { ++ ret = fuse_session_loop(se); ++ } else { ++ ret = fuse_session_loop_mt(se, opts.clone_fd); ++ } ++ ++ fuse_session_unmount(se); + err_out3: +- fuse_remove_signal_handlers(se); ++ fuse_remove_signal_handlers(se); + err_out2: +- fuse_session_destroy(se); ++ fuse_session_destroy(se); + err_out1: +- free(opts.mountpoint); +- fuse_opt_free_args(&args); ++ free(opts.mountpoint); ++ fuse_opt_free_args(&args); + +- if (lo.root.fd >= 0) +- close(lo.root.fd); ++ if (lo.root.fd >= 0) { ++ close(lo.root.fd); ++ } + +- return ret ? 1 : 0; ++ return ret ? 1 : 0; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Handle-hard-reboot.patch b/kvm-virtiofsd-Handle-hard-reboot.patch new file mode 100755 index 0000000000000000000000000000000000000000..88880302396c10ef03c0cce3c13b341d7b9a4ad4 --- /dev/null +++ b/kvm-virtiofsd-Handle-hard-reboot.patch @@ -0,0 +1,65 @@ +From 616407b06517361ce444dcc0960aeaf55b52da33 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:41 +0100 +Subject: [PATCH 070/116] virtiofsd: Handle hard reboot +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-67-dgilbert@redhat.com> +Patchwork-id: 93521 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 066/112] virtiofsd: Handle hard reboot +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Handle a + mount + hard reboot (without unmount) + mount + +we get another 'init' which FUSE doesn't normally expect. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e8556f49098b5d95634e592d79a97f761b76c96e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 7d742b5..65f91da 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2433,7 +2433,21 @@ void fuse_session_process_buf_int(struct fuse_session *se, + goto reply_err; + } + } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) { +- goto reply_err; ++ if (fuse_lowlevel_is_virtio(se)) { ++ /* ++ * TODO: This is after a hard reboot typically, we need to do ++ * a destroy, but we can't reply to this request yet so ++ * we can't use do_destroy ++ */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: reinit\n", __func__); ++ se->got_destroy = 1; ++ se->got_init = 0; ++ if (se->op.destroy) { ++ se->op.destroy(se->userdata); ++ } ++ } else { ++ goto reply_err; ++ } + } + + err = EACCES; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Handle-reinit.patch b/kvm-virtiofsd-Handle-reinit.patch new file mode 100755 index 0000000000000000000000000000000000000000..3f9577bb379bef9b60814b6718f3b4538dfc51d8 --- /dev/null +++ b/kvm-virtiofsd-Handle-reinit.patch @@ -0,0 +1,53 @@ +From 485adfa1aa1b3e2d1449edf5c42d6ec396cbfb5d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:40 +0100 +Subject: [PATCH 069/116] virtiofsd: Handle reinit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-66-dgilbert@redhat.com> +Patchwork-id: 93520 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 065/112] virtiofsd: Handle reinit +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Allow init->destroy->init for mount->umount->mount + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c806d6435fe95fd54b379920aca2f4e3ea1f3258) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index a7a1968..7d742b5 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2028,6 +2028,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + } + + se->got_init = 1; ++ se->got_destroy = 0; + if (se->op.init) { + se->op.init(se->userdata, &se->conn); + } +@@ -2130,6 +2131,7 @@ static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, + (void)iter; + + se->got_destroy = 1; ++ se->got_init = 0; + if (se->op.destroy) { + se->op.destroy(se->userdata); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Keep-track-of-replies.patch b/kvm-virtiofsd-Keep-track-of-replies.patch new file mode 100755 index 0000000000000000000000000000000000000000..18be3e0b6caa1c234350beb199ef5f34fea8f0aa --- /dev/null +++ b/kvm-virtiofsd-Keep-track-of-replies.patch @@ -0,0 +1,116 @@ +From c818a1cb603cad07aa5c49ce808aa09435667c7c Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:04 +0100 +Subject: [PATCH 033/116] virtiofsd: Keep track of replies +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-30-dgilbert@redhat.com> +Patchwork-id: 93481 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 029/112] virtiofsd: Keep track of replies +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Keep track of whether we sent a reply to a request; this is a bit +paranoid but it means: + a) We should always recycle an element even if there was an error + in the request + b) Never try and send two replies on one queue element + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 2f65e69a7f22da8d20c747f34f339ebb40a0634f) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 05d0e29..f1adeb6 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -44,6 +44,7 @@ struct fv_QueueInfo { + + /* The element for the command currently being processed */ + VuVirtqElement *qe; ++ bool reply_sent; + }; + + /* +@@ -178,6 +179,7 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + { + VuVirtqElement *elem; + VuVirtq *q; ++ int ret = 0; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); +@@ -191,6 +193,7 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + assert(out->unique); + /* For virtio we always have ch */ + assert(ch); ++ assert(!ch->qi->reply_sent); + elem = ch->qi->qe; + q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; + +@@ -208,19 +211,23 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + if (in_len < sizeof(struct fuse_out_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", + __func__, elem->index); +- return -E2BIG; ++ ret = -E2BIG; ++ goto err; + } + if (in_len < tosend_len) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", + __func__, elem->index, tosend_len); +- return -E2BIG; ++ ret = -E2BIG; ++ goto err; + } + + copy_iov(iov, count, in_sg, in_num, tosend_len); + vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); + vu_queue_notify(&se->virtio_dev->dev, q); ++ ch->qi->reply_sent = true; + +- return 0; ++err: ++ return ret; + } + + /* Thread function for individual queues, created when a queue is 'started' */ +@@ -296,6 +303,9 @@ static void *fv_queue_thread(void *opaque) + break; + } + ++ qi->qe = elem; ++ qi->reply_sent = false; ++ + if (!fbuf.mem) { + fbuf.mem = malloc(se->bufsize); + assert(fbuf.mem); +@@ -331,6 +341,13 @@ static void *fv_queue_thread(void *opaque) + /* TODO: Add checks for fuse_session_exited */ + fuse_session_process_buf_int(se, &fbuf, &ch); + ++ if (!qi->reply_sent) { ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", ++ __func__, elem->index); ++ /* I think we've still got to recycle the element */ ++ vu_queue_push(dev, q, elem, 0); ++ vu_queue_notify(dev, q); ++ } + qi->qe = NULL; + free(elem); + elem = NULL; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch b/kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch new file mode 100755 index 0000000000000000000000000000000000000000..5e054f344438062d89d4a00db5fbfd2d9b223ab1 --- /dev/null +++ b/kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch @@ -0,0 +1,143 @@ +From b37344c38b866c7e7fb773b4a3172a39306bac7e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:42 +0100 +Subject: [PATCH 071/116] virtiofsd: Kill threads when queues are stopped +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-68-dgilbert@redhat.com> +Patchwork-id: 93522 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 067/112] virtiofsd: Kill threads when queues are stopped +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Kill the threads we've started when the queues get stopped. + +Signed-off-by: Dr. David Alan Gilbert +With improvements by: +Signed-off-by: Eryu Guan +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 10477ac47fc57d00a84802ff97c15450cd8021c1) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 51 +++++++++++++++++++++++++++++++++++++------ + 1 file changed, 44 insertions(+), 7 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 872968f..7a8774a 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -41,6 +41,7 @@ struct fv_QueueInfo { + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; ++ int kill_fd; /* For killing the thread */ + + /* The element for the command currently being processed */ + VuVirtqElement *qe; +@@ -412,14 +413,17 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { +- struct pollfd pf[1]; ++ struct pollfd pf[2]; + pf[0].fd = qi->kick_fd; + pf[0].events = POLLIN; + pf[0].revents = 0; ++ pf[1].fd = qi->kill_fd; ++ pf[1].events = POLLIN; ++ pf[1].revents = 0; + + fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__, + qi->qidx); +- int poll_res = ppoll(pf, 1, NULL, NULL); ++ int poll_res = ppoll(pf, 2, NULL, NULL); + + if (poll_res == -1) { + if (errno == EINTR) { +@@ -430,12 +434,23 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n"); + break; + } +- assert(poll_res == 1); ++ assert(poll_res >= 1); + if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n", + __func__, pf[0].revents, qi->qidx); + break; + } ++ if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) { ++ fuse_log(FUSE_LOG_ERR, ++ "%s: Unexpected poll revents %x Queue %d killfd\n", ++ __func__, pf[1].revents, qi->qidx); ++ break; ++ } ++ if (pf[1].revents) { ++ fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n", ++ __func__, qi->qidx); ++ break; ++ } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__, + qi->qidx); +@@ -589,6 +604,28 @@ out: + return NULL; + } + ++static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) ++{ ++ int ret; ++ struct fv_QueueInfo *ourqi; ++ ++ assert(qidx < vud->nqueues); ++ ourqi = vud->qi[qidx]; ++ ++ /* Kill the thread */ ++ if (eventfd_write(ourqi->kill_fd, 1)) { ++ fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n", ++ qidx, strerror(errno)); ++ } ++ ret = pthread_join(ourqi->thread, NULL); ++ if (ret) { ++ fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n", ++ __func__, qidx, ret); ++ } ++ close(ourqi->kill_fd); ++ ourqi->kick_fd = -1; ++} ++ + /* Callback from libvhost-user on start or stop of a queue */ + static void fv_queue_set_started(VuDev *dev, int qidx, bool started) + { +@@ -633,16 +670,16 @@ static void fv_queue_set_started(VuDev *dev, int qidx, bool started) + } + ourqi = vud->qi[qidx]; + ourqi->kick_fd = dev->vq[qidx].kick_fd; ++ ++ ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE); ++ assert(ourqi->kill_fd != -1); + if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", + __func__, qidx); + assert(0); + } + } else { +- /* TODO: Kill the thread */ +- assert(qidx < vud->nqueues); +- ourqi = vud->qi[qidx]; +- ourqi->kick_fd = -1; ++ fv_queue_cleanup_thread(vud, qidx); + } + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch b/kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch new file mode 100755 index 0000000000000000000000000000000000000000..98211cbfa1b4b391122951ab3c352218b8db4b16 --- /dev/null +++ b/kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch @@ -0,0 +1,96 @@ +From f09f13f9a001a50ee3465c165f4bbaf870fcadb9 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:53 +0100 +Subject: [PATCH 022/116] virtiofsd: Make fsync work even if only inode is + passed in +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-19-dgilbert@redhat.com> +Patchwork-id: 93472 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 018/112] virtiofsd: Make fsync work even if only inode is passed in +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +If caller has not sent file handle in request, then using inode, retrieve +the fd opened using O_PATH and use that to open file again and issue +fsync. This will be needed when dax_flush() calls fsync. At that time +we only have inode information (and not file). + +Signed-off-by: Vivek Goyal +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 1b209805f8159c3f4d89ddb9390a5f64887cebff) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 6 +++++- + tools/virtiofsd/passthrough_ll.c | 28 ++++++++++++++++++++++++++-- + 2 files changed, 31 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 514d79c..8552cfb 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1075,7 +1075,11 @@ static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + fi.fh = arg->fh; + + if (req->se->op.fsync) { +- req->se->op.fsync(req, nodeid, datasync, &fi); ++ if (fi.fh == (uint64_t)-1) { ++ req->se->op.fsync(req, nodeid, datasync, NULL); ++ } else { ++ req->se->op.fsync(req, nodeid, datasync, &fi); ++ } + } else { + fuse_reply_err(req, ENOSYS); + } +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 6c4da18..26ac870 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -903,10 +903,34 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + { + int res; + (void)ino; ++ int fd; ++ char *buf; ++ ++ fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, ++ (void *)fi); ++ ++ if (!fi) { ++ res = asprintf(&buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ if (res == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ ++ fd = open(buf, O_RDWR); ++ free(buf); ++ if (fd == -1) { ++ return (void)fuse_reply_err(req, errno); ++ } ++ } else { ++ fd = fi->fh; ++ } ++ + if (datasync) { +- res = fdatasync(fi->fh); ++ res = fdatasync(fd); + } else { +- res = fsync(fi->fh); ++ res = fsync(fd); ++ } ++ if (!fi) { ++ close(fd); + } + fuse_reply_err(req, res == -1 ? errno : 0); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch b/kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch new file mode 100755 index 0000000000000000000000000000000000000000..2c9874dea6b4de566deff03d6291578fb831dbe5 --- /dev/null +++ b/kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch @@ -0,0 +1,257 @@ +From a96042f05eaf494fbe26a9cbd940f5f815f782f9 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:56 +0100 +Subject: [PATCH 025/116] virtiofsd: Open vhost connection instead of mounting +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-22-dgilbert@redhat.com> +Patchwork-id: 93476 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 021/112] virtiofsd: Open vhost connection instead of mounting +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +When run with vhost-user options we conect to the QEMU instead +via a socket. Start this off by creating the socket. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d14bf584dd965821e80d14c16d9292a464b1ab85) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 7 ++-- + tools/virtiofsd/fuse_lowlevel.c | 55 ++++------------------------ + tools/virtiofsd/fuse_virtio.c | 79 +++++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_virtio.h | 23 ++++++++++++ + 4 files changed, 114 insertions(+), 50 deletions(-) + create mode 100644 tools/virtiofsd/fuse_virtio.c + create mode 100644 tools/virtiofsd/fuse_virtio.h + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 26b1a7d..82d6ac7 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -6,9 +6,10 @@ + * See the file COPYING.LIB + */ + +-#define FUSE_USE_VERSION 31 +- ++#ifndef FUSE_I_H ++#define FUSE_I_H + ++#define FUSE_USE_VERSION 31 + #include "fuse.h" + #include "fuse_lowlevel.h" + +@@ -101,3 +102,5 @@ void fuse_session_process_buf_int(struct fuse_session *se, + + /* room needed in buffer to accommodate header */ + #define FUSE_BUFFER_HEADER_SIZE 0x1000 ++ ++#endif +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 17e8718..5df124e 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -14,6 +14,7 @@ + #include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" ++#include "fuse_virtio.h" + + #include + #include +@@ -2202,6 +2203,11 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out4; + } + ++ if (!se->vu_socket_path) { ++ fprintf(stderr, "fuse: missing -o vhost_user_socket option\n"); ++ goto out4; ++ } ++ + se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE; + + list_init_req(&se->list); +@@ -2224,54 +2230,7 @@ out1: + + int fuse_session_mount(struct fuse_session *se) + { +- int fd; +- +- /* +- * Make sure file descriptors 0, 1 and 2 are open, otherwise chaos +- * would ensue. +- */ +- do { +- fd = open("/dev/null", O_RDWR); +- if (fd > 2) { +- close(fd); +- } +- } while (fd >= 0 && fd <= 2); +- +- /* +- * To allow FUSE daemons to run without privileges, the caller may open +- * /dev/fuse before launching the file system and pass on the file +- * descriptor by specifying /dev/fd/N as the mount point. Note that the +- * parent process takes care of performing the mount in this case. +- */ +- fd = fuse_mnt_parse_fuse_fd(mountpoint); +- if (fd != -1) { +- if (fcntl(fd, F_GETFD) == -1) { +- fuse_log(FUSE_LOG_ERR, "fuse: Invalid file descriptor /dev/fd/%u\n", +- fd); +- return -1; +- } +- se->fd = fd; +- return 0; +- } +- +- /* Open channel */ +- fd = fuse_kern_mount(mountpoint, se->mo); +- if (fd == -1) { +- return -1; +- } +- se->fd = fd; +- +- /* Save mountpoint */ +- se->mountpoint = strdup(mountpoint); +- if (se->mountpoint == NULL) { +- goto error_out; +- } +- +- return 0; +- +-error_out: +- fuse_kern_unmount(mountpoint, fd); +- return -1; ++ return virtio_session_mount(se); + } + + int fuse_session_fd(struct fuse_session *se) +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +new file mode 100644 +index 0000000..cbef6ff +--- /dev/null ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -0,0 +1,79 @@ ++/* ++ * virtio-fs glue for FUSE ++ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates ++ * ++ * Authors: ++ * Dave Gilbert ++ * ++ * Implements the glue between libfuse and libvhost-user ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ ++ ++#include "fuse_i.h" ++#include "standard-headers/linux/fuse.h" ++#include "fuse_misc.h" ++#include "fuse_opt.h" ++#include "fuse_virtio.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* From spec */ ++struct virtio_fs_config { ++ char tag[36]; ++ uint32_t num_queues; ++}; ++ ++int virtio_session_mount(struct fuse_session *se) ++{ ++ struct sockaddr_un un; ++ mode_t old_umask; ++ ++ if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) { ++ fuse_log(FUSE_LOG_ERR, "Socket path too long\n"); ++ return -1; ++ } ++ ++ se->fd = -1; ++ ++ /* ++ * Create the Unix socket to communicate with qemu ++ * based on QEMU's vhost-user-bridge ++ */ ++ unlink(se->vu_socket_path); ++ strcpy(un.sun_path, se->vu_socket_path); ++ size_t addr_len = sizeof(un); ++ ++ int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0); ++ if (listen_sock == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n"); ++ return -1; ++ } ++ un.sun_family = AF_UNIX; ++ ++ /* ++ * Unfortunately bind doesn't let you set the mask on the socket, ++ * so set umask to 077 and restore it later. ++ */ ++ old_umask = umask(0077); ++ if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n"); ++ umask(old_umask); ++ return -1; ++ } ++ umask(old_umask); ++ ++ if (listen(listen_sock, 1) == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n"); ++ return -1; ++ } ++ ++ return -1; ++} +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +new file mode 100644 +index 0000000..8f2edb6 +--- /dev/null ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -0,0 +1,23 @@ ++/* ++ * virtio-fs glue for FUSE ++ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates ++ * ++ * Authors: ++ * Dave Gilbert ++ * ++ * Implements the glue between libfuse and libvhost-user ++ * ++ * This program can be distributed under the terms of the GNU LGPLv2. ++ * See the file COPYING.LIB ++ */ ++ ++#ifndef FUSE_VIRTIO_H ++#define FUSE_VIRTIO_H ++ ++#include "fuse_i.h" ++ ++struct fuse_session; ++ ++int virtio_session_mount(struct fuse_session *se); ++ ++#endif +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch b/kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch new file mode 100755 index 0000000000000000000000000000000000000000..8d8de781d2306eac9972509a1047a4f46b32878b --- /dev/null +++ b/kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch @@ -0,0 +1,76 @@ +From ade3dcad8a907d281549b341a8908851e36ba458 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:31 +0100 +Subject: [PATCH 060/116] virtiofsd: Parse flag FUSE_WRITE_KILL_PRIV +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-57-dgilbert@redhat.com> +Patchwork-id: 93505 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 056/112] virtiofsd: Parse flag FUSE_WRITE_KILL_PRIV +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +Caller can set FUSE_WRITE_KILL_PRIV in write_flags. Parse it and pass it +to the filesystem. + +Signed-off-by: Vivek Goyal +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f779bc5265e7e7abb13a03d4bfbc74151afc15c2) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_common.h | 6 +++++- + tools/virtiofsd/fuse_lowlevel.c | 4 +++- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index f8f6433..686c42c 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -93,8 +93,12 @@ struct fuse_file_info { + */ + unsigned int cache_readdir:1; + ++ /* Indicates that suid/sgid bits should be removed upon write */ ++ unsigned int kill_priv:1; ++ ++ + /** Padding. Reserved for future use*/ +- unsigned int padding:25; ++ unsigned int padding:24; + unsigned int padding2:32; + + /* +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 02e1d83..2d6dc5a 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1142,6 +1142,7 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; ++ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; +@@ -1177,7 +1178,8 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + fi.fh = arg->fh; +- fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; ++ fi.writepage = !!(arg->write_flags & FUSE_WRITE_CACHE); ++ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + if (ibufv->count == 1) { + assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch b/kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch new file mode 100755 index 0000000000000000000000000000000000000000..7d095c9c94ccfb2355bcf7fa1d672aa19d8003ba --- /dev/null +++ b/kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch @@ -0,0 +1,140 @@ +From d5986c804f05070a07dfe702f7c66357daaa1ab6 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:20 +0100 +Subject: [PATCH 049/116] virtiofsd: Pass write iov's all the way through +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-46-dgilbert@redhat.com> +Patchwork-id: 93497 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 045/112] virtiofsd: Pass write iov's all the way through +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Pass the write iov pointing to guest RAM all the way through rather +than copying the data. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Xiao Yang +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e17f7a580e2c599330ad3a6946be615ca2fe97d9) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 79 +++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 73 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index fd588a4..872968f 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -454,6 +454,10 @@ static void *fv_queue_thread(void *opaque) + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + + while (1) { ++ bool allocated_bufv = false; ++ struct fuse_bufvec bufv; ++ struct fuse_bufvec *pbufv; ++ + /* + * An element contains one request and the space to send our + * response They're spread over multiple descriptors in a +@@ -495,14 +499,76 @@ static void *fv_queue_thread(void *opaque) + __func__, elem->index); + assert(0); /* TODO */ + } +- copy_from_iov(&fbuf, out_num, out_sg); +- fbuf.size = out_len; ++ /* Copy just the first element and look at it */ ++ copy_from_iov(&fbuf, 1, out_sg); ++ ++ if (out_num > 2 && ++ out_sg[0].iov_len == sizeof(struct fuse_in_header) && ++ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE && ++ out_sg[1].iov_len == sizeof(struct fuse_write_in)) { ++ /* ++ * For a write we don't actually need to copy the ++ * data, we can just do it straight out of guest memory ++ * but we must still copy the headers in case the guest ++ * was nasty and changed them while we were using them. ++ */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); ++ ++ /* copy the fuse_write_in header after the fuse_in_header */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len; ++ ++ /* Allocate the bufv, with space for the rest of the iov */ ++ allocated_bufv = true; ++ pbufv = malloc(sizeof(struct fuse_bufvec) + ++ sizeof(struct fuse_buf) * (out_num - 2)); ++ if (!pbufv) { ++ vu_queue_unpop(dev, q, elem, 0); ++ free(elem); ++ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", ++ __func__); ++ goto out; ++ } ++ ++ pbufv->count = 1; ++ pbufv->buf[0] = fbuf; ++ ++ size_t iovindex, pbufvindex; ++ iovindex = 2; /* 2 headers, separate iovs */ ++ pbufvindex = 1; /* 2 headers, 1 fusebuf */ ++ ++ for (; iovindex < out_num; iovindex++, pbufvindex++) { ++ pbufv->count++; ++ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ ++ pbufv->buf[pbufvindex].flags = 0; ++ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; ++ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; ++ } ++ } else { ++ /* Normal (non fast write) path */ ++ ++ /* Copy the rest of the buffer */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, out_num - 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_len; + +- /* TODO! Endianness of header */ ++ /* TODO! Endianness of header */ + +- /* TODO: Add checks for fuse_session_exited */ +- struct fuse_bufvec bufv = { .buf[0] = fbuf, .count = 1 }; +- fuse_session_process_buf_int(se, &bufv, &ch); ++ /* TODO: Add checks for fuse_session_exited */ ++ bufv.buf[0] = fbuf; ++ bufv.count = 1; ++ pbufv = &bufv; ++ } ++ pbufv->idx = 0; ++ pbufv->off = 0; ++ fuse_session_process_buf_int(se, pbufv, &ch); ++ ++ if (allocated_bufv) { ++ free(pbufv); ++ } + + if (!qi->reply_sent) { + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", +@@ -516,6 +582,7 @@ static void *fv_queue_thread(void *opaque) + elem = NULL; + } + } ++out: + pthread_mutex_destroy(&ch.lock); + free(fbuf.mem); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch b/kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch new file mode 100755 index 0000000000000000000000000000000000000000..834ced122c038024ca6757bf0684dc3396c072cf --- /dev/null +++ b/kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch @@ -0,0 +1,168 @@ +From 9e4320eec5204da851ac95fb7a7e6520c9ccee7d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:19 +0100 +Subject: [PATCH 048/116] virtiofsd: Plumb fuse_bufvec through to do_write_buf +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-45-dgilbert@redhat.com> +Patchwork-id: 93499 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 044/112] virtiofsd: Plumb fuse_bufvec through to do_write_buf +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Let fuse_session_process_buf_int take a fuse_bufvec * instead of a +fuse_buf; and then through to do_write_buf - where in the best +case it can pass that straight through to op.write_buf without copying +(other than skipping a header). + +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 469f9d2fc405b0508e6cf1b4b5bbcadfc82064e5) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 2 +- + tools/virtiofsd/fuse_lowlevel.c | 61 +++++++++++++++++++++++++++-------------- + tools/virtiofsd/fuse_virtio.c | 3 +- + 3 files changed, 44 insertions(+), 22 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 45995f3..a20854f 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -100,7 +100,7 @@ int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + void fuse_free_req(fuse_req_t req); + + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, ++ struct fuse_bufvec *bufv, + struct fuse_chan *ch); + + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 95f4db8..7e10995 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1004,11 +1004,12 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, +- const struct fuse_buf *ibuf) ++ struct fuse_bufvec *ibufv) + { + struct fuse_session *se = req->se; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, ++ struct fuse_bufvec *pbufv = ibufv; ++ struct fuse_bufvec tmpbufv = { ++ .buf[0] = ibufv->buf[0], + .count = 1, + }; + struct fuse_write_in *arg = (struct fuse_write_in *)inarg; +@@ -1018,22 +1019,31 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + fi.fh = arg->fh; + fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; + +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- bufv.buf[0].mem = PARAM(arg); +- } +- +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); +- fuse_reply_err(req, EIO); +- return; ++ if (ibufv->count == 1) { ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ tmpbufv.buf[0].mem = PARAM(arg); ++ } ++ tmpbufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); ++ if (tmpbufv.buf[0].size < arg->size) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: do_write_buf: buffer size too small\n"); ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ tmpbufv.buf[0].size = arg->size; ++ pbufv = &tmpbufv; ++ } else { ++ /* ++ * Input bufv contains the headers in the first element ++ * and the data in the rest, we need to skip that first element ++ */ ++ ibufv->buf[0].size = 0; + } +- bufv.buf[0].size = arg->size; + +- se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); ++ se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); + } + + static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) +@@ -2024,13 +2034,24 @@ static const char *opname(enum fuse_opcode opcode) + void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf) + { +- fuse_session_process_buf_int(se, buf, NULL); ++ struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; ++ fuse_session_process_buf_int(se, &bufv, NULL); + } + ++/* ++ * Restriction: ++ * bufv is normally a single entry buffer, except for a write ++ * where (if it's in memory) then the bufv may be multiple entries, ++ * where the first entry contains all headers and subsequent entries ++ * contain data ++ * bufv shall not use any offsets etc to make the data anything ++ * other than contiguous starting from 0. ++ */ + void fuse_session_process_buf_int(struct fuse_session *se, +- const struct fuse_buf *buf, ++ struct fuse_bufvec *bufv, + struct fuse_chan *ch) + { ++ const struct fuse_buf *buf = bufv->buf; + struct fuse_in_header *in; + const void *inarg; + struct fuse_req *req; +@@ -2108,7 +2129,7 @@ void fuse_session_process_buf_int(struct fuse_session *se, + + inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { +- do_write_buf(req, in->nodeid, inarg, buf); ++ do_write_buf(req, in->nodeid, inarg, bufv); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + } +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 635f877..fd588a4 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -501,7 +501,8 @@ static void *fv_queue_thread(void *opaque) + /* TODO! Endianness of header */ + + /* TODO: Add checks for fuse_session_exited */ +- fuse_session_process_buf_int(se, &fbuf, &ch); ++ struct fuse_bufvec bufv = { .buf[0] = fbuf, .count = 1 }; ++ fuse_session_process_buf_int(se, &bufv, &ch); + + if (!qi->reply_sent) { + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Poll-kick_fd-for-queue.patch b/kvm-virtiofsd-Poll-kick_fd-for-queue.patch new file mode 100755 index 0000000000000000000000000000000000000000..d7c6c0a91abde66186f895b9b5274f43e13b77c4 --- /dev/null +++ b/kvm-virtiofsd-Poll-kick_fd-for-queue.patch @@ -0,0 +1,97 @@ +From 083b944fac29bc3115a19eb38e176f6b23f04938 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:01 +0100 +Subject: [PATCH 030/116] virtiofsd: Poll kick_fd for queue +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-27-dgilbert@redhat.com> +Patchwork-id: 93483 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 026/112] virtiofsd: Poll kick_fd for queue +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +In the queue thread poll the kick_fd we're passed. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 5dcd1f56141378226d33dc3df68ec57913e0aa04) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 40 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 39 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 2a94bb3..05e7258 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -100,13 +101,50 @@ static void fv_panic(VuDev *dev, const char *err) + exit(EXIT_FAILURE); + } + ++/* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { + struct fv_QueueInfo *qi = opaque; + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { +- /* TODO */ ++ struct pollfd pf[1]; ++ pf[0].fd = qi->kick_fd; ++ pf[0].events = POLLIN; ++ pf[0].revents = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__, ++ qi->qidx); ++ int poll_res = ppoll(pf, 1, NULL, NULL); ++ ++ if (poll_res == -1) { ++ if (errno == EINTR) { ++ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", ++ __func__); ++ continue; ++ } ++ fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n"); ++ break; ++ } ++ assert(poll_res == 1); ++ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { ++ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n", ++ __func__, pf[0].revents, qi->qidx); ++ break; ++ } ++ assert(pf[0].revents & POLLIN); ++ fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__, ++ qi->qidx); ++ ++ eventfd_t evalue; ++ if (eventfd_read(qi->kick_fd, &evalue)) { ++ fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); ++ break; ++ } ++ if (qi->virtio_dev->se->debug) { ++ fprintf(stderr, "%s: Queue %d gave evalue: %zx\n", __func__, ++ qi->qidx, (size_t)evalue); ++ } + } + + return NULL; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch b/kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch new file mode 100755 index 0000000000000000000000000000000000000000..d4e1ea1d2c2f772f6d2f6fbd1af22d02ee78ba48 --- /dev/null +++ b/kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch @@ -0,0 +1,144 @@ +From ab336e3aea97d76c1b2ac725d19b4518f47dd8f0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:59 +0100 +Subject: [PATCH 088/116] virtiofsd: Prevent multiply running with same + vhost_user_socket +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-85-dgilbert@redhat.com> +Patchwork-id: 93541 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 084/112] virtiofsd: Prevent multiply running with same vhost_user_socket +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +virtiofsd can run multiply even if the vhost_user_socket is same path. + + ]# ./virtiofsd -o vhost_user_socket=/tmp/vhostqemu -o source=/tmp/share & + [1] 244965 + virtio_session_mount: Waiting for vhost-user socket connection... + ]# ./virtiofsd -o vhost_user_socket=/tmp/vhostqemu -o source=/tmp/share & + [2] 244966 + virtio_session_mount: Waiting for vhost-user socket connection... + ]# + +The user will get confused about the situation and maybe the cause of the +unexpected problem. So it's better to prevent the multiple running. + +Create a regular file under localstatedir directory to exclude the +vhost_user_socket. To create and lock the file, use qemu_write_pidfile() +because the API has some sanity checks and file lock. + +Signed-off-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert + Applied fixes from Stefan's review and moved osdep include +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 96814800d2b49d18737c36e021c387697ec40c62) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 1 + + tools/virtiofsd/fuse_virtio.c | 49 ++++++++++++++++++++++++++++++++++++++++- + 2 files changed, 49 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 440508a..aac282f 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -18,6 +18,7 @@ + + #include + #include ++#include + #include + #include + #include +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index e7bd772..b7948de 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -13,11 +13,12 @@ + + #include "qemu/osdep.h" + #include "qemu/iov.h" +-#include "fuse_virtio.h" ++#include "qapi/error.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" + #include "fuse_misc.h" + #include "fuse_opt.h" ++#include "fuse_virtio.h" + + #include + #include +@@ -743,6 +744,42 @@ int virtio_loop(struct fuse_session *se) + return 0; + } + ++static void strreplace(char *s, char old, char new) ++{ ++ for (; *s; ++s) { ++ if (*s == old) { ++ *s = new; ++ } ++ } ++} ++ ++static bool fv_socket_lock(struct fuse_session *se) ++{ ++ g_autofree gchar *sk_name = NULL; ++ g_autofree gchar *pidfile = NULL; ++ g_autofree gchar *dir = NULL; ++ Error *local_err = NULL; ++ ++ dir = qemu_get_local_state_pathname("run/virtiofsd"); ++ ++ if (g_mkdir_with_parents(dir, S_IRWXU) < 0) { ++ fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s", ++ __func__, dir, strerror(errno)); ++ return false; ++ } ++ ++ sk_name = g_strdup(se->vu_socket_path); ++ strreplace(sk_name, '/', '.'); ++ pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name); ++ ++ if (!qemu_write_pidfile(pidfile, &local_err)) { ++ error_report_err(local_err); ++ return false; ++ } ++ ++ return true; ++} ++ + static int fv_create_listen_socket(struct fuse_session *se) + { + struct sockaddr_un un; +@@ -758,6 +795,16 @@ static int fv_create_listen_socket(struct fuse_session *se) + return -1; + } + ++ if (!strlen(se->vu_socket_path)) { ++ fuse_log(FUSE_LOG_ERR, "Socket path is empty\n"); ++ return -1; ++ } ++ ++ /* Check the vu_socket_path is already used */ ++ if (!fv_socket_lock(se)) { ++ return -1; ++ } ++ + /* + * Create the Unix socket to communicate with qemu + * based on QEMU's vhost-user-bridge +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch b/kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch new file mode 100755 index 0000000000000000000000000000000000000000..f30f23a7046d7019ff0e1bd0fabd966bed88ba47 --- /dev/null +++ b/kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch @@ -0,0 +1,945 @@ +From e7c1ad608117b21f80c762f5505a66b21c56e9d3 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:40 +0100 +Subject: [PATCH 009/116] virtiofsd: Pull in kernel's fuse.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-6-dgilbert@redhat.com> +Patchwork-id: 93460 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 005/112] virtiofsd: Pull in kernel's fuse.h +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Update scripts/update-linux-headers.sh to add fuse.h and +use it to pull in fuse.h from the kernel; from v5.5-rc1 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a62a9e192bc5f0aa0bc076b51db5a069add87c78) +Signed-off-by: Miroslav Rezanina +--- + include/standard-headers/linux/fuse.h | 891 ++++++++++++++++++++++++++++++++++ + scripts/update-linux-headers.sh | 1 + + 2 files changed, 892 insertions(+) + create mode 100644 include/standard-headers/linux/fuse.h + +diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h +new file mode 100644 +index 0000000..f4df0a4 +--- /dev/null ++++ b/include/standard-headers/linux/fuse.h +@@ -0,0 +1,891 @@ ++/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ ++/* ++ This file defines the kernel interface of FUSE ++ Copyright (C) 2001-2008 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU GPL. ++ See the file COPYING. ++ ++ This -- and only this -- header file may also be distributed under ++ the terms of the BSD Licence as follows: ++ ++ Copyright (C) 2001-2007 Miklos Szeredi. All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ 1. Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ 2. Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ ++ THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE ++ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ SUCH DAMAGE. ++*/ ++ ++/* ++ * This file defines the kernel interface of FUSE ++ * ++ * Protocol changelog: ++ * ++ * 7.1: ++ * - add the following messages: ++ * FUSE_SETATTR, FUSE_SYMLINK, FUSE_MKNOD, FUSE_MKDIR, FUSE_UNLINK, ++ * FUSE_RMDIR, FUSE_RENAME, FUSE_LINK, FUSE_OPEN, FUSE_READ, FUSE_WRITE, ++ * FUSE_RELEASE, FUSE_FSYNC, FUSE_FLUSH, FUSE_SETXATTR, FUSE_GETXATTR, ++ * FUSE_LISTXATTR, FUSE_REMOVEXATTR, FUSE_OPENDIR, FUSE_READDIR, ++ * FUSE_RELEASEDIR ++ * - add padding to messages to accommodate 32-bit servers on 64-bit kernels ++ * ++ * 7.2: ++ * - add FOPEN_DIRECT_IO and FOPEN_KEEP_CACHE flags ++ * - add FUSE_FSYNCDIR message ++ * ++ * 7.3: ++ * - add FUSE_ACCESS message ++ * - add FUSE_CREATE message ++ * - add filehandle to fuse_setattr_in ++ * ++ * 7.4: ++ * - add frsize to fuse_kstatfs ++ * - clean up request size limit checking ++ * ++ * 7.5: ++ * - add flags and max_write to fuse_init_out ++ * ++ * 7.6: ++ * - add max_readahead to fuse_init_in and fuse_init_out ++ * ++ * 7.7: ++ * - add FUSE_INTERRUPT message ++ * - add POSIX file lock support ++ * ++ * 7.8: ++ * - add lock_owner and flags fields to fuse_release_in ++ * - add FUSE_BMAP message ++ * - add FUSE_DESTROY message ++ * ++ * 7.9: ++ * - new fuse_getattr_in input argument of GETATTR ++ * - add lk_flags in fuse_lk_in ++ * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in ++ * - add blksize field to fuse_attr ++ * - add file flags field to fuse_read_in and fuse_write_in ++ * - Add ATIME_NOW and MTIME_NOW flags to fuse_setattr_in ++ * ++ * 7.10 ++ * - add nonseekable open flag ++ * ++ * 7.11 ++ * - add IOCTL message ++ * - add unsolicited notification support ++ * - add POLL message and NOTIFY_POLL notification ++ * ++ * 7.12 ++ * - add umask flag to input argument of create, mknod and mkdir ++ * - add notification messages for invalidation of inodes and ++ * directory entries ++ * ++ * 7.13 ++ * - make max number of background requests and congestion threshold ++ * tunables ++ * ++ * 7.14 ++ * - add splice support to fuse device ++ * ++ * 7.15 ++ * - add store notify ++ * - add retrieve notify ++ * ++ * 7.16 ++ * - add BATCH_FORGET request ++ * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct ++ * fuse_ioctl_iovec' instead of ambiguous 'struct iovec' ++ * - add FUSE_IOCTL_32BIT flag ++ * ++ * 7.17 ++ * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK ++ * ++ * 7.18 ++ * - add FUSE_IOCTL_DIR flag ++ * - add FUSE_NOTIFY_DELETE ++ * ++ * 7.19 ++ * - add FUSE_FALLOCATE ++ * ++ * 7.20 ++ * - add FUSE_AUTO_INVAL_DATA ++ * ++ * 7.21 ++ * - add FUSE_READDIRPLUS ++ * - send the requested events in POLL request ++ * ++ * 7.22 ++ * - add FUSE_ASYNC_DIO ++ * ++ * 7.23 ++ * - add FUSE_WRITEBACK_CACHE ++ * - add time_gran to fuse_init_out ++ * - add reserved space to fuse_init_out ++ * - add FATTR_CTIME ++ * - add ctime and ctimensec to fuse_setattr_in ++ * - add FUSE_RENAME2 request ++ * - add FUSE_NO_OPEN_SUPPORT flag ++ * ++ * 7.24 ++ * - add FUSE_LSEEK for SEEK_HOLE and SEEK_DATA support ++ * ++ * 7.25 ++ * - add FUSE_PARALLEL_DIROPS ++ * ++ * 7.26 ++ * - add FUSE_HANDLE_KILLPRIV ++ * - add FUSE_POSIX_ACL ++ * ++ * 7.27 ++ * - add FUSE_ABORT_ERROR ++ * ++ * 7.28 ++ * - add FUSE_COPY_FILE_RANGE ++ * - add FOPEN_CACHE_DIR ++ * - add FUSE_MAX_PAGES, add max_pages to init_out ++ * - add FUSE_CACHE_SYMLINKS ++ * ++ * 7.29 ++ * - add FUSE_NO_OPENDIR_SUPPORT flag ++ * ++ * 7.30 ++ * - add FUSE_EXPLICIT_INVAL_DATA ++ * - add FUSE_IOCTL_COMPAT_X32 ++ * ++ * 7.31 ++ * - add FUSE_WRITE_KILL_PRIV flag ++ * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING ++ * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag ++ */ ++ ++#ifndef _LINUX_FUSE_H ++#define _LINUX_FUSE_H ++ ++#include ++ ++/* ++ * Version negotiation: ++ * ++ * Both the kernel and userspace send the version they support in the ++ * INIT request and reply respectively. ++ * ++ * If the major versions match then both shall use the smallest ++ * of the two minor versions for communication. ++ * ++ * If the kernel supports a larger major version, then userspace shall ++ * reply with the major version it supports, ignore the rest of the ++ * INIT message and expect a new INIT message from the kernel with a ++ * matching major version. ++ * ++ * If the library supports a larger major version, then it shall fall ++ * back to the major protocol version sent by the kernel for ++ * communication and reply with that major version (and an arbitrary ++ * supported minor version). ++ */ ++ ++/** Version number of this interface */ ++#define FUSE_KERNEL_VERSION 7 ++ ++/** Minor version number of this interface */ ++#define FUSE_KERNEL_MINOR_VERSION 31 ++ ++/** The node ID of the root inode */ ++#define FUSE_ROOT_ID 1 ++ ++/* Make sure all structures are padded to 64bit boundary, so 32bit ++ userspace works under 64bit kernels */ ++ ++struct fuse_attr { ++ uint64_t ino; ++ uint64_t size; ++ uint64_t blocks; ++ uint64_t atime; ++ uint64_t mtime; ++ uint64_t ctime; ++ uint32_t atimensec; ++ uint32_t mtimensec; ++ uint32_t ctimensec; ++ uint32_t mode; ++ uint32_t nlink; ++ uint32_t uid; ++ uint32_t gid; ++ uint32_t rdev; ++ uint32_t blksize; ++ uint32_t padding; ++}; ++ ++struct fuse_kstatfs { ++ uint64_t blocks; ++ uint64_t bfree; ++ uint64_t bavail; ++ uint64_t files; ++ uint64_t ffree; ++ uint32_t bsize; ++ uint32_t namelen; ++ uint32_t frsize; ++ uint32_t padding; ++ uint32_t spare[6]; ++}; ++ ++struct fuse_file_lock { ++ uint64_t start; ++ uint64_t end; ++ uint32_t type; ++ uint32_t pid; /* tgid */ ++}; ++ ++/** ++ * Bitmasks for fuse_setattr_in.valid ++ */ ++#define FATTR_MODE (1 << 0) ++#define FATTR_UID (1 << 1) ++#define FATTR_GID (1 << 2) ++#define FATTR_SIZE (1 << 3) ++#define FATTR_ATIME (1 << 4) ++#define FATTR_MTIME (1 << 5) ++#define FATTR_FH (1 << 6) ++#define FATTR_ATIME_NOW (1 << 7) ++#define FATTR_MTIME_NOW (1 << 8) ++#define FATTR_LOCKOWNER (1 << 9) ++#define FATTR_CTIME (1 << 10) ++ ++/** ++ * Flags returned by the OPEN request ++ * ++ * FOPEN_DIRECT_IO: bypass page cache for this open file ++ * FOPEN_KEEP_CACHE: don't invalidate the data cache on open ++ * FOPEN_NONSEEKABLE: the file is not seekable ++ * FOPEN_CACHE_DIR: allow caching this directory ++ * FOPEN_STREAM: the file is stream-like (no file position at all) ++ */ ++#define FOPEN_DIRECT_IO (1 << 0) ++#define FOPEN_KEEP_CACHE (1 << 1) ++#define FOPEN_NONSEEKABLE (1 << 2) ++#define FOPEN_CACHE_DIR (1 << 3) ++#define FOPEN_STREAM (1 << 4) ++ ++/** ++ * INIT request/reply flags ++ * ++ * FUSE_ASYNC_READ: asynchronous read requests ++ * FUSE_POSIX_LOCKS: remote locking for POSIX file locks ++ * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported) ++ * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem ++ * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." ++ * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB ++ * FUSE_DONT_MASK: don't apply umask to file mode on create operations ++ * FUSE_SPLICE_WRITE: kernel supports splice write on the device ++ * FUSE_SPLICE_MOVE: kernel supports splice move on the device ++ * FUSE_SPLICE_READ: kernel supports splice read on the device ++ * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks ++ * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories ++ * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages ++ * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one) ++ * FUSE_READDIRPLUS_AUTO: adaptive readdirplus ++ * FUSE_ASYNC_DIO: asynchronous direct I/O submission ++ * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes ++ * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens ++ * FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir ++ * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc ++ * FUSE_POSIX_ACL: filesystem supports posix acls ++ * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED ++ * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages ++ * FUSE_CACHE_SYMLINKS: cache READLINK responses ++ * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir ++ * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request ++ * FUSE_MAP_ALIGNMENT: map_alignment field is valid ++ */ ++#define FUSE_ASYNC_READ (1 << 0) ++#define FUSE_POSIX_LOCKS (1 << 1) ++#define FUSE_FILE_OPS (1 << 2) ++#define FUSE_ATOMIC_O_TRUNC (1 << 3) ++#define FUSE_EXPORT_SUPPORT (1 << 4) ++#define FUSE_BIG_WRITES (1 << 5) ++#define FUSE_DONT_MASK (1 << 6) ++#define FUSE_SPLICE_WRITE (1 << 7) ++#define FUSE_SPLICE_MOVE (1 << 8) ++#define FUSE_SPLICE_READ (1 << 9) ++#define FUSE_FLOCK_LOCKS (1 << 10) ++#define FUSE_HAS_IOCTL_DIR (1 << 11) ++#define FUSE_AUTO_INVAL_DATA (1 << 12) ++#define FUSE_DO_READDIRPLUS (1 << 13) ++#define FUSE_READDIRPLUS_AUTO (1 << 14) ++#define FUSE_ASYNC_DIO (1 << 15) ++#define FUSE_WRITEBACK_CACHE (1 << 16) ++#define FUSE_NO_OPEN_SUPPORT (1 << 17) ++#define FUSE_PARALLEL_DIROPS (1 << 18) ++#define FUSE_HANDLE_KILLPRIV (1 << 19) ++#define FUSE_POSIX_ACL (1 << 20) ++#define FUSE_ABORT_ERROR (1 << 21) ++#define FUSE_MAX_PAGES (1 << 22) ++#define FUSE_CACHE_SYMLINKS (1 << 23) ++#define FUSE_NO_OPENDIR_SUPPORT (1 << 24) ++#define FUSE_EXPLICIT_INVAL_DATA (1 << 25) ++#define FUSE_MAP_ALIGNMENT (1 << 26) ++ ++/** ++ * CUSE INIT request/reply flags ++ * ++ * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl ++ */ ++#define CUSE_UNRESTRICTED_IOCTL (1 << 0) ++ ++/** ++ * Release flags ++ */ ++#define FUSE_RELEASE_FLUSH (1 << 0) ++#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1) ++ ++/** ++ * Getattr flags ++ */ ++#define FUSE_GETATTR_FH (1 << 0) ++ ++/** ++ * Lock flags ++ */ ++#define FUSE_LK_FLOCK (1 << 0) ++ ++/** ++ * WRITE flags ++ * ++ * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed ++ * FUSE_WRITE_LOCKOWNER: lock_owner field is valid ++ * FUSE_WRITE_KILL_PRIV: kill suid and sgid bits ++ */ ++#define FUSE_WRITE_CACHE (1 << 0) ++#define FUSE_WRITE_LOCKOWNER (1 << 1) ++#define FUSE_WRITE_KILL_PRIV (1 << 2) ++ ++/** ++ * Read flags ++ */ ++#define FUSE_READ_LOCKOWNER (1 << 1) ++ ++/** ++ * Ioctl flags ++ * ++ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine ++ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed ++ * FUSE_IOCTL_RETRY: retry with new iovecs ++ * FUSE_IOCTL_32BIT: 32bit ioctl ++ * FUSE_IOCTL_DIR: is a directory ++ * FUSE_IOCTL_COMPAT_X32: x32 compat ioctl on 64bit machine (64bit time_t) ++ * ++ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs ++ */ ++#define FUSE_IOCTL_COMPAT (1 << 0) ++#define FUSE_IOCTL_UNRESTRICTED (1 << 1) ++#define FUSE_IOCTL_RETRY (1 << 2) ++#define FUSE_IOCTL_32BIT (1 << 3) ++#define FUSE_IOCTL_DIR (1 << 4) ++#define FUSE_IOCTL_COMPAT_X32 (1 << 5) ++ ++#define FUSE_IOCTL_MAX_IOV 256 ++ ++/** ++ * Poll flags ++ * ++ * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify ++ */ ++#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0) ++ ++/** ++ * Fsync flags ++ * ++ * FUSE_FSYNC_FDATASYNC: Sync data only, not metadata ++ */ ++#define FUSE_FSYNC_FDATASYNC (1 << 0) ++ ++enum fuse_opcode { ++ FUSE_LOOKUP = 1, ++ FUSE_FORGET = 2, /* no reply */ ++ FUSE_GETATTR = 3, ++ FUSE_SETATTR = 4, ++ FUSE_READLINK = 5, ++ FUSE_SYMLINK = 6, ++ FUSE_MKNOD = 8, ++ FUSE_MKDIR = 9, ++ FUSE_UNLINK = 10, ++ FUSE_RMDIR = 11, ++ FUSE_RENAME = 12, ++ FUSE_LINK = 13, ++ FUSE_OPEN = 14, ++ FUSE_READ = 15, ++ FUSE_WRITE = 16, ++ FUSE_STATFS = 17, ++ FUSE_RELEASE = 18, ++ FUSE_FSYNC = 20, ++ FUSE_SETXATTR = 21, ++ FUSE_GETXATTR = 22, ++ FUSE_LISTXATTR = 23, ++ FUSE_REMOVEXATTR = 24, ++ FUSE_FLUSH = 25, ++ FUSE_INIT = 26, ++ FUSE_OPENDIR = 27, ++ FUSE_READDIR = 28, ++ FUSE_RELEASEDIR = 29, ++ FUSE_FSYNCDIR = 30, ++ FUSE_GETLK = 31, ++ FUSE_SETLK = 32, ++ FUSE_SETLKW = 33, ++ FUSE_ACCESS = 34, ++ FUSE_CREATE = 35, ++ FUSE_INTERRUPT = 36, ++ FUSE_BMAP = 37, ++ FUSE_DESTROY = 38, ++ FUSE_IOCTL = 39, ++ FUSE_POLL = 40, ++ FUSE_NOTIFY_REPLY = 41, ++ FUSE_BATCH_FORGET = 42, ++ FUSE_FALLOCATE = 43, ++ FUSE_READDIRPLUS = 44, ++ FUSE_RENAME2 = 45, ++ FUSE_LSEEK = 46, ++ FUSE_COPY_FILE_RANGE = 47, ++ FUSE_SETUPMAPPING = 48, ++ FUSE_REMOVEMAPPING = 49, ++ ++ /* CUSE specific operations */ ++ CUSE_INIT = 4096, ++ ++ /* Reserved opcodes: helpful to detect structure endian-ness */ ++ CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */ ++ FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */ ++}; ++ ++enum fuse_notify_code { ++ FUSE_NOTIFY_POLL = 1, ++ FUSE_NOTIFY_INVAL_INODE = 2, ++ FUSE_NOTIFY_INVAL_ENTRY = 3, ++ FUSE_NOTIFY_STORE = 4, ++ FUSE_NOTIFY_RETRIEVE = 5, ++ FUSE_NOTIFY_DELETE = 6, ++ FUSE_NOTIFY_CODE_MAX, ++}; ++ ++/* The read buffer is required to be at least 8k, but may be much larger */ ++#define FUSE_MIN_READ_BUFFER 8192 ++ ++#define FUSE_COMPAT_ENTRY_OUT_SIZE 120 ++ ++struct fuse_entry_out { ++ uint64_t nodeid; /* Inode ID */ ++ uint64_t generation; /* Inode generation: nodeid:gen must ++ be unique for the fs's lifetime */ ++ uint64_t entry_valid; /* Cache timeout for the name */ ++ uint64_t attr_valid; /* Cache timeout for the attributes */ ++ uint32_t entry_valid_nsec; ++ uint32_t attr_valid_nsec; ++ struct fuse_attr attr; ++}; ++ ++struct fuse_forget_in { ++ uint64_t nlookup; ++}; ++ ++struct fuse_forget_one { ++ uint64_t nodeid; ++ uint64_t nlookup; ++}; ++ ++struct fuse_batch_forget_in { ++ uint32_t count; ++ uint32_t dummy; ++}; ++ ++struct fuse_getattr_in { ++ uint32_t getattr_flags; ++ uint32_t dummy; ++ uint64_t fh; ++}; ++ ++#define FUSE_COMPAT_ATTR_OUT_SIZE 96 ++ ++struct fuse_attr_out { ++ uint64_t attr_valid; /* Cache timeout for the attributes */ ++ uint32_t attr_valid_nsec; ++ uint32_t dummy; ++ struct fuse_attr attr; ++}; ++ ++#define FUSE_COMPAT_MKNOD_IN_SIZE 8 ++ ++struct fuse_mknod_in { ++ uint32_t mode; ++ uint32_t rdev; ++ uint32_t umask; ++ uint32_t padding; ++}; ++ ++struct fuse_mkdir_in { ++ uint32_t mode; ++ uint32_t umask; ++}; ++ ++struct fuse_rename_in { ++ uint64_t newdir; ++}; ++ ++struct fuse_rename2_in { ++ uint64_t newdir; ++ uint32_t flags; ++ uint32_t padding; ++}; ++ ++struct fuse_link_in { ++ uint64_t oldnodeid; ++}; ++ ++struct fuse_setattr_in { ++ uint32_t valid; ++ uint32_t padding; ++ uint64_t fh; ++ uint64_t size; ++ uint64_t lock_owner; ++ uint64_t atime; ++ uint64_t mtime; ++ uint64_t ctime; ++ uint32_t atimensec; ++ uint32_t mtimensec; ++ uint32_t ctimensec; ++ uint32_t mode; ++ uint32_t unused4; ++ uint32_t uid; ++ uint32_t gid; ++ uint32_t unused5; ++}; ++ ++struct fuse_open_in { ++ uint32_t flags; ++ uint32_t unused; ++}; ++ ++struct fuse_create_in { ++ uint32_t flags; ++ uint32_t mode; ++ uint32_t umask; ++ uint32_t padding; ++}; ++ ++struct fuse_open_out { ++ uint64_t fh; ++ uint32_t open_flags; ++ uint32_t padding; ++}; ++ ++struct fuse_release_in { ++ uint64_t fh; ++ uint32_t flags; ++ uint32_t release_flags; ++ uint64_t lock_owner; ++}; ++ ++struct fuse_flush_in { ++ uint64_t fh; ++ uint32_t unused; ++ uint32_t padding; ++ uint64_t lock_owner; ++}; ++ ++struct fuse_read_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t read_flags; ++ uint64_t lock_owner; ++ uint32_t flags; ++ uint32_t padding; ++}; ++ ++#define FUSE_COMPAT_WRITE_IN_SIZE 24 ++ ++struct fuse_write_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t write_flags; ++ uint64_t lock_owner; ++ uint32_t flags; ++ uint32_t padding; ++}; ++ ++struct fuse_write_out { ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++#define FUSE_COMPAT_STATFS_SIZE 48 ++ ++struct fuse_statfs_out { ++ struct fuse_kstatfs st; ++}; ++ ++struct fuse_fsync_in { ++ uint64_t fh; ++ uint32_t fsync_flags; ++ uint32_t padding; ++}; ++ ++struct fuse_setxattr_in { ++ uint32_t size; ++ uint32_t flags; ++}; ++ ++struct fuse_getxattr_in { ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++struct fuse_getxattr_out { ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++struct fuse_lk_in { ++ uint64_t fh; ++ uint64_t owner; ++ struct fuse_file_lock lk; ++ uint32_t lk_flags; ++ uint32_t padding; ++}; ++ ++struct fuse_lk_out { ++ struct fuse_file_lock lk; ++}; ++ ++struct fuse_access_in { ++ uint32_t mask; ++ uint32_t padding; ++}; ++ ++struct fuse_init_in { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t max_readahead; ++ uint32_t flags; ++}; ++ ++#define FUSE_COMPAT_INIT_OUT_SIZE 8 ++#define FUSE_COMPAT_22_INIT_OUT_SIZE 24 ++ ++struct fuse_init_out { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t max_readahead; ++ uint32_t flags; ++ uint16_t max_background; ++ uint16_t congestion_threshold; ++ uint32_t max_write; ++ uint32_t time_gran; ++ uint16_t max_pages; ++ uint16_t map_alignment; ++ uint32_t unused[8]; ++}; ++ ++#define CUSE_INIT_INFO_MAX 4096 ++ ++struct cuse_init_in { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t unused; ++ uint32_t flags; ++}; ++ ++struct cuse_init_out { ++ uint32_t major; ++ uint32_t minor; ++ uint32_t unused; ++ uint32_t flags; ++ uint32_t max_read; ++ uint32_t max_write; ++ uint32_t dev_major; /* chardev major */ ++ uint32_t dev_minor; /* chardev minor */ ++ uint32_t spare[10]; ++}; ++ ++struct fuse_interrupt_in { ++ uint64_t unique; ++}; ++ ++struct fuse_bmap_in { ++ uint64_t block; ++ uint32_t blocksize; ++ uint32_t padding; ++}; ++ ++struct fuse_bmap_out { ++ uint64_t block; ++}; ++ ++struct fuse_ioctl_in { ++ uint64_t fh; ++ uint32_t flags; ++ uint32_t cmd; ++ uint64_t arg; ++ uint32_t in_size; ++ uint32_t out_size; ++}; ++ ++struct fuse_ioctl_iovec { ++ uint64_t base; ++ uint64_t len; ++}; ++ ++struct fuse_ioctl_out { ++ int32_t result; ++ uint32_t flags; ++ uint32_t in_iovs; ++ uint32_t out_iovs; ++}; ++ ++struct fuse_poll_in { ++ uint64_t fh; ++ uint64_t kh; ++ uint32_t flags; ++ uint32_t events; ++}; ++ ++struct fuse_poll_out { ++ uint32_t revents; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_poll_wakeup_out { ++ uint64_t kh; ++}; ++ ++struct fuse_fallocate_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint64_t length; ++ uint32_t mode; ++ uint32_t padding; ++}; ++ ++struct fuse_in_header { ++ uint32_t len; ++ uint32_t opcode; ++ uint64_t unique; ++ uint64_t nodeid; ++ uint32_t uid; ++ uint32_t gid; ++ uint32_t pid; ++ uint32_t padding; ++}; ++ ++struct fuse_out_header { ++ uint32_t len; ++ int32_t error; ++ uint64_t unique; ++}; ++ ++struct fuse_dirent { ++ uint64_t ino; ++ uint64_t off; ++ uint32_t namelen; ++ uint32_t type; ++ char name[]; ++}; ++ ++#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) ++#define FUSE_DIRENT_ALIGN(x) \ ++ (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) ++#define FUSE_DIRENT_SIZE(d) \ ++ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) ++ ++struct fuse_direntplus { ++ struct fuse_entry_out entry_out; ++ struct fuse_dirent dirent; ++}; ++ ++#define FUSE_NAME_OFFSET_DIRENTPLUS \ ++ offsetof(struct fuse_direntplus, dirent.name) ++#define FUSE_DIRENTPLUS_SIZE(d) \ ++ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen) ++ ++struct fuse_notify_inval_inode_out { ++ uint64_t ino; ++ int64_t off; ++ int64_t len; ++}; ++ ++struct fuse_notify_inval_entry_out { ++ uint64_t parent; ++ uint32_t namelen; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_delete_out { ++ uint64_t parent; ++ uint64_t child; ++ uint32_t namelen; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_store_out { ++ uint64_t nodeid; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++struct fuse_notify_retrieve_out { ++ uint64_t notify_unique; ++ uint64_t nodeid; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t padding; ++}; ++ ++/* Matches the size of fuse_write_in */ ++struct fuse_notify_retrieve_in { ++ uint64_t dummy1; ++ uint64_t offset; ++ uint32_t size; ++ uint32_t dummy2; ++ uint64_t dummy3; ++ uint64_t dummy4; ++}; ++ ++/* Device ioctls: */ ++#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) ++ ++struct fuse_lseek_in { ++ uint64_t fh; ++ uint64_t offset; ++ uint32_t whence; ++ uint32_t padding; ++}; ++ ++struct fuse_lseek_out { ++ uint64_t offset; ++}; ++ ++struct fuse_copy_file_range_in { ++ uint64_t fh_in; ++ uint64_t off_in; ++ uint64_t nodeid_out; ++ uint64_t fh_out; ++ uint64_t off_out; ++ uint64_t len; ++ uint64_t flags; ++}; ++ ++#endif /* _LINUX_FUSE_H */ +diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh +index f76d773..29c27f4 100755 +--- a/scripts/update-linux-headers.sh ++++ b/scripts/update-linux-headers.sh +@@ -186,6 +186,7 @@ rm -rf "$output/include/standard-headers/linux" + mkdir -p "$output/include/standard-headers/linux" + for i in "$tmpdir"/include/linux/*virtio*.h \ + "$tmpdir/include/linux/qemu_fw_cfg.h" \ ++ "$tmpdir/include/linux/fuse.h" \ + "$tmpdir/include/linux/input.h" \ + "$tmpdir/include/linux/input-event-codes.h" \ + "$tmpdir/include/linux/pci_regs.h" \ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Pull-in-upstream-headers.patch b/kvm-virtiofsd-Pull-in-upstream-headers.patch new file mode 100755 index 0000000000000000000000000000000000000000..78784fbd0812641dcfce27cd9c4cbf7942b07ea9 --- /dev/null +++ b/kvm-virtiofsd-Pull-in-upstream-headers.patch @@ -0,0 +1,4911 @@ +From 434b51e5c2fce756906dec4803900397bc98ad72 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:39 +0100 +Subject: [PATCH 008/116] virtiofsd: Pull in upstream headers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-5-dgilbert@redhat.com> +Patchwork-id: 93457 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 004/112] virtiofsd: Pull in upstream headers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Pull in headers fromlibfuse's upstream fuse-3.8.0 + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ee46c78901eb7fa78e328e04c0494ad6d207238b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse.h | 1275 ++++++++++++++++++++ + tools/virtiofsd/fuse_common.h | 823 +++++++++++++ + tools/virtiofsd/fuse_i.h | 139 +++ + tools/virtiofsd/fuse_log.h | 82 ++ + tools/virtiofsd/fuse_lowlevel.h | 2089 +++++++++++++++++++++++++++++++++ + tools/virtiofsd/fuse_misc.h | 59 + + tools/virtiofsd/fuse_opt.h | 271 +++++ + tools/virtiofsd/passthrough_helpers.h | 76 ++ + 8 files changed, 4814 insertions(+) + create mode 100644 tools/virtiofsd/fuse.h + create mode 100644 tools/virtiofsd/fuse_common.h + create mode 100644 tools/virtiofsd/fuse_i.h + create mode 100644 tools/virtiofsd/fuse_log.h + create mode 100644 tools/virtiofsd/fuse_lowlevel.h + create mode 100644 tools/virtiofsd/fuse_misc.h + create mode 100644 tools/virtiofsd/fuse_opt.h + create mode 100644 tools/virtiofsd/passthrough_helpers.h + +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +new file mode 100644 +index 0000000..883f6e5 +--- /dev/null ++++ b/tools/virtiofsd/fuse.h +@@ -0,0 +1,1275 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_H_ ++#define FUSE_H_ ++ ++/** @file ++ * ++ * This file defines the library interface of FUSE ++ * ++ * IMPORTANT: you should define FUSE_USE_VERSION before including this header. ++ */ ++ ++#include "fuse_common.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ----------------------------------------------------------- * ++ * Basic FUSE API * ++ * ----------------------------------------------------------- */ ++ ++/** Handle for a FUSE filesystem */ ++struct fuse; ++ ++/** ++ * Readdir flags, passed to ->readdir() ++ */ ++enum fuse_readdir_flags { ++ /** ++ * "Plus" mode. ++ * ++ * The kernel wants to prefill the inode cache during readdir. The ++ * filesystem may honour this by filling in the attributes and setting ++ * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also ++ * just ignore this flag completely. ++ */ ++ FUSE_READDIR_PLUS = (1 << 0), ++}; ++ ++enum fuse_fill_dir_flags { ++ /** ++ * "Plus" mode: all file attributes are valid ++ * ++ * The attributes are used by the kernel to prefill the inode cache ++ * during a readdir. ++ * ++ * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set ++ * and vice versa. ++ */ ++ FUSE_FILL_DIR_PLUS = (1 << 1), ++}; ++ ++/** Function to add an entry in a readdir() operation ++ * ++ * The *off* parameter can be any non-zero value that enables the ++ * filesystem to identify the current point in the directory ++ * stream. It does not need to be the actual physical position. A ++ * value of zero is reserved to indicate that seeking in directories ++ * is not supported. ++ * ++ * @param buf the buffer passed to the readdir() operation ++ * @param name the file name of the directory entry ++ * @param stat file attributes, can be NULL ++ * @param off offset of the next entry or zero ++ * @param flags fill flags ++ * @return 1 if buffer is full, zero otherwise ++ */ ++typedef int (*fuse_fill_dir_t) (void *buf, const char *name, ++ const struct stat *stbuf, off_t off, ++ enum fuse_fill_dir_flags flags); ++/** ++ * Configuration of the high-level API ++ * ++ * This structure is initialized from the arguments passed to ++ * fuse_new(), and then passed to the file system's init() handler ++ * which should ensure that the configuration is compatible with the ++ * file system implementation. ++ */ ++struct fuse_config { ++ /** ++ * If `set_gid` is non-zero, the st_gid attribute of each file ++ * is overwritten with the value of `gid`. ++ */ ++ int set_gid; ++ unsigned int gid; ++ ++ /** ++ * If `set_uid` is non-zero, the st_uid attribute of each file ++ * is overwritten with the value of `uid`. ++ */ ++ int set_uid; ++ unsigned int uid; ++ ++ /** ++ * If `set_mode` is non-zero, the any permissions bits set in ++ * `umask` are unset in the st_mode attribute of each file. ++ */ ++ int set_mode; ++ unsigned int umask; ++ ++ /** ++ * The timeout in seconds for which name lookups will be ++ * cached. ++ */ ++ double entry_timeout; ++ ++ /** ++ * The timeout in seconds for which a negative lookup will be ++ * cached. This means, that if file did not exist (lookup ++ * retuned ENOENT), the lookup will only be redone after the ++ * timeout, and the file/directory will be assumed to not ++ * exist until then. A value of zero means that negative ++ * lookups are not cached. ++ */ ++ double negative_timeout; ++ ++ /** ++ * The timeout in seconds for which file/directory attributes ++ * (as returned by e.g. the `getattr` handler) are cached. ++ */ ++ double attr_timeout; ++ ++ /** ++ * Allow requests to be interrupted ++ */ ++ int intr; ++ ++ /** ++ * Specify which signal number to send to the filesystem when ++ * a request is interrupted. The default is hardcoded to ++ * USR1. ++ */ ++ int intr_signal; ++ ++ /** ++ * Normally, FUSE assigns inodes to paths only for as long as ++ * the kernel is aware of them. With this option inodes are ++ * instead remembered for at least this many seconds. This ++ * will require more memory, but may be necessary when using ++ * applications that make use of inode numbers. ++ * ++ * A number of -1 means that inodes will be remembered for the ++ * entire life-time of the file-system process. ++ */ ++ int remember; ++ ++ /** ++ * The default behavior is that if an open file is deleted, ++ * the file is renamed to a hidden file (.fuse_hiddenXXX), and ++ * only removed when the file is finally released. This ++ * relieves the filesystem implementation of having to deal ++ * with this problem. This option disables the hiding ++ * behavior, and files are removed immediately in an unlink ++ * operation (or in a rename operation which overwrites an ++ * existing file). ++ * ++ * It is recommended that you not use the hard_remove ++ * option. When hard_remove is set, the following libc ++ * functions fail on unlinked files (returning errno of ++ * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2), ++ * ftruncate(2), fstat(2), fchmod(2), fchown(2) ++ */ ++ int hard_remove; ++ ++ /** ++ * Honor the st_ino field in the functions getattr() and ++ * fill_dir(). This value is used to fill in the st_ino field ++ * in the stat(2), lstat(2), fstat(2) functions and the d_ino ++ * field in the readdir(2) function. The filesystem does not ++ * have to guarantee uniqueness, however some applications ++ * rely on this value being unique for the whole filesystem. ++ * ++ * Note that this does *not* affect the inode that libfuse ++ * and the kernel use internally (also called the "nodeid"). ++ */ ++ int use_ino; ++ ++ /** ++ * If use_ino option is not given, still try to fill in the ++ * d_ino field in readdir(2). If the name was previously ++ * looked up, and is still in the cache, the inode number ++ * found there will be used. Otherwise it will be set to -1. ++ * If use_ino option is given, this option is ignored. ++ */ ++ int readdir_ino; ++ ++ /** ++ * This option disables the use of page cache (file content cache) ++ * in the kernel for this filesystem. This has several affects: ++ * ++ * 1. Each read(2) or write(2) system call will initiate one ++ * or more read or write operations, data will not be ++ * cached in the kernel. ++ * ++ * 2. The return value of the read() and write() system calls ++ * will correspond to the return values of the read and ++ * write operations. This is useful for example if the ++ * file size is not known in advance (before reading it). ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `direct_io` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int direct_io; ++ ++ /** ++ * This option disables flushing the cache of the file ++ * contents on every open(2). This should only be enabled on ++ * filesystems where the file data is never changed ++ * externally (not through the mounted FUSE filesystem). Thus ++ * it is not suitable for network filesystems and other ++ * intermediate filesystems. ++ * ++ * NOTE: if this option is not specified (and neither ++ * direct_io) data is still cached after the open(2), so a ++ * read(2) system call will not always initiate a read ++ * operation. ++ * ++ * Internally, enabling this option causes fuse to set the ++ * `keep_cache` field of `struct fuse_file_info` - overwriting ++ * any value that was put there by the file system. ++ */ ++ int kernel_cache; ++ ++ /** ++ * This option is an alternative to `kernel_cache`. Instead of ++ * unconditionally keeping cached data, the cached data is ++ * invalidated on open(2) if if the modification time or the ++ * size of the file has changed since it was last opened. ++ */ ++ int auto_cache; ++ ++ /** ++ * The timeout in seconds for which file attributes are cached ++ * for the purpose of checking if auto_cache should flush the ++ * file data on open. ++ */ ++ int ac_attr_timeout_set; ++ double ac_attr_timeout; ++ ++ /** ++ * If this option is given the file-system handlers for the ++ * following operations will not receive path information: ++ * read, write, flush, release, fsync, readdir, releasedir, ++ * fsyncdir, lock, ioctl and poll. ++ * ++ * For the truncate, getattr, chmod, chown and utimens ++ * operations the path will be provided only if the struct ++ * fuse_file_info argument is NULL. ++ */ ++ int nullpath_ok; ++ ++ /** ++ * The remaining options are used by libfuse internally and ++ * should not be touched. ++ */ ++ int show_help; ++ char *modules; ++ int debug; ++}; ++ ++ ++/** ++ * The file system operations: ++ * ++ * Most of these should work very similarly to the well known UNIX ++ * file system operations. A major exception is that instead of ++ * returning an error in 'errno', the operation should return the ++ * negated error value (-errno) directly. ++ * ++ * All methods are optional, but some are essential for a useful ++ * filesystem (e.g. getattr). Open, flush, release, fsync, opendir, ++ * releasedir, fsyncdir, access, create, truncate, lock, init and ++ * destroy are special purpose methods, without which a full featured ++ * filesystem can still be implemented. ++ * ++ * In general, all methods are expected to perform any necessary ++ * permission checking. However, a filesystem may delegate this task ++ * to the kernel by passing the `default_permissions` mount option to ++ * `fuse_new()`. In this case, methods will only be called if ++ * the kernel's permission check has succeeded. ++ * ++ * Almost all operations take a path which can be of any length. ++ */ ++struct fuse_operations { ++ /** Get file attributes. ++ * ++ * Similar to stat(). The 'st_dev' and 'st_blksize' fields are ++ * ignored. The 'st_ino' field is ignored except if the 'use_ino' ++ * mount option is given. In that case it is passed to userspace, ++ * but libfuse and the kernel will still assign a different ++ * inode for internal use (called the "nodeid"). ++ * ++ * `fi` will always be NULL if the file is not currently open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*getattr) (const char *, struct stat *, struct fuse_file_info *fi); ++ ++ /** Read the target of a symbolic link ++ * ++ * The buffer should be filled with a null terminated string. The ++ * buffer size argument includes the space for the terminating ++ * null character. If the linkname is too long to fit in the ++ * buffer, it should be truncated. The return value should be 0 ++ * for success. ++ */ ++ int (*readlink) (const char *, char *, size_t); ++ ++ /** Create a file node ++ * ++ * This is called for creation of all non-directory, non-symlink ++ * nodes. If the filesystem defines a create() method, then for ++ * regular files that will be called instead. ++ */ ++ int (*mknod) (const char *, mode_t, dev_t); ++ ++ /** Create a directory ++ * ++ * Note that the mode argument may not have the type specification ++ * bits set, i.e. S_ISDIR(mode) can be false. To obtain the ++ * correct directory type bits use mode|S_IFDIR ++ * */ ++ int (*mkdir) (const char *, mode_t); ++ ++ /** Remove a file */ ++ int (*unlink) (const char *); ++ ++ /** Remove a directory */ ++ int (*rmdir) (const char *); ++ ++ /** Create a symbolic link */ ++ int (*symlink) (const char *, const char *); ++ ++ /** Rename a file ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ */ ++ int (*rename) (const char *, const char *, unsigned int flags); ++ ++ /** Create a hard link to a file */ ++ int (*link) (const char *, const char *); ++ ++ /** Change the permission bits of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ */ ++ int (*chmod) (const char *, mode_t, struct fuse_file_info *fi); ++ ++ /** Change the owner and group of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*chown) (const char *, uid_t, gid_t, struct fuse_file_info *fi); ++ ++ /** Change the size of a file ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*truncate) (const char *, off_t, struct fuse_file_info *fi); ++ ++ /** Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH) ++ * should be used by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount option is ++ * given, this check is already done by the kernel before calling ++ * open() and may thus be omitted by the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open will also succeed without being send ++ * to the filesystem process. ++ * ++ */ ++ int (*open) (const char *, struct fuse_file_info *); ++ ++ /** Read data from an open file ++ * ++ * Read should return exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the ++ * 'direct_io' mount option is specified, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ */ ++ int (*read) (const char *, char *, size_t, off_t, ++ struct fuse_file_info *); ++ ++ /** Write data to an open file ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the 'direct_io' ++ * mount option is specified (see read operation). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write) (const char *, const char *, size_t, off_t, ++ struct fuse_file_info *); ++ ++ /** Get file system statistics ++ * ++ * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored ++ */ ++ int (*statfs) (const char *, struct statvfs *); ++ ++ /** Possibly flush cached data ++ * ++ * BIG NOTE: This is not equivalent to fsync(). It's not a ++ * request to sync dirty data. ++ * ++ * Flush is called on each close() of a file descriptor, as opposed to ++ * release which is called on the close of the last file descriptor for ++ * a file. Under Linux, errors returned by flush() will be passed to ++ * userspace as errors from close(), so flush() is a good place to write ++ * back any cached dirty data. However, many applications ignore errors ++ * on close(), and on non-Linux systems, close() may succeed even if flush() ++ * returns an error. For these reasons, filesystems should not assume ++ * that errors returned by flush will ever be noticed or even ++ * delivered. ++ * ++ * NOTE: The flush() method may be called more than once for each ++ * open(). This happens if more than one file descriptor refers to an ++ * open file handle, e.g. due to dup(), dup2() or fork() calls. It is ++ * not possible to determine if a flush is final, so each flush should ++ * be treated equally. Multiple write-flush sequences are relatively ++ * rare, so this shouldn't be a problem. ++ * ++ * Filesystems shouldn't assume that flush will be called at any ++ * particular point. It may be called more times than expected, or not ++ * at all. ++ * ++ * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ int (*flush) (const char *, struct fuse_file_info *); ++ ++ /** Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open() call there will be exactly one release() call ++ * with the same flags and file handle. It is possible to ++ * have a file opened more than once, in which case only the last ++ * release will mean, that no more reads/writes will happen on the ++ * file. The return value of release is ignored. ++ */ ++ int (*release) (const char *, struct fuse_file_info *); ++ ++ /** Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ */ ++ int (*fsync) (const char *, int, struct fuse_file_info *); ++ ++ /** Set extended attributes */ ++ int (*setxattr) (const char *, const char *, const char *, size_t, int); ++ ++ /** Get extended attributes */ ++ int (*getxattr) (const char *, const char *, char *, size_t); ++ ++ /** List extended attributes */ ++ int (*listxattr) (const char *, char *, size_t); ++ ++ /** Remove extended attributes */ ++ int (*removexattr) (const char *, const char *); ++ ++ /** Open directory ++ * ++ * Unless the 'default_permissions' mount option is given, ++ * this method should check if opendir is permitted for this ++ * directory. Optionally opendir may also return an arbitrary ++ * filehandle in the fuse_file_info structure, which will be ++ * passed to readdir, releasedir and fsyncdir. ++ */ ++ int (*opendir) (const char *, struct fuse_file_info *); ++ ++ /** Read directory ++ * ++ * The filesystem may choose between two modes of operation: ++ * ++ * 1) The readdir implementation ignores the offset parameter, and ++ * passes zero to the filler function's offset. The filler ++ * function will not return '1' (unless an error happens), so the ++ * whole directory is read in a single readdir operation. ++ * ++ * 2) The readdir implementation keeps track of the offsets of the ++ * directory entries. It uses the offset parameter and always ++ * passes non-zero offset to the filler function. When the buffer ++ * is full (or an error happens) the filler function will return ++ * '1'. ++ */ ++ int (*readdir) (const char *, void *, fuse_fill_dir_t, off_t, ++ struct fuse_file_info *, enum fuse_readdir_flags); ++ ++ /** Release directory ++ */ ++ int (*releasedir) (const char *, struct fuse_file_info *); ++ ++ /** Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data ++ */ ++ int (*fsyncdir) (const char *, int, struct fuse_file_info *); ++ ++ /** ++ * Initialize filesystem ++ * ++ * The return value will passed in the `private_data` field of ++ * `struct fuse_context` to all file operations, and as a ++ * parameter to the destroy() method. It overrides the initial ++ * value provided to fuse_main() / fuse_new(). ++ */ ++ void *(*init) (struct fuse_conn_info *conn, ++ struct fuse_config *cfg); ++ ++ /** ++ * Clean up filesystem ++ * ++ * Called on filesystem exit. ++ */ ++ void (*destroy) (void *private_data); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() system call. If the ++ * 'default_permissions' mount option is given, this method is not ++ * called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ */ ++ int (*access) (const char *, int); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ */ ++ int (*create) (const char *, mode_t, struct fuse_file_info *); ++ ++ /** ++ * Perform POSIX file locking operation ++ * ++ * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW. ++ * ++ * For the meaning of fields in 'struct flock' see the man page ++ * for fcntl(2). The l_whence field will always be set to ++ * SEEK_SET. ++ * ++ * For checking lock ownership, the 'fuse_file_info->owner' ++ * argument must be used. ++ * ++ * For F_GETLK operation, the library will first check currently ++ * held locks, and if a conflicting lock is found it will return ++ * information without calling this method. This ensures, that ++ * for local locks the l_pid field is correctly filled in. The ++ * results may not be accurate in case of race conditions and in ++ * the presence of hard links, but it's unlikely that an ++ * application would rely on accurate GETLK results in these ++ * cases. If a conflicting lock is not found, this method will be ++ * called, and the filesystem may fill out l_pid by a meaningful ++ * value, or it may leave this field zero. ++ * ++ * For F_SETLK and F_SETLKW the l_pid field will be set to the pid ++ * of the process performing the locking operation. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*lock) (const char *, struct fuse_file_info *, int cmd, ++ struct flock *); ++ ++ /** ++ * Change the access and modification times of a file with ++ * nanosecond resolution ++ * ++ * This supersedes the old utime() interface. New applications ++ * should use this. ++ * ++ * `fi` will always be NULL if the file is not currenlty open, but ++ * may also be NULL if the file is open. ++ * ++ * See the utimensat(2) man page for details. ++ */ ++ int (*utimens) (const char *, const struct timespec tv[2], ++ struct fuse_file_info *fi); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ */ ++ int (*bmap) (const char *, size_t blocksize, uint64_t *idx); ++ ++ /** ++ * Ioctl ++ * ++ * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in ++ * 64bit environment. The size and direction of data is ++ * determined by _IOC_*() decoding of cmd. For _IOC_NONE, ++ * data will be NULL, for _IOC_WRITE data is out area, for ++ * _IOC_READ in area and if both are set in/out area. In all ++ * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes. ++ * ++ * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a ++ * directory file handle. ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ int (*ioctl) (const char *, unsigned int cmd, void *arg, ++ struct fuse_file_info *, unsigned int flags, void *data); ++ ++ /** ++ * Poll for IO readiness events ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ */ ++ int (*poll) (const char *, struct fuse_file_info *, ++ struct fuse_pollhandle *ph, unsigned *reventsp); ++ ++ /** Write contents of buffer to an open file ++ * ++ * Similar to the write() method, but data is supplied in a ++ * generic buffer. Use fuse_buf_copy() to transfer data to ++ * the destination. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ */ ++ int (*write_buf) (const char *, struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *); ++ ++ /** Store data from an open file in a buffer ++ * ++ * Similar to the read() method, but data is stored and ++ * returned in a generic buffer. ++ * ++ * No actual copying of data has to take place, the source ++ * file descriptor may simply be stored in the buffer for ++ * later data transfer. ++ * ++ * The buffer must be allocated dynamically and stored at the ++ * location pointed to by bufp. If the buffer contains memory ++ * regions, they too must be allocated using malloc(). The ++ * allocated memory will be freed by the caller. ++ */ ++ int (*read_buf) (const char *, struct fuse_bufvec **bufp, ++ size_t size, off_t off, struct fuse_file_info *); ++ /** ++ * Perform BSD file locking operation ++ * ++ * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN ++ * ++ * Nonblocking requests will be indicated by ORing LOCK_NB to ++ * the above operations ++ * ++ * For more information see the flock(2) manual page. ++ * ++ * Additionally fi->owner will be set to a value unique to ++ * this open file. This same value will be supplied to ++ * ->release() when the file is released. ++ * ++ * Note: if this method is not implemented, the kernel will still ++ * allow file locking to work locally. Hence it is only ++ * interesting for network filesystems and similar. ++ */ ++ int (*flock) (const char *, struct fuse_file_info *, int op); ++ ++ /** ++ * Allocates space for an open file ++ * ++ * This function ensures that required space is allocated for specified ++ * file. If this function returns success then any subsequent write ++ * request to specified range is guaranteed not to fail because of lack ++ * of space on the file system media. ++ */ ++ int (*fallocate) (const char *, int, off_t, off_t, ++ struct fuse_file_info *); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ */ ++ ssize_t (*copy_file_range) (const char *path_in, ++ struct fuse_file_info *fi_in, ++ off_t offset_in, const char *path_out, ++ struct fuse_file_info *fi_out, ++ off_t offset_out, size_t size, int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ */ ++ off_t (*lseek) (const char *, off_t off, int whence, struct fuse_file_info *); ++}; ++ ++/** Extra context that may be needed by some filesystems ++ * ++ * The uid, gid and pid fields are not filled in case of a writepage ++ * operation. ++ */ ++struct fuse_context { ++ /** Pointer to the fuse object */ ++ struct fuse *fuse; ++ ++ /** User ID of the calling process */ ++ uid_t uid; ++ ++ /** Group ID of the calling process */ ++ gid_t gid; ++ ++ /** Process ID of the calling thread */ ++ pid_t pid; ++ ++ /** Private filesystem data */ ++ void *private_data; ++ ++ /** Umask of the calling process */ ++ mode_t umask; ++}; ++ ++/** ++ * Main function of FUSE. ++ * ++ * This is for the lazy. This is all that has to be called from the ++ * main() function. ++ * ++ * This function does the following: ++ * - parses command line options, and handles --help and ++ * --version ++ * - installs signal handlers for INT, HUP, TERM and PIPE ++ * - registers an exit handler to unmount the filesystem on program exit ++ * - creates a fuse handle ++ * - registers the operations ++ * - calls either the single-threaded or the multi-threaded event loop ++ * ++ * Most file systems will have to parse some file-system specific ++ * arguments before calling this function. It is recommended to do ++ * this with fuse_opt_parse() and a processing function that passes ++ * through any unknown options (this can also be achieved by just ++ * passing NULL as the processing function). That way, the remaining ++ * options can be passed directly to fuse_main(). ++ * ++ * fuse_main() accepts all options that can be passed to ++ * fuse_parse_cmdline(), fuse_new(), or fuse_session_new(). ++ * ++ * Option parsing skips argv[0], which is assumed to contain the ++ * program name. This element must always be present and is used to ++ * construct a basic ``usage: `` message for the --help ++ * output. argv[0] may also be set to the empty string. In this case ++ * the usage message is suppressed. This can be used by file systems ++ * to print their own usage line first. See hello.c for an example of ++ * how to do this. ++ * ++ * Note: this is currently implemented as a macro. ++ * ++ * The following error codes may be returned from fuse_main(): ++ * 1: Invalid option arguments ++ * 2: No mount point specified ++ * 3: FUSE setup failed ++ * 4: Mounting failed ++ * 5: Failed to daemonize (detach from session) ++ * 6: Failed to set up signal handlers ++ * 7: An error occured during the life of the file system ++ * ++ * @param argc the argument counter passed to the main() function ++ * @param argv the argument vector passed to the main() function ++ * @param op the file system operation ++ * @param private_data Initial value for the `private_data` ++ * field of `struct fuse_context`. May be overridden by the ++ * `struct fuse_operations.init` handler. ++ * @return 0 on success, nonzero on failure ++ * ++ * Example usage, see hello.c ++ */ ++/* ++ int fuse_main(int argc, char *argv[], const struct fuse_operations *op, ++ void *private_data); ++*/ ++#define fuse_main(argc, argv, op, private_data) \ ++ fuse_main_real(argc, argv, op, sizeof(*(op)), private_data) ++ ++/* ----------------------------------------------------------- * ++ * More detailed API * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Print available options (high- and low-level) to stdout. This is ++ * not an exhaustive list, but includes only those options that may be ++ * of interest to an end-user of a file system. ++ * ++ * The function looks at the argument vector only to determine if ++ * there are additional modules to be loaded (module=foo option), ++ * and attempts to call their help functions as well. ++ * ++ * @param args the argument vector. ++ */ ++void fuse_lib_help(struct fuse_args *args); ++ ++/** ++ * Create a new FUSE filesystem. ++ * ++ * This function accepts most file-system independent mount options ++ * (like context, nodev, ro - see mount(8)), as well as the ++ * FUSE-specific mount options from mount.fuse(8). ++ * ++ * If the --help option is specified, the function writes a help text ++ * to stdout and returns NULL. ++ * ++ * Option parsing skips argv[0], which is assumed to contain the ++ * program name. This element must always be present and is used to ++ * construct a basic ``usage: `` message for the --help output. If ++ * argv[0] is set to the empty string, no usage message is included in ++ * the --help output. ++ * ++ * If an unknown option is passed in, an error message is written to ++ * stderr and the function returns NULL. ++ * ++ * @param args argument vector ++ * @param op the filesystem operations ++ * @param op_size the size of the fuse_operations structure ++ * @param private_data Initial value for the `private_data` ++ * field of `struct fuse_context`. May be overridden by the ++ * `struct fuse_operations.init` handler. ++ * @return the created FUSE handle ++ */ ++#if FUSE_USE_VERSION == 30 ++struct fuse *fuse_new_30(struct fuse_args *args, const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++#define fuse_new(args, op, size, data) fuse_new_30(args, op, size, data) ++#else ++struct fuse *fuse_new(struct fuse_args *args, const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++#endif ++ ++/** ++ * Mount a FUSE file system. ++ * ++ * @param mountpoint the mount point path ++ * @param f the FUSE handle ++ * ++ * @return 0 on success, -1 on failure. ++ **/ ++int fuse_mount(struct fuse *f, const char *mountpoint); ++ ++/** ++ * Unmount a FUSE file system. ++ * ++ * See fuse_session_unmount() for additional information. ++ * ++ * @param f the FUSE handle ++ **/ ++void fuse_unmount(struct fuse *f); ++ ++/** ++ * Destroy the FUSE handle. ++ * ++ * NOTE: This function does not unmount the filesystem. If this is ++ * needed, call fuse_unmount() before calling this function. ++ * ++ * @param f the FUSE handle ++ */ ++void fuse_destroy(struct fuse *f); ++ ++/** ++ * FUSE event loop. ++ * ++ * Requests from the kernel are processed, and the appropriate ++ * operations are called. ++ * ++ * For a description of the return value and the conditions when the ++ * event loop exits, refer to the documentation of ++ * fuse_session_loop(). ++ * ++ * @param f the FUSE handle ++ * @return see fuse_session_loop() ++ * ++ * See also: fuse_loop_mt() ++ */ ++int fuse_loop(struct fuse *f); ++ ++/** ++ * Flag session as terminated ++ * ++ * This function will cause any running event loops to exit on ++ * the next opportunity. ++ * ++ * @param f the FUSE handle ++ */ ++void fuse_exit(struct fuse *f); ++ ++/** ++ * FUSE event loop with multiple threads ++ * ++ * Requests from the kernel are processed, and the appropriate ++ * operations are called. Request are processed in parallel by ++ * distributing them between multiple threads. ++ * ++ * For a description of the return value and the conditions when the ++ * event loop exits, refer to the documentation of ++ * fuse_session_loop(). ++ * ++ * Note: using fuse_loop() instead of fuse_loop_mt() means you are running in ++ * single-threaded mode, and that you will not have to worry about reentrancy, ++ * though you will have to worry about recursive lookups. In single-threaded ++ * mode, FUSE will wait for one callback to return before calling another. ++ * ++ * Enabling multiple threads, by using fuse_loop_mt(), will cause FUSE to make ++ * multiple simultaneous calls into the various callback functions given by your ++ * fuse_operations record. ++ * ++ * If you are using multiple threads, you can enjoy all the parallel execution ++ * and interactive response benefits of threads, and you get to enjoy all the ++ * benefits of race conditions and locking bugs, too. Ensure that any code used ++ * in the callback function of fuse_operations is also thread-safe. ++ * ++ * @param f the FUSE handle ++ * @param config loop configuration ++ * @return see fuse_session_loop() ++ * ++ * See also: fuse_loop() ++ */ ++#if FUSE_USE_VERSION < 32 ++int fuse_loop_mt_31(struct fuse *f, int clone_fd); ++#define fuse_loop_mt(f, clone_fd) fuse_loop_mt_31(f, clone_fd) ++#else ++int fuse_loop_mt(struct fuse *f, struct fuse_loop_config *config); ++#endif ++ ++/** ++ * Get the current context ++ * ++ * The context is only valid for the duration of a filesystem ++ * operation, and thus must not be stored and used later. ++ * ++ * @return the context ++ */ ++struct fuse_context *fuse_get_context(void); ++ ++/** ++ * Get the current supplementary group IDs for the current request ++ * ++ * Similar to the getgroups(2) system call, except the return value is ++ * always the total number of group IDs, even if it is larger than the ++ * specified size. ++ * ++ * The current fuse kernel module in linux (as of 2.6.30) doesn't pass ++ * the group list to userspace, hence this function needs to parse ++ * "/proc/$TID/task/$TID/status" to get the group IDs. ++ * ++ * This feature may not be supported on all operating systems. In ++ * such a case this function will return -ENOSYS. ++ * ++ * @param size size of given array ++ * @param list array of group IDs to be filled in ++ * @return the total number of supplementary group IDs or -errno on failure ++ */ ++int fuse_getgroups(int size, gid_t list[]); ++ ++/** ++ * Check if the current request has already been interrupted ++ * ++ * @return 1 if the request has been interrupted, 0 otherwise ++ */ ++int fuse_interrupted(void); ++ ++/** ++ * Invalidates cache for the given path. ++ * ++ * This calls fuse_lowlevel_notify_inval_inode internally. ++ * ++ * @return 0 on successful invalidation, negative error value otherwise. ++ * This routine may return -ENOENT to indicate that there was ++ * no entry to be invalidated, e.g., because the path has not ++ * been seen before or has been forgotten; this should not be ++ * considered to be an error. ++ */ ++int fuse_invalidate_path(struct fuse *f, const char *path); ++ ++/** ++ * The real main function ++ * ++ * Do not call this directly, use fuse_main() ++ */ ++int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++ ++/** ++ * Start the cleanup thread when using option "remember". ++ * ++ * This is done automatically by fuse_loop_mt() ++ * @param fuse struct fuse pointer for fuse instance ++ * @return 0 on success and -1 on error ++ */ ++int fuse_start_cleanup_thread(struct fuse *fuse); ++ ++/** ++ * Stop the cleanup thread when using option "remember". ++ * ++ * This is done automatically by fuse_loop_mt() ++ * @param fuse struct fuse pointer for fuse instance ++ */ ++void fuse_stop_cleanup_thread(struct fuse *fuse); ++ ++/** ++ * Iterate over cache removing stale entries ++ * use in conjunction with "-oremember" ++ * ++ * NOTE: This is already done for the standard sessions ++ * ++ * @param fuse struct fuse pointer for fuse instance ++ * @return the number of seconds until the next cleanup ++ */ ++int fuse_clean_cache(struct fuse *fuse); ++ ++/* ++ * Stacking API ++ */ ++ ++/** ++ * Fuse filesystem object ++ * ++ * This is opaque object represents a filesystem layer ++ */ ++struct fuse_fs; ++ ++/* ++ * These functions call the relevant filesystem operation, and return ++ * the result. ++ * ++ * If the operation is not defined, they return -ENOSYS, with the ++ * exception of fuse_fs_open, fuse_fs_release, fuse_fs_opendir, ++ * fuse_fs_releasedir and fuse_fs_statfs, which return 0. ++ */ ++ ++int fuse_fs_getattr(struct fuse_fs *fs, const char *path, struct stat *buf, ++ struct fuse_file_info *fi); ++int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, ++ const char *newpath, unsigned int flags); ++int fuse_fs_unlink(struct fuse_fs *fs, const char *path); ++int fuse_fs_rmdir(struct fuse_fs *fs, const char *path); ++int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, ++ const char *path); ++int fuse_fs_link(struct fuse_fs *fs, const char *oldpath, const char *newpath); ++int fuse_fs_release(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_open(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_read(struct fuse_fs *fs, const char *path, char *buf, size_t size, ++ off_t off, struct fuse_file_info *fi); ++int fuse_fs_read_buf(struct fuse_fs *fs, const char *path, ++ struct fuse_bufvec **bufp, size_t size, off_t off, ++ struct fuse_file_info *fi); ++int fuse_fs_write(struct fuse_fs *fs, const char *path, const char *buf, ++ size_t size, off_t off, struct fuse_file_info *fi); ++int fuse_fs_write_buf(struct fuse_fs *fs, const char *path, ++ struct fuse_bufvec *buf, off_t off, ++ struct fuse_file_info *fi); ++int fuse_fs_fsync(struct fuse_fs *fs, const char *path, int datasync, ++ struct fuse_file_info *fi); ++int fuse_fs_flush(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_statfs(struct fuse_fs *fs, const char *path, struct statvfs *buf); ++int fuse_fs_opendir(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_readdir(struct fuse_fs *fs, const char *path, void *buf, ++ fuse_fill_dir_t filler, off_t off, ++ struct fuse_file_info *fi, enum fuse_readdir_flags flags); ++int fuse_fs_fsyncdir(struct fuse_fs *fs, const char *path, int datasync, ++ struct fuse_file_info *fi); ++int fuse_fs_releasedir(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi); ++int fuse_fs_create(struct fuse_fs *fs, const char *path, mode_t mode, ++ struct fuse_file_info *fi); ++int fuse_fs_lock(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi, int cmd, struct flock *lock); ++int fuse_fs_flock(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi, int op); ++int fuse_fs_chmod(struct fuse_fs *fs, const char *path, mode_t mode, ++ struct fuse_file_info *fi); ++int fuse_fs_chown(struct fuse_fs *fs, const char *path, uid_t uid, gid_t gid, ++ struct fuse_file_info *fi); ++int fuse_fs_truncate(struct fuse_fs *fs, const char *path, off_t size, ++ struct fuse_file_info *fi); ++int fuse_fs_utimens(struct fuse_fs *fs, const char *path, ++ const struct timespec tv[2], struct fuse_file_info *fi); ++int fuse_fs_access(struct fuse_fs *fs, const char *path, int mask); ++int fuse_fs_readlink(struct fuse_fs *fs, const char *path, char *buf, ++ size_t len); ++int fuse_fs_mknod(struct fuse_fs *fs, const char *path, mode_t mode, ++ dev_t rdev); ++int fuse_fs_mkdir(struct fuse_fs *fs, const char *path, mode_t mode); ++int fuse_fs_setxattr(struct fuse_fs *fs, const char *path, const char *name, ++ const char *value, size_t size, int flags); ++int fuse_fs_getxattr(struct fuse_fs *fs, const char *path, const char *name, ++ char *value, size_t size); ++int fuse_fs_listxattr(struct fuse_fs *fs, const char *path, char *list, ++ size_t size); ++int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, ++ const char *name); ++int fuse_fs_bmap(struct fuse_fs *fs, const char *path, size_t blocksize, ++ uint64_t *idx); ++int fuse_fs_ioctl(struct fuse_fs *fs, const char *path, unsigned int cmd, ++ void *arg, struct fuse_file_info *fi, unsigned int flags, ++ void *data); ++int fuse_fs_poll(struct fuse_fs *fs, const char *path, ++ struct fuse_file_info *fi, struct fuse_pollhandle *ph, ++ unsigned *reventsp); ++int fuse_fs_fallocate(struct fuse_fs *fs, const char *path, int mode, ++ off_t offset, off_t length, struct fuse_file_info *fi); ++ssize_t fuse_fs_copy_file_range(struct fuse_fs *fs, const char *path_in, ++ struct fuse_file_info *fi_in, off_t off_in, ++ const char *path_out, ++ struct fuse_file_info *fi_out, off_t off_out, ++ size_t len, int flags); ++off_t fuse_fs_lseek(struct fuse_fs *fs, const char *path, off_t off, int whence, ++ struct fuse_file_info *fi); ++void fuse_fs_init(struct fuse_fs *fs, struct fuse_conn_info *conn, ++ struct fuse_config *cfg); ++void fuse_fs_destroy(struct fuse_fs *fs); ++ ++int fuse_notify_poll(struct fuse_pollhandle *ph); ++ ++/** ++ * Create a new fuse filesystem object ++ * ++ * This is usually called from the factory of a fuse module to create ++ * a new instance of a filesystem. ++ * ++ * @param op the filesystem operations ++ * @param op_size the size of the fuse_operations structure ++ * @param private_data Initial value for the `private_data` ++ * field of `struct fuse_context`. May be overridden by the ++ * `struct fuse_operations.init` handler. ++ * @return a new filesystem object ++ */ ++struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size, ++ void *private_data); ++ ++/** ++ * Factory for creating filesystem objects ++ * ++ * The function may use and remove options from 'args' that belong ++ * to this module. ++ * ++ * For now the 'fs' vector always contains exactly one filesystem. ++ * This is the filesystem which will be below the newly created ++ * filesystem in the stack. ++ * ++ * @param args the command line arguments ++ * @param fs NULL terminated filesystem object vector ++ * @return the new filesystem object ++ */ ++typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args, ++ struct fuse_fs *fs[]); ++/** ++ * Register filesystem module ++ * ++ * If the "-omodules=*name*_:..." option is present, filesystem ++ * objects are created and pushed onto the stack with the *factory_* ++ * function. ++ * ++ * @param name_ the name of this filesystem module ++ * @param factory_ the factory function for this filesystem module ++ */ ++#define FUSE_REGISTER_MODULE(name_, factory_) \ ++ fuse_module_factory_t fuse_module_ ## name_ ## _factory = factory_ ++ ++/** Get session from fuse object */ ++struct fuse_session *fuse_get_session(struct fuse *f); ++ ++/** ++ * Open a FUSE file descriptor and set up the mount for the given ++ * mountpoint and flags. ++ * ++ * @param mountpoint reference to the mount in the file system ++ * @param options mount options ++ * @return the FUSE file descriptor or -1 upon error ++ */ ++int fuse_open_channel(const char *mountpoint, const char *options); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_H_ */ +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +new file mode 100644 +index 0000000..2d686b2 +--- /dev/null ++++ b/tools/virtiofsd/fuse_common.h +@@ -0,0 +1,823 @@ ++/* FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++/** @file */ ++ ++#if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_) ++#error "Never include directly; use or instead." ++#endif ++ ++#ifndef FUSE_COMMON_H_ ++#define FUSE_COMMON_H_ ++ ++#include "fuse_opt.h" ++#include "fuse_log.h" ++#include ++#include ++ ++/** Major version of FUSE library interface */ ++#define FUSE_MAJOR_VERSION 3 ++ ++/** Minor version of FUSE library interface */ ++#define FUSE_MINOR_VERSION 2 ++ ++#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) ++#define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/** ++ * Information about an open file. ++ * ++ * File Handles are created by the open, opendir, and create methods and closed ++ * by the release and releasedir methods. Multiple file handles may be ++ * concurrently open for the same file. Generally, a client will create one ++ * file handle per file descriptor, though in some cases multiple file ++ * descriptors can share a single file handle. ++ */ ++struct fuse_file_info { ++ /** Open flags. Available in open() and release() */ ++ int flags; ++ ++ /** In case of a write operation indicates if this was caused ++ by a delayed write from the page cache. If so, then the ++ context's pid, uid, and gid fields will not be valid, and ++ the *fh* value may not match the *fh* value that would ++ have been sent with the corresponding individual write ++ requests if write caching had been disabled. */ ++ unsigned int writepage : 1; ++ ++ /** Can be filled in by open, to use direct I/O on this file. */ ++ unsigned int direct_io : 1; ++ ++ /** Can be filled in by open. It signals the kernel that any ++ currently cached file data (ie., data that the filesystem ++ provided the last time the file was open) need not be ++ invalidated. Has no effect when set in other contexts (in ++ particular it does nothing when set by opendir()). */ ++ unsigned int keep_cache : 1; ++ ++ /** Indicates a flush operation. Set in flush operation, also ++ maybe set in highlevel lock operation and lowlevel release ++ operation. */ ++ unsigned int flush : 1; ++ ++ /** Can be filled in by open, to indicate that the file is not ++ seekable. */ ++ unsigned int nonseekable : 1; ++ ++ /* Indicates that flock locks for this file should be ++ released. If set, lock_owner shall contain a valid value. ++ May only be set in ->release(). */ ++ unsigned int flock_release : 1; ++ ++ /** Can be filled in by opendir. It signals the kernel to ++ enable caching of entries returned by readdir(). Has no ++ effect when set in other contexts (in particular it does ++ nothing when set by open()). */ ++ unsigned int cache_readdir : 1; ++ ++ /** Padding. Reserved for future use*/ ++ unsigned int padding : 25; ++ unsigned int padding2 : 32; ++ ++ /** File handle id. May be filled in by filesystem in create, ++ * open, and opendir(). Available in most other file operations on the ++ * same file handle. */ ++ uint64_t fh; ++ ++ /** Lock owner id. Available in locking operations and flush */ ++ uint64_t lock_owner; ++ ++ /** Requested poll events. Available in ->poll. Only set on kernels ++ which support it. If unsupported, this field is set to zero. */ ++ uint32_t poll_events; ++}; ++ ++/** ++ * Configuration parameters passed to fuse_session_loop_mt() and ++ * fuse_loop_mt(). ++ */ ++struct fuse_loop_config { ++ /** ++ * whether to use separate device fds for each thread ++ * (may increase performance) ++ */ ++ int clone_fd; ++ ++ /** ++ * The maximum number of available worker threads before they ++ * start to get deleted when they become idle. If not ++ * specified, the default is 10. ++ * ++ * Adjusting this has performance implications; a very small number ++ * of threads in the pool will cause a lot of thread creation and ++ * deletion overhead and performance may suffer. When set to 0, a new ++ * thread will be created to service every operation. ++ */ ++ unsigned int max_idle_threads; ++}; ++ ++/************************************************************************** ++ * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' * ++ **************************************************************************/ ++ ++/** ++ * Indicates that the filesystem supports asynchronous read requests. ++ * ++ * If this capability is not requested/available, the kernel will ++ * ensure that there is at most one pending read request per ++ * file-handle at any time, and will attempt to order read requests by ++ * increasing offset. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_ASYNC_READ (1 << 0) ++ ++/** ++ * Indicates that the filesystem supports "remote" locking. ++ * ++ * This feature is enabled by default when supported by the kernel, ++ * and if getlk() and setlk() handlers are implemented. ++ */ ++#define FUSE_CAP_POSIX_LOCKS (1 << 1) ++ ++/** ++ * Indicates that the filesystem supports the O_TRUNC open flag. If ++ * disabled, and an application specifies O_TRUNC, fuse first calls ++ * truncate() and then open() with O_TRUNC filtered out. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) ++ ++/** ++ * Indicates that the filesystem supports lookups of "." and "..". ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) ++ ++/** ++ * Indicates that the kernel should not apply the umask to the ++ * file mode on create operations. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_DONT_MASK (1 << 6) ++ ++/** ++ * Indicates that libfuse should try to use splice() when writing to ++ * the fuse device. This may improve performance. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_SPLICE_WRITE (1 << 7) ++ ++/** ++ * Indicates that libfuse should try to move pages instead of copying when ++ * writing to / reading from the fuse device. This may improve performance. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_SPLICE_MOVE (1 << 8) ++ ++/** ++ * Indicates that libfuse should try to use splice() when reading from ++ * the fuse device. This may improve performance. ++ * ++ * This feature is enabled by default when supported by the kernel and ++ * if the filesystem implements a write_buf() handler. ++ */ ++#define FUSE_CAP_SPLICE_READ (1 << 9) ++ ++/** ++ * If set, the calls to flock(2) will be emulated using POSIX locks and must ++ * then be handled by the filesystem's setlock() handler. ++ * ++ * If not set, flock(2) calls will be handled by the FUSE kernel module ++ * internally (so any access that does not go through the kernel cannot be taken ++ * into account). ++ * ++ * This feature is enabled by default when supported by the kernel and ++ * if the filesystem implements a flock() handler. ++ */ ++#define FUSE_CAP_FLOCK_LOCKS (1 << 10) ++ ++/** ++ * Indicates that the filesystem supports ioctl's on directories. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_IOCTL_DIR (1 << 11) ++ ++/** ++ * Traditionally, while a file is open the FUSE kernel module only ++ * asks the filesystem for an update of the file's attributes when a ++ * client attempts to read beyond EOF. This is unsuitable for ++ * e.g. network filesystems, where the file contents may change ++ * without the kernel knowing about it. ++ * ++ * If this flag is set, FUSE will check the validity of the attributes ++ * on every read. If the attributes are no longer valid (i.e., if the ++ * *attr_timeout* passed to fuse_reply_attr() or set in `struct ++ * fuse_entry_param` has passed), it will first issue a `getattr` ++ * request. If the new mtime differs from the previous value, any ++ * cached file *contents* will be invalidated as well. ++ * ++ * This flag should always be set when available. If all file changes ++ * go through the kernel, *attr_timeout* should be set to a very large ++ * number to avoid unnecessary getattr() calls. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) ++ ++/** ++ * Indicates that the filesystem supports readdirplus. ++ * ++ * This feature is enabled by default when supported by the kernel and if the ++ * filesystem implements a readdirplus() handler. ++ */ ++#define FUSE_CAP_READDIRPLUS (1 << 13) ++ ++/** ++ * Indicates that the filesystem supports adaptive readdirplus. ++ * ++ * If FUSE_CAP_READDIRPLUS is not set, this flag has no effect. ++ * ++ * If FUSE_CAP_READDIRPLUS is set and this flag is not set, the kernel ++ * will always issue readdirplus() requests to retrieve directory ++ * contents. ++ * ++ * If FUSE_CAP_READDIRPLUS is set and this flag is set, the kernel ++ * will issue both readdir() and readdirplus() requests, depending on ++ * how much information is expected to be required. ++ * ++ * As of Linux 4.20, the algorithm is as follows: when userspace ++ * starts to read directory entries, issue a READDIRPLUS request to ++ * the filesystem. If any entry attributes have been looked up by the ++ * time userspace requests the next batch of entries continue with ++ * READDIRPLUS, otherwise switch to plain READDIR. This will reasult ++ * in eg plain "ls" triggering READDIRPLUS first then READDIR after ++ * that because it doesn't do lookups. "ls -l" should result in all ++ * READDIRPLUS, except if dentries are already cached. ++ * ++ * This feature is enabled by default when supported by the kernel and ++ * if the filesystem implements both a readdirplus() and a readdir() ++ * handler. ++ */ ++#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) ++ ++/** ++ * Indicates that the filesystem supports asynchronous direct I/O submission. ++ * ++ * If this capability is not requested/available, the kernel will ensure that ++ * there is at most one pending read and one pending write request per direct ++ * I/O file-handle at any time. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_ASYNC_DIO (1 << 15) ++ ++/** ++ * Indicates that writeback caching should be enabled. This means that ++ * individual write request may be buffered and merged in the kernel ++ * before they are send to the filesystem. ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) ++ ++/** ++ * Indicates support for zero-message opens. If this flag is set in ++ * the `capable` field of the `fuse_conn_info` structure, then the ++ * filesystem may return `ENOSYS` from the open() handler to indicate ++ * success. Further attempts to open files will be handled in the ++ * kernel. (If this flag is not set, returning ENOSYS will be treated ++ * as an error and signaled to the caller). ++ * ++ * Setting (or unsetting) this flag in the `want` field has *no ++ * effect*. ++ */ ++#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) ++ ++/** ++ * Indicates support for parallel directory operations. If this flag ++ * is unset, the FUSE kernel module will ensure that lookup() and ++ * readdir() requests are never issued concurrently for the same ++ * directory. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) ++ ++/** ++ * Indicates support for POSIX ACLs. ++ * ++ * If this feature is enabled, the kernel will cache and have ++ * responsibility for enforcing ACLs. ACL will be stored as xattrs and ++ * passed to userspace, which is responsible for updating the ACLs in ++ * the filesystem, keeping the file mode in sync with the ACL, and ++ * ensuring inheritance of default ACLs when new filesystem nodes are ++ * created. Note that this requires that the file system is able to ++ * parse and interpret the xattr representation of ACLs. ++ * ++ * Enabling this feature implicitly turns on the ++ * ``default_permissions`` mount option (even if it was not passed to ++ * mount(2)). ++ * ++ * This feature is disabled by default. ++ */ ++#define FUSE_CAP_POSIX_ACL (1 << 19) ++ ++/** ++ * Indicates that the filesystem is responsible for unsetting ++ * setuid and setgid bits when a file is written, truncated, or ++ * its owner is changed. ++ * ++ * This feature is enabled by default when supported by the kernel. ++ */ ++#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) ++ ++/** ++ * Indicates support for zero-message opendirs. If this flag is set in ++ * the `capable` field of the `fuse_conn_info` structure, then the filesystem ++ * may return `ENOSYS` from the opendir() handler to indicate success. Further ++ * opendir and releasedir messages will be handled in the kernel. (If this ++ * flag is not set, returning ENOSYS will be treated as an error and signalled ++ * to the caller.) ++ * ++ * Setting (or unsetting) this flag in the `want` field has *no effect*. ++ */ ++#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) ++ ++/** ++ * Ioctl flags ++ * ++ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine ++ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed ++ * FUSE_IOCTL_RETRY: retry with new iovecs ++ * FUSE_IOCTL_DIR: is a directory ++ * ++ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs ++ */ ++#define FUSE_IOCTL_COMPAT (1 << 0) ++#define FUSE_IOCTL_UNRESTRICTED (1 << 1) ++#define FUSE_IOCTL_RETRY (1 << 2) ++#define FUSE_IOCTL_DIR (1 << 4) ++ ++#define FUSE_IOCTL_MAX_IOV 256 ++ ++/** ++ * Connection information, passed to the ->init() method ++ * ++ * Some of the elements are read-write, these can be changed to ++ * indicate the value requested by the filesystem. The requested ++ * value must usually be smaller than the indicated value. ++ */ ++struct fuse_conn_info { ++ /** ++ * Major version of the protocol (read-only) ++ */ ++ unsigned proto_major; ++ ++ /** ++ * Minor version of the protocol (read-only) ++ */ ++ unsigned proto_minor; ++ ++ /** ++ * Maximum size of the write buffer ++ */ ++ unsigned max_write; ++ ++ /** ++ * Maximum size of read requests. A value of zero indicates no ++ * limit. However, even if the filesystem does not specify a ++ * limit, the maximum size of read requests will still be ++ * limited by the kernel. ++ * ++ * NOTE: For the time being, the maximum size of read requests ++ * must be set both here *and* passed to fuse_session_new() ++ * using the ``-o max_read=`` mount option. At some point ++ * in the future, specifying the mount option will no longer ++ * be necessary. ++ */ ++ unsigned max_read; ++ ++ /** ++ * Maximum readahead ++ */ ++ unsigned max_readahead; ++ ++ /** ++ * Capability flags that the kernel supports (read-only) ++ */ ++ unsigned capable; ++ ++ /** ++ * Capability flags that the filesystem wants to enable. ++ * ++ * libfuse attempts to initialize this field with ++ * reasonable default values before calling the init() handler. ++ */ ++ unsigned want; ++ ++ /** ++ * Maximum number of pending "background" requests. A ++ * background request is any type of request for which the ++ * total number is not limited by other means. As of kernel ++ * 4.8, only two types of requests fall into this category: ++ * ++ * 1. Read-ahead requests ++ * 2. Asynchronous direct I/O requests ++ * ++ * Read-ahead requests are generated (if max_readahead is ++ * non-zero) by the kernel to preemptively fill its caches ++ * when it anticipates that userspace will soon read more ++ * data. ++ * ++ * Asynchronous direct I/O requests are generated if ++ * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large ++ * direct I/O request. In this case the kernel will internally ++ * split it up into multiple smaller requests and submit them ++ * to the filesystem concurrently. ++ * ++ * Note that the following requests are *not* background ++ * requests: writeback requests (limited by the kernel's ++ * flusher algorithm), regular (i.e., synchronous and ++ * buffered) userspace read/write requests (limited to one per ++ * thread), asynchronous read requests (Linux's io_submit(2) ++ * call actually blocks, so these are also limited to one per ++ * thread). ++ */ ++ unsigned max_background; ++ ++ /** ++ * Kernel congestion threshold parameter. If the number of pending ++ * background requests exceeds this number, the FUSE kernel module will ++ * mark the filesystem as "congested". This instructs the kernel to ++ * expect that queued requests will take some time to complete, and to ++ * adjust its algorithms accordingly (e.g. by putting a waiting thread ++ * to sleep instead of using a busy-loop). ++ */ ++ unsigned congestion_threshold; ++ ++ /** ++ * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible ++ * for updating mtime and ctime when write requests are received. The ++ * updated values are passed to the filesystem with setattr() requests. ++ * However, if the filesystem does not support the full resolution of ++ * the kernel timestamps (nanoseconds), the mtime and ctime values used ++ * by kernel and filesystem will differ (and result in an apparent ++ * change of times after a cache flush). ++ * ++ * To prevent this problem, this variable can be used to inform the ++ * kernel about the timestamp granularity supported by the file-system. ++ * The value should be power of 10. The default is 1, i.e. full ++ * nano-second resolution. Filesystems supporting only second resolution ++ * should set this to 1000000000. ++ */ ++ unsigned time_gran; ++ ++ /** ++ * For future use. ++ */ ++ unsigned reserved[22]; ++}; ++ ++struct fuse_session; ++struct fuse_pollhandle; ++struct fuse_conn_info_opts; ++ ++/** ++ * This function parses several command-line options that can be used ++ * to override elements of struct fuse_conn_info. The pointer returned ++ * by this function should be passed to the ++ * fuse_apply_conn_info_opts() method by the file system's init() ++ * handler. ++ * ++ * Before using this function, think twice if you really want these ++ * parameters to be adjustable from the command line. In most cases, ++ * they should be determined by the file system internally. ++ * ++ * The following options are recognized: ++ * ++ * -o max_write=N sets conn->max_write ++ * -o max_readahead=N sets conn->max_readahead ++ * -o max_background=N sets conn->max_background ++ * -o congestion_threshold=N sets conn->congestion_threshold ++ * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want ++ * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want ++ * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want ++ * -o no_remote_lock Equivalent to -o no_remote_flock,no_remote_posix_lock ++ * -o no_remote_flock Unsets FUSE_CAP_FLOCK_LOCKS in conn->want ++ * -o no_remote_posix_lock Unsets FUSE_CAP_POSIX_LOCKS in conn->want ++ * -o [no_]splice_write (un-)sets FUSE_CAP_SPLICE_WRITE in conn->want ++ * -o [no_]splice_move (un-)sets FUSE_CAP_SPLICE_MOVE in conn->want ++ * -o [no_]splice_read (un-)sets FUSE_CAP_SPLICE_READ in conn->want ++ * -o [no_]auto_inval_data (un-)sets FUSE_CAP_AUTO_INVAL_DATA in conn->want ++ * -o readdirplus=no unsets FUSE_CAP_READDIRPLUS in conn->want ++ * -o readdirplus=yes sets FUSE_CAP_READDIRPLUS and unsets ++ * FUSE_CAP_READDIRPLUS_AUTO in conn->want ++ * -o readdirplus=auto sets FUSE_CAP_READDIRPLUS and ++ * FUSE_CAP_READDIRPLUS_AUTO in conn->want ++ * -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in conn->want ++ * -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in conn->want ++ * -o time_gran=N sets conn->time_gran ++ * ++ * Known options will be removed from *args*, unknown options will be ++ * passed through unchanged. ++ * ++ * @param args argument vector (input+output) ++ * @return parsed options ++ **/ ++struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args); ++ ++/** ++ * This function applies the (parsed) parameters in *opts* to the ++ * *conn* pointer. It may modify the following fields: wants, ++ * max_write, max_readahead, congestion_threshold, max_background, ++ * time_gran. A field is only set (or unset) if the corresponding ++ * option has been explicitly set. ++ */ ++void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, ++ struct fuse_conn_info *conn); ++ ++/** ++ * Go into the background ++ * ++ * @param foreground if true, stay in the foreground ++ * @return 0 on success, -1 on failure ++ */ ++int fuse_daemonize(int foreground); ++ ++/** ++ * Get the version of the library ++ * ++ * @return the version ++ */ ++int fuse_version(void); ++ ++/** ++ * Get the full package version string of the library ++ * ++ * @return the package version ++ */ ++const char *fuse_pkgversion(void); ++ ++/** ++ * Destroy poll handle ++ * ++ * @param ph the poll handle ++ */ ++void fuse_pollhandle_destroy(struct fuse_pollhandle *ph); ++ ++/* ----------------------------------------------------------- * ++ * Data buffer * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Buffer flags ++ */ ++enum fuse_buf_flags { ++ /** ++ * Buffer contains a file descriptor ++ * ++ * If this flag is set, the .fd field is valid, otherwise the ++ * .mem fields is valid. ++ */ ++ FUSE_BUF_IS_FD = (1 << 1), ++ ++ /** ++ * Seek on the file descriptor ++ * ++ * If this flag is set then the .pos field is valid and is ++ * used to seek to the given offset before performing ++ * operation on file descriptor. ++ */ ++ FUSE_BUF_FD_SEEK = (1 << 2), ++ ++ /** ++ * Retry operation on file descriptor ++ * ++ * If this flag is set then retry operation on file descriptor ++ * until .size bytes have been copied or an error or EOF is ++ * detected. ++ */ ++ FUSE_BUF_FD_RETRY = (1 << 3), ++}; ++ ++/** ++ * Buffer copy flags ++ */ ++enum fuse_buf_copy_flags { ++ /** ++ * Don't use splice(2) ++ * ++ * Always fall back to using read and write instead of ++ * splice(2) to copy data from one file descriptor to another. ++ * ++ * If this flag is not set, then only fall back if splice is ++ * unavailable. ++ */ ++ FUSE_BUF_NO_SPLICE = (1 << 1), ++ ++ /** ++ * Force splice ++ * ++ * Always use splice(2) to copy data from one file descriptor ++ * to another. If splice is not available, return -EINVAL. ++ */ ++ FUSE_BUF_FORCE_SPLICE = (1 << 2), ++ ++ /** ++ * Try to move data with splice. ++ * ++ * If splice is used, try to move pages from the source to the ++ * destination instead of copying. See documentation of ++ * SPLICE_F_MOVE in splice(2) man page. ++ */ ++ FUSE_BUF_SPLICE_MOVE = (1 << 3), ++ ++ /** ++ * Don't block on the pipe when copying data with splice ++ * ++ * Makes the operations on the pipe non-blocking (if the pipe ++ * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) ++ * man page. ++ */ ++ FUSE_BUF_SPLICE_NONBLOCK= (1 << 4), ++}; ++ ++/** ++ * Single data buffer ++ * ++ * Generic data buffer for I/O, extended attributes, etc... Data may ++ * be supplied as a memory pointer or as a file descriptor ++ */ ++struct fuse_buf { ++ /** ++ * Size of data in bytes ++ */ ++ size_t size; ++ ++ /** ++ * Buffer flags ++ */ ++ enum fuse_buf_flags flags; ++ ++ /** ++ * Memory pointer ++ * ++ * Used unless FUSE_BUF_IS_FD flag is set. ++ */ ++ void *mem; ++ ++ /** ++ * File descriptor ++ * ++ * Used if FUSE_BUF_IS_FD flag is set. ++ */ ++ int fd; ++ ++ /** ++ * File position ++ * ++ * Used if FUSE_BUF_FD_SEEK flag is set. ++ */ ++ off_t pos; ++}; ++ ++/** ++ * Data buffer vector ++ * ++ * An array of data buffers, each containing a memory pointer or a ++ * file descriptor. ++ * ++ * Allocate dynamically to add more than one buffer. ++ */ ++struct fuse_bufvec { ++ /** ++ * Number of buffers in the array ++ */ ++ size_t count; ++ ++ /** ++ * Index of current buffer within the array ++ */ ++ size_t idx; ++ ++ /** ++ * Current offset within the current buffer ++ */ ++ size_t off; ++ ++ /** ++ * Array of buffers ++ */ ++ struct fuse_buf buf[1]; ++}; ++ ++/* Initialize bufvec with a single buffer of given size */ ++#define FUSE_BUFVEC_INIT(size__) \ ++ ((struct fuse_bufvec) { \ ++ /* .count= */ 1, \ ++ /* .idx = */ 0, \ ++ /* .off = */ 0, \ ++ /* .buf = */ { /* [0] = */ { \ ++ /* .size = */ (size__), \ ++ /* .flags = */ (enum fuse_buf_flags) 0, \ ++ /* .mem = */ NULL, \ ++ /* .fd = */ -1, \ ++ /* .pos = */ 0, \ ++ } } \ ++ } ) ++ ++/** ++ * Get total size of data in a fuse buffer vector ++ * ++ * @param bufv buffer vector ++ * @return size of data ++ */ ++size_t fuse_buf_size(const struct fuse_bufvec *bufv); ++ ++/** ++ * Copy data from one buffer vector to another ++ * ++ * @param dst destination buffer vector ++ * @param src source buffer vector ++ * @param flags flags controlling the copy ++ * @return actual number of bytes copied or -errno on error ++ */ ++ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src, ++ enum fuse_buf_copy_flags flags); ++ ++/* ----------------------------------------------------------- * ++ * Signal handling * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Exit session on HUP, TERM and INT signals and ignore PIPE signal ++ * ++ * Stores session in a global variable. May only be called once per ++ * process until fuse_remove_signal_handlers() is called. ++ * ++ * Once either of the POSIX signals arrives, the signal handler calls ++ * fuse_session_exit(). ++ * ++ * @param se the session to exit ++ * @return 0 on success, -1 on failure ++ * ++ * See also: ++ * fuse_remove_signal_handlers() ++ */ ++int fuse_set_signal_handlers(struct fuse_session *se); ++ ++/** ++ * Restore default signal handlers ++ * ++ * Resets global session. After this fuse_set_signal_handlers() may ++ * be called again. ++ * ++ * @param se the same session as given in fuse_set_signal_handlers() ++ * ++ * See also: ++ * fuse_set_signal_handlers() ++ */ ++void fuse_remove_signal_handlers(struct fuse_session *se); ++ ++/* ----------------------------------------------------------- * ++ * Compatibility stuff * ++ * ----------------------------------------------------------- */ ++ ++#if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30 ++# error only API version 30 or greater is supported ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++ ++/* ++ * This interface uses 64 bit off_t. ++ * ++ * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags! ++ */ ++ ++#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && !defined __cplusplus ++_Static_assert(sizeof(off_t) == 8, "fuse: off_t must be 64bit"); ++#else ++struct _fuse_off_t_must_be_64bit_dummy_struct \ ++ { unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1); }; ++#endif ++ ++#endif /* FUSE_COMMON_H_ */ +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +new file mode 100644 +index 0000000..d38b630 +--- /dev/null ++++ b/tools/virtiofsd/fuse_i.h +@@ -0,0 +1,139 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include "fuse.h" ++#include "fuse_lowlevel.h" ++ ++struct mount_opts; ++ ++struct fuse_req { ++ struct fuse_session *se; ++ uint64_t unique; ++ int ctr; ++ pthread_mutex_t lock; ++ struct fuse_ctx ctx; ++ struct fuse_chan *ch; ++ int interrupted; ++ unsigned int ioctl_64bit : 1; ++ union { ++ struct { ++ uint64_t unique; ++ } i; ++ struct { ++ fuse_interrupt_func_t func; ++ void *data; ++ } ni; ++ } u; ++ struct fuse_req *next; ++ struct fuse_req *prev; ++}; ++ ++struct fuse_notify_req { ++ uint64_t unique; ++ void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, ++ const void *, const struct fuse_buf *); ++ struct fuse_notify_req *next; ++ struct fuse_notify_req *prev; ++}; ++ ++struct fuse_session { ++ char *mountpoint; ++ volatile int exited; ++ int fd; ++ struct mount_opts *mo; ++ int debug; ++ int deny_others; ++ struct fuse_lowlevel_ops op; ++ int got_init; ++ struct cuse_data *cuse_data; ++ void *userdata; ++ uid_t owner; ++ struct fuse_conn_info conn; ++ struct fuse_req list; ++ struct fuse_req interrupts; ++ pthread_mutex_t lock; ++ int got_destroy; ++ pthread_key_t pipe_key; ++ int broken_splice_nonblock; ++ uint64_t notify_ctr; ++ struct fuse_notify_req notify_list; ++ size_t bufsize; ++ int error; ++}; ++ ++struct fuse_chan { ++ pthread_mutex_t lock; ++ int ctr; ++ int fd; ++}; ++ ++/** ++ * Filesystem module ++ * ++ * Filesystem modules are registered with the FUSE_REGISTER_MODULE() ++ * macro. ++ * ++ */ ++struct fuse_module { ++ char *name; ++ fuse_module_factory_t factory; ++ struct fuse_module *next; ++ struct fusemod_so *so; ++ int ctr; ++}; ++ ++/* ----------------------------------------------------------- * ++ * Channel interface (when using -o clone_fd) * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Obtain counted reference to the channel ++ * ++ * @param ch the channel ++ * @return the channel ++ */ ++struct fuse_chan *fuse_chan_get(struct fuse_chan *ch); ++ ++/** ++ * Drop counted reference to a channel ++ * ++ * @param ch the channel ++ */ ++void fuse_chan_put(struct fuse_chan *ch); ++ ++struct mount_opts *parse_mount_opts(struct fuse_args *args); ++void destroy_mount_opts(struct mount_opts *mo); ++void fuse_mount_version(void); ++unsigned get_max_read(struct mount_opts *o); ++void fuse_kern_unmount(const char *mountpoint, int fd); ++int fuse_kern_mount(const char *mountpoint, struct mount_opts *mo); ++ ++int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, ++ int count); ++void fuse_free_req(fuse_req_t req); ++ ++void cuse_lowlevel_init(fuse_req_t req, fuse_ino_t nodeide, const void *inarg); ++ ++int fuse_start_thread(pthread_t *thread_id, void *(*func)(void *), void *arg); ++ ++int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, ++ struct fuse_chan *ch); ++void fuse_session_process_buf_int(struct fuse_session *se, ++ const struct fuse_buf *buf, struct fuse_chan *ch); ++ ++struct fuse *fuse_new_31(struct fuse_args *args, const struct fuse_operations *op, ++ size_t op_size, void *private_data); ++int fuse_loop_mt_32(struct fuse *f, struct fuse_loop_config *config); ++int fuse_session_loop_mt_32(struct fuse_session *se, struct fuse_loop_config *config); ++ ++#define FUSE_MAX_MAX_PAGES 256 ++#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 ++ ++/* room needed in buffer to accommodate header */ ++#define FUSE_BUFFER_HEADER_SIZE 0x1000 ++ +diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h +new file mode 100644 +index 0000000..5e112e0 +--- /dev/null ++++ b/tools/virtiofsd/fuse_log.h +@@ -0,0 +1,82 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2019 Red Hat, Inc. ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_LOG_H_ ++#define FUSE_LOG_H_ ++ ++/** @file ++ * ++ * This file defines the logging interface of FUSE ++ */ ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/** ++ * Log severity level ++ * ++ * These levels correspond to syslog(2) log levels since they are widely used. ++ */ ++enum fuse_log_level { ++ FUSE_LOG_EMERG, ++ FUSE_LOG_ALERT, ++ FUSE_LOG_CRIT, ++ FUSE_LOG_ERR, ++ FUSE_LOG_WARNING, ++ FUSE_LOG_NOTICE, ++ FUSE_LOG_INFO, ++ FUSE_LOG_DEBUG ++}; ++ ++/** ++ * Log message handler function. ++ * ++ * This function must be thread-safe. It may be called from any libfuse ++ * function, including fuse_parse_cmdline() and other functions invoked before ++ * a FUSE filesystem is created. ++ * ++ * Install a custom log message handler function using fuse_set_log_func(). ++ * ++ * @param level log severity level ++ * @param fmt sprintf-style format string including newline ++ * @param ap format string arguments ++ */ ++typedef void (*fuse_log_func_t)(enum fuse_log_level level, ++ const char *fmt, va_list ap); ++ ++/** ++ * Install a custom log handler function. ++ * ++ * Log messages are emitted by libfuse functions to report errors and debug ++ * information. Messages are printed to stderr by default but this can be ++ * overridden by installing a custom log message handler function. ++ * ++ * The log message handler function is global and affects all FUSE filesystems ++ * created within this process. ++ * ++ * @param func a custom log message handler function or NULL to revert to ++ * the default ++ */ ++void fuse_set_log_func(fuse_log_func_t func); ++ ++/** ++ * Emit a log message ++ * ++ * @param level severity level (FUSE_LOG_ERR, FUSE_LOG_DEBUG, etc) ++ * @param fmt sprintf-style format string including newline ++ */ ++void fuse_log(enum fuse_log_level level, const char *fmt, ...); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_LOG_H_ */ +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +new file mode 100644 +index 0000000..18c6363 +--- /dev/null ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -0,0 +1,2089 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_LOWLEVEL_H_ ++#define FUSE_LOWLEVEL_H_ ++ ++/** @file ++ * ++ * Low level API ++ * ++ * IMPORTANT: you should define FUSE_USE_VERSION before including this ++ * header. To use the newest API define it to 31 (recommended for any ++ * new application). ++ */ ++ ++#ifndef FUSE_USE_VERSION ++#error FUSE_USE_VERSION not defined ++#endif ++ ++#include "fuse_common.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ----------------------------------------------------------- * ++ * Miscellaneous definitions * ++ * ----------------------------------------------------------- */ ++ ++/** The node ID of the root inode */ ++#define FUSE_ROOT_ID 1 ++ ++/** Inode number type */ ++typedef uint64_t fuse_ino_t; ++ ++/** Request pointer type */ ++typedef struct fuse_req *fuse_req_t; ++ ++/** ++ * Session ++ * ++ * This provides hooks for processing requests, and exiting ++ */ ++struct fuse_session; ++ ++/** Directory entry parameters supplied to fuse_reply_entry() */ ++struct fuse_entry_param { ++ /** Unique inode number ++ * ++ * In lookup, zero means negative entry (from version 2.5) ++ * Returning ENOENT also means negative entry, but by setting zero ++ * ino the kernel may cache negative entries for entry_timeout ++ * seconds. ++ */ ++ fuse_ino_t ino; ++ ++ /** Generation number for this entry. ++ * ++ * If the file system will be exported over NFS, the ++ * ino/generation pairs need to be unique over the file ++ * system's lifetime (rather than just the mount time). So if ++ * the file system reuses an inode after it has been deleted, ++ * it must assign a new, previously unused generation number ++ * to the inode at the same time. ++ * ++ */ ++ uint64_t generation; ++ ++ /** Inode attributes. ++ * ++ * Even if attr_timeout == 0, attr must be correct. For example, ++ * for open(), FUSE uses attr.st_size from lookup() to determine ++ * how many bytes to request. If this value is not correct, ++ * incorrect data will be returned. ++ */ ++ struct stat attr; ++ ++ /** Validity timeout (in seconds) for inode attributes. If ++ attributes only change as a result of requests that come ++ through the kernel, this should be set to a very large ++ value. */ ++ double attr_timeout; ++ ++ /** Validity timeout (in seconds) for the name. If directory ++ entries are changed/deleted only as a result of requests ++ that come through the kernel, this should be set to a very ++ large value. */ ++ double entry_timeout; ++}; ++ ++/** ++ * Additional context associated with requests. ++ * ++ * Note that the reported client uid, gid and pid may be zero in some ++ * situations. For example, if the FUSE file system is running in a ++ * PID or user namespace but then accessed from outside the namespace, ++ * there is no valid uid/pid/gid that could be reported. ++ */ ++struct fuse_ctx { ++ /** User ID of the calling process */ ++ uid_t uid; ++ ++ /** Group ID of the calling process */ ++ gid_t gid; ++ ++ /** Thread ID of the calling process */ ++ pid_t pid; ++ ++ /** Umask of the calling process */ ++ mode_t umask; ++}; ++ ++struct fuse_forget_data { ++ fuse_ino_t ino; ++ uint64_t nlookup; ++}; ++ ++/* 'to_set' flags in setattr */ ++#define FUSE_SET_ATTR_MODE (1 << 0) ++#define FUSE_SET_ATTR_UID (1 << 1) ++#define FUSE_SET_ATTR_GID (1 << 2) ++#define FUSE_SET_ATTR_SIZE (1 << 3) ++#define FUSE_SET_ATTR_ATIME (1 << 4) ++#define FUSE_SET_ATTR_MTIME (1 << 5) ++#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) ++#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) ++#define FUSE_SET_ATTR_CTIME (1 << 10) ++ ++/* ----------------------------------------------------------- * ++ * Request methods and replies * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Low level filesystem operations ++ * ++ * Most of the methods (with the exception of init and destroy) ++ * receive a request handle (fuse_req_t) as their first argument. ++ * This handle must be passed to one of the specified reply functions. ++ * ++ * This may be done inside the method invocation, or after the call ++ * has returned. The request handle is valid until one of the reply ++ * functions is called. ++ * ++ * Other pointer arguments (name, fuse_file_info, etc) are not valid ++ * after the call has returned, so if they are needed later, their ++ * contents have to be copied. ++ * ++ * In general, all methods are expected to perform any necessary ++ * permission checking. However, a filesystem may delegate this task ++ * to the kernel by passing the `default_permissions` mount option to ++ * `fuse_session_new()`. In this case, methods will only be called if ++ * the kernel's permission check has succeeded. ++ * ++ * The filesystem sometimes needs to handle a return value of -ENOENT ++ * from the reply function, which means, that the request was ++ * interrupted, and the reply discarded. For example if ++ * fuse_reply_open() return -ENOENT means, that the release method for ++ * this file will not be called. ++ */ ++struct fuse_lowlevel_ops { ++ /** ++ * Initialize filesystem ++ * ++ * This function is called when libfuse establishes ++ * communication with the FUSE kernel module. The file system ++ * should use this module to inspect and/or modify the ++ * connection parameters provided in the `conn` structure. ++ * ++ * Note that some parameters may be overwritten by options ++ * passed to fuse_session_new() which take precedence over the ++ * values set in this handler. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*init) (void *userdata, struct fuse_conn_info *conn); ++ ++ /** ++ * Clean up filesystem. ++ * ++ * Called on filesystem exit. When this method is called, the ++ * connection to the kernel may be gone already, so that eg. calls ++ * to fuse_lowlevel_notify_* will fail. ++ * ++ * There's no reply to this function ++ * ++ * @param userdata the user data passed to fuse_session_new() ++ */ ++ void (*destroy) (void *userdata); ++ ++ /** ++ * Look up a directory entry by name and get its attributes. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name the name to look up ++ */ ++ void (*lookup) (fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Forget about an inode ++ * ++ * This function is called when the kernel removes an inode ++ * from its internal caches. ++ * ++ * The inode's lookup count increases by one for every call to ++ * fuse_reply_entry and fuse_reply_create. The nlookup parameter ++ * indicates by how much the lookup count should be decreased. ++ * ++ * Inodes with a non-zero lookup count may receive request from ++ * the kernel even after calls to unlink, rmdir or (when ++ * overwriting an existing file) rename. Filesystems must handle ++ * such requests properly and it is recommended to defer removal ++ * of the inode until the lookup count reaches zero. Calls to ++ * unlink, rmdir or rename will be followed closely by forget ++ * unless the file or directory is open, in which case the ++ * kernel issues forget only after the release or releasedir ++ * calls. ++ * ++ * Note that if a file system will be exported over NFS the ++ * inodes lifetime must extend even beyond forget. See the ++ * generation field in struct fuse_entry_param above. ++ * ++ * On unmount the lookup count for all inodes implicitly drops ++ * to zero. It is not guaranteed that the file system will ++ * receive corresponding forget messages for the affected ++ * inodes. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param nlookup the number of lookups to forget ++ */ ++ void (*forget) (fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); ++ ++ /** ++ * Get file attributes. ++ * ++ * If writeback caching is enabled, the kernel may have a ++ * better idea of a file's length than the FUSE file system ++ * (eg if there has been a write that extended the file size, ++ * but that has not yet been passed to the filesystem.n ++ * ++ * In this case, the st_size value provided by the file system ++ * will be ignored. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi for future use, currently always NULL ++ */ ++ void (*getattr) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Set file attributes ++ * ++ * In the 'attr' argument only members indicated by the 'to_set' ++ * bitmask contain valid values. Other members contain undefined ++ * values. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits if the file ++ * size or owner is being changed. ++ * ++ * If the setattr was invoked from the ftruncate() system call ++ * under Linux kernel versions 2.6.15 or later, the fi->fh will ++ * contain the value set by the open method or will be undefined ++ * if the open method didn't set any value. Otherwise (not ++ * ftruncate call, or kernel version earlier than 2.6.15) the fi ++ * parameter will be NULL. ++ * ++ * Valid replies: ++ * fuse_reply_attr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param attr the attributes ++ * @param to_set bit mask of attributes which should be set ++ * @param fi file information, or NULL ++ */ ++ void (*setattr) (fuse_req_t req, fuse_ino_t ino, struct stat *attr, ++ int to_set, struct fuse_file_info *fi); ++ ++ /** ++ * Read symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_readlink ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ */ ++ void (*readlink) (fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Create file node ++ * ++ * Create a regular file, character device, block device, fifo or ++ * socket node. ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param rdev the device number (only valid if created file is a device) ++ */ ++ void (*mknod) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, dev_t rdev); ++ ++ /** ++ * Create a directory ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode with which to create the new file ++ */ ++ void (*mkdir) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode); ++ ++ /** ++ * Remove a file ++ * ++ * If the file's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*unlink) (fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Remove a directory ++ * ++ * If the directory's inode's lookup count is non-zero, the ++ * file system is expected to postpone any removal of the ++ * inode until the lookup count reaches zero (see description ++ * of the forget function). ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to remove ++ */ ++ void (*rmdir) (fuse_req_t req, fuse_ino_t parent, const char *name); ++ ++ /** ++ * Create a symbolic link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param link the contents of the symbolic link ++ * @param parent inode number of the parent directory ++ * @param name to create ++ */ ++ void (*symlink) (fuse_req_t req, const char *link, fuse_ino_t parent, ++ const char *name); ++ ++ /** Rename a file ++ * ++ * If the target exists it should be atomically replaced. If ++ * the target's inode's lookup count is non-zero, the file ++ * system is expected to postpone any removal of the inode ++ * until the lookup count reaches zero (see description of the ++ * forget function). ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EINVAL, i.e. all ++ * future bmap requests will fail with EINVAL without being ++ * send to the filesystem process. ++ * ++ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If ++ * RENAME_NOREPLACE is specified, the filesystem must not ++ * overwrite *newname* if it exists and return an error ++ * instead. If `RENAME_EXCHANGE` is specified, the filesystem ++ * must atomically exchange the two files, i.e. both must ++ * exist and neither may be deleted. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the old parent directory ++ * @param name old name ++ * @param newparent inode number of the new parent directory ++ * @param newname new name ++ */ ++ void (*rename) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ fuse_ino_t newparent, const char *newname, ++ unsigned int flags); ++ ++ /** ++ * Create a hard link ++ * ++ * Valid replies: ++ * fuse_reply_entry ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the old inode number ++ * @param newparent inode number of the new parent directory ++ * @param newname new name to create ++ */ ++ void (*link) (fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, ++ const char *newname); ++ ++ /** ++ * Open a file ++ * ++ * Open flags are available in fi->flags. The following rules ++ * apply. ++ * ++ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be ++ * filtered out / handled by the kernel. ++ * ++ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used ++ * by the filesystem to check if the operation is ++ * permitted. If the ``-o default_permissions`` mount ++ * option is given, this check is already done by the ++ * kernel before calling open() and may thus be omitted by ++ * the filesystem. ++ * ++ * - When writeback caching is enabled, the kernel may send ++ * read requests even for files opened with O_WRONLY. The ++ * filesystem should be prepared to handle this. ++ * ++ * - When writeback caching is disabled, the filesystem is ++ * expected to properly handle the O_APPEND flag and ensure ++ * that each write is appending to the end of the file. ++ * ++ * - When writeback caching is enabled, the kernel will ++ * handle O_APPEND. However, unless all changes to the file ++ * come through the kernel this will not work reliably. The ++ * filesystem should thus either ignore the O_APPEND flag ++ * (and let the kernel handle it), or return an error ++ * (indicating that reliably O_APPEND is not available). ++ * ++ * Filesystem may store an arbitrary file handle (pointer, ++ * index, etc) in fi->fh, and use this in other all other file ++ * operations (read, write, flush, release, fsync). ++ * ++ * Filesystem may also implement stateless file I/O and not store ++ * anything in fi->fh. ++ * ++ * There are also some flags (direct_io, keep_cache) which the ++ * filesystem may set in fi, to change the way the file is opened. ++ * See fuse_file_info structure in for more details. ++ * ++ * If this request is answered with an error code of ENOSYS ++ * and FUSE_CAP_NO_OPEN_SUPPORT is set in ++ * `fuse_conn_info.capable`, this is treated as success and ++ * future calls to open and release will also succeed without being ++ * sent to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*open) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Read data ++ * ++ * Read should send exactly the number of bytes requested except ++ * on EOF or error, otherwise the rest of the data will be ++ * substituted with zeroes. An exception to this is when the file ++ * has been opened in 'direct_io' mode, in which case the return ++ * value of the read system call will reflect the return value of ++ * this operation. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_iov ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size number of bytes to read ++ * @param off offset to read from ++ * @param fi file information ++ */ ++ void (*read) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Write data ++ * ++ * Write should return exactly the number of bytes requested ++ * except on error. An exception to this is when the file has ++ * been opened in 'direct_io' mode, in which case the return value ++ * of the write system call will reflect the return value of this ++ * operation. ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param buf data to write ++ * @param size number of bytes to write ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write) (fuse_req_t req, fuse_ino_t ino, const char *buf, ++ size_t size, off_t off, struct fuse_file_info *fi); ++ ++ /** ++ * Flush method ++ * ++ * This is called on each close() of the opened file. ++ * ++ * Since file descriptors can be duplicated (dup, dup2, fork), for ++ * one open call there may be many flush calls. ++ * ++ * Filesystems shouldn't assume that flush will always be called ++ * after some writes, or that if will be called at all. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * ++ * NOTE: the name of the method is misleading, since (unlike ++ * fsync) the filesystem is not forced to flush pending writes. ++ * One reason to flush data is if the filesystem wants to return ++ * write errors during close. However, such use is non-portable ++ * because POSIX does not require [close] to wait for delayed I/O to ++ * complete. ++ * ++ * If the filesystem supports file locking operations (setlk, ++ * getlk) it should remove all locks belonging to 'fi->owner'. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to flush() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * ++ * [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html ++ */ ++ void (*flush) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Release an open file ++ * ++ * Release is called when there are no more references to an open ++ * file: all file descriptors are closed and all memory mappings ++ * are unmapped. ++ * ++ * For every open call there will be exactly one release call (unless ++ * the filesystem is force-unmounted). ++ * ++ * The filesystem may reply with an error, but error values are ++ * not returned to close() or munmap() which triggered the ++ * release. ++ * ++ * fi->fh will contain the value set by the open method, or will ++ * be undefined if the open method didn't set any value. ++ * fi->flags will contain the same flags as for open. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*release) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize file contents ++ * ++ * If the datasync parameter is non-zero, then only the user data ++ * should be flushed, not the meta data. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsync() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsync) (fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Open a directory ++ * ++ * Filesystem may store an arbitrary file handle (pointer, index, ++ * etc) in fi->fh, and use this in other all other directory ++ * stream operations (readdir, releasedir, fsyncdir). ++ * ++ * If this request is answered with an error code of ENOSYS and ++ * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, ++ * this is treated as success and future calls to opendir and ++ * releasedir will also succeed without being sent to the filesystem ++ * process. In addition, the kernel will cache readdir results ++ * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. ++ * ++ * Valid replies: ++ * fuse_reply_open ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*opendir) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Read directory ++ * ++ * Send a buffer filled using fuse_add_direntry(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Returning a directory entry from readdir() does not affect ++ * its lookup count. ++ * ++ * If off_t is non-zero, then it will correspond to one of the off_t ++ * values that was previously returned by readdir() for the same ++ * directory handle. In this case, readdir() should skip over entries ++ * coming before the position defined by the off_t value. If entries ++ * are added or removed while the directory handle is open, they filesystem ++ * may still include the entries that have been removed, and may not ++ * report the entries that have been created. However, addition or ++ * removal of entries must never cause readdir() to skip over unrelated ++ * entries or to report them more than once. This means ++ * that off_t can not be a simple index that enumerates the entries ++ * that have been returned but must contain sufficient information to ++ * uniquely determine the next directory entry to return even when the ++ * set of entries is changing. ++ * ++ * The function does not have to report the '.' and '..' ++ * entries, but is allowed to do so. Note that, if readdir does ++ * not return '.' or '..', they will not be implicitly returned, ++ * and this behavior is observable by the caller. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdir) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Release an open directory ++ * ++ * For every opendir call there will be exactly one releasedir ++ * call (unless the filesystem is force-unmounted). ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ */ ++ void (*releasedir) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Synchronize directory contents ++ * ++ * If the datasync parameter is non-zero, then only the directory ++ * contents should be flushed, not the meta data. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * If this request is answered with an error code of ENOSYS, ++ * this is treated as success and future calls to fsyncdir() will ++ * succeed automatically without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param datasync flag indicating if only data should be flushed ++ * @param fi file information ++ */ ++ void (*fsyncdir) (fuse_req_t req, fuse_ino_t ino, int datasync, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Get file system statistics ++ * ++ * Valid replies: ++ * fuse_reply_statfs ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number, zero means "undefined" ++ */ ++ void (*statfs) (fuse_req_t req, fuse_ino_t ino); ++ ++ /** ++ * Set an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future setxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ */ ++ void (*setxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, ++ const char *value, size_t size, int flags); ++ ++ /** ++ * Get an extended attribute ++ * ++ * If size is zero, the size of the value should be sent with ++ * fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the value fits in the buffer, the ++ * value should be sent with fuse_reply_buf. ++ * ++ * If the size is too small for the value, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future getxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ * @param size maximum size of the value to send ++ */ ++ void (*getxattr) (fuse_req_t req, fuse_ino_t ino, const char *name, ++ size_t size); ++ ++ /** ++ * List extended attribute names ++ * ++ * If size is zero, the total size of the attribute list should be ++ * sent with fuse_reply_xattr. ++ * ++ * If the size is non-zero, and the null character separated ++ * attribute list fits in the buffer, the list should be sent with ++ * fuse_reply_buf. ++ * ++ * If the size is too small for the list, the ERANGE error should ++ * be sent. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future listxattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_xattr ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum size of the list to send ++ */ ++ void (*listxattr) (fuse_req_t req, fuse_ino_t ino, size_t size); ++ ++ /** ++ * Remove an extended attribute ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future removexattr() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param name of the extended attribute ++ */ ++ void (*removexattr) (fuse_req_t req, fuse_ino_t ino, const char *name); ++ ++ /** ++ * Check file access permissions ++ * ++ * This will be called for the access() and chdir() system ++ * calls. If the 'default_permissions' mount option is given, ++ * this method is not called. ++ * ++ * This method is not called under Linux kernel versions 2.4.x ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent success, i.e. this and all future access() ++ * requests will succeed without being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param mask requested access mode ++ */ ++ void (*access) (fuse_req_t req, fuse_ino_t ino, int mask); ++ ++ /** ++ * Create and open a file ++ * ++ * If the file does not exist, first create it with the specified ++ * mode, and then open it. ++ * ++ * See the description of the open handler for more ++ * information. ++ * ++ * If this method is not implemented or under Linux kernel ++ * versions earlier than 2.6.15, the mknod() and open() methods ++ * will be called instead. ++ * ++ * If this request is answered with an error code of ENOSYS, the handler ++ * is treated as not implemented (i.e., for this and future requests the ++ * mknod() and open() handlers will be called instead). ++ * ++ * Valid replies: ++ * fuse_reply_create ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param parent inode number of the parent directory ++ * @param name to create ++ * @param mode file type and mode with which to create the new file ++ * @param fi file information ++ */ ++ void (*create) (fuse_req_t req, fuse_ino_t parent, const char *name, ++ mode_t mode, struct fuse_file_info *fi); ++ ++ /** ++ * Test for a POSIX file lock ++ * ++ * Valid replies: ++ * fuse_reply_lock ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to test ++ */ ++ void (*getlk) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi, struct flock *lock); ++ ++ /** ++ * Acquire, modify or release a POSIX file lock ++ * ++ * For POSIX threads (NPTL) there's a 1-1 relation between pid and ++ * owner, but otherwise this is not always the case. For checking ++ * lock ownership, 'fi->owner' must be used. The l_pid field in ++ * 'struct flock' should only be used to fill in this field in ++ * getlk(). ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param lock the region/type to set ++ * @param sleep locking operation may sleep ++ */ ++ void (*setlk) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi, ++ struct flock *lock, int sleep); ++ ++ /** ++ * Map block index within file to block index within device ++ * ++ * Note: This makes sense only for block device backed filesystems ++ * mounted with the 'blkdev' option ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future bmap() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_bmap ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param blocksize unit of block index ++ * @param idx block index within file ++ */ ++ void (*bmap) (fuse_req_t req, fuse_ino_t ino, size_t blocksize, ++ uint64_t idx); ++ ++ /** ++ * Ioctl ++ * ++ * Note: For unrestricted ioctls (not allowed for FUSE ++ * servers), data in and out areas can be discovered by giving ++ * iovs and setting FUSE_IOCTL_RETRY in *flags*. For ++ * restricted ioctls, kernel prepares in/out data area ++ * according to the information encoded in cmd. ++ * ++ * Valid replies: ++ * fuse_reply_ioctl_retry ++ * fuse_reply_ioctl ++ * fuse_reply_ioctl_iov ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param cmd ioctl command ++ * @param arg ioctl argument ++ * @param fi file information ++ * @param flags for FUSE_IOCTL_* flags ++ * @param in_buf data fetched from the caller ++ * @param in_bufsz number of fetched bytes ++ * @param out_bufsz maximum size of output data ++ * ++ * Note : the unsigned long request submitted by the application ++ * is truncated to 32 bits. ++ */ ++ void (*ioctl) (fuse_req_t req, fuse_ino_t ino, unsigned int cmd, ++ void *arg, struct fuse_file_info *fi, unsigned flags, ++ const void *in_buf, size_t in_bufsz, size_t out_bufsz); ++ ++ /** ++ * Poll for IO readiness ++ * ++ * Note: If ph is non-NULL, the client should notify ++ * when IO readiness events occur by calling ++ * fuse_lowlevel_notify_poll() with the specified ph. ++ * ++ * Regardless of the number of times poll with a non-NULL ph ++ * is received, single notification is enough to clear all. ++ * Notifying more times incurs overhead but doesn't harm ++ * correctness. ++ * ++ * The callee is responsible for destroying ph with ++ * fuse_pollhandle_destroy() when no longer in use. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as success (with a kernel-defined default poll-mask) and ++ * future calls to pull() will succeed the same way without being send ++ * to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_poll ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param ph poll handle to be used for notification ++ */ ++ void (*poll) (fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct fuse_pollhandle *ph); ++ ++ /** ++ * Write data made available in a buffer ++ * ++ * This is a more generic version of the ->write() method. If ++ * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the ++ * kernel supports splicing from the fuse device, then the ++ * data will be made available in pipe for supporting zero ++ * copy data transfer. ++ * ++ * buf->count is guaranteed to be one (and thus buf->idx is ++ * always zero). The write_buf handler must ensure that ++ * bufv->off is correctly updated (reflecting the number of ++ * bytes read from bufv->buf[0]). ++ * ++ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is ++ * expected to reset the setuid and setgid bits. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param bufv buffer containing the data ++ * @param off offset to write to ++ * @param fi file information ++ */ ++ void (*write_buf) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_bufvec *bufv, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Callback function for the retrieve request ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() ++ * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() ++ * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() ++ * @param bufv the buffer containing the returned data ++ */ ++ void (*retrieve_reply) (fuse_req_t req, void *cookie, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv); ++ ++ /** ++ * Forget about multiple inodes ++ * ++ * See description of the forget function for more ++ * information. ++ * ++ * Valid replies: ++ * fuse_reply_none ++ * ++ * @param req request handle ++ */ ++ void (*forget_multi) (fuse_req_t req, size_t count, ++ struct fuse_forget_data *forgets); ++ ++ /** ++ * Acquire, modify or release a BSD file lock ++ * ++ * Note: if the locking methods are not implemented, the kernel ++ * will still allow file locking to work locally. Hence these are ++ * only interesting for network filesystems and similar. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param fi file information ++ * @param op the locking operation, see flock(2) ++ */ ++ void (*flock) (fuse_req_t req, fuse_ino_t ino, ++ struct fuse_file_info *fi, int op); ++ ++ /** ++ * Allocate requested space. If this function returns success then ++ * subsequent writes to the specified range shall not fail due to the lack ++ * of free space on the file system storage media. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future fallocate() requests will fail with EOPNOTSUPP without being ++ * send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param offset starting point for allocated region ++ * @param length size of allocated region ++ * @param mode determines the operation to be performed on the given range, ++ * see fallocate(2) ++ */ ++ void (*fallocate) (fuse_req_t req, fuse_ino_t ino, int mode, ++ off_t offset, off_t length, struct fuse_file_info *fi); ++ ++ /** ++ * Read directory with attributes ++ * ++ * Send a buffer filled using fuse_add_direntry_plus(), with size not ++ * exceeding the requested size. Send an empty buffer on end of ++ * stream. ++ * ++ * fi->fh will contain the value set by the opendir method, or ++ * will be undefined if the opendir method didn't set any value. ++ * ++ * In contrast to readdir() (which does not affect the lookup counts), ++ * the lookup count of every entry returned by readdirplus(), except "." ++ * and "..", is incremented by one. ++ * ++ * Valid replies: ++ * fuse_reply_buf ++ * fuse_reply_data ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param size maximum number of bytes to send ++ * @param off offset to continue reading the directory stream ++ * @param fi file information ++ */ ++ void (*readdirplus) (fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, ++ struct fuse_file_info *fi); ++ ++ /** ++ * Copy a range of data from one file to another ++ * ++ * Performs an optimized copy between two file descriptors without the ++ * additional cost of transferring data through the FUSE kernel module ++ * to user space (glibc) and then back into the FUSE filesystem again. ++ * ++ * In case this method is not implemented, glibc falls back to reading ++ * data from the source and writing to the destination. Effectively ++ * doing an inefficient copy of the data. ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all ++ * future copy_file_range() requests will fail with EOPNOTSUPP without ++ * being send to the filesystem process. ++ * ++ * Valid replies: ++ * fuse_reply_write ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino_in the inode number or the source file ++ * @param off_in starting point from were the data should be read ++ * @param fi_in file information of the source file ++ * @param ino_out the inode number or the destination file ++ * @param off_out starting point where the data should be written ++ * @param fi_out file information of the destination file ++ * @param len maximum size of the data to copy ++ * @param flags passed along with the copy_file_range() syscall ++ */ ++ void (*copy_file_range) (fuse_req_t req, fuse_ino_t ino_in, ++ off_t off_in, struct fuse_file_info *fi_in, ++ fuse_ino_t ino_out, off_t off_out, ++ struct fuse_file_info *fi_out, size_t len, ++ int flags); ++ ++ /** ++ * Find next data or hole after the specified offset ++ * ++ * If this request is answered with an error code of ENOSYS, this is ++ * treated as a permanent failure, i.e. all future lseek() requests will ++ * fail with the same error code without being send to the filesystem ++ * process. ++ * ++ * Valid replies: ++ * fuse_reply_lseek ++ * fuse_reply_err ++ * ++ * @param req request handle ++ * @param ino the inode number ++ * @param off offset to start search from ++ * @param whence either SEEK_DATA or SEEK_HOLE ++ * @param fi file information ++ */ ++ void (*lseek) (fuse_req_t req, fuse_ino_t ino, off_t off, int whence, ++ struct fuse_file_info *fi); ++}; ++ ++/** ++ * Reply with an error code or success. ++ * ++ * Possible requests: ++ * all except forget ++ * ++ * Whereever possible, error codes should be chosen from the list of ++ * documented error conditions in the corresponding system calls ++ * manpage. ++ * ++ * An error code of ENOSYS is sometimes treated specially. This is ++ * indicated in the documentation of the affected handler functions. ++ * ++ * The following requests may be answered with a zero error code: ++ * unlink, rmdir, rename, flush, release, fsync, fsyncdir, setxattr, ++ * removexattr, setlk. ++ * ++ * @param req request handle ++ * @param err the positive error value, or zero for success ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_err(fuse_req_t req, int err); ++ ++/** ++ * Don't send reply ++ * ++ * Possible requests: ++ * forget ++ * forget_multi ++ * retrieve_reply ++ * ++ * @param req request handle ++ */ ++void fuse_reply_none(fuse_req_t req); ++ ++/** ++ * Reply with a directory entry ++ * ++ * Possible requests: ++ * lookup, mknod, mkdir, symlink, link ++ * ++ * Side effects: ++ * increments the lookup count on success ++ * ++ * @param req request handle ++ * @param e the entry parameters ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e); ++ ++/** ++ * Reply with a directory entry and open parameters ++ * ++ * currently the following members of 'fi' are used: ++ * fh, direct_io, keep_cache ++ * ++ * Possible requests: ++ * create ++ * ++ * Side effects: ++ * increments the lookup count on success ++ * ++ * @param req request handle ++ * @param e the entry parameters ++ * @param fi file information ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, ++ const struct fuse_file_info *fi); ++ ++/** ++ * Reply with attributes ++ * ++ * Possible requests: ++ * getattr, setattr ++ * ++ * @param req request handle ++ * @param attr the attributes ++ * @param attr_timeout validity timeout (in seconds) for the attributes ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_attr(fuse_req_t req, const struct stat *attr, ++ double attr_timeout); ++ ++/** ++ * Reply with the contents of a symbolic link ++ * ++ * Possible requests: ++ * readlink ++ * ++ * @param req request handle ++ * @param link symbolic link contents ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_readlink(fuse_req_t req, const char *link); ++ ++/** ++ * Reply with open parameters ++ * ++ * currently the following members of 'fi' are used: ++ * fh, direct_io, keep_cache ++ * ++ * Possible requests: ++ * open, opendir ++ * ++ * @param req request handle ++ * @param fi file information ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *fi); ++ ++/** ++ * Reply with number of bytes written ++ * ++ * Possible requests: ++ * write ++ * ++ * @param req request handle ++ * @param count the number of bytes written ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_write(fuse_req_t req, size_t count); ++ ++/** ++ * Reply with data ++ * ++ * Possible requests: ++ * read, readdir, getxattr, listxattr ++ * ++ * @param req request handle ++ * @param buf buffer containing data ++ * @param size the size of data in bytes ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); ++ ++/** ++ * Reply with data copied/moved from buffer(s) ++ * ++ * Zero copy data transfer ("splicing") will be used under ++ * the following circumstances: ++ * ++ * 1. FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.want, and ++ * 2. the kernel supports splicing from the fuse device ++ * (FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.capable), and ++ * 3. *flags* does not contain FUSE_BUF_NO_SPLICE ++ * 4. The amount of data that is provided in file-descriptor backed ++ * buffers (i.e., buffers for which bufv[n].flags == FUSE_BUF_FD) ++ * is at least twice the page size. ++ * ++ * In order for SPLICE_F_MOVE to be used, the following additional ++ * conditions have to be fulfilled: ++ * ++ * 1. FUSE_CAP_SPLICE_MOVE is set in fuse_conn_info.want, and ++ * 2. the kernel supports it (i.e, FUSE_CAP_SPLICE_MOVE is set in ++ fuse_conn_info.capable), and ++ * 3. *flags* contains FUSE_BUF_SPLICE_MOVE ++ * ++ * Note that, if splice is used, the data is actually spliced twice: ++ * once into a temporary pipe (to prepend header data), and then again ++ * into the kernel. If some of the provided buffers are memory-backed, ++ * the data in them is copied in step one and spliced in step two. ++ * ++ * The FUSE_BUF_SPLICE_FORCE_SPLICE and FUSE_BUF_SPLICE_NONBLOCK flags ++ * are silently ignored. ++ * ++ * Possible requests: ++ * read, readdir, getxattr, listxattr ++ * ++ * Side effects: ++ * when used to return data from a readdirplus() (but not readdir()) ++ * call, increments the lookup count of each returned entry by one ++ * on success. ++ * ++ * @param req request handle ++ * @param bufv buffer vector ++ * @param flags flags controlling the copy ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags); ++ ++/** ++ * Reply with data vector ++ * ++ * Possible requests: ++ * read, readdir, getxattr, listxattr ++ * ++ * @param req request handle ++ * @param iov the vector containing the data ++ * @param count the size of vector ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count); ++ ++/** ++ * Reply with filesystem statistics ++ * ++ * Possible requests: ++ * statfs ++ * ++ * @param req request handle ++ * @param stbuf filesystem statistics ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf); ++ ++/** ++ * Reply with needed buffer size ++ * ++ * Possible requests: ++ * getxattr, listxattr ++ * ++ * @param req request handle ++ * @param count the buffer size needed in bytes ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_xattr(fuse_req_t req, size_t count); ++ ++/** ++ * Reply with file lock information ++ * ++ * Possible requests: ++ * getlk ++ * ++ * @param req request handle ++ * @param lock the lock information ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_lock(fuse_req_t req, const struct flock *lock); ++ ++/** ++ * Reply with block index ++ * ++ * Possible requests: ++ * bmap ++ * ++ * @param req request handle ++ * @param idx block index within device ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_bmap(fuse_req_t req, uint64_t idx); ++ ++/* ----------------------------------------------------------- * ++ * Filling a buffer in readdir * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Add a directory entry to the buffer ++ * ++ * Buffer needs to be large enough to hold the entry. If it's not, ++ * then the entry is not filled in but the size of the entry is still ++ * returned. The caller can check this by comparing the bufsize ++ * parameter with the returned entry size. If the entry size is ++ * larger than the buffer size, the operation failed. ++ * ++ * From the 'stbuf' argument the st_ino field and bits 12-15 of the ++ * st_mode field are used. The other fields are ignored. ++ * ++ * *off* should be any non-zero value that the filesystem can use to ++ * identify the current point in the directory stream. It does not ++ * need to be the actual physical position. A value of zero is ++ * reserved to mean "from the beginning", and should therefore never ++ * be used (the first call to fuse_add_direntry should be passed the ++ * offset of the second directory entry). ++ * ++ * @param req request handle ++ * @param buf the point where the new entry will be added to the buffer ++ * @param bufsize remaining size of the buffer ++ * @param name the name of the entry ++ * @param stbuf the file attributes ++ * @param off the offset of the next entry ++ * @return the space needed for the entry ++ */ ++size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, const struct stat *stbuf, ++ off_t off); ++ ++/** ++ * Add a directory entry to the buffer with the attributes ++ * ++ * See documentation of `fuse_add_direntry()` for more details. ++ * ++ * @param req request handle ++ * @param buf the point where the new entry will be added to the buffer ++ * @param bufsize remaining size of the buffer ++ * @param name the name of the entry ++ * @param e the directory entry ++ * @param off the offset of the next entry ++ * @return the space needed for the entry ++ */ ++size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, ++ const char *name, ++ const struct fuse_entry_param *e, off_t off); ++ ++/** ++ * Reply to ask for data fetch and output buffer preparation. ioctl ++ * will be retried with the specified input data fetched and output ++ * buffer prepared. ++ * ++ * Possible requests: ++ * ioctl ++ * ++ * @param req request handle ++ * @param in_iov iovec specifying data to fetch from the caller ++ * @param in_count number of entries in in_iov ++ * @param out_iov iovec specifying addresses to write output to ++ * @param out_count number of entries in out_iov ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_ioctl_retry(fuse_req_t req, ++ const struct iovec *in_iov, size_t in_count, ++ const struct iovec *out_iov, size_t out_count); ++ ++/** ++ * Reply to finish ioctl ++ * ++ * Possible requests: ++ * ioctl ++ * ++ * @param req request handle ++ * @param result result to be passed to the caller ++ * @param buf buffer containing output data ++ * @param size length of output data ++ */ ++int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size); ++ ++/** ++ * Reply to finish ioctl with iov buffer ++ * ++ * Possible requests: ++ * ioctl ++ * ++ * @param req request handle ++ * @param result result to be passed to the caller ++ * @param iov the vector containing the data ++ * @param count the size of vector ++ */ ++int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, ++ int count); ++ ++/** ++ * Reply with poll result event mask ++ * ++ * @param req request handle ++ * @param revents poll result event mask ++ */ ++int fuse_reply_poll(fuse_req_t req, unsigned revents); ++ ++/** ++ * Reply with offset ++ * ++ * Possible requests: ++ * lseek ++ * ++ * @param req request handle ++ * @param off offset of next data or hole ++ * @return zero for success, -errno for failure to send reply ++ */ ++int fuse_reply_lseek(fuse_req_t req, off_t off); ++ ++/* ----------------------------------------------------------- * ++ * Notification * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Notify IO readiness event ++ * ++ * For more information, please read comment for poll operation. ++ * ++ * @param ph poll handle to notify IO readiness event for ++ */ ++int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph); ++ ++/** ++ * Notify to invalidate cache for an inode. ++ * ++ * Added in FUSE protocol version 7.12. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * If the filesystem has writeback caching enabled, invalidating an ++ * inode will first trigger a writeback of all dirty pages. The call ++ * will block until all writeback requests have completed and the ++ * inode has been invalidated. It will, however, not wait for ++ * completion of pending writeback requests that have been issued ++ * before. ++ * ++ * If there are no dirty pages, this function will never block. ++ * ++ * @param se the session object ++ * @param ino the inode number ++ * @param off the offset in the inode where to start invalidating ++ * or negative to invalidate attributes only ++ * @param len the amount of cache to invalidate or 0 for all ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, ++ off_t off, off_t len); ++ ++/** ++ * Notify to invalidate parent attributes and the dentry matching ++ * parent/name ++ * ++ * To avoid a deadlock this function must not be called in the ++ * execution path of a related filesytem operation or within any code ++ * that could hold a lock that could be needed to execute such an ++ * operation. As of kernel 4.18, a "related operation" is a lookup(), ++ * symlink(), mknod(), mkdir(), unlink(), rename(), link() or create() ++ * request for the parent, and a setattr(), unlink(), rmdir(), ++ * rename(), setxattr(), removexattr(), readdir() or readdirplus() ++ * request for the inode itself. ++ * ++ * When called correctly, this function will never block. ++ * ++ * Added in FUSE protocol version 7.12. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param parent inode number ++ * @param name file name ++ * @param namelen strlen() of file name ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, ++ const char *name, size_t namelen); ++ ++/** ++ * This function behaves like fuse_lowlevel_notify_inval_entry() with ++ * the following additional effect (at least as of Linux kernel 4.8): ++ * ++ * If the provided *child* inode matches the inode that is currently ++ * associated with the cached dentry, and if there are any inotify ++ * watches registered for the dentry, then the watchers are informed ++ * that the dentry has been deleted. ++ * ++ * To avoid a deadlock this function must not be called while ++ * executing a related filesytem operation or while holding a lock ++ * that could be needed to execute such an operation (see the ++ * description of fuse_lowlevel_notify_inval_entry() for more ++ * details). ++ * ++ * When called correctly, this function will never block. ++ * ++ * Added in FUSE protocol version 7.18. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param parent inode number ++ * @param child inode number ++ * @param name file name ++ * @param namelen strlen() of file name ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_delete(struct fuse_session *se, ++ fuse_ino_t parent, fuse_ino_t child, ++ const char *name, size_t namelen); ++ ++/** ++ * Store data to the kernel buffers ++ * ++ * Synchronously store data in the kernel buffers belonging to the ++ * given inode. The stored data is marked up-to-date (no read will be ++ * performed against it, unless it's invalidated or evicted from the ++ * cache). ++ * ++ * If the stored data overflows the current file size, then the size ++ * is extended, similarly to a write(2) on the filesystem. ++ * ++ * If this function returns an error, then the store wasn't fully ++ * completed, but it may have been partially completed. ++ * ++ * Added in FUSE protocol version 7.15. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param ino the inode number ++ * @param offset the starting offset into the file to store to ++ * @param bufv buffer vector ++ * @param flags flags controlling the copy ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, ++ off_t offset, struct fuse_bufvec *bufv, ++ enum fuse_buf_copy_flags flags); ++/** ++ * Retrieve data from the kernel buffers ++ * ++ * Retrieve data in the kernel buffers belonging to the given inode. ++ * If successful then the retrieve_reply() method will be called with ++ * the returned data. ++ * ++ * Only present pages are returned in the retrieve reply. Retrieving ++ * stops when it finds a non-present page and only data prior to that ++ * is returned. ++ * ++ * If this function returns an error, then the retrieve will not be ++ * completed and no reply will be sent. ++ * ++ * This function doesn't change the dirty state of pages in the kernel ++ * buffer. For dirty pages the write() method will be called ++ * regardless of having been retrieved previously. ++ * ++ * Added in FUSE protocol version 7.15. If the kernel does not support ++ * this (or a newer) version, the function will return -ENOSYS and do ++ * nothing. ++ * ++ * @param se the session object ++ * @param ino the inode number ++ * @param size the number of bytes to retrieve ++ * @param offset the starting offset into the file to retrieve from ++ * @param cookie user data to supply to the reply callback ++ * @return zero for success, -errno for failure ++ */ ++int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, ++ size_t size, off_t offset, void *cookie); ++ ++ ++/* ----------------------------------------------------------- * ++ * Utility functions * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Get the userdata from the request ++ * ++ * @param req request handle ++ * @return the user data passed to fuse_session_new() ++ */ ++void *fuse_req_userdata(fuse_req_t req); ++ ++/** ++ * Get the context from the request ++ * ++ * The pointer returned by this function will only be valid for the ++ * request's lifetime ++ * ++ * @param req request handle ++ * @return the context structure ++ */ ++const struct fuse_ctx *fuse_req_ctx(fuse_req_t req); ++ ++/** ++ * Get the current supplementary group IDs for the specified request ++ * ++ * Similar to the getgroups(2) system call, except the return value is ++ * always the total number of group IDs, even if it is larger than the ++ * specified size. ++ * ++ * The current fuse kernel module in linux (as of 2.6.30) doesn't pass ++ * the group list to userspace, hence this function needs to parse ++ * "/proc/$TID/task/$TID/status" to get the group IDs. ++ * ++ * This feature may not be supported on all operating systems. In ++ * such a case this function will return -ENOSYS. ++ * ++ * @param req request handle ++ * @param size size of given array ++ * @param list array of group IDs to be filled in ++ * @return the total number of supplementary group IDs or -errno on failure ++ */ ++int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]); ++ ++/** ++ * Callback function for an interrupt ++ * ++ * @param req interrupted request ++ * @param data user data ++ */ ++typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data); ++ ++/** ++ * Register/unregister callback for an interrupt ++ * ++ * If an interrupt has already happened, then the callback function is ++ * called from within this function, hence it's not possible for ++ * interrupts to be lost. ++ * ++ * @param req request handle ++ * @param func the callback function or NULL for unregister ++ * @param data user data passed to the callback function ++ */ ++void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, ++ void *data); ++ ++/** ++ * Check if a request has already been interrupted ++ * ++ * @param req request handle ++ * @return 1 if the request has been interrupted, 0 otherwise ++ */ ++int fuse_req_interrupted(fuse_req_t req); ++ ++ ++/* ----------------------------------------------------------- * ++ * Inquiry functions * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Print low-level version information to stdout. ++ */ ++void fuse_lowlevel_version(void); ++ ++/** ++ * Print available low-level options to stdout. This is not an ++ * exhaustive list, but includes only those options that may be of ++ * interest to an end-user of a file system. ++ */ ++void fuse_lowlevel_help(void); ++ ++/** ++ * Print available options for `fuse_parse_cmdline()`. ++ */ ++void fuse_cmdline_help(void); ++ ++/* ----------------------------------------------------------- * ++ * Filesystem setup & teardown * ++ * ----------------------------------------------------------- */ ++ ++struct fuse_cmdline_opts { ++ int singlethread; ++ int foreground; ++ int debug; ++ int nodefault_subtype; ++ char *mountpoint; ++ int show_version; ++ int show_help; ++ int clone_fd; ++ unsigned int max_idle_threads; ++}; ++ ++/** ++ * Utility function to parse common options for simple file systems ++ * using the low-level API. A help text that describes the available ++ * options can be printed with `fuse_cmdline_help`. A single ++ * non-option argument is treated as the mountpoint. Multiple ++ * non-option arguments will result in an error. ++ * ++ * If neither -o subtype= or -o fsname= options are given, a new ++ * subtype option will be added and set to the basename of the program ++ * (the fsname will remain unset, and then defaults to "fuse"). ++ * ++ * Known options will be removed from *args*, unknown options will ++ * remain. ++ * ++ * @param args argument vector (input+output) ++ * @param opts output argument for parsed options ++ * @return 0 on success, -1 on failure ++ */ ++int fuse_parse_cmdline(struct fuse_args *args, ++ struct fuse_cmdline_opts *opts); ++ ++/** ++ * Create a low level session. ++ * ++ * Returns a session structure suitable for passing to ++ * fuse_session_mount() and fuse_session_loop(). ++ * ++ * This function accepts most file-system independent mount options ++ * (like context, nodev, ro - see mount(8)), as well as the general ++ * fuse mount options listed in mount.fuse(8) (e.g. -o allow_root and ++ * -o default_permissions, but not ``-o use_ino``). Instead of `-o ++ * debug`, debugging may also enabled with `-d` or `--debug`. ++ * ++ * If not all options are known, an error message is written to stderr ++ * and the function returns NULL. ++ * ++ * Option parsing skips argv[0], which is assumed to contain the ++ * program name. To prevent accidentally passing an option in ++ * argv[0], this element must always be present (even if no options ++ * are specified). It may be set to the empty string ('\0') if no ++ * reasonable value can be provided. ++ * ++ * @param args argument vector ++ * @param op the (low-level) filesystem operations ++ * @param op_size sizeof(struct fuse_lowlevel_ops) ++ * @param userdata user data ++ * ++ * @return the fuse session on success, NULL on failure ++ **/ ++struct fuse_session *fuse_session_new(struct fuse_args *args, ++ const struct fuse_lowlevel_ops *op, ++ size_t op_size, void *userdata); ++ ++/** ++ * Mount a FUSE file system. ++ * ++ * @param mountpoint the mount point path ++ * @param se session object ++ * ++ * @return 0 on success, -1 on failure. ++ **/ ++int fuse_session_mount(struct fuse_session *se, const char *mountpoint); ++ ++/** ++ * Enter a single threaded, blocking event loop. ++ * ++ * When the event loop terminates because the connection to the FUSE ++ * kernel module has been closed, this function returns zero. This ++ * happens when the filesystem is unmounted regularly (by the ++ * filesystem owner or root running the umount(8) or fusermount(1) ++ * command), or if connection is explicitly severed by writing ``1`` ++ * to the``abort`` file in ``/sys/fs/fuse/connections/NNN``. The only ++ * way to distinguish between these two conditions is to check if the ++ * filesystem is still mounted after the session loop returns. ++ * ++ * When some error occurs during request processing, the function ++ * returns a negated errno(3) value. ++ * ++ * If the loop has been terminated because of a signal handler ++ * installed by fuse_set_signal_handlers(), this function returns the ++ * (positive) signal value that triggered the exit. ++ * ++ * @param se the session ++ * @return 0, -errno, or a signal value ++ */ ++int fuse_session_loop(struct fuse_session *se); ++ ++/** ++ * Enter a multi-threaded event loop. ++ * ++ * For a description of the return value and the conditions when the ++ * event loop exits, refer to the documentation of ++ * fuse_session_loop(). ++ * ++ * @param se the session ++ * @param config session loop configuration ++ * @return see fuse_session_loop() ++ */ ++#if FUSE_USE_VERSION < 32 ++int fuse_session_loop_mt_31(struct fuse_session *se, int clone_fd); ++#define fuse_session_loop_mt(se, clone_fd) fuse_session_loop_mt_31(se, clone_fd) ++#else ++int fuse_session_loop_mt(struct fuse_session *se, struct fuse_loop_config *config); ++#endif ++ ++/** ++ * Flag a session as terminated. ++ * ++ * This function is invoked by the POSIX signal handlers, when ++ * registered using fuse_set_signal_handlers(). It will cause any ++ * running event loops to terminate on the next opportunity. ++ * ++ * @param se the session ++ */ ++void fuse_session_exit(struct fuse_session *se); ++ ++/** ++ * Reset the terminated flag of a session ++ * ++ * @param se the session ++ */ ++void fuse_session_reset(struct fuse_session *se); ++ ++/** ++ * Query the terminated flag of a session ++ * ++ * @param se the session ++ * @return 1 if exited, 0 if not exited ++ */ ++int fuse_session_exited(struct fuse_session *se); ++ ++/** ++ * Ensure that file system is unmounted. ++ * ++ * In regular operation, the file system is typically unmounted by the ++ * user calling umount(8) or fusermount(1), which then terminates the ++ * FUSE session loop. However, the session loop may also terminate as ++ * a result of an explicit call to fuse_session_exit() (e.g. by a ++ * signal handler installed by fuse_set_signal_handler()). In this ++ * case the filesystem remains mounted, but any attempt to access it ++ * will block (while the filesystem process is still running) or give ++ * an ESHUTDOWN error (after the filesystem process has terminated). ++ * ++ * If the communication channel with the FUSE kernel module is still ++ * open (i.e., if the session loop was terminated by an explicit call ++ * to fuse_session_exit()), this function will close it and unmount ++ * the filesystem. If the communication channel has been closed by the ++ * kernel, this method will do (almost) nothing. ++ * ++ * NOTE: The above semantics mean that if the connection to the kernel ++ * is terminated via the ``/sys/fs/fuse/connections/NNN/abort`` file, ++ * this method will *not* unmount the filesystem. ++ * ++ * @param se the session ++ */ ++void fuse_session_unmount(struct fuse_session *se); ++ ++/** ++ * Destroy a session ++ * ++ * @param se the session ++ */ ++void fuse_session_destroy(struct fuse_session *se); ++ ++/* ----------------------------------------------------------- * ++ * Custom event loop support * ++ * ----------------------------------------------------------- */ ++ ++/** ++ * Return file descriptor for communication with kernel. ++ * ++ * The file selector can be used to integrate FUSE with a custom event ++ * loop. Whenever data is available for reading on the provided fd, ++ * the event loop should call `fuse_session_receive_buf` followed by ++ * `fuse_session_process_buf` to process the request. ++ * ++ * The returned file descriptor is valid until `fuse_session_unmount` ++ * is called. ++ * ++ * @param se the session ++ * @return a file descriptor ++ */ ++int fuse_session_fd(struct fuse_session *se); ++ ++/** ++ * Process a raw request supplied in a generic buffer ++ * ++ * The fuse_buf may contain a memory buffer or a pipe file descriptor. ++ * ++ * @param se the session ++ * @param buf the fuse_buf containing the request ++ */ ++void fuse_session_process_buf(struct fuse_session *se, ++ const struct fuse_buf *buf); ++ ++/** ++ * Read a raw request from the kernel into the supplied buffer. ++ * ++ * Depending on file system options, system capabilities, and request ++ * size the request is either read into a memory buffer or spliced ++ * into a temporary pipe. ++ * ++ * @param se the session ++ * @param buf the fuse_buf to store the request in ++ * @return the actual size of the raw request, or -errno on error ++ */ ++int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_LOWLEVEL_H_ */ +diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h +new file mode 100644 +index 0000000..2f6663e +--- /dev/null ++++ b/tools/virtiofsd/fuse_misc.h +@@ -0,0 +1,59 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB ++*/ ++ ++#include ++ ++/* ++ Versioned symbols cannot be used in some cases because it ++ - confuse the dynamic linker in uClibc ++ - not supported on MacOSX (in MachO binary format) ++*/ ++#if (!defined(__UCLIBC__) && !defined(__APPLE__)) ++#define FUSE_SYMVER(x) __asm__(x) ++#else ++#define FUSE_SYMVER(x) ++#endif ++ ++#ifndef USE_UCLIBC ++#define fuse_mutex_init(mut) pthread_mutex_init(mut, NULL) ++#else ++/* Is this hack still needed? */ ++static inline void fuse_mutex_init(pthread_mutex_t *mut) ++{ ++ pthread_mutexattr_t attr; ++ pthread_mutexattr_init(&attr); ++ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); ++ pthread_mutex_init(mut, &attr); ++ pthread_mutexattr_destroy(&attr); ++} ++#endif ++ ++#ifdef HAVE_STRUCT_STAT_ST_ATIM ++/* Linux */ ++#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec) ++#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec) ++#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec) ++#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val) ++#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val) ++#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val) ++#elif defined(HAVE_STRUCT_STAT_ST_ATIMESPEC) ++/* FreeBSD */ ++#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec) ++#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec) ++#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec) ++#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val) ++#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val) ++#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val) ++#else ++#define ST_ATIM_NSEC(stbuf) 0 ++#define ST_CTIM_NSEC(stbuf) 0 ++#define ST_MTIM_NSEC(stbuf) 0 ++#define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0) ++#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0) ++#define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0) ++#endif +diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h +new file mode 100644 +index 0000000..d8573e7 +--- /dev/null ++++ b/tools/virtiofsd/fuse_opt.h +@@ -0,0 +1,271 @@ ++/* ++ FUSE: Filesystem in Userspace ++ Copyright (C) 2001-2007 Miklos Szeredi ++ ++ This program can be distributed under the terms of the GNU LGPLv2. ++ See the file COPYING.LIB. ++*/ ++ ++#ifndef FUSE_OPT_H_ ++#define FUSE_OPT_H_ ++ ++/** @file ++ * ++ * This file defines the option parsing interface of FUSE ++ */ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/** ++ * Option description ++ * ++ * This structure describes a single option, and action associated ++ * with it, in case it matches. ++ * ++ * More than one such match may occur, in which case the action for ++ * each match is executed. ++ * ++ * There are three possible actions in case of a match: ++ * ++ * i) An integer (int or unsigned) variable determined by 'offset' is ++ * set to 'value' ++ * ++ * ii) The processing function is called, with 'value' as the key ++ * ++ * iii) An integer (any) or string (char *) variable determined by ++ * 'offset' is set to the value of an option parameter ++ * ++ * 'offset' should normally be either set to ++ * ++ * - 'offsetof(struct foo, member)' actions i) and iii) ++ * ++ * - -1 action ii) ++ * ++ * The 'offsetof()' macro is defined in the header. ++ * ++ * The template determines which options match, and also have an ++ * effect on the action. Normally the action is either i) or ii), but ++ * if a format is present in the template, then action iii) is ++ * performed. ++ * ++ * The types of templates are: ++ * ++ * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only ++ * themselves. Invalid values are "--" and anything beginning ++ * with "-o" ++ * ++ * 2) "foo", "foo-bar", etc. These match "-ofoo", "-ofoo-bar" or ++ * the relevant option in a comma separated option list ++ * ++ * 3) "bar=", "--foo=", etc. These are variations of 1) and 2) ++ * which have a parameter ++ * ++ * 4) "bar=%s", "--foo=%lu", etc. Same matching as above but perform ++ * action iii). ++ * ++ * 5) "-x ", etc. Matches either "-xparam" or "-x param" as ++ * two separate arguments ++ * ++ * 6) "-x %s", etc. Combination of 4) and 5) ++ * ++ * If the format is "%s", memory is allocated for the string unlike with ++ * scanf(). The previous value (if non-NULL) stored at the this location is ++ * freed. ++ */ ++struct fuse_opt { ++ /** Matching template and optional parameter formatting */ ++ const char *templ; ++ ++ /** ++ * Offset of variable within 'data' parameter of fuse_opt_parse() ++ * or -1 ++ */ ++ unsigned long offset; ++ ++ /** ++ * Value to set the variable to, or to be passed as 'key' to the ++ * processing function. Ignored if template has a format ++ */ ++ int value; ++}; ++ ++/** ++ * Key option. In case of a match, the processing function will be ++ * called with the specified key. ++ */ ++#define FUSE_OPT_KEY(templ, key) { templ, -1U, key } ++ ++/** ++ * Last option. An array of 'struct fuse_opt' must end with a NULL ++ * template value ++ */ ++#define FUSE_OPT_END { NULL, 0, 0 } ++ ++/** ++ * Argument list ++ */ ++struct fuse_args { ++ /** Argument count */ ++ int argc; ++ ++ /** Argument vector. NULL terminated */ ++ char **argv; ++ ++ /** Is 'argv' allocated? */ ++ int allocated; ++}; ++ ++/** ++ * Initializer for 'struct fuse_args' ++ */ ++#define FUSE_ARGS_INIT(argc, argv) { argc, argv, 0 } ++ ++/** ++ * Key value passed to the processing function if an option did not ++ * match any template ++ */ ++#define FUSE_OPT_KEY_OPT -1 ++ ++/** ++ * Key value passed to the processing function for all non-options ++ * ++ * Non-options are the arguments beginning with a character other than ++ * '-' or all arguments after the special '--' option ++ */ ++#define FUSE_OPT_KEY_NONOPT -2 ++ ++/** ++ * Special key value for options to keep ++ * ++ * Argument is not passed to processing function, but behave as if the ++ * processing function returned 1 ++ */ ++#define FUSE_OPT_KEY_KEEP -3 ++ ++/** ++ * Special key value for options to discard ++ * ++ * Argument is not passed to processing function, but behave as if the ++ * processing function returned zero ++ */ ++#define FUSE_OPT_KEY_DISCARD -4 ++ ++/** ++ * Processing function ++ * ++ * This function is called if ++ * - option did not match any 'struct fuse_opt' ++ * - argument is a non-option ++ * - option did match and offset was set to -1 ++ * ++ * The 'arg' parameter will always contain the whole argument or ++ * option including the parameter if exists. A two-argument option ++ * ("-x foo") is always converted to single argument option of the ++ * form "-xfoo" before this function is called. ++ * ++ * Options of the form '-ofoo' are passed to this function without the ++ * '-o' prefix. ++ * ++ * The return value of this function determines whether this argument ++ * is to be inserted into the output argument vector, or discarded. ++ * ++ * @param data is the user data passed to the fuse_opt_parse() function ++ * @param arg is the whole argument or option ++ * @param key determines why the processing function was called ++ * @param outargs the current output argument list ++ * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept ++ */ ++typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, ++ struct fuse_args *outargs); ++ ++/** ++ * Option parsing function ++ * ++ * If 'args' was returned from a previous call to fuse_opt_parse() or ++ * it was constructed from ++ * ++ * A NULL 'args' is equivalent to an empty argument vector ++ * ++ * A NULL 'opts' is equivalent to an 'opts' array containing a single ++ * end marker ++ * ++ * A NULL 'proc' is equivalent to a processing function always ++ * returning '1' ++ * ++ * @param args is the input and output argument list ++ * @param data is the user data ++ * @param opts is the option description array ++ * @param proc is the processing function ++ * @return -1 on error, 0 on success ++ */ ++int fuse_opt_parse(struct fuse_args *args, void *data, ++ const struct fuse_opt opts[], fuse_opt_proc_t proc); ++ ++/** ++ * Add an option to a comma separated option list ++ * ++ * @param opts is a pointer to an option list, may point to a NULL value ++ * @param opt is the option to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_add_opt(char **opts, const char *opt); ++ ++/** ++ * Add an option, escaping commas, to a comma separated option list ++ * ++ * @param opts is a pointer to an option list, may point to a NULL value ++ * @param opt is the option to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_add_opt_escaped(char **opts, const char *opt); ++ ++/** ++ * Add an argument to a NULL terminated argument vector ++ * ++ * @param args is the structure containing the current argument list ++ * @param arg is the new argument to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_add_arg(struct fuse_args *args, const char *arg); ++ ++/** ++ * Add an argument at the specified position in a NULL terminated ++ * argument vector ++ * ++ * Adds the argument to the N-th position. This is useful for adding ++ * options at the beginning of the array which must not come after the ++ * special '--' option. ++ * ++ * @param args is the structure containing the current argument list ++ * @param pos is the position at which to add the argument ++ * @param arg is the new argument to add ++ * @return -1 on allocation error, 0 on success ++ */ ++int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg); ++ ++/** ++ * Free the contents of argument list ++ * ++ * The structure itself is not freed ++ * ++ * @param args is the structure containing the argument list ++ */ ++void fuse_opt_free_args(struct fuse_args *args); ++ ++ ++/** ++ * Check if an option matches ++ * ++ * @param opts is the option description array ++ * @param opt is the option to match ++ * @return 1 if a match is found, 0 if not ++ */ ++int fuse_opt_match(const struct fuse_opt opts[], const char *opt); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* FUSE_OPT_H_ */ +diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h +new file mode 100644 +index 0000000..6b77c33 +--- /dev/null ++++ b/tools/virtiofsd/passthrough_helpers.h +@@ -0,0 +1,76 @@ ++/* ++ * FUSE: Filesystem in Userspace ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE ++ */ ++ ++/* ++ * Creates files on the underlying file system in response to a FUSE_MKNOD ++ * operation ++ */ ++static int mknod_wrapper(int dirfd, const char *path, const char *link, ++ int mode, dev_t rdev) ++{ ++ int res; ++ ++ if (S_ISREG(mode)) { ++ res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); ++ if (res >= 0) ++ res = close(res); ++ } else if (S_ISDIR(mode)) { ++ res = mkdirat(dirfd, path, mode); ++ } else if (S_ISLNK(mode) && link != NULL) { ++ res = symlinkat(link, dirfd, path); ++ } else if (S_ISFIFO(mode)) { ++ res = mkfifoat(dirfd, path, mode); ++#ifdef __FreeBSD__ ++ } else if (S_ISSOCK(mode)) { ++ struct sockaddr_un su; ++ int fd; ++ ++ if (strlen(path) >= sizeof(su.sun_path)) { ++ errno = ENAMETOOLONG; ++ return -1; ++ } ++ fd = socket(AF_UNIX, SOCK_STREAM, 0); ++ if (fd >= 0) { ++ /* ++ * We must bind the socket to the underlying file ++ * system to create the socket file, even though ++ * we'll never listen on this socket. ++ */ ++ su.sun_family = AF_UNIX; ++ strncpy(su.sun_path, path, sizeof(su.sun_path)); ++ res = bindat(dirfd, fd, (struct sockaddr*)&su, ++ sizeof(su)); ++ if (res == 0) ++ close(fd); ++ } else { ++ res = -1; ++ } ++#endif ++ } else { ++ res = mknodat(dirfd, path, mode, rdev); ++ } ++ ++ return res; ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Remove-fuse_req_getgroups.patch b/kvm-virtiofsd-Remove-fuse_req_getgroups.patch new file mode 100755 index 0000000000000000000000000000000000000000..27e71f26d6d2867177e910c66abbed26a943ac60 --- /dev/null +++ b/kvm-virtiofsd-Remove-fuse_req_getgroups.patch @@ -0,0 +1,193 @@ +From 7a1860c83ff042f3e796c449e780ee0528107213 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:08 +0000 +Subject: [PATCH 12/18] virtiofsd: Remove fuse_req_getgroups +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-2-dgilbert@redhat.com> +Patchwork-id: 94122 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 1/7] virtiofsd: Remove fuse_req_getgroups +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +Remove fuse_req_getgroups that's unused in virtiofsd; it came in +from libfuse but we don't actually use it. It was called from +fuse_getgroups which we previously removed (but had left it's header +in). + +Coverity had complained about null termination in it, but removing +it is the easiest answer. + +Fixes: Coverity CID: 1413117 (String not null terminated) +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 988717b46b6424907618cb845ace9d69062703af) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse.h | 20 ----------- + tools/virtiofsd/fuse_lowlevel.c | 77 ----------------------------------------- + tools/virtiofsd/fuse_lowlevel.h | 21 ----------- + 3 files changed, 118 deletions(-) + +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +index 7a4c713..aba13fe 100644 +--- a/tools/virtiofsd/fuse.h ++++ b/tools/virtiofsd/fuse.h +@@ -1007,26 +1007,6 @@ void fuse_exit(struct fuse *f); + struct fuse_context *fuse_get_context(void); + + /** +- * Get the current supplementary group IDs for the current request +- * +- * Similar to the getgroups(2) system call, except the return value is +- * always the total number of group IDs, even if it is larger than the +- * specified size. +- * +- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass +- * the group list to userspace, hence this function needs to parse +- * "/proc/$TID/task/$TID/status" to get the group IDs. +- * +- * This feature may not be supported on all operating systems. In +- * such a case this function will return -ENOSYS. +- * +- * @param size size of given array +- * @param list array of group IDs to be filled in +- * @return the total number of supplementary group IDs or -errno on failure +- */ +-int fuse_getgroups(int size, gid_t list[]); +- +-/** + * Check if the current request has already been interrupted + * + * @return 1 if the request has been interrupted, 0 otherwise +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index de2e2e0..01c418a 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2667,83 +2667,6 @@ int fuse_lowlevel_is_virtio(struct fuse_session *se) + return !!se->virtio_dev; + } + +-#ifdef linux +-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) +-{ +- char *buf; +- size_t bufsize = 1024; +- char path[128]; +- int ret; +- int fd; +- unsigned long pid = req->ctx.pid; +- char *s; +- +- sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); +- +-retry: +- buf = malloc(bufsize); +- if (buf == NULL) { +- return -ENOMEM; +- } +- +- ret = -EIO; +- fd = open(path, O_RDONLY); +- if (fd == -1) { +- goto out_free; +- } +- +- ret = read(fd, buf, bufsize); +- close(fd); +- if (ret < 0) { +- ret = -EIO; +- goto out_free; +- } +- +- if ((size_t)ret == bufsize) { +- free(buf); +- bufsize *= 4; +- goto retry; +- } +- +- ret = -EIO; +- s = strstr(buf, "\nGroups:"); +- if (s == NULL) { +- goto out_free; +- } +- +- s += 8; +- ret = 0; +- while (1) { +- char *end; +- unsigned long val = strtoul(s, &end, 0); +- if (end == s) { +- break; +- } +- +- s = end; +- if (ret < size) { +- list[ret] = val; +- } +- ret++; +- } +- +-out_free: +- free(buf); +- return ret; +-} +-#else /* linux */ +-/* +- * This is currently not implemented on other than Linux... +- */ +-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) +-{ +- (void)req; +- (void)size; +- (void)list; +- return -ENOSYS; +-} +-#endif +- + void fuse_session_exit(struct fuse_session *se) + { + se->exited = 1; +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 138041e..8f6d705 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1705,27 +1705,6 @@ void *fuse_req_userdata(fuse_req_t req); + const struct fuse_ctx *fuse_req_ctx(fuse_req_t req); + + /** +- * Get the current supplementary group IDs for the specified request +- * +- * Similar to the getgroups(2) system call, except the return value is +- * always the total number of group IDs, even if it is larger than the +- * specified size. +- * +- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass +- * the group list to userspace, hence this function needs to parse +- * "/proc/$TID/task/$TID/status" to get the group IDs. +- * +- * This feature may not be supported on all operating systems. In +- * such a case this function will return -ENOSYS. +- * +- * @param req request handle +- * @param size size of given array +- * @param list array of group IDs to be filled in +- * @return the total number of supplementary group IDs or -errno on failure +- */ +-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]); +- +-/** + * Callback function for an interrupt + * + * @param req interrupted request +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch b/kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch new file mode 100755 index 0000000000000000000000000000000000000000..7f9c5bb838f51955c0ad0f7233cf46ad1f06236a --- /dev/null +++ b/kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch @@ -0,0 +1,271 @@ +From 80237df2b22eca685037456e65d149fed4654165 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:48 +0100 +Subject: [PATCH 017/116] virtiofsd: Remove unused enum fuse_buf_copy_flags +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-14-dgilbert@redhat.com> +Patchwork-id: 93465 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 013/112] virtiofsd: Remove unused enum fuse_buf_copy_flags +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Xiao Yang + +Signed-off-by: Xiao Yang +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8c3fe75e0308ba2f01d160ace534b7e386cea808) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 7 +++--- + tools/virtiofsd/fuse_common.h | 46 +--------------------------------------- + tools/virtiofsd/fuse_lowlevel.c | 13 +++++------- + tools/virtiofsd/fuse_lowlevel.h | 35 ++---------------------------- + tools/virtiofsd/passthrough_ll.c | 4 ++-- + 5 files changed, 13 insertions(+), 92 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 5df946c..4d507f3 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -171,7 +171,7 @@ static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, + + static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) ++ size_t len) + { + int src_is_fd = src->flags & FUSE_BUF_IS_FD; + int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; +@@ -224,8 +224,7 @@ static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) + return 1; + } + +-ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, +- enum fuse_buf_copy_flags flags) ++ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) + { + size_t copied = 0; + +@@ -249,7 +248,7 @@ ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv, + dst_len = dst->size - dstv->off; + len = min_size(src_len, dst_len); + +- res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len, flags); ++ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len); + if (res < 0) { + if (!copied) { + return res; +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index bd9bf86..0cb33ac 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -605,48 +605,6 @@ enum fuse_buf_flags { + }; + + /** +- * Buffer copy flags +- */ +-enum fuse_buf_copy_flags { +- /** +- * Don't use splice(2) +- * +- * Always fall back to using read and write instead of +- * splice(2) to copy data from one file descriptor to another. +- * +- * If this flag is not set, then only fall back if splice is +- * unavailable. +- */ +- FUSE_BUF_NO_SPLICE = (1 << 1), +- +- /** +- * Force splice +- * +- * Always use splice(2) to copy data from one file descriptor +- * to another. If splice is not available, return -EINVAL. +- */ +- FUSE_BUF_FORCE_SPLICE = (1 << 2), +- +- /** +- * Try to move data with splice. +- * +- * If splice is used, try to move pages from the source to the +- * destination instead of copying. See documentation of +- * SPLICE_F_MOVE in splice(2) man page. +- */ +- FUSE_BUF_SPLICE_MOVE = (1 << 3), +- +- /** +- * Don't block on the pipe when copying data with splice +- * +- * Makes the operations on the pipe non-blocking (if the pipe +- * is full or empty). See SPLICE_F_NONBLOCK in the splice(2) +- * man page. +- */ +- FUSE_BUF_SPLICE_NONBLOCK = (1 << 4), +-}; +- +-/** + * Single data buffer + * + * Generic data buffer for I/O, extended attributes, etc... Data may +@@ -741,11 +699,9 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv); + * + * @param dst destination buffer vector + * @param src source buffer vector +- * @param flags flags controlling the copy + * @return actual number of bytes copied or -errno on error + */ +-ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src, +- enum fuse_buf_copy_flags flags); ++ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src); + + /* + * Signal handling +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index eb0ec49..3da80de 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -490,16 +490,14 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + + static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, unsigned int flags) ++ struct fuse_bufvec *buf) + { + size_t len = fuse_buf_size(buf); +- (void)flags; + + return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); + } + +-int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) + { + struct iovec iov[2]; + struct fuse_out_header out; +@@ -511,7 +509,7 @@ int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, + out.unique = req->unique; + out.error = 0; + +- res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv, flags); ++ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv); + if (res <= 0) { + fuse_free_req(req); + return res; +@@ -1969,8 +1967,7 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + } + + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags) ++ off_t offset, struct fuse_bufvec *bufv) + { + struct fuse_out_header out; + struct fuse_notify_store_out outarg; +@@ -1999,7 +1996,7 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + +- res = fuse_send_data_iov(se, NULL, iov, 2, bufv, flags); ++ res = fuse_send_data_iov(se, NULL, iov, 2, bufv); + if (res > 0) { + res = -res; + } +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 12a84b4..2fa225d 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1363,33 +1363,6 @@ int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + /** + * Reply with data copied/moved from buffer(s) + * +- * Zero copy data transfer ("splicing") will be used under +- * the following circumstances: +- * +- * 1. FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.want, and +- * 2. the kernel supports splicing from the fuse device +- * (FUSE_CAP_SPLICE_WRITE is set in fuse_conn_info.capable), and +- * 3. *flags* does not contain FUSE_BUF_NO_SPLICE +- * 4. The amount of data that is provided in file-descriptor backed +- * buffers (i.e., buffers for which bufv[n].flags == FUSE_BUF_FD) +- * is at least twice the page size. +- * +- * In order for SPLICE_F_MOVE to be used, the following additional +- * conditions have to be fulfilled: +- * +- * 1. FUSE_CAP_SPLICE_MOVE is set in fuse_conn_info.want, and +- * 2. the kernel supports it (i.e, FUSE_CAP_SPLICE_MOVE is set in +- fuse_conn_info.capable), and +- * 3. *flags* contains FUSE_BUF_SPLICE_MOVE +- * +- * Note that, if splice is used, the data is actually spliced twice: +- * once into a temporary pipe (to prepend header data), and then again +- * into the kernel. If some of the provided buffers are memory-backed, +- * the data in them is copied in step one and spliced in step two. +- * +- * The FUSE_BUF_SPLICE_FORCE_SPLICE and FUSE_BUF_SPLICE_NONBLOCK flags +- * are silently ignored. +- * + * Possible requests: + * read, readdir, getxattr, listxattr + * +@@ -1400,11 +1373,9 @@ int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + * + * @param req request handle + * @param bufv buffer vector +- * @param flags flags controlling the copy + * @return zero for success, -errno for failure to send reply + */ +-int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv); + + /** + * Reply with data vector +@@ -1705,12 +1676,10 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + * @param ino the inode number + * @param offset the starting offset into the file to store to + * @param bufv buffer vector +- * @param flags flags controlling the copy + * @return zero for success, -errno for failure + */ + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv, +- enum fuse_buf_copy_flags flags); ++ off_t offset, struct fuse_bufvec *bufv); + + /* + * Utility functions +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9377718..126a56c 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -931,7 +931,7 @@ static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + buf.buf[0].fd = fi->fh; + buf.buf[0].pos = offset; + +- fuse_reply_data(req, &buf, FUSE_BUF_SPLICE_MOVE); ++ fuse_reply_data(req, &buf); + } + + static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, +@@ -952,7 +952,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + out_buf.buf[0].size, (unsigned long)off); + } + +- res = fuse_buf_copy(&out_buf, in_buf, 0); ++ res = fuse_buf_copy(&out_buf, in_buf); + if (res < 0) { + fuse_reply_err(req, -res); + } else { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch b/kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch new file mode 100755 index 0000000000000000000000000000000000000000..e1a3cd1c6488419e4386b0f3f8e45b691a176331 --- /dev/null +++ b/kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch @@ -0,0 +1,72 @@ +From b8d62021f28114f054571b96ec0cd4dad4476923 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:14 +0100 +Subject: [PATCH 103/116] virtiofsd: Reset O_DIRECT flag during file open +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-100-dgilbert@redhat.com> +Patchwork-id: 93553 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 099/112] virtiofsd: Reset O_DIRECT flag during file open +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +If an application wants to do direct IO and opens a file with O_DIRECT +in guest, that does not necessarily mean that we need to bypass page +cache on host as well. So reset this flag on host. + +If somebody needs to bypass page cache on host as well (and it is safe to +do so), we can add a knob in daemon later to control this behavior. + +I check virtio-9p and they do reset O_DIRECT flag. + +Signed-off-by: Vivek Goyal +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 65da4539803373ec4eec97ffc49ee90083e56efd) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ccbbec1..948cb19 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1721,6 +1721,13 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out; + } + ++ /* ++ * O_DIRECT in guest should not necessarily mean bypassing page ++ * cache on host as well. If somebody needs that behavior, it ++ * probably should be a configuration knob in daemon. ++ */ ++ fi->flags &= ~O_DIRECT; ++ + fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); + err = fd == -1 ? errno : 0; +@@ -1950,6 +1957,13 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + fi->flags &= ~O_APPEND; + } + ++ /* ++ * O_DIRECT in guest should not necessarily mean bypassing page ++ * cache on host as well. If somebody needs that behavior, it ++ * probably should be a configuration knob in daemon. ++ */ ++ fi->flags &= ~O_DIRECT; ++ + sprintf(buf, "%i", lo_fd(req, ino)); + fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); + if (fd == -1) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Send-replies-to-messages.patch b/kvm-virtiofsd-Send-replies-to-messages.patch new file mode 100755 index 0000000000000000000000000000000000000000..5453fda1692be60813453a7f05ef44895fd0a2a3 --- /dev/null +++ b/kvm-virtiofsd-Send-replies-to-messages.patch @@ -0,0 +1,199 @@ +From bb1f691dc410ce11ac9675ced70e78a3ce2511b0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:03 +0100 +Subject: [PATCH 032/116] virtiofsd: Send replies to messages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-29-dgilbert@redhat.com> +Patchwork-id: 93485 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 028/112] virtiofsd: Send replies to messages +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Route fuse out messages back through the same queue elements +that had the command that triggered the request. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit df57ba919ec3edef9cc208d35685095e6e92713e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 4 ++ + tools/virtiofsd/fuse_virtio.c | 107 ++++++++++++++++++++++++++++++++++++++-- + tools/virtiofsd/fuse_virtio.h | 4 ++ + 3 files changed, 111 insertions(+), 4 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index af09fa2..380d93b 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -171,6 +171,10 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + } + } + ++ if (fuse_lowlevel_is_virtio(se)) { ++ return virtio_send_msg(se, ch, iov, count); ++ } ++ + abort(); /* virtio should have taken it before here */ + return 0; + } +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 3841b20..05d0e29 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -41,6 +41,9 @@ struct fv_QueueInfo { + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; ++ ++ /* The element for the command currently being processed */ ++ VuVirtqElement *qe; + }; + + /* +@@ -121,6 +124,105 @@ static void copy_from_iov(struct fuse_buf *buf, size_t out_num, + } + } + ++/* ++ * Copy from one iov to another, the given number of bytes ++ * The caller must have checked sizes. ++ */ ++static void copy_iov(struct iovec *src_iov, int src_count, ++ struct iovec *dst_iov, int dst_count, size_t to_copy) ++{ ++ size_t dst_offset = 0; ++ /* Outer loop copies 'src' elements */ ++ while (to_copy) { ++ assert(src_count); ++ size_t src_len = src_iov[0].iov_len; ++ size_t src_offset = 0; ++ ++ if (src_len > to_copy) { ++ src_len = to_copy; ++ } ++ /* Inner loop copies contents of one 'src' to maybe multiple dst. */ ++ while (src_len) { ++ assert(dst_count); ++ size_t dst_len = dst_iov[0].iov_len - dst_offset; ++ if (dst_len > src_len) { ++ dst_len = src_len; ++ } ++ ++ memcpy(dst_iov[0].iov_base + dst_offset, ++ src_iov[0].iov_base + src_offset, dst_len); ++ src_len -= dst_len; ++ to_copy -= dst_len; ++ src_offset += dst_len; ++ dst_offset += dst_len; ++ ++ assert(dst_offset <= dst_iov[0].iov_len); ++ if (dst_offset == dst_iov[0].iov_len) { ++ dst_offset = 0; ++ dst_iov++; ++ dst_count--; ++ } ++ } ++ src_iov++; ++ src_count--; ++ } ++} ++ ++/* ++ * Called back by ll whenever it wants to send a reply/message back ++ * The 1st element of the iov starts with the fuse_out_header ++ * 'unique'==0 means it's a notify message. ++ */ ++int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count) ++{ ++ VuVirtqElement *elem; ++ VuVirtq *q; ++ ++ assert(count >= 1); ++ assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); ++ ++ struct fuse_out_header *out = iov[0].iov_base; ++ /* TODO: Endianness! */ ++ ++ size_t tosend_len = iov_size(iov, count); ++ ++ /* unique == 0 is notification, which we don't support */ ++ assert(out->unique); ++ /* For virtio we always have ch */ ++ assert(ch); ++ elem = ch->qi->qe; ++ q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ ++ /* The 'in' part of the elem is to qemu */ ++ unsigned int in_num = elem->in_num; ++ struct iovec *in_sg = elem->in_sg; ++ size_t in_len = iov_size(in_sg, in_num); ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", ++ __func__, elem->index, in_num, in_len); ++ ++ /* ++ * The elem should have room for a 'fuse_out_header' (out from fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (in_len < sizeof(struct fuse_out_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", ++ __func__, elem->index); ++ return -E2BIG; ++ } ++ if (in_len < tosend_len) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", ++ __func__, elem->index, tosend_len); ++ return -E2BIG; ++ } ++ ++ copy_iov(iov, count, in_sg, in_num, tosend_len); ++ vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); ++ vu_queue_notify(&se->virtio_dev->dev, q); ++ ++ return 0; ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { +@@ -226,13 +328,10 @@ static void *fv_queue_thread(void *opaque) + + /* TODO! Endianness of header */ + +- /* TODO: Fixup fuse_send_msg */ + /* TODO: Add checks for fuse_session_exited */ + fuse_session_process_buf_int(se, &fbuf, &ch); + +- /* TODO: vu_queue_push(dev, q, elem, qi->write_count); */ +- vu_queue_notify(dev, q); +- ++ qi->qe = NULL; + free(elem); + elem = NULL; + } +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index 23026d6..135a148 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -22,4 +22,8 @@ int virtio_session_mount(struct fuse_session *se); + + int virtio_loop(struct fuse_session *se); + ++ ++int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, ++ struct iovec *iov, int count); ++ + #endif +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Show-submounts.patch b/kvm-virtiofsd-Show-submounts.patch new file mode 100755 index 0000000000000000000000000000000000000000..d45a0300628c4371c4732dbe6b5c9adad1e7319c --- /dev/null +++ b/kvm-virtiofsd-Show-submounts.patch @@ -0,0 +1,51 @@ +From 717373379510cd6ecf8c6d0e1aae65edfac4551d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:58 +0100 +Subject: [PATCH 7/9] virtiofsd: Show submounts + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-6-dgilbert@redhat.com> +Patchwork-id: 96273 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 5/7] virtiofsd: Show submounts +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Max Reitz + +Currently, setup_mounts() bind-mounts the shared directory without +MS_REC. This makes all submounts disappear. + +Pass MS_REC so that the guest can see submounts again. + +Fixes: 5baa3b8e95064c2434bd9e2f312edd5e9ae275dc +Signed-off-by: Max Reitz +Message-Id: <20200424133516.73077-1-mreitz@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert + Changed Fixes to point to the commit with the problem rather than + the commit that turned it on +(cherry picked from commit ace0829c0d08f0e5f1451e402e94495bc2166772) + +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 73d8405..614ba55 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2670,7 +2670,7 @@ static void setup_mounts(const char *source) + int oldroot; + int newroot; + +- if (mount(source, source, NULL, MS_BIND, NULL) < 0) { ++ if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); + exit(1); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Start-queue-threads.patch b/kvm-virtiofsd-Start-queue-threads.patch new file mode 100755 index 0000000000000000000000000000000000000000..8b03cd62af39adbaf977fba3d128c8273019554f --- /dev/null +++ b/kvm-virtiofsd-Start-queue-threads.patch @@ -0,0 +1,165 @@ +From 38282d996cde61261211160577b366b83cad8012 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:00 +0100 +Subject: [PATCH 029/116] virtiofsd: Start queue threads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-26-dgilbert@redhat.com> +Patchwork-id: 93479 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 025/112] virtiofsd: Start queue threads +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Start a thread for each queue when we get notified it's been started. + +Signed-off-by: Dr. David Alan Gilbert +fix by: +Signed-off-by: Jun Piao +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e4c55a3c144493b436e40031e2eed61a84eca47b) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 89 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 89 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 4819e56..2a94bb3 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -11,6 +11,7 @@ + * See the file COPYING.LIB + */ + ++#include "qemu/osdep.h" + #include "fuse_virtio.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" +@@ -30,6 +31,15 @@ + + #include "contrib/libvhost-user/libvhost-user.h" + ++struct fv_QueueInfo { ++ pthread_t thread; ++ struct fv_VuDev *virtio_dev; ++ ++ /* Our queue index, corresponds to array position */ ++ int qidx; ++ int kick_fd; ++}; ++ + /* + * We pass the dev element into libvhost-user + * and then use it to get back to the outer +@@ -38,6 +48,13 @@ + struct fv_VuDev { + VuDev dev; + struct fuse_session *se; ++ ++ /* ++ * The following pair of fields are only accessed in the main ++ * virtio_loop ++ */ ++ size_t nqueues; ++ struct fv_QueueInfo **qi; + }; + + /* From spec */ +@@ -83,6 +100,75 @@ static void fv_panic(VuDev *dev, const char *err) + exit(EXIT_FAILURE); + } + ++static void *fv_queue_thread(void *opaque) ++{ ++ struct fv_QueueInfo *qi = opaque; ++ fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, ++ qi->qidx, qi->kick_fd); ++ while (1) { ++ /* TODO */ ++ } ++ ++ return NULL; ++} ++ ++/* Callback from libvhost-user on start or stop of a queue */ ++static void fv_queue_set_started(VuDev *dev, int qidx, bool started) ++{ ++ struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev); ++ struct fv_QueueInfo *ourqi; ++ ++ fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx, ++ started); ++ assert(qidx >= 0); ++ ++ /* ++ * Ignore additional request queues for now. passthrough_ll.c must be ++ * audited for thread-safety issues first. It was written with a ++ * well-behaved client in mind and may not protect against all types of ++ * races yet. ++ */ ++ if (qidx > 1) { ++ fuse_log(FUSE_LOG_ERR, ++ "%s: multiple request queues not yet implemented, please only " ++ "configure 1 request queue\n", ++ __func__); ++ exit(EXIT_FAILURE); ++ } ++ ++ if (started) { ++ /* Fire up a thread to watch this queue */ ++ if (qidx >= vud->nqueues) { ++ vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0])); ++ assert(vud->qi); ++ memset(vud->qi + vud->nqueues, 0, ++ sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues))); ++ vud->nqueues = qidx + 1; ++ } ++ if (!vud->qi[qidx]) { ++ vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1); ++ assert(vud->qi[qidx]); ++ vud->qi[qidx]->virtio_dev = vud; ++ vud->qi[qidx]->qidx = qidx; ++ } else { ++ /* Shouldn't have been started */ ++ assert(vud->qi[qidx]->kick_fd == -1); ++ } ++ ourqi = vud->qi[qidx]; ++ ourqi->kick_fd = dev->vq[qidx].kick_fd; ++ if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { ++ fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", ++ __func__, qidx); ++ assert(0); ++ } ++ } else { ++ /* TODO: Kill the thread */ ++ assert(qidx < vud->nqueues); ++ ourqi = vud->qi[qidx]; ++ ourqi->kick_fd = -1; ++ } ++} ++ + static bool fv_queue_order(VuDev *dev, int qidx) + { + return false; +@@ -92,6 +178,9 @@ static const VuDevIface fv_iface = { + .get_features = fv_get_features, + .set_features = fv_set_features, + ++ /* Don't need process message, we've not got any at vhost-user level */ ++ .queue_set_started = fv_queue_set_started, ++ + .queue_is_processed_in_order = fv_queue_order, + }; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Start-reading-commands-from-queue.patch b/kvm-virtiofsd-Start-reading-commands-from-queue.patch new file mode 100755 index 0000000000000000000000000000000000000000..20224802c5fa3dc264c81350524299c6467bc1da --- /dev/null +++ b/kvm-virtiofsd-Start-reading-commands-from-queue.patch @@ -0,0 +1,200 @@ +From b4af2eff8ecadb4e2c9520602455f77fac2cb943 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:02 +0100 +Subject: [PATCH 031/116] virtiofsd: Start reading commands from queue +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-28-dgilbert@redhat.com> +Patchwork-id: 93484 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 027/112] virtiofsd: Start reading commands from queue +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Pop queue elements off queues, copy the data from them and +pass that to fuse. + + Note: 'out' in a VuVirtqElement is from QEMU + 'in' in libfuse is into the daemon + + So we read from the out iov's to get a fuse_in_header + +When we get a kick we've got to read all the elements until the queue +is empty. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b509e1228b3e5eb83c14819045988999fc2dbd1b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 2 + + tools/virtiofsd/fuse_virtio.c | 99 +++++++++++++++++++++++++++++++++++++++++-- + 2 files changed, 98 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index ec04449..1126723 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -14,6 +14,7 @@ + #include "fuse_lowlevel.h" + + struct fv_VuDev; ++struct fv_QueueInfo; + + struct fuse_req { + struct fuse_session *se; +@@ -75,6 +76,7 @@ struct fuse_chan { + pthread_mutex_t lock; + int ctr; + int fd; ++ struct fv_QueueInfo *qi; + }; + + /** +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 05e7258..3841b20 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -12,6 +12,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/iov.h" + #include "fuse_virtio.h" + #include "fuse_i.h" + #include "standard-headers/linux/fuse.h" +@@ -32,6 +33,7 @@ + + #include "contrib/libvhost-user/libvhost-user.h" + ++struct fv_VuDev; + struct fv_QueueInfo { + pthread_t thread; + struct fv_VuDev *virtio_dev; +@@ -101,10 +103,41 @@ static void fv_panic(VuDev *dev, const char *err) + exit(EXIT_FAILURE); + } + ++/* ++ * Copy from an iovec into a fuse_buf (memory only) ++ * Caller must ensure there is space ++ */ ++static void copy_from_iov(struct fuse_buf *buf, size_t out_num, ++ const struct iovec *out_sg) ++{ ++ void *dest = buf->mem; ++ ++ while (out_num) { ++ size_t onelen = out_sg->iov_len; ++ memcpy(dest, out_sg->iov_base, onelen); ++ dest += onelen; ++ out_sg++; ++ out_num--; ++ } ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { + struct fv_QueueInfo *qi = opaque; ++ struct VuDev *dev = &qi->virtio_dev->dev; ++ struct VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ struct fuse_session *se = qi->virtio_dev->se; ++ struct fuse_chan ch; ++ struct fuse_buf fbuf; ++ ++ fbuf.mem = NULL; ++ fbuf.flags = 0; ++ ++ fuse_mutex_init(&ch.lock); ++ ch.fd = (int)0xdaff0d111; ++ ch.qi = qi; ++ + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { +@@ -141,11 +174,71 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); + break; + } +- if (qi->virtio_dev->se->debug) { +- fprintf(stderr, "%s: Queue %d gave evalue: %zx\n", __func__, +- qi->qidx, (size_t)evalue); ++ /* out is from guest, in is too guest */ ++ unsigned int in_bytes, out_bytes; ++ vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", ++ __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); ++ ++ while (1) { ++ /* ++ * An element contains one request and the space to send our ++ * response They're spread over multiple descriptors in a ++ * scatter/gather set and we can't trust the guest to keep them ++ * still; so copy in/out. ++ */ ++ VuVirtqElement *elem = vu_queue_pop(dev, q, sizeof(VuVirtqElement)); ++ if (!elem) { ++ break; ++ } ++ ++ if (!fbuf.mem) { ++ fbuf.mem = malloc(se->bufsize); ++ assert(fbuf.mem); ++ assert(se->bufsize > sizeof(struct fuse_in_header)); ++ } ++ /* The 'out' part of the elem is from qemu */ ++ unsigned int out_num = elem->out_num; ++ struct iovec *out_sg = elem->out_sg; ++ size_t out_len = iov_size(out_sg, out_num); ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: elem %d: with %d out desc of length %zd\n", __func__, ++ elem->index, out_num, out_len); ++ ++ /* ++ * The elem should contain a 'fuse_in_header' (in to fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (out_len < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", ++ __func__, elem->index); ++ assert(0); /* TODO */ ++ } ++ if (out_len > se->bufsize) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", ++ __func__, elem->index); ++ assert(0); /* TODO */ ++ } ++ copy_from_iov(&fbuf, out_num, out_sg); ++ fbuf.size = out_len; ++ ++ /* TODO! Endianness of header */ ++ ++ /* TODO: Fixup fuse_send_msg */ ++ /* TODO: Add checks for fuse_session_exited */ ++ fuse_session_process_buf_int(se, &fbuf, &ch); ++ ++ /* TODO: vu_queue_push(dev, q, elem, qi->write_count); */ ++ vu_queue_notify(dev, q); ++ ++ free(elem); ++ elem = NULL; + } + } ++ pthread_mutex_destroy(&ch.lock); ++ free(fbuf.mem); + + return NULL; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Start-wiring-up-vhost-user.patch b/kvm-virtiofsd-Start-wiring-up-vhost-user.patch new file mode 100755 index 0000000000000000000000000000000000000000..7b50118419ae6f3ce6d66d782a13882c2d15cc61 --- /dev/null +++ b/kvm-virtiofsd-Start-wiring-up-vhost-user.patch @@ -0,0 +1,247 @@ +From 020f593031b0b54e4c35faffea489b700aed6a72 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:57 +0100 +Subject: [PATCH 026/116] virtiofsd: Start wiring up vhost-user +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-23-dgilbert@redhat.com> +Patchwork-id: 93477 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 022/112] virtiofsd: Start wiring up vhost-user +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Listen on our unix socket for the connection from QEMU, when we get it +initialise vhost-user and dive into our own loop variant (currently +dummy). + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f6f3573c6f271af5ded63ce28589a113f7205c72) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 4 ++ + tools/virtiofsd/fuse_lowlevel.c | 5 +++ + tools/virtiofsd/fuse_lowlevel.h | 7 ++++ + tools/virtiofsd/fuse_virtio.c | 87 +++++++++++++++++++++++++++++++++++++++- + tools/virtiofsd/fuse_virtio.h | 2 + + tools/virtiofsd/passthrough_ll.c | 7 +--- + 6 files changed, 106 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 82d6ac7..ec04449 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -13,6 +13,8 @@ + #include "fuse.h" + #include "fuse_lowlevel.h" + ++struct fv_VuDev; ++ + struct fuse_req { + struct fuse_session *se; + uint64_t unique; +@@ -65,6 +67,8 @@ struct fuse_session { + size_t bufsize; + int error; + char *vu_socket_path; ++ int vu_socketfd; ++ struct fv_VuDev *virtio_dev; + }; + + struct fuse_chan { +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 5df124e..af09fa2 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2242,6 +2242,11 @@ void fuse_session_unmount(struct fuse_session *se) + { + } + ++int fuse_lowlevel_is_virtio(struct fuse_session *se) ++{ ++ return se->vu_socket_path != NULL; ++} ++ + #ifdef linux + int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) + { +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 2fa225d..f6b3470 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1755,6 +1755,13 @@ void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + */ + int fuse_req_interrupted(fuse_req_t req); + ++/** ++ * Check if the session is connected via virtio ++ * ++ * @param se session object ++ * @return 1 if the session is a virtio session ++ */ ++int fuse_lowlevel_is_virtio(struct fuse_session *se); + + /* + * Inquiry functions +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index cbef6ff..2ae3c76 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -19,18 +19,78 @@ + + #include + #include ++#include + #include + #include + #include + #include + #include + ++#include "contrib/libvhost-user/libvhost-user.h" ++ ++/* ++ * We pass the dev element into libvhost-user ++ * and then use it to get back to the outer ++ * container for other data. ++ */ ++struct fv_VuDev { ++ VuDev dev; ++ struct fuse_session *se; ++}; ++ + /* From spec */ + struct virtio_fs_config { + char tag[36]; + uint32_t num_queues; + }; + ++/* ++ * Callback from libvhost-user if there's a new fd we're supposed to listen ++ * to, typically a queue kick? ++ */ ++static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb, ++ void *data) ++{ ++ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); ++} ++ ++/* ++ * Callback from libvhost-user if we're no longer supposed to listen on an fd ++ */ ++static void fv_remove_watch(VuDev *dev, int fd) ++{ ++ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); ++} ++ ++/* Callback from libvhost-user to panic */ ++static void fv_panic(VuDev *dev, const char *err) ++{ ++ fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err); ++ /* TODO: Allow reconnects?? */ ++ exit(EXIT_FAILURE); ++} ++ ++static bool fv_queue_order(VuDev *dev, int qidx) ++{ ++ return false; ++} ++ ++static const VuDevIface fv_iface = { ++ /* TODO: Add other callbacks */ ++ .queue_is_processed_in_order = fv_queue_order, ++}; ++ ++int virtio_loop(struct fuse_session *se) ++{ ++ fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__); ++ ++ while (1) { ++ /* TODO: Add stuffing */ ++ } ++ ++ fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); ++} ++ + int virtio_session_mount(struct fuse_session *se) + { + struct sockaddr_un un; +@@ -75,5 +135,30 @@ int virtio_session_mount(struct fuse_session *se) + return -1; + } + +- return -1; ++ fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n", ++ __func__); ++ int data_sock = accept(listen_sock, NULL, NULL); ++ if (data_sock == -1) { ++ fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n"); ++ close(listen_sock); ++ return -1; ++ } ++ close(listen_sock); ++ fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n", ++ __func__); ++ ++ /* TODO: Some cleanup/deallocation! */ ++ se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1); ++ if (!se->virtio_dev) { ++ fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__); ++ close(data_sock); ++ return -1; ++ } ++ ++ se->vu_socketfd = data_sock; ++ se->virtio_dev->se = se; ++ vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch, ++ fv_remove_watch, &fv_iface); ++ ++ return 0; + } +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index 8f2edb6..23026d6 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -20,4 +20,6 @@ struct fuse_session; + + int virtio_session_mount(struct fuse_session *se); + ++int virtio_loop(struct fuse_session *se); ++ + #endif +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index fc9b264..037c5d7 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -36,6 +36,7 @@ + */ + + #include "qemu/osdep.h" ++#include "fuse_virtio.h" + #include "fuse_lowlevel.h" + #include + #include +@@ -1395,11 +1396,7 @@ int main(int argc, char *argv[]) + fuse_daemonize(opts.foreground); + + /* Block until ctrl+c or fusermount -u */ +- if (opts.singlethread) { +- ret = fuse_session_loop(se); +- } else { +- ret = fuse_session_loop_mt(se, opts.clone_fd); +- } ++ ret = virtio_loop(se); + + fuse_session_unmount(se); + err_out3: +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Support-remote-posix-locks.patch b/kvm-virtiofsd-Support-remote-posix-locks.patch new file mode 100755 index 0000000000000000000000000000000000000000..e60364a7c656ecea1d10e6639d0705a625da713d --- /dev/null +++ b/kvm-virtiofsd-Support-remote-posix-locks.patch @@ -0,0 +1,355 @@ +From 8e46d0862c4c204f92c08ce2ae961921f270efb5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:03 +0100 +Subject: [PATCH 092/116] virtiofsd: Support remote posix locks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-89-dgilbert@redhat.com> +Patchwork-id: 93537 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 088/112] virtiofsd: Support remote posix locks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +Doing posix locks with-in guest kernel are not sufficient if a file/dir +is being shared by multiple guests. So we need the notion of daemon doing +the locks which are visible to rest of the guests. + +Given posix locks are per process, one can not call posix lock API on host, +otherwise bunch of basic posix locks properties are broken. For example, +If two processes (A and B) in guest open the file and take locks on different +sections of file, if one of the processes closes the fd, it will close +fd on virtiofsd and all posix locks on file will go away. This means if +process A closes the fd, then locks of process B will go away too. + +Similar other problems exist too. + +This patch set tries to emulate posix locks while using open file +description locks provided on Linux. + +Daemon provides two options (-o posix_lock, -o no_posix_lock) to enable +or disable posix locking in daemon. By default it is enabled. + +There are few issues though. + +- GETLK() returns pid of process holding lock. As we are emulating locks + using OFD, and these locks are not per process and don't return pid + of process, so GETLK() in guest does not reuturn process pid. + +- As of now only F_SETLK is supported and not F_SETLKW. We can't block + the thread in virtiofsd for arbitrary long duration as there is only + one thread serving the queue. That means unlock request will not make + it to daemon and F_SETLKW will block infinitely and bring virtio-fs + to a halt. This is a solvable problem though and will require significant + changes in virtiofsd and kernel. Left as a TODO item for now. + +Signed-off-by: Vivek Goyal +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0e81414c54161296212f6bc8a1c70526c4a9755a) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 3 + + tools/virtiofsd/passthrough_ll.c | 189 +++++++++++++++++++++++++++++++++++++++ + 2 files changed, 192 insertions(+) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5672024..33749bf 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -156,6 +156,9 @@ void fuse_cmdline_help(void) + " allowed (default: 10)\n" + " -o norace disable racy fallback\n" + " default: false\n" ++ " -o posix_lock|no_posix_lock\n" ++ " enable/disable remote posix lock\n" ++ " default: posix_lock\n" + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 05b5f89..9414935 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -67,6 +67,12 @@ + #include "passthrough_helpers.h" + #include "seccomp.h" + ++/* Keep track of inode posix locks for each owner. */ ++struct lo_inode_plock { ++ uint64_t lock_owner; ++ int fd; /* fd for OFD locks */ ++}; ++ + struct lo_map_elem { + union { + struct lo_inode *inode; +@@ -95,6 +101,8 @@ struct lo_inode { + struct lo_key key; + uint64_t refcount; /* protected by lo->mutex */ + fuse_ino_t fuse_ino; ++ pthread_mutex_t plock_mutex; ++ GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ + }; + + struct lo_cred { +@@ -114,6 +122,7 @@ struct lo_data { + int norace; + int writeback; + int flock; ++ int posix_lock; + int xattr; + char *source; + double timeout; +@@ -137,6 +146,8 @@ static const struct fuse_opt lo_opts[] = { + { "source=%s", offsetof(struct lo_data, source), 0 }, + { "flock", offsetof(struct lo_data, flock), 1 }, + { "no_flock", offsetof(struct lo_data, flock), 0 }, ++ { "posix_lock", offsetof(struct lo_data, posix_lock), 1 }, ++ { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 }, + { "xattr", offsetof(struct lo_data, xattr), 1 }, + { "no_xattr", offsetof(struct lo_data, xattr), 0 }, + { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, +@@ -485,6 +496,17 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } ++ ++ if (conn->capable & FUSE_CAP_POSIX_LOCKS) { ++ if (lo->posix_lock) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n"); ++ conn->want |= FUSE_CAP_POSIX_LOCKS; ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n"); ++ conn->want &= ~FUSE_CAP_POSIX_LOCKS; ++ } ++ } ++ + if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || + lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); +@@ -772,6 +794,19 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + return p; + } + ++/* value_destroy_func for posix_locks GHashTable */ ++static void posix_locks_value_destroy(gpointer data) ++{ ++ struct lo_inode_plock *plock = data; ++ ++ /* ++ * We had used open() for locks and had only one fd. So ++ * closing this fd should release all OFD locks. ++ */ ++ close(plock->fd); ++ free(plock); ++} ++ + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct fuse_entry_param *e) + { +@@ -825,6 +860,9 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + newfd = -1; + inode->key.ino = e->attr.st_ino; + inode->key.dev = e->attr.st_dev; ++ pthread_mutex_init(&inode->plock_mutex, NULL); ++ inode->posix_locks = g_hash_table_new_full( ++ g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); + + pthread_mutex_lock(&lo->mutex); + inode->fuse_ino = lo_add_inode_mapping(req, inode); +@@ -1160,6 +1198,11 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + if (!inode->refcount) { + lo_map_remove(&lo->ino_map, inode->fuse_ino); + g_hash_table_remove(lo->inodes, &inode->key); ++ if (g_hash_table_size(inode->posix_locks)) { ++ fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n"); ++ } ++ g_hash_table_destroy(inode->posix_locks); ++ pthread_mutex_destroy(&inode->plock_mutex); + pthread_mutex_unlock(&lo->mutex); + close(inode->fd); + free(inode); +@@ -1516,6 +1559,136 @@ out: + } + } + ++/* Should be called with inode->plock_mutex held */ ++static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, ++ struct lo_inode *inode, ++ uint64_t lock_owner, ++ pid_t pid, int *err) ++{ ++ struct lo_inode_plock *plock; ++ char procname[64]; ++ int fd; ++ ++ plock = ++ g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner)); ++ ++ if (plock) { ++ return plock; ++ } ++ ++ plock = malloc(sizeof(struct lo_inode_plock)); ++ if (!plock) { ++ *err = ENOMEM; ++ return NULL; ++ } ++ ++ /* Open another instance of file which can be used for ofd locks. */ ++ sprintf(procname, "%i", inode->fd); ++ ++ /* TODO: What if file is not writable? */ ++ fd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (fd == -1) { ++ *err = errno; ++ free(plock); ++ return NULL; ++ } ++ ++ plock->lock_owner = lock_owner; ++ plock->fd = fd; ++ g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner), ++ plock); ++ return plock; ++} ++ ++static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ struct lo_inode_plock *plock; ++ int ret, saverr = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_getlk(ino=%" PRIu64 ", flags=%d)" ++ " owner=0x%lx, l_type=%d l_start=0x%lx" ++ " l_len=0x%lx\n", ++ ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start, ++ lock->l_len); ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ pthread_mutex_lock(&inode->plock_mutex); ++ plock = ++ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); ++ if (!plock) { ++ pthread_mutex_unlock(&inode->plock_mutex); ++ fuse_reply_err(req, ret); ++ return; ++ } ++ ++ ret = fcntl(plock->fd, F_OFD_GETLK, lock); ++ if (ret == -1) { ++ saverr = errno; ++ } ++ pthread_mutex_unlock(&inode->plock_mutex); ++ ++ if (saverr) { ++ fuse_reply_err(req, saverr); ++ } else { ++ fuse_reply_lock(req, lock); ++ } ++} ++ ++static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, ++ struct flock *lock, int sleep) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode; ++ struct lo_inode_plock *plock; ++ int ret, saverr = 0; ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_setlk(ino=%" PRIu64 ", flags=%d)" ++ " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d" ++ " l_start=0x%lx l_len=0x%lx\n", ++ ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep, ++ lock->l_whence, lock->l_start, lock->l_len); ++ ++ if (sleep) { ++ fuse_reply_err(req, EOPNOTSUPP); ++ return; ++ } ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ pthread_mutex_lock(&inode->plock_mutex); ++ plock = ++ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); ++ ++ if (!plock) { ++ pthread_mutex_unlock(&inode->plock_mutex); ++ fuse_reply_err(req, ret); ++ return; ++ } ++ ++ /* TODO: Is it alright to modify flock? */ ++ lock->l_pid = 0; ++ ret = fcntl(plock->fd, F_OFD_SETLK, lock); ++ if (ret == -1) { ++ saverr = errno; ++ } ++ pthread_mutex_unlock(&inode->plock_mutex); ++ fuse_reply_err(req, saverr); ++} ++ + static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { +@@ -1617,6 +1790,19 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { + int res; + (void)ino; ++ struct lo_inode *inode; ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ /* An fd is going away. Cleanup associated posix locks */ ++ pthread_mutex_lock(&inode->plock_mutex); ++ g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner)); ++ pthread_mutex_unlock(&inode->plock_mutex); ++ + res = close(dup(lo_fi_fd(req, fi))); + fuse_reply_err(req, res == -1 ? errno : 0); + } +@@ -2080,6 +2266,8 @@ static struct fuse_lowlevel_ops lo_oper = { + .releasedir = lo_releasedir, + .fsyncdir = lo_fsyncdir, + .create = lo_create, ++ .getlk = lo_getlk, ++ .setlk = lo_setlk, + .open = lo_open, + .release = lo_release, + .flush = lo_flush, +@@ -2434,6 +2622,7 @@ int main(int argc, char *argv[]) + struct lo_data lo = { + .debug = 0, + .writeback = 0, ++ .posix_lock = 1, + .proc_self_fd = -1, + }; + struct lo_map_elem *root_elem; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Trim-down-imported-files.patch b/kvm-virtiofsd-Trim-down-imported-files.patch new file mode 100755 index 0000000000000000000000000000000000000000..f3f1e851fa27b43aa137b9b38decb7047c705181 --- /dev/null +++ b/kvm-virtiofsd-Trim-down-imported-files.patch @@ -0,0 +1,1582 @@ +From 9d3788b1c2fa5cb4f14e292232a05c6a5217802d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:44 +0100 +Subject: [PATCH 013/116] virtiofsd: Trim down imported files +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-10-dgilbert@redhat.com> +Patchwork-id: 93463 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 009/112] virtiofsd: Trim down imported files +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +There's a lot of the original fuse code we don't need; trim them down. + +Signed-off-by: Dr. David Alan Gilbert +with additional trimming by: +Signed-off-by: Misono Tomohiro +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Xiao Yang +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a3e23f325439a290c504d6bbc48c2e742149ecab) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 71 +--- + tools/virtiofsd/fuse.h | 46 --- + tools/virtiofsd/fuse_common.h | 32 -- + tools/virtiofsd/fuse_i.h | 41 --- + tools/virtiofsd/fuse_log.h | 8 - + tools/virtiofsd/fuse_lowlevel.c | 675 +--------------------------------- + tools/virtiofsd/fuse_lowlevel.h | 28 -- + tools/virtiofsd/fuse_opt.h | 8 - + tools/virtiofsd/helper.c | 143 ------- + tools/virtiofsd/passthrough_helpers.h | 26 -- + tools/virtiofsd/passthrough_ll.c | 1 - + 11 files changed, 8 insertions(+), 1071 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 5ab9b87..aefb7db 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -157,73 +157,6 @@ static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, + return copied; + } + +-#ifdef HAVE_SPLICE +-static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) +-{ +- int splice_flags = 0; +- off_t *srcpos = NULL; +- off_t *dstpos = NULL; +- off_t srcpos_val; +- off_t dstpos_val; +- ssize_t res; +- size_t copied = 0; +- +- if (flags & FUSE_BUF_SPLICE_MOVE) +- splice_flags |= SPLICE_F_MOVE; +- if (flags & FUSE_BUF_SPLICE_NONBLOCK) +- splice_flags |= SPLICE_F_NONBLOCK; +- +- if (src->flags & FUSE_BUF_FD_SEEK) { +- srcpos_val = src->pos + src_off; +- srcpos = &srcpos_val; +- } +- if (dst->flags & FUSE_BUF_FD_SEEK) { +- dstpos_val = dst->pos + dst_off; +- dstpos = &dstpos_val; +- } +- +- while (len) { +- res = splice(src->fd, srcpos, dst->fd, dstpos, len, +- splice_flags); +- if (res == -1) { +- if (copied) +- break; +- +- if (errno != EINVAL || (flags & FUSE_BUF_FORCE_SPLICE)) +- return -errno; +- +- /* Maybe splice is not supported for this combination */ +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, +- len); +- } +- if (res == 0) +- break; +- +- copied += res; +- if (!(src->flags & FUSE_BUF_FD_RETRY) && +- !(dst->flags & FUSE_BUF_FD_RETRY)) { +- break; +- } +- +- len -= res; +- } +- +- return copied; +-} +-#else +-static ssize_t fuse_buf_splice(const struct fuse_buf *dst, size_t dst_off, +- const struct fuse_buf *src, size_t src_off, +- size_t len, enum fuse_buf_copy_flags flags) +-{ +- (void) flags; +- +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); +-} +-#endif +- +- + static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len, enum fuse_buf_copy_flags flags) +@@ -247,10 +180,8 @@ static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + return fuse_buf_write(dst, dst_off, src, src_off, len); + } else if (!dst_is_fd) { + return fuse_buf_read(dst, dst_off, src, src_off, len); +- } else if (flags & FUSE_BUF_NO_SPLICE) { +- return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); + } else { +- return fuse_buf_splice(dst, dst_off, src, src_off, len, flags); ++ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); + } + } + +diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h +index 883f6e5..3202fba 100644 +--- a/tools/virtiofsd/fuse.h ++++ b/tools/virtiofsd/fuse.h +@@ -25,10 +25,6 @@ + #include + #include + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /* ----------------------------------------------------------- * + * Basic FUSE API * + * ----------------------------------------------------------- */ +@@ -979,44 +975,6 @@ int fuse_loop(struct fuse *f); + void fuse_exit(struct fuse *f); + + /** +- * FUSE event loop with multiple threads +- * +- * Requests from the kernel are processed, and the appropriate +- * operations are called. Request are processed in parallel by +- * distributing them between multiple threads. +- * +- * For a description of the return value and the conditions when the +- * event loop exits, refer to the documentation of +- * fuse_session_loop(). +- * +- * Note: using fuse_loop() instead of fuse_loop_mt() means you are running in +- * single-threaded mode, and that you will not have to worry about reentrancy, +- * though you will have to worry about recursive lookups. In single-threaded +- * mode, FUSE will wait for one callback to return before calling another. +- * +- * Enabling multiple threads, by using fuse_loop_mt(), will cause FUSE to make +- * multiple simultaneous calls into the various callback functions given by your +- * fuse_operations record. +- * +- * If you are using multiple threads, you can enjoy all the parallel execution +- * and interactive response benefits of threads, and you get to enjoy all the +- * benefits of race conditions and locking bugs, too. Ensure that any code used +- * in the callback function of fuse_operations is also thread-safe. +- * +- * @param f the FUSE handle +- * @param config loop configuration +- * @return see fuse_session_loop() +- * +- * See also: fuse_loop() +- */ +-#if FUSE_USE_VERSION < 32 +-int fuse_loop_mt_31(struct fuse *f, int clone_fd); +-#define fuse_loop_mt(f, clone_fd) fuse_loop_mt_31(f, clone_fd) +-#else +-int fuse_loop_mt(struct fuse *f, struct fuse_loop_config *config); +-#endif +- +-/** + * Get the current context + * + * The context is only valid for the duration of a filesystem +@@ -1268,8 +1226,4 @@ struct fuse_session *fuse_get_session(struct fuse *f); + */ + int fuse_open_channel(const char *mountpoint, const char *options); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_H_ */ +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index 2d686b2..bf8f8cc 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -28,10 +28,6 @@ + #define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) + #define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /** + * Information about an open file. + * +@@ -100,30 +96,6 @@ struct fuse_file_info { + uint32_t poll_events; + }; + +-/** +- * Configuration parameters passed to fuse_session_loop_mt() and +- * fuse_loop_mt(). +- */ +-struct fuse_loop_config { +- /** +- * whether to use separate device fds for each thread +- * (may increase performance) +- */ +- int clone_fd; +- +- /** +- * The maximum number of available worker threads before they +- * start to get deleted when they become idle. If not +- * specified, the default is 10. +- * +- * Adjusting this has performance implications; a very small number +- * of threads in the pool will cause a lot of thread creation and +- * deletion overhead and performance may suffer. When set to 0, a new +- * thread will be created to service every operation. +- */ +- unsigned int max_idle_threads; +-}; +- + /************************************************************************** + * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' * + **************************************************************************/ +@@ -802,10 +774,6 @@ void fuse_remove_signal_handlers(struct fuse_session *se); + # error only API version 30 or greater is supported + #endif + +-#ifdef __cplusplus +-} +-#endif +- + + /* + * This interface uses 64 bit off_t. +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index d38b630..b39522e 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -9,8 +9,6 @@ + #include "fuse.h" + #include "fuse_lowlevel.h" + +-struct mount_opts; +- + struct fuse_req { + struct fuse_session *se; + uint64_t unique; +@@ -45,7 +43,6 @@ struct fuse_session { + char *mountpoint; + volatile int exited; + int fd; +- struct mount_opts *mo; + int debug; + int deny_others; + struct fuse_lowlevel_ops op; +@@ -58,7 +55,6 @@ struct fuse_session { + struct fuse_req interrupts; + pthread_mutex_t lock; + int got_destroy; +- pthread_key_t pipe_key; + int broken_splice_nonblock; + uint64_t notify_ctr; + struct fuse_notify_req notify_list; +@@ -87,53 +83,16 @@ struct fuse_module { + int ctr; + }; + +-/* ----------------------------------------------------------- * +- * Channel interface (when using -o clone_fd) * +- * ----------------------------------------------------------- */ +- +-/** +- * Obtain counted reference to the channel +- * +- * @param ch the channel +- * @return the channel +- */ +-struct fuse_chan *fuse_chan_get(struct fuse_chan *ch); +- +-/** +- * Drop counted reference to a channel +- * +- * @param ch the channel +- */ +-void fuse_chan_put(struct fuse_chan *ch); +- +-struct mount_opts *parse_mount_opts(struct fuse_args *args); +-void destroy_mount_opts(struct mount_opts *mo); +-void fuse_mount_version(void); +-unsigned get_max_read(struct mount_opts *o); +-void fuse_kern_unmount(const char *mountpoint, int fd); +-int fuse_kern_mount(const char *mountpoint, struct mount_opts *mo); +- + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count); + void fuse_free_req(fuse_req_t req); + +-void cuse_lowlevel_init(fuse_req_t req, fuse_ino_t nodeide, const void *inarg); +- +-int fuse_start_thread(pthread_t *thread_id, void *(*func)(void *), void *arg); +- +-int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, +- struct fuse_chan *ch); + void fuse_session_process_buf_int(struct fuse_session *se, + const struct fuse_buf *buf, struct fuse_chan *ch); + +-struct fuse *fuse_new_31(struct fuse_args *args, const struct fuse_operations *op, +- size_t op_size, void *private_data); +-int fuse_loop_mt_32(struct fuse *f, struct fuse_loop_config *config); +-int fuse_session_loop_mt_32(struct fuse_session *se, struct fuse_loop_config *config); + + #define FUSE_MAX_MAX_PAGES 256 + #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + + /* room needed in buffer to accommodate header */ + #define FUSE_BUFFER_HEADER_SIZE 0x1000 +- +diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h +index 5e112e0..0af700d 100644 +--- a/tools/virtiofsd/fuse_log.h ++++ b/tools/virtiofsd/fuse_log.h +@@ -16,10 +16,6 @@ + + #include + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /** + * Log severity level + * +@@ -75,8 +71,4 @@ void fuse_set_log_func(fuse_log_func_t func); + */ + void fuse_log(enum fuse_log_level level, const char *fmt, ...); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_LOG_H_ */ +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index f2d7038..e6fa247 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -16,7 +16,6 @@ + #include "fuse_kernel.h" + #include "fuse_opt.h" + #include "fuse_misc.h" +-#include "mount_util.h" + + #include + #include +@@ -28,12 +27,6 @@ + #include + #include + +-#ifndef F_LINUX_SPECIFIC_BASE +-#define F_LINUX_SPECIFIC_BASE 1024 +-#endif +-#ifndef F_SETPIPE_SZ +-#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +-#endif + + + #define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) +@@ -137,7 +130,6 @@ void fuse_free_req(fuse_req_t req) + req->u.ni.data = NULL; + list_del_req(req); + ctr = --req->ctr; +- fuse_chan_put(req->ch); + req->ch = NULL; + pthread_mutex_unlock(&se->lock); + if (!ctr) +@@ -184,19 +176,7 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + } + } + +- ssize_t res = writev(ch ? ch->fd : se->fd, +- iov, count); +- int err = errno; +- +- if (res == -1) { +- assert(se != NULL); +- +- /* ENOENT means the operation was interrupted */ +- if (!fuse_session_exited(se) && err != ENOENT) +- perror("fuse: writing device"); +- return -err; +- } +- ++ abort(); /* virtio should have taken it before here */ + return 0; + } + +@@ -480,10 +460,6 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + struct fuse_bufvec *buf, + size_t len) + { +- struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); +- void *mbuf; +- int res; +- + /* Optimize common case */ + if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && + !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { +@@ -496,350 +472,10 @@ static int fuse_send_data_iov_fallback(struct fuse_session *se, + return fuse_send_msg(se, ch, iov, iov_count); + } + +- res = posix_memalign(&mbuf, pagesize, len); +- if (res != 0) +- return res; +- +- mem_buf.buf[0].mem = mbuf; +- res = fuse_buf_copy(&mem_buf, buf, 0); +- if (res < 0) { +- free(mbuf); +- return -res; +- } +- len = res; +- +- iov[iov_count].iov_base = mbuf; +- iov[iov_count].iov_len = len; +- iov_count++; +- res = fuse_send_msg(se, ch, iov, iov_count); +- free(mbuf); +- +- return res; +-} +- +-struct fuse_ll_pipe { +- size_t size; +- int can_grow; +- int pipe[2]; +-}; +- +-static void fuse_ll_pipe_free(struct fuse_ll_pipe *llp) +-{ +- close(llp->pipe[0]); +- close(llp->pipe[1]); +- free(llp); +-} +- +-#ifdef HAVE_SPLICE +-#if !defined(HAVE_PIPE2) || !defined(O_CLOEXEC) +-static int fuse_pipe(int fds[2]) +-{ +- int rv = pipe(fds); +- +- if (rv == -1) +- return rv; +- +- if (fcntl(fds[0], F_SETFL, O_NONBLOCK) == -1 || +- fcntl(fds[1], F_SETFL, O_NONBLOCK) == -1 || +- fcntl(fds[0], F_SETFD, FD_CLOEXEC) == -1 || +- fcntl(fds[1], F_SETFD, FD_CLOEXEC) == -1) { +- close(fds[0]); +- close(fds[1]); +- rv = -1; +- } +- return rv; +-} +-#else +-static int fuse_pipe(int fds[2]) +-{ +- return pipe2(fds, O_CLOEXEC | O_NONBLOCK); +-} +-#endif +- +-static struct fuse_ll_pipe *fuse_ll_get_pipe(struct fuse_session *se) +-{ +- struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); +- if (llp == NULL) { +- int res; +- +- llp = malloc(sizeof(struct fuse_ll_pipe)); +- if (llp == NULL) +- return NULL; +- +- res = fuse_pipe(llp->pipe); +- if (res == -1) { +- free(llp); +- return NULL; +- } +- +- /* +- *the default size is 16 pages on linux +- */ +- llp->size = pagesize * 16; +- llp->can_grow = 1; +- +- pthread_setspecific(se->pipe_key, llp); +- } +- +- return llp; +-} +-#endif +- +-static void fuse_ll_clear_pipe(struct fuse_session *se) +-{ +- struct fuse_ll_pipe *llp = pthread_getspecific(se->pipe_key); +- if (llp) { +- pthread_setspecific(se->pipe_key, NULL); +- fuse_ll_pipe_free(llp); +- } +-} +- +-#if defined(HAVE_SPLICE) && defined(HAVE_VMSPLICE) +-static int read_back(int fd, char *buf, size_t len) +-{ +- int res; +- +- res = read(fd, buf, len); +- if (res == -1) { +- fuse_log(FUSE_LOG_ERR, "fuse: internal error: failed to read back from pipe: %s\n", strerror(errno)); +- return -EIO; +- } +- if (res != len) { +- fuse_log(FUSE_LOG_ERR, "fuse: internal error: short read back from pipe: %i from %zi\n", res, len); +- return -EIO; +- } ++ abort(); /* Will have taken vhost path */ + return 0; + } + +-static int grow_pipe_to_max(int pipefd) +-{ +- int max; +- int res; +- int maxfd; +- char buf[32]; +- +- maxfd = open("/proc/sys/fs/pipe-max-size", O_RDONLY); +- if (maxfd < 0) +- return -errno; +- +- res = read(maxfd, buf, sizeof(buf) - 1); +- if (res < 0) { +- int saved_errno; +- +- saved_errno = errno; +- close(maxfd); +- return -saved_errno; +- } +- close(maxfd); +- buf[res] = '\0'; +- +- max = atoi(buf); +- res = fcntl(pipefd, F_SETPIPE_SZ, max); +- if (res < 0) +- return -errno; +- return max; +-} +- +-static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, +- struct iovec *iov, int iov_count, +- struct fuse_bufvec *buf, unsigned int flags) +-{ +- int res; +- size_t len = fuse_buf_size(buf); +- struct fuse_out_header *out = iov[0].iov_base; +- struct fuse_ll_pipe *llp; +- int splice_flags; +- size_t pipesize; +- size_t total_fd_size; +- size_t idx; +- size_t headerlen; +- struct fuse_bufvec pipe_buf = FUSE_BUFVEC_INIT(len); +- +- if (se->broken_splice_nonblock) +- goto fallback; +- +- if (flags & FUSE_BUF_NO_SPLICE) +- goto fallback; +- +- total_fd_size = 0; +- for (idx = buf->idx; idx < buf->count; idx++) { +- if (buf->buf[idx].flags & FUSE_BUF_IS_FD) { +- total_fd_size = buf->buf[idx].size; +- if (idx == buf->idx) +- total_fd_size -= buf->off; +- } +- } +- if (total_fd_size < 2 * pagesize) +- goto fallback; +- +- if (se->conn.proto_minor < 14 || +- !(se->conn.want & FUSE_CAP_SPLICE_WRITE)) +- goto fallback; +- +- llp = fuse_ll_get_pipe(se); +- if (llp == NULL) +- goto fallback; +- +- +- headerlen = iov_length(iov, iov_count); +- +- out->len = headerlen + len; +- +- /* +- * Heuristic for the required pipe size, does not work if the +- * source contains less than page size fragments +- */ +- pipesize = pagesize * (iov_count + buf->count + 1) + out->len; +- +- if (llp->size < pipesize) { +- if (llp->can_grow) { +- res = fcntl(llp->pipe[0], F_SETPIPE_SZ, pipesize); +- if (res == -1) { +- res = grow_pipe_to_max(llp->pipe[0]); +- if (res > 0) +- llp->size = res; +- llp->can_grow = 0; +- goto fallback; +- } +- llp->size = res; +- } +- if (llp->size < pipesize) +- goto fallback; +- } +- +- +- res = vmsplice(llp->pipe[1], iov, iov_count, SPLICE_F_NONBLOCK); +- if (res == -1) +- goto fallback; +- +- if (res != headerlen) { +- res = -EIO; +- fuse_log(FUSE_LOG_ERR, "fuse: short vmsplice to pipe: %u/%zu\n", res, +- headerlen); +- goto clear_pipe; +- } +- +- pipe_buf.buf[0].flags = FUSE_BUF_IS_FD; +- pipe_buf.buf[0].fd = llp->pipe[1]; +- +- res = fuse_buf_copy(&pipe_buf, buf, +- FUSE_BUF_FORCE_SPLICE | FUSE_BUF_SPLICE_NONBLOCK); +- if (res < 0) { +- if (res == -EAGAIN || res == -EINVAL) { +- /* +- * Should only get EAGAIN on kernels with +- * broken SPLICE_F_NONBLOCK support (<= +- * 2.6.35) where this error or a short read is +- * returned even if the pipe itself is not +- * full +- * +- * EINVAL might mean that splice can't handle +- * this combination of input and output. +- */ +- if (res == -EAGAIN) +- se->broken_splice_nonblock = 1; +- +- pthread_setspecific(se->pipe_key, NULL); +- fuse_ll_pipe_free(llp); +- goto fallback; +- } +- res = -res; +- goto clear_pipe; +- } +- +- if (res != 0 && res < len) { +- struct fuse_bufvec mem_buf = FUSE_BUFVEC_INIT(len); +- void *mbuf; +- size_t now_len = res; +- /* +- * For regular files a short count is either +- * 1) due to EOF, or +- * 2) because of broken SPLICE_F_NONBLOCK (see above) +- * +- * For other inputs it's possible that we overflowed +- * the pipe because of small buffer fragments. +- */ +- +- res = posix_memalign(&mbuf, pagesize, len); +- if (res != 0) +- goto clear_pipe; +- +- mem_buf.buf[0].mem = mbuf; +- mem_buf.off = now_len; +- res = fuse_buf_copy(&mem_buf, buf, 0); +- if (res > 0) { +- char *tmpbuf; +- size_t extra_len = res; +- /* +- * Trickiest case: got more data. Need to get +- * back the data from the pipe and then fall +- * back to regular write. +- */ +- tmpbuf = malloc(headerlen); +- if (tmpbuf == NULL) { +- free(mbuf); +- res = ENOMEM; +- goto clear_pipe; +- } +- res = read_back(llp->pipe[0], tmpbuf, headerlen); +- free(tmpbuf); +- if (res != 0) { +- free(mbuf); +- goto clear_pipe; +- } +- res = read_back(llp->pipe[0], mbuf, now_len); +- if (res != 0) { +- free(mbuf); +- goto clear_pipe; +- } +- len = now_len + extra_len; +- iov[iov_count].iov_base = mbuf; +- iov[iov_count].iov_len = len; +- iov_count++; +- res = fuse_send_msg(se, ch, iov, iov_count); +- free(mbuf); +- return res; +- } +- free(mbuf); +- res = now_len; +- } +- len = res; +- out->len = headerlen + len; +- +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, success, outsize: %i (splice)\n", +- (unsigned long long) out->unique, out->len); +- } +- +- splice_flags = 0; +- if ((flags & FUSE_BUF_SPLICE_MOVE) && +- (se->conn.want & FUSE_CAP_SPLICE_MOVE)) +- splice_flags |= SPLICE_F_MOVE; +- +- res = splice(llp->pipe[0], NULL, ch ? ch->fd : se->fd, +- NULL, out->len, splice_flags); +- if (res == -1) { +- res = -errno; +- perror("fuse: splice from pipe"); +- goto clear_pipe; +- } +- if (res != out->len) { +- res = -EIO; +- fuse_log(FUSE_LOG_ERR, "fuse: short splice from pipe: %u/%u\n", +- res, out->len); +- goto clear_pipe; +- } +- return 0; +- +-clear_pipe: +- fuse_ll_clear_pipe(se); +- return res; +- +-fallback: +- return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); +-} +-#else + static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int iov_count, + struct fuse_bufvec *buf, unsigned int flags) +@@ -849,7 +485,6 @@ static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + + return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); + } +-#endif + + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv, + enum fuse_buf_copy_flags flags) +@@ -1408,16 +1043,11 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + if (bufv.buf[0].size < arg->size) { + fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); + fuse_reply_err(req, EIO); +- goto out; ++ return; + } + bufv.buf[0].size = arg->size; + + se->op.write_buf(req, nodeid, &bufv, arg->offset, &fi); +- +-out: +- /* Need to reset the pipe if ->write_buf() didn't consume all data */ +- if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) +- fuse_ll_clear_pipe(se); + } + + static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) +@@ -2038,17 +1668,6 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + return; + } + +- unsigned max_read_mo = get_max_read(se->mo); +- if (se->conn.max_read != max_read_mo) { +- fuse_log(FUSE_LOG_ERR, "fuse: error: init() and fuse_session_new() " +- "requested different maximum read size (%u vs %u)\n", +- se->conn.max_read, max_read_mo); +- fuse_reply_err(req, EPROTO); +- se->error = -EPROTO; +- fuse_session_exit(se); +- return; +- } +- + if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { + se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; + } +@@ -2364,8 +1983,6 @@ static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, + } + out: + free(rreq); +- if ((ibuf->flags & FUSE_BUF_IS_FD) && bufv.idx < bufv.count) +- fuse_ll_clear_pipe(se); + } + + int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +@@ -2496,7 +2113,6 @@ static struct { + [FUSE_RENAME2] = { do_rename2, "RENAME2" }, + [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, + [FUSE_LSEEK] = { do_lseek, "LSEEK" }, +- [CUSE_INIT] = { cuse_lowlevel_init, "CUSE_INIT" }, + }; + + #define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) +@@ -2509,21 +2125,6 @@ static const char *opname(enum fuse_opcode opcode) + return fuse_ll_ops[opcode].name; + } + +-static int fuse_ll_copy_from_pipe(struct fuse_bufvec *dst, +- struct fuse_bufvec *src) +-{ +- ssize_t res = fuse_buf_copy(dst, src, 0); +- if (res < 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", strerror(-res)); +- return res; +- } +- if ((size_t)res < fuse_buf_size(dst)) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); +- return -1; +- } +- return 0; +-} +- + void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf) + { +@@ -2533,36 +2134,12 @@ void fuse_session_process_buf(struct fuse_session *se, + void fuse_session_process_buf_int(struct fuse_session *se, + const struct fuse_buf *buf, struct fuse_chan *ch) + { +- const size_t write_header_size = sizeof(struct fuse_in_header) + +- sizeof(struct fuse_write_in); +- struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; +- struct fuse_bufvec tmpbuf = FUSE_BUFVEC_INIT(write_header_size); + struct fuse_in_header *in; + const void *inarg; + struct fuse_req *req; +- void *mbuf = NULL; + int err; +- int res; +- +- if (buf->flags & FUSE_BUF_IS_FD) { +- if (buf->size < tmpbuf.buf[0].size) +- tmpbuf.buf[0].size = buf->size; + +- mbuf = malloc(tmpbuf.buf[0].size); +- if (mbuf == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate header\n"); +- goto clear_pipe; +- } +- tmpbuf.buf[0].mem = mbuf; +- +- res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); +- if (res < 0) +- goto clear_pipe; +- +- in = mbuf; +- } else { +- in = buf->mem; +- } ++ in = buf->mem; + + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, +@@ -2584,14 +2161,14 @@ void fuse_session_process_buf_int(struct fuse_session *se, + }; + + fuse_send_msg(se, ch, &iov, 1); +- goto clear_pipe; ++ return; + } + + req->unique = in->unique; + req->ctx.uid = in->uid; + req->ctx.gid = in->gid; + req->ctx.pid = in->pid; +- req->ch = ch ? fuse_chan_get(ch) : NULL; ++ req->ch = ch; + + err = EIO; + if (!se->got_init) { +@@ -2627,28 +2204,6 @@ void fuse_session_process_buf_int(struct fuse_session *se, + fuse_reply_err(intr, EAGAIN); + } + +- if ((buf->flags & FUSE_BUF_IS_FD) && write_header_size < buf->size && +- (in->opcode != FUSE_WRITE || !se->op.write_buf) && +- in->opcode != FUSE_NOTIFY_REPLY) { +- void *newmbuf; +- +- err = ENOMEM; +- newmbuf = realloc(mbuf, buf->size); +- if (newmbuf == NULL) +- goto reply_err; +- mbuf = newmbuf; +- +- tmpbuf = FUSE_BUFVEC_INIT(buf->size - write_header_size); +- tmpbuf.buf[0].mem = (char *)mbuf + write_header_size; +- +- res = fuse_ll_copy_from_pipe(&tmpbuf, &bufv); +- err = -res; +- if (res < 0) +- goto reply_err; +- +- in = mbuf; +- } +- + inarg = (void *) &in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) + do_write_buf(req, in->nodeid, inarg, buf); +@@ -2657,16 +2212,10 @@ void fuse_session_process_buf_int(struct fuse_session *se, + else + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + +-out_free: +- free(mbuf); + return; + + reply_err: + fuse_reply_err(req, err); +-clear_pipe: +- if (buf->flags & FUSE_BUF_IS_FD) +- fuse_ll_clear_pipe(se); +- goto out_free; + } + + #define LL_OPTION(n,o,v) \ +@@ -2684,7 +2233,6 @@ void fuse_lowlevel_version(void) + { + printf("using FUSE kernel interface version %i.%i\n", + FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); +- fuse_mount_version(); + } + + void fuse_lowlevel_help(void) +@@ -2692,204 +2240,29 @@ void fuse_lowlevel_help(void) + /* These are not all options, but the ones that are + potentially of interest to an end-user */ + printf( +-" -o allow_other allow access by all users\n" + " -o allow_root allow access by root\n" +-" -o auto_unmount auto unmount on process termination\n"); ++); + } + + void fuse_session_destroy(struct fuse_session *se) + { +- struct fuse_ll_pipe *llp; +- + if (se->got_init && !se->got_destroy) { + if (se->op.destroy) + se->op.destroy(se->userdata); + } +- llp = pthread_getspecific(se->pipe_key); +- if (llp != NULL) +- fuse_ll_pipe_free(llp); +- pthread_key_delete(se->pipe_key); + pthread_mutex_destroy(&se->lock); + free(se->cuse_data); + if (se->fd != -1) + close(se->fd); +- destroy_mount_opts(se->mo); + free(se); + } + + +-static void fuse_ll_pipe_destructor(void *data) +-{ +- struct fuse_ll_pipe *llp = data; +- fuse_ll_pipe_free(llp); +-} +- +-int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf) +-{ +- return fuse_session_receive_buf_int(se, buf, NULL); +-} +- +-int fuse_session_receive_buf_int(struct fuse_session *se, struct fuse_buf *buf, +- struct fuse_chan *ch) +-{ +- int err; +- ssize_t res; +-#ifdef HAVE_SPLICE +- size_t bufsize = se->bufsize; +- struct fuse_ll_pipe *llp; +- struct fuse_buf tmpbuf; +- +- if (se->conn.proto_minor < 14 || !(se->conn.want & FUSE_CAP_SPLICE_READ)) +- goto fallback; +- +- llp = fuse_ll_get_pipe(se); +- if (llp == NULL) +- goto fallback; +- +- if (llp->size < bufsize) { +- if (llp->can_grow) { +- res = fcntl(llp->pipe[0], F_SETPIPE_SZ, bufsize); +- if (res == -1) { +- llp->can_grow = 0; +- res = grow_pipe_to_max(llp->pipe[0]); +- if (res > 0) +- llp->size = res; +- goto fallback; +- } +- llp->size = res; +- } +- if (llp->size < bufsize) +- goto fallback; +- } +- +- res = splice(ch ? ch->fd : se->fd, +- NULL, llp->pipe[1], NULL, bufsize, 0); +- err = errno; +- +- if (fuse_session_exited(se)) +- return 0; +- +- if (res == -1) { +- if (err == ENODEV) { +- /* Filesystem was unmounted, or connection was aborted +- via /sys/fs/fuse/connections */ +- fuse_session_exit(se); +- return 0; +- } +- if (err != EINTR && err != EAGAIN) +- perror("fuse: splice from device"); +- return -err; +- } +- +- if (res < sizeof(struct fuse_in_header)) { +- fuse_log(FUSE_LOG_ERR, "short splice from fuse device\n"); +- return -EIO; +- } +- +- tmpbuf = (struct fuse_buf) { +- .size = res, +- .flags = FUSE_BUF_IS_FD, +- .fd = llp->pipe[0], +- }; +- +- /* +- * Don't bother with zero copy for small requests. +- * fuse_loop_mt() needs to check for FORGET so this more than +- * just an optimization. +- */ +- if (res < sizeof(struct fuse_in_header) + +- sizeof(struct fuse_write_in) + pagesize) { +- struct fuse_bufvec src = { .buf[0] = tmpbuf, .count = 1 }; +- struct fuse_bufvec dst = { .count = 1 }; +- +- if (!buf->mem) { +- buf->mem = malloc(se->bufsize); +- if (!buf->mem) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: failed to allocate read buffer\n"); +- return -ENOMEM; +- } +- } +- buf->size = se->bufsize; +- buf->flags = 0; +- dst.buf[0] = *buf; +- +- res = fuse_buf_copy(&dst, &src, 0); +- if (res < 0) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: %s\n", +- strerror(-res)); +- fuse_ll_clear_pipe(se); +- return res; +- } +- if (res < tmpbuf.size) { +- fuse_log(FUSE_LOG_ERR, "fuse: copy from pipe: short read\n"); +- fuse_ll_clear_pipe(se); +- return -EIO; +- } +- assert(res == tmpbuf.size); +- +- } else { +- /* Don't overwrite buf->mem, as that would cause a leak */ +- buf->fd = tmpbuf.fd; +- buf->flags = tmpbuf.flags; +- } +- buf->size = tmpbuf.size; +- +- return res; +- +-fallback: +-#endif +- if (!buf->mem) { +- buf->mem = malloc(se->bufsize); +- if (!buf->mem) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: failed to allocate read buffer\n"); +- return -ENOMEM; +- } +- } +- +-restart: +- res = read(ch ? ch->fd : se->fd, buf->mem, se->bufsize); +- err = errno; +- +- if (fuse_session_exited(se)) +- return 0; +- if (res == -1) { +- /* ENOENT means the operation was interrupted, it's safe +- to restart */ +- if (err == ENOENT) +- goto restart; +- +- if (err == ENODEV) { +- /* Filesystem was unmounted, or connection was aborted +- via /sys/fs/fuse/connections */ +- fuse_session_exit(se); +- return 0; +- } +- /* Errors occurring during normal operation: EINTR (read +- interrupted), EAGAIN (nonblocking I/O), ENODEV (filesystem +- umounted) */ +- if (err != EINTR && err != EAGAIN) +- perror("fuse: reading device"); +- return -err; +- } +- if ((size_t) res < sizeof(struct fuse_in_header)) { +- fuse_log(FUSE_LOG_ERR, "short read on fuse device\n"); +- return -EIO; +- } +- +- buf->size = res; +- +- return res; +-} +- + struct fuse_session *fuse_session_new(struct fuse_args *args, + const struct fuse_lowlevel_ops *op, + size_t op_size, void *userdata) + { +- int err; + struct fuse_session *se; +- struct mount_opts *mo; + + if (sizeof(struct fuse_lowlevel_ops) < op_size) { + fuse_log(FUSE_LOG_ERR, "fuse: warning: library too old, some operations may not work\n"); +@@ -2913,20 +2286,6 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + /* Parse options */ + if(fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) + goto out2; +- if(se->deny_others) { +- /* Allowing access only by root is done by instructing +- * kernel to allow access by everyone, and then restricting +- * access to root and mountpoint owner in libfuse. +- */ +- // We may be adding the option a second time, but +- // that doesn't hurt. +- if(fuse_opt_add_arg(args, "-oallow_other") == -1) +- goto out2; +- } +- mo = parse_mount_opts(args); +- if (mo == NULL) +- goto out3; +- + if(args->argc == 1 && + args->argv[0][0] == '-') { + fuse_log(FUSE_LOG_ERR, "fuse: warning: argv[0] looks like an option, but " +@@ -2940,9 +2299,6 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out4; + } + +- if (se->debug) +- fuse_log(FUSE_LOG_DEBUG, "FUSE library version: %s\n", PACKAGE_VERSION); +- + se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + + FUSE_BUFFER_HEADER_SIZE; + +@@ -2952,26 +2308,14 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + se->notify_ctr = 1; + fuse_mutex_init(&se->lock); + +- err = pthread_key_create(&se->pipe_key, fuse_ll_pipe_destructor); +- if (err) { +- fuse_log(FUSE_LOG_ERR, "fuse: failed to create thread specific key: %s\n", +- strerror(err)); +- goto out5; +- } +- + memcpy(&se->op, op, op_size); + se->owner = getuid(); + se->userdata = userdata; + +- se->mo = mo; + return se; + +-out5: +- pthread_mutex_destroy(&se->lock); + out4: + fuse_opt_free_args(args); +-out3: +- free(mo); + out2: + free(se); + out1: +@@ -3035,11 +2379,6 @@ int fuse_session_fd(struct fuse_session *se) + + void fuse_session_unmount(struct fuse_session *se) + { +- if (se->mountpoint != NULL) { +- fuse_kern_unmount(se->mountpoint, se->fd); +- free(se->mountpoint); +- se->mountpoint = NULL; +- } + } + + #ifdef linux +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 18c6363..6b1adfc 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -31,10 +31,6 @@ + #include + #include + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /* ----------------------------------------------------------- * + * Miscellaneous definitions * + * ----------------------------------------------------------- */ +@@ -1863,14 +1859,12 @@ void fuse_cmdline_help(void); + * ----------------------------------------------------------- */ + + struct fuse_cmdline_opts { +- int singlethread; + int foreground; + int debug; + int nodefault_subtype; + char *mountpoint; + int show_version; + int show_help; +- int clone_fd; + unsigned int max_idle_threads; + }; + +@@ -1962,24 +1956,6 @@ int fuse_session_mount(struct fuse_session *se, const char *mountpoint); + int fuse_session_loop(struct fuse_session *se); + + /** +- * Enter a multi-threaded event loop. +- * +- * For a description of the return value and the conditions when the +- * event loop exits, refer to the documentation of +- * fuse_session_loop(). +- * +- * @param se the session +- * @param config session loop configuration +- * @return see fuse_session_loop() +- */ +-#if FUSE_USE_VERSION < 32 +-int fuse_session_loop_mt_31(struct fuse_session *se, int clone_fd); +-#define fuse_session_loop_mt(se, clone_fd) fuse_session_loop_mt_31(se, clone_fd) +-#else +-int fuse_session_loop_mt(struct fuse_session *se, struct fuse_loop_config *config); +-#endif +- +-/** + * Flag a session as terminated. + * + * This function is invoked by the POSIX signal handlers, when +@@ -2082,8 +2058,4 @@ void fuse_session_process_buf(struct fuse_session *se, + */ + int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_LOWLEVEL_H_ */ +diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h +index d8573e7..6910255 100644 +--- a/tools/virtiofsd/fuse_opt.h ++++ b/tools/virtiofsd/fuse_opt.h +@@ -14,10 +14,6 @@ + * This file defines the option parsing interface of FUSE + */ + +-#ifdef __cplusplus +-extern "C" { +-#endif +- + /** + * Option description + * +@@ -264,8 +260,4 @@ void fuse_opt_free_args(struct fuse_args *args); + */ + int fuse_opt_match(const struct fuse_opt opts[], const char *opt); + +-#ifdef __cplusplus +-} +-#endif +- + #endif /* FUSE_OPT_H_ */ +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 64ff7ad..5a2e64c 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -41,14 +41,10 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), + FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("-f", foreground), +- FUSE_HELPER_OPT("-s", singlethread), + FUSE_HELPER_OPT("fsname=", nodefault_subtype), + FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), +-#ifndef __FreeBSD__ + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), +-#endif +- FUSE_HELPER_OPT("clone_fd", clone_fd), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_OPT_END + }; +@@ -132,9 +128,6 @@ void fuse_cmdline_help(void) + " -V --version print version\n" + " -d -o debug enable debug output (implies -f)\n" + " -f foreground operation\n" +- " -s disable multi-threaded operation\n" +- " -o clone_fd use separate fuse device fd for each thread\n" +- " (may improve performance)\n" + " -o max_idle_threads the maximum number of idle worker threads\n" + " allowed (default: 10)\n"); + } +@@ -171,34 +164,6 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, + } + } + +-/* Under FreeBSD, there is no subtype option so this +- function actually sets the fsname */ +-static int add_default_subtype(const char *progname, struct fuse_args *args) +-{ +- int res; +- char *subtype_opt; +- +- const char *basename = strrchr(progname, '/'); +- if (basename == NULL) +- basename = progname; +- else if (basename[1] != '\0') +- basename++; +- +- subtype_opt = (char *) malloc(strlen(basename) + 64); +- if (subtype_opt == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); +- return -1; +- } +-#ifdef __FreeBSD__ +- sprintf(subtype_opt, "-ofsname=%s", basename); +-#else +- sprintf(subtype_opt, "-osubtype=%s", basename); +-#endif +- res = fuse_opt_add_arg(args, subtype_opt); +- free(subtype_opt); +- return res; +-} +- + int fuse_parse_cmdline(struct fuse_args *args, + struct fuse_cmdline_opts *opts) + { +@@ -210,14 +175,6 @@ int fuse_parse_cmdline(struct fuse_args *args, + fuse_helper_opt_proc) == -1) + return -1; + +- /* *Linux*: if neither -o subtype nor -o fsname are specified, +- set subtype to program's basename. +- *FreeBSD*: if fsname is not specified, set to program's +- basename. */ +- if (!opts->nodefault_subtype) +- if (add_default_subtype(args->argv[0], args) == -1) +- return -1; +- + return 0; + } + +@@ -276,88 +233,6 @@ int fuse_daemonize(int foreground) + return 0; + } + +-int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, +- size_t op_size, void *user_data) +-{ +- struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +- struct fuse *fuse; +- struct fuse_cmdline_opts opts; +- int res; +- +- if (fuse_parse_cmdline(&args, &opts) != 0) +- return 1; +- +- if (opts.show_version) { +- printf("FUSE library version %s\n", PACKAGE_VERSION); +- fuse_lowlevel_version(); +- res = 0; +- goto out1; +- } +- +- if (opts.show_help) { +- if(args.argv[0][0] != '\0') +- printf("usage: %s [options] \n\n", +- args.argv[0]); +- printf("FUSE options:\n"); +- fuse_cmdline_help(); +- fuse_lib_help(&args); +- res = 0; +- goto out1; +- } +- +- if (!opts.show_help && +- !opts.mountpoint) { +- fuse_log(FUSE_LOG_ERR, "error: no mountpoint specified\n"); +- res = 2; +- goto out1; +- } +- +- +- fuse = fuse_new_31(&args, op, op_size, user_data); +- if (fuse == NULL) { +- res = 3; +- goto out1; +- } +- +- if (fuse_mount(fuse,opts.mountpoint) != 0) { +- res = 4; +- goto out2; +- } +- +- if (fuse_daemonize(opts.foreground) != 0) { +- res = 5; +- goto out3; +- } +- +- struct fuse_session *se = fuse_get_session(fuse); +- if (fuse_set_signal_handlers(se) != 0) { +- res = 6; +- goto out3; +- } +- +- if (opts.singlethread) +- res = fuse_loop(fuse); +- else { +- struct fuse_loop_config loop_config; +- loop_config.clone_fd = opts.clone_fd; +- loop_config.max_idle_threads = opts.max_idle_threads; +- res = fuse_loop_mt_32(fuse, &loop_config); +- } +- if (res) +- res = 7; +- +- fuse_remove_signal_handlers(se); +-out3: +- fuse_unmount(fuse); +-out2: +- fuse_destroy(fuse); +-out1: +- free(opts.mountpoint); +- fuse_opt_free_args(&args); +- return res; +-} +- +- + void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, + struct fuse_conn_info *conn) + { +@@ -420,21 +295,3 @@ struct fuse_conn_info_opts* fuse_parse_conn_info_opts(struct fuse_args *args) + } + return opts; + } +- +-int fuse_open_channel(const char *mountpoint, const char* options) +-{ +- struct mount_opts *opts = NULL; +- int fd = -1; +- const char *argv[] = { "", "-o", options }; +- int argc = sizeof(argv) / sizeof(argv[0]); +- struct fuse_args args = FUSE_ARGS_INIT(argc, (char**) argv); +- +- opts = parse_mount_opts(&args); +- if (opts == NULL) +- return -1; +- +- fd = fuse_kern_mount(mountpoint, opts); +- destroy_mount_opts(opts); +- +- return fd; +-} +diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h +index 6b77c33..7c5f561 100644 +--- a/tools/virtiofsd/passthrough_helpers.h ++++ b/tools/virtiofsd/passthrough_helpers.h +@@ -42,32 +42,6 @@ static int mknod_wrapper(int dirfd, const char *path, const char *link, + res = symlinkat(link, dirfd, path); + } else if (S_ISFIFO(mode)) { + res = mkfifoat(dirfd, path, mode); +-#ifdef __FreeBSD__ +- } else if (S_ISSOCK(mode)) { +- struct sockaddr_un su; +- int fd; +- +- if (strlen(path) >= sizeof(su.sun_path)) { +- errno = ENAMETOOLONG; +- return -1; +- } +- fd = socket(AF_UNIX, SOCK_STREAM, 0); +- if (fd >= 0) { +- /* +- * We must bind the socket to the underlying file +- * system to create the socket file, even though +- * we'll never listen on this socket. +- */ +- su.sun_family = AF_UNIX; +- strncpy(su.sun_path, path, sizeof(su.sun_path)); +- res = bindat(dirfd, fd, (struct sockaddr*)&su, +- sizeof(su)); +- if (res == 0) +- close(fd); +- } else { +- res = -1; +- } +-#endif + } else { + res = mknodat(dirfd, path, mode, rdev); + } +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e1a6056..e5f7115 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1240,7 +1240,6 @@ int main(int argc, char *argv[]) + ret = 0; + goto err_out1; + } else if (opts.show_version) { +- printf("FUSE library version %s\n", fuse_pkgversion()); + fuse_lowlevel_version(); + ret = 0; + goto err_out1; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Trim-out-compatibility-code.patch b/kvm-virtiofsd-Trim-out-compatibility-code.patch new file mode 100755 index 0000000000000000000000000000000000000000..411af77d7a67eb2153d208e2c58992c5a31cd85a --- /dev/null +++ b/kvm-virtiofsd-Trim-out-compatibility-code.patch @@ -0,0 +1,545 @@ +From ff16b837e402de773581f77ca188f8806c0b500f Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:51 +0100 +Subject: [PATCH 020/116] virtiofsd: Trim out compatibility code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-17-dgilbert@redhat.com> +Patchwork-id: 93468 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 016/112] virtiofsd: Trim out compatibility code +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +virtiofsd only supports major=7, minor>=31; trim out a lot of +old compatibility code. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 72c42e2d65510e073cf78fdc924d121c77fa0080) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 330 +++++++++++++++------------------------- + 1 file changed, 119 insertions(+), 211 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 07fb8a6..514d79c 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -387,16 +387,7 @@ static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f) + int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) + { + struct fuse_entry_out arg; +- size_t size = req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ENTRY_OUT_SIZE : +- sizeof(arg); +- +- /* +- * before ABI 7.4 e->ino == 0 was invalid, only ENOENT meant +- * negative entry +- */ +- if (!e->ino && req->se->conn.proto_minor < 4) { +- return fuse_reply_err(req, ENOENT); +- } ++ size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + fill_entry(&arg, e); +@@ -407,9 +398,7 @@ int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + const struct fuse_file_info *f) + { + char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; +- size_t entrysize = req->se->conn.proto_minor < 9 ? +- FUSE_COMPAT_ENTRY_OUT_SIZE : +- sizeof(struct fuse_entry_out); ++ size_t entrysize = sizeof(struct fuse_entry_out); + struct fuse_entry_out *earg = (struct fuse_entry_out *)buf; + struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize); + +@@ -423,8 +412,7 @@ int fuse_reply_attr(fuse_req_t req, const struct stat *attr, + double attr_timeout) + { + struct fuse_attr_out arg; +- size_t size = +- req->se->conn.proto_minor < 9 ? FUSE_COMPAT_ATTR_OUT_SIZE : sizeof(arg); ++ size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + arg.attr_valid = calc_timeout_sec(attr_timeout); +@@ -519,8 +507,7 @@ int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) + int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) + { + struct fuse_statfs_out arg; +- size_t size = +- req->se->conn.proto_minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(arg); ++ size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + convert_statfs(stbuf, &arg.st); +@@ -604,45 +591,31 @@ int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, + iov[count].iov_len = sizeof(arg); + count++; + +- if (req->se->conn.proto_minor < 16) { +- if (in_count) { +- iov[count].iov_base = (void *)in_iov; +- iov[count].iov_len = sizeof(in_iov[0]) * in_count; +- count++; +- } ++ /* Can't handle non-compat 64bit ioctls on 32bit */ ++ if (sizeof(void *) == 4 && req->ioctl_64bit) { ++ res = fuse_reply_err(req, EINVAL); ++ goto out; ++ } + +- if (out_count) { +- iov[count].iov_base = (void *)out_iov; +- iov[count].iov_len = sizeof(out_iov[0]) * out_count; +- count++; ++ if (in_count) { ++ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); ++ if (!in_fiov) { ++ goto enomem; + } +- } else { +- /* Can't handle non-compat 64bit ioctls on 32bit */ +- if (sizeof(void *) == 4 && req->ioctl_64bit) { +- res = fuse_reply_err(req, EINVAL); +- goto out; +- } +- +- if (in_count) { +- in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); +- if (!in_fiov) { +- goto enomem; +- } + +- iov[count].iov_base = (void *)in_fiov; +- iov[count].iov_len = sizeof(in_fiov[0]) * in_count; +- count++; ++ iov[count].iov_base = (void *)in_fiov; ++ iov[count].iov_len = sizeof(in_fiov[0]) * in_count; ++ count++; ++ } ++ if (out_count) { ++ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); ++ if (!out_fiov) { ++ goto enomem; + } +- if (out_count) { +- out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); +- if (!out_fiov) { +- goto enomem; +- } + +- iov[count].iov_base = (void *)out_fiov; +- iov[count].iov_len = sizeof(out_fiov[0]) * out_count; +- count++; +- } ++ iov[count].iov_base = (void *)out_fiov; ++ iov[count].iov_len = sizeof(out_fiov[0]) * out_count; ++ count++; + } + + res = send_reply_iov(req, 0, iov, count); +@@ -784,14 +757,12 @@ static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + struct fuse_file_info *fip = NULL; + struct fuse_file_info fi; + +- if (req->se->conn.proto_minor >= 9) { +- struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; ++ struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; + +- if (arg->getattr_flags & FUSE_GETATTR_FH) { +- memset(&fi, 0, sizeof(fi)); +- fi.fh = arg->fh; +- fip = &fi; +- } ++ if (arg->getattr_flags & FUSE_GETATTR_FH) { ++ memset(&fi, 0, sizeof(fi)); ++ fi.fh = arg->fh; ++ fip = &fi; + } + + if (req->se->op.getattr) { +@@ -856,11 +827,7 @@ static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + struct fuse_mknod_in *arg = (struct fuse_mknod_in *)inarg; + char *name = PARAM(arg); + +- if (req->se->conn.proto_minor >= 12) { +- req->ctx.umask = arg->umask; +- } else { +- name = (char *)inarg + FUSE_COMPAT_MKNOD_IN_SIZE; +- } ++ req->ctx.umask = arg->umask; + + if (req->se->op.mknod) { + req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); +@@ -873,9 +840,7 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + { + struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *)inarg; + +- if (req->se->conn.proto_minor >= 12) { +- req->ctx.umask = arg->umask; +- } ++ req->ctx.umask = arg->umask; + + if (req->se->op.mkdir) { + req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); +@@ -967,11 +932,7 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + +- if (req->se->conn.proto_minor >= 12) { +- req->ctx.umask = arg->umask; +- } else { +- name = (char *)inarg + sizeof(struct fuse_open_in); +- } ++ req->ctx.umask = arg->umask; + + req->se->op.create(req, nodeid, name, arg->mode, &fi); + } else { +@@ -1003,10 +964,8 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 9) { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- } ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; + req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); +@@ -1023,13 +982,9 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + fi.fh = arg->fh; + fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; + +- if (req->se->conn.proto_minor < 9) { +- param = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- param = PARAM(arg); +- } ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ param = PARAM(arg); + + if (req->se->op.write) { + req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); +@@ -1053,21 +1008,14 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + fi.fh = arg->fh; + fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; + +- if (se->conn.proto_minor < 9) { +- bufv.buf[0].mem = ((char *)arg) + FUSE_COMPAT_WRITE_IN_SIZE; +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + FUSE_COMPAT_WRITE_IN_SIZE; +- assert(!(bufv.buf[0].flags & FUSE_BUF_IS_FD)); +- } else { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- bufv.buf[0].mem = PARAM(arg); +- } +- +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; ++ if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { ++ bufv.buf[0].mem = PARAM(arg); + } ++ ++ bufv.buf[0].size -= ++ sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); + if (bufv.buf[0].size < arg->size) { + fuse_log(FUSE_LOG_ERR, "fuse: do_write_buf: buffer size too small\n"); + fuse_reply_err(req, EIO); +@@ -1086,9 +1034,7 @@ static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.flush = 1; +- if (req->se->conn.proto_minor >= 7) { +- fi.lock_owner = arg->lock_owner; +- } ++ fi.lock_owner = arg->lock_owner; + + if (req->se->op.flush) { + req->se->op.flush(req, nodeid, &fi); +@@ -1105,10 +1051,8 @@ static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; +- if (req->se->conn.proto_minor >= 8) { +- fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; +- fi.lock_owner = arg->lock_owner; +- } ++ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; ++ fi.lock_owner = arg->lock_owner; + if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { + fi.flock_release = 1; + fi.lock_owner = arg->lock_owner; +@@ -1477,8 +1421,7 @@ static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +- if (sizeof(void *) == 4 && req->se->conn.proto_minor >= 16 && +- !(flags & FUSE_IOCTL_32BIT)) { ++ if (sizeof(void *) == 4 && !(flags & FUSE_IOCTL_32BIT)) { + req->ioctl_64bit = 1; + } + +@@ -1603,7 +1546,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + outarg.major = FUSE_KERNEL_VERSION; + outarg.minor = FUSE_KERNEL_MINOR_VERSION; + +- if (arg->major < 7) { ++ if (arg->major < 7 || (arg->major == 7 && arg->minor < 31)) { + fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", + arg->major, arg->minor); + fuse_reply_err(req, EPROTO); +@@ -1616,81 +1559,71 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + return; + } + +- if (arg->minor >= 6) { +- if (arg->max_readahead < se->conn.max_readahead) { +- se->conn.max_readahead = arg->max_readahead; +- } +- if (arg->flags & FUSE_ASYNC_READ) { +- se->conn.capable |= FUSE_CAP_ASYNC_READ; +- } +- if (arg->flags & FUSE_POSIX_LOCKS) { +- se->conn.capable |= FUSE_CAP_POSIX_LOCKS; +- } +- if (arg->flags & FUSE_ATOMIC_O_TRUNC) { +- se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; +- } +- if (arg->flags & FUSE_EXPORT_SUPPORT) { +- se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; +- } +- if (arg->flags & FUSE_DONT_MASK) { +- se->conn.capable |= FUSE_CAP_DONT_MASK; +- } +- if (arg->flags & FUSE_FLOCK_LOCKS) { +- se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; +- } +- if (arg->flags & FUSE_AUTO_INVAL_DATA) { +- se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; +- } +- if (arg->flags & FUSE_DO_READDIRPLUS) { +- se->conn.capable |= FUSE_CAP_READDIRPLUS; +- } +- if (arg->flags & FUSE_READDIRPLUS_AUTO) { +- se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; +- } +- if (arg->flags & FUSE_ASYNC_DIO) { +- se->conn.capable |= FUSE_CAP_ASYNC_DIO; +- } +- if (arg->flags & FUSE_WRITEBACK_CACHE) { +- se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; +- } +- if (arg->flags & FUSE_NO_OPEN_SUPPORT) { +- se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; +- } +- if (arg->flags & FUSE_PARALLEL_DIROPS) { +- se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; +- } +- if (arg->flags & FUSE_POSIX_ACL) { +- se->conn.capable |= FUSE_CAP_POSIX_ACL; +- } +- if (arg->flags & FUSE_HANDLE_KILLPRIV) { +- se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; +- } +- if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { +- se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; +- } +- if (!(arg->flags & FUSE_MAX_PAGES)) { +- size_t max_bufsize = +- FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + +- FUSE_BUFFER_HEADER_SIZE; +- if (bufsize > max_bufsize) { +- bufsize = max_bufsize; +- } ++ if (arg->max_readahead < se->conn.max_readahead) { ++ se->conn.max_readahead = arg->max_readahead; ++ } ++ if (arg->flags & FUSE_ASYNC_READ) { ++ se->conn.capable |= FUSE_CAP_ASYNC_READ; ++ } ++ if (arg->flags & FUSE_POSIX_LOCKS) { ++ se->conn.capable |= FUSE_CAP_POSIX_LOCKS; ++ } ++ if (arg->flags & FUSE_ATOMIC_O_TRUNC) { ++ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; ++ } ++ if (arg->flags & FUSE_EXPORT_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; ++ } ++ if (arg->flags & FUSE_DONT_MASK) { ++ se->conn.capable |= FUSE_CAP_DONT_MASK; ++ } ++ if (arg->flags & FUSE_FLOCK_LOCKS) { ++ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; ++ } ++ if (arg->flags & FUSE_AUTO_INVAL_DATA) { ++ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; ++ } ++ if (arg->flags & FUSE_DO_READDIRPLUS) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS; ++ } ++ if (arg->flags & FUSE_READDIRPLUS_AUTO) { ++ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; ++ } ++ if (arg->flags & FUSE_ASYNC_DIO) { ++ se->conn.capable |= FUSE_CAP_ASYNC_DIO; ++ } ++ if (arg->flags & FUSE_WRITEBACK_CACHE) { ++ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; ++ } ++ if (arg->flags & FUSE_NO_OPEN_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; ++ } ++ if (arg->flags & FUSE_PARALLEL_DIROPS) { ++ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; ++ } ++ if (arg->flags & FUSE_POSIX_ACL) { ++ se->conn.capable |= FUSE_CAP_POSIX_ACL; ++ } ++ if (arg->flags & FUSE_HANDLE_KILLPRIV) { ++ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; ++ } ++ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { ++ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; ++ } ++ if (!(arg->flags & FUSE_MAX_PAGES)) { ++ size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + ++ FUSE_BUFFER_HEADER_SIZE; ++ if (bufsize > max_bufsize) { ++ bufsize = max_bufsize; + } +- } else { +- se->conn.max_readahead = 0; + } +- +- if (se->conn.proto_minor >= 14) { + #ifdef HAVE_SPLICE + #ifdef HAVE_VMSPLICE +- se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; ++ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; + #endif +- se->conn.capable |= FUSE_CAP_SPLICE_READ; ++ se->conn.capable |= FUSE_CAP_SPLICE_READ; + #endif +- } +- if (se->conn.proto_minor >= 18) { +- se->conn.capable |= FUSE_CAP_IOCTL_DIR; +- } ++ se->conn.capable |= FUSE_CAP_IOCTL_DIR; + + /* + * Default settings for modern filesystems. +@@ -1797,24 +1730,20 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + outarg.max_readahead = se->conn.max_readahead; + outarg.max_write = se->conn.max_write; +- if (se->conn.proto_minor >= 13) { +- if (se->conn.max_background >= (1 << 16)) { +- se->conn.max_background = (1 << 16) - 1; +- } +- if (se->conn.congestion_threshold > se->conn.max_background) { +- se->conn.congestion_threshold = se->conn.max_background; +- } +- if (!se->conn.congestion_threshold) { +- se->conn.congestion_threshold = se->conn.max_background * 3 / 4; +- } +- +- outarg.max_background = se->conn.max_background; +- outarg.congestion_threshold = se->conn.congestion_threshold; ++ if (se->conn.max_background >= (1 << 16)) { ++ se->conn.max_background = (1 << 16) - 1; ++ } ++ if (se->conn.congestion_threshold > se->conn.max_background) { ++ se->conn.congestion_threshold = se->conn.max_background; + } +- if (se->conn.proto_minor >= 23) { +- outarg.time_gran = se->conn.time_gran; ++ if (!se->conn.congestion_threshold) { ++ se->conn.congestion_threshold = se->conn.max_background * 3 / 4; + } + ++ outarg.max_background = se->conn.max_background; ++ outarg.congestion_threshold = se->conn.congestion_threshold; ++ outarg.time_gran = se->conn.time_gran; ++ + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, + outarg.minor); +@@ -1828,11 +1757,6 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + outarg.congestion_threshold); + fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); + } +- if (arg->minor < 5) { +- outargsize = FUSE_COMPAT_INIT_OUT_SIZE; +- } else if (arg->minor < 23) { +- outargsize = FUSE_COMPAT_22_INIT_OUT_SIZE; +- } + + send_reply_ok(req, &outarg, outargsize); + } +@@ -1896,10 +1820,6 @@ int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { +- return -ENOSYS; +- } +- + outarg.ino = ino; + outarg.off = off; + outarg.len = len; +@@ -1920,10 +1840,6 @@ int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 12) { +- return -ENOSYS; +- } +- + outarg.parent = parent; + outarg.namelen = namelen; + outarg.padding = 0; +@@ -1947,10 +1863,6 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 18) { +- return -ENOSYS; +- } +- + outarg.parent = parent; + outarg.child = child; + outarg.namelen = namelen; +@@ -1977,10 +1889,6 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + return -EINVAL; + } + +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { +- return -ENOSYS; +- } +- + out.unique = 0; + out.error = FUSE_NOTIFY_STORE; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-Whitelist-fchmod.patch b/kvm-virtiofsd-Whitelist-fchmod.patch new file mode 100755 index 0000000000000000000000000000000000000000..a4f95d9c000575a6305032464d30cd1425643f6d --- /dev/null +++ b/kvm-virtiofsd-Whitelist-fchmod.patch @@ -0,0 +1,79 @@ +From 181ed1777c3dd50b1ff9907b0a4199e845af1270 Mon Sep 17 00:00:00 2001 +From: Max Reitz +Date: Fri, 18 Jun 2021 16:21:17 -0400 +Subject: [PATCH 1/4] virtiofsd: Whitelist fchmod +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Max Reitz +Message-id: <20210618162117.97775-2-mreitz@redhat.com> +Patchwork-id: 101719 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH 1/1] virtiofsd: Whitelist fchmod +Bugzilla: 1967914 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Vivek Goyal +RH-Acked-by: Connor Kuehl + +lo_setattr() invokes fchmod() in a rarely used code path, so it should +be whitelisted or virtiofsd will crash with EBADSYS. + +Said code path can be triggered for example as follows: + +On the host, in the shared directory, create a file with the sticky bit +set and a security.capability xattr: +(1) # touch foo +(2) # chmod u+s foo +(3) # setcap '' foo + +Then in the guest let some process truncate that file after it has +dropped all of its capabilities (at least CAP_FSETID): + +int main(int argc, char *argv[]) +{ + capng_setpid(getpid()); + capng_clear(CAPNG_SELECT_BOTH); + capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, 0); + capng_apply(CAPNG_SELECT_BOTH); + + ftruncate(open(argv[1], O_RDWR), 0); +} + +This will cause the guest kernel to drop the sticky bit (i.e. perform a +mode change) as part of the truncate (where FATTR_FH is set), and that +will cause virtiofsd to invoke fchmod() instead of fchmodat(). + +(A similar configuration exists further below with futimens() vs. +utimensat(), but the former is not a syscall but just a wrapper for the +latter, so no further whitelisting is required.) + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1842667 +Reported-by: Qian Cai +Cc: qemu-stable@nongnu.org +Signed-off-by: Max Reitz +Message-Id: <20200608093111.14942-1-mreitz@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Vivek Goyal +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 63659fe74e76f5c5285466f0c5cfbdca65b3688e) +Signed-off-by: Max Reitz +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/seccomp.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +index bd9e7b083c..3b1522acdd 100644 +--- a/tools/virtiofsd/seccomp.c ++++ b/tools/virtiofsd/seccomp.c +@@ -42,6 +42,7 @@ static const int syscall_whitelist[] = { + SCMP_SYS(exit_group), + SCMP_SYS(fallocate), + SCMP_SYS(fchdir), ++ SCMP_SYS(fchmod), + SCMP_SYS(fchmodat), + SCMP_SYS(fchownat), + SCMP_SYS(fcntl), +-- +2.27.0 + diff --git a/kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch b/kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch new file mode 100755 index 0000000000000000000000000000000000000000..a0882d5103f01ecb70e5e2ec03c004c234937cea --- /dev/null +++ b/kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch @@ -0,0 +1,93 @@ +From e4c8fd1060fb69a093064851ebf66dd82533ec0e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:17 +0100 +Subject: [PATCH 106/116] virtiofsd: add definition of fuse_buf_writev() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-103-dgilbert@redhat.com> +Patchwork-id: 93557 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 102/112] virtiofsd: add definition of fuse_buf_writev() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: piaojun + +Define fuse_buf_writev() which use pwritev and writev to improve io +bandwidth. Especially, the src bufs with 0 size should be skipped as +their mems are not *block_size* aligned which will cause writev failed +in direct io mode. + +Signed-off-by: Jun Piao +Suggested-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9ceaaa15cf21073c2b23058c374f61c30cd39c31) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 38 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 38 insertions(+) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 42a608f..37befeb 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -14,6 +14,7 @@ + #include "fuse_lowlevel.h" + #include + #include ++#include + #include + #include + +@@ -33,6 +34,43 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv) + return size; + } + ++__attribute__((unused)) ++static ssize_t fuse_buf_writev(struct fuse_buf *out_buf, ++ struct fuse_bufvec *in_buf) ++{ ++ ssize_t res, i, j; ++ size_t iovcnt = in_buf->count; ++ struct iovec *iov; ++ int fd = out_buf->fd; ++ ++ iov = calloc(iovcnt, sizeof(struct iovec)); ++ if (!iov) { ++ return -ENOMEM; ++ } ++ ++ for (i = 0, j = 0; i < iovcnt; i++) { ++ /* Skip the buf with 0 size */ ++ if (in_buf->buf[i].size) { ++ iov[j].iov_base = in_buf->buf[i].mem; ++ iov[j].iov_len = in_buf->buf[i].size; ++ j++; ++ } ++ } ++ ++ if (out_buf->flags & FUSE_BUF_FD_SEEK) { ++ res = pwritev(fd, iov, iovcnt, out_buf->pos); ++ } else { ++ res = writev(fd, iov, iovcnt); ++ } ++ ++ if (res == -1) { ++ res = -errno; ++ } ++ ++ free(iov); ++ return res; ++} ++ + static size_t min_size(size_t s1, size_t s2) + { + return s1 < s2 ? s1 : s2; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch b/kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch new file mode 100755 index 0000000000000000000000000000000000000000..451f12b541a8451fa79a1d003ab15561972fa3c5 --- /dev/null +++ b/kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch @@ -0,0 +1,170 @@ +From f91a9bdc171142174110e9ff1716b611f6fb0039 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:07 +0100 +Subject: [PATCH 036/116] virtiofsd: add --fd=FDNUM fd passing option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-33-dgilbert@redhat.com> +Patchwork-id: 93487 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 032/112] virtiofsd: add --fd=FDNUM fd passing option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Although --socket-path=PATH is useful for manual invocations, management +tools typically create the UNIX domain socket themselves and pass it to +the vhost-user device backend. This way QEMU can be launched +immediately with a valid socket. No waiting for the vhost-user device +backend is required when fd passing is used. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit cee8e35d4386e34bf79c3ca2aab7f7b1bb48cf8d) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 16 ++++++++++++---- + tools/virtiofsd/fuse_virtio.c | 31 +++++++++++++++++++++++++------ + 3 files changed, 38 insertions(+), 10 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 1126723..45995f3 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -68,6 +68,7 @@ struct fuse_session { + size_t bufsize; + int error; + char *vu_socket_path; ++ int vu_listen_fd; + int vu_socketfd; + struct fv_VuDev *virtio_dev; + }; +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 4f4684d..95f4db8 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2130,6 +2130,7 @@ static const struct fuse_opt fuse_ll_opts[] = { + LL_OPTION("--debug", debug, 1), + LL_OPTION("allow_root", deny_others, 1), + LL_OPTION("--socket-path=%s", vu_socket_path, 0), ++ LL_OPTION("--fd=%d", vu_listen_fd, 0), + FUSE_OPT_END + }; + +@@ -2147,7 +2148,8 @@ void fuse_lowlevel_help(void) + */ + printf( + " -o allow_root allow access by root\n" +- " --socket-path=PATH path for the vhost-user socket\n"); ++ " --socket-path=PATH path for the vhost-user socket\n" ++ " --fd=FDNUM fd number of vhost-user socket\n"); + } + + void fuse_session_destroy(struct fuse_session *se) +@@ -2191,6 +2193,7 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out1; + } + se->fd = -1; ++ se->vu_listen_fd = -1; + se->conn.max_write = UINT_MAX; + se->conn.max_readahead = UINT_MAX; + +@@ -2212,8 +2215,13 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + goto out4; + } + +- if (!se->vu_socket_path) { +- fprintf(stderr, "fuse: missing -o vhost_user_socket option\n"); ++ if (!se->vu_socket_path && se->vu_listen_fd < 0) { ++ fuse_log(FUSE_LOG_ERR, "fuse: missing --socket-path or --fd option\n"); ++ goto out4; ++ } ++ if (se->vu_socket_path && se->vu_listen_fd >= 0) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: --socket-path and --fd cannot be given together\n"); + goto out4; + } + +@@ -2253,7 +2261,7 @@ void fuse_session_unmount(struct fuse_session *se) + + int fuse_lowlevel_is_virtio(struct fuse_session *se) + { +- return se->vu_socket_path != NULL; ++ return !!se->virtio_dev; + } + + #ifdef linux +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 7e2711b..635f877 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -638,18 +638,21 @@ int virtio_loop(struct fuse_session *se) + return 0; + } + +-int virtio_session_mount(struct fuse_session *se) ++static int fv_create_listen_socket(struct fuse_session *se) + { + struct sockaddr_un un; + mode_t old_umask; + ++ /* Nothing to do if fd is already initialized */ ++ if (se->vu_listen_fd >= 0) { ++ return 0; ++ } ++ + if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) { + fuse_log(FUSE_LOG_ERR, "Socket path too long\n"); + return -1; + } + +- se->fd = -1; +- + /* + * Create the Unix socket to communicate with qemu + * based on QEMU's vhost-user-bridge +@@ -682,15 +685,31 @@ int virtio_session_mount(struct fuse_session *se) + return -1; + } + ++ se->vu_listen_fd = listen_sock; ++ return 0; ++} ++ ++int virtio_session_mount(struct fuse_session *se) ++{ ++ int ret; ++ ++ ret = fv_create_listen_socket(se); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ se->fd = -1; ++ + fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n", + __func__); +- int data_sock = accept(listen_sock, NULL, NULL); ++ int data_sock = accept(se->vu_listen_fd, NULL, NULL); + if (data_sock == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n"); +- close(listen_sock); ++ close(se->vu_listen_fd); + return -1; + } +- close(listen_sock); ++ close(se->vu_listen_fd); ++ se->vu_listen_fd = -1; + fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n", + __func__); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-fuse_mbuf_iter-API.patch b/kvm-virtiofsd-add-fuse_mbuf_iter-API.patch new file mode 100755 index 0000000000000000000000000000000000000000..b874dc90e7957d680e0e713a17b7371553f9922b --- /dev/null +++ b/kvm-virtiofsd-add-fuse_mbuf_iter-API.patch @@ -0,0 +1,134 @@ +From 1b0edd3d0a2ee5c097bcf3501c1dfa937f02e473 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:21 +0100 +Subject: [PATCH 050/116] virtiofsd: add fuse_mbuf_iter API +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-47-dgilbert@redhat.com> +Patchwork-id: 93502 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 046/112] virtiofsd: add fuse_mbuf_iter API +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Introduce an API for consuming bytes from a buffer with size checks. +All FUSE operations will be converted to use this safe API instead of +void *inarg. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit dad157e880416ab3a0e45beaa0e81977516568bc) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 28 +++++++++++++++++++++++++ + tools/virtiofsd/fuse_common.h | 49 ++++++++++++++++++++++++++++++++++++++++++- + 2 files changed, 76 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 772efa9..42a608f 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -267,3 +267,31 @@ ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) + + return copied; + } ++ ++void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len) ++{ ++ void *ptr; ++ ++ if (len > iter->size - iter->pos) { ++ return NULL; ++ } ++ ++ ptr = iter->mem + iter->pos; ++ iter->pos += len; ++ return ptr; ++} ++ ++const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter) ++{ ++ const char *str = iter->mem + iter->pos; ++ size_t remaining = iter->size - iter->pos; ++ size_t i; ++ ++ for (i = 0; i < remaining; i++) { ++ if (str[i] == '\0') { ++ iter->pos += i + 1; ++ return str; ++ } ++ } ++ return NULL; ++} +diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h +index 0cb33ac..f8f6433 100644 +--- a/tools/virtiofsd/fuse_common.h ++++ b/tools/virtiofsd/fuse_common.h +@@ -703,10 +703,57 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv); + */ + ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src); + ++/** ++ * Memory buffer iterator ++ * ++ */ ++struct fuse_mbuf_iter { ++ /** ++ * Data pointer ++ */ ++ void *mem; ++ ++ /** ++ * Total length, in bytes ++ */ ++ size_t size; ++ ++ /** ++ * Offset from start of buffer ++ */ ++ size_t pos; ++}; ++ ++/* Initialize memory buffer iterator from a fuse_buf */ ++#define FUSE_MBUF_ITER_INIT(fbuf) \ ++ ((struct fuse_mbuf_iter){ \ ++ .mem = fbuf->mem, \ ++ .size = fbuf->size, \ ++ .pos = 0, \ ++ }) ++ ++/** ++ * Consume bytes from a memory buffer iterator ++ * ++ * @param iter memory buffer iterator ++ * @param len number of bytes to consume ++ * @return pointer to start of consumed bytes or ++ * NULL if advancing beyond end of buffer ++ */ ++void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len); ++ ++/** ++ * Consume a NUL-terminated string from a memory buffer iterator ++ * ++ * @param iter memory buffer iterator ++ * @return pointer to the string or ++ * NULL if advancing beyond end of buffer or there is no NUL-terminator ++ */ ++const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter); ++ + /* + * Signal handling + */ +- + /** + * Exit session on HUP, TERM and INT signals and ignore PIPE signal + * +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch b/kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch new file mode 100755 index 0000000000000000000000000000000000000000..bdef115e78029e0b11c1f40ba35dce4040e9b3db --- /dev/null +++ b/kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch @@ -0,0 +1,88 @@ +From 7a3c94e10b087c06635ef72aadb1550184dd5c58 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:58 +0100 +Subject: [PATCH 087/116] virtiofsd: add helper for lo_data cleanup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-84-dgilbert@redhat.com> +Patchwork-id: 93538 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 083/112] virtiofsd: add helper for lo_data cleanup +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +This offers an helper function for lo_data's cleanup. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 18a69cbbb6a4caa7c2040c6db4a33b044a32be7e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 056ebe8..e8dc5c7 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2407,6 +2407,26 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b) + return la->ino == lb->ino && la->dev == lb->dev; + } + ++static void fuse_lo_data_cleanup(struct lo_data *lo) ++{ ++ if (lo->inodes) { ++ g_hash_table_destroy(lo->inodes); ++ } ++ lo_map_destroy(&lo->fd_map); ++ lo_map_destroy(&lo->dirp_map); ++ lo_map_destroy(&lo->ino_map); ++ ++ if (lo->proc_self_fd >= 0) { ++ close(lo->proc_self_fd); ++ } ++ ++ if (lo->root.fd >= 0) { ++ close(lo->root.fd); ++ } ++ ++ free(lo->source); ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2554,22 +2574,7 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + +- if (lo.inodes) { +- g_hash_table_destroy(lo.inodes); +- } +- lo_map_destroy(&lo.fd_map); +- lo_map_destroy(&lo.dirp_map); +- lo_map_destroy(&lo.ino_map); +- +- if (lo.proc_self_fd >= 0) { +- close(lo.proc_self_fd); +- } +- +- if (lo.root.fd >= 0) { +- close(lo.root.fd); +- } +- +- free(lo.source); ++ fuse_lo_data_cleanup(&lo); + + return ret ? 1 : 0; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-o-source-PATH-to-help-output.patch b/kvm-virtiofsd-add-o-source-PATH-to-help-output.patch new file mode 100755 index 0000000000000000000000000000000000000000..5e81663f6ee3dc2e403fb4794fe92493d81ffb84 --- /dev/null +++ b/kvm-virtiofsd-add-o-source-PATH-to-help-output.patch @@ -0,0 +1,46 @@ +From c55995c25f60168e3cb6b5bae1bf9a47813383d0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:55 +0100 +Subject: [PATCH 024/116] virtiofsd: add -o source=PATH to help output +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-21-dgilbert@redhat.com> +Patchwork-id: 93474 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 020/112] virtiofsd: add -o source=PATH to help output +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +The -o source=PATH option will be used by most command-line invocations. +Let's document it! + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 4ff075f72be2f489c8998ae492ec5cdbbbd73e07) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 26ac870..fc9b264 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1319,6 +1319,7 @@ int main(int argc, char *argv[]) + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); ++ printf(" -o source=PATH shared directory tree\n"); + fuse_lowlevel_help(); + ret = 0; + goto err_out1; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-print-capabilities-option.patch b/kvm-virtiofsd-add-print-capabilities-option.patch new file mode 100755 index 0000000000000000000000000000000000000000..b57e4087630198858254e7f875b09e673c54f898 --- /dev/null +++ b/kvm-virtiofsd-add-print-capabilities-option.patch @@ -0,0 +1,121 @@ +From 23d81ee7564084f29e32fedaed5196ae1a5a3240 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:10 +0100 +Subject: [PATCH 039/116] virtiofsd: add --print-capabilities option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-36-dgilbert@redhat.com> +Patchwork-id: 93486 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 035/112] virtiofsd: add --print-capabilities option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Add the --print-capabilities option as per vhost-user.rst "Backend +programs conventions". Currently there are no advertised features. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 45018fbb0a73ce66fd3dd87ecd2872b45658add4) +Signed-off-by: Miroslav Rezanina +--- + docs/interop/vhost-user.json | 4 +++- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 2 ++ + tools/virtiofsd/passthrough_ll.c | 12 ++++++++++++ + 4 files changed, 18 insertions(+), 1 deletion(-) + +diff --git a/docs/interop/vhost-user.json b/docs/interop/vhost-user.json +index da6aaf5..d4ea1f7 100644 +--- a/docs/interop/vhost-user.json ++++ b/docs/interop/vhost-user.json +@@ -31,6 +31,7 @@ + # @rproc-serial: virtio remoteproc serial link + # @scsi: virtio scsi + # @vsock: virtio vsock transport ++# @fs: virtio fs (since 4.2) + # + # Since: 4.0 + ## +@@ -50,7 +51,8 @@ + 'rpmsg', + 'rproc-serial', + 'scsi', +- 'vsock' ++ 'vsock', ++ 'fs' + ] + } + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index f6b3470..0d61df8 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1794,6 +1794,7 @@ struct fuse_cmdline_opts { + int nodefault_subtype; + int show_version; + int show_help; ++ int print_capabilities; + unsigned int max_idle_threads; + }; + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index a3645fc..b8ec5ac 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -40,6 +40,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("--help", show_help), + FUSE_HELPER_OPT("-V", show_version), + FUSE_HELPER_OPT("--version", show_version), ++ FUSE_HELPER_OPT("--print-capabilities", print_capabilities), + FUSE_HELPER_OPT("-d", debug), + FUSE_HELPER_OPT("debug", debug), + FUSE_HELPER_OPT("-d", foreground), +@@ -135,6 +136,7 @@ void fuse_cmdline_help(void) + { + printf(" -h --help print help\n" + " -V --version print version\n" ++ " --print-capabilities print vhost-user.json\n" + " -d -o debug enable debug output (implies -f)\n" + " -f foreground operation\n" + " --daemonize run in background\n" +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 037c5d7..cd27c09 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1298,6 +1298,14 @@ static struct fuse_lowlevel_ops lo_oper = { + .lseek = lo_lseek, + }; + ++/* Print vhost-user.json backend program capabilities */ ++static void print_capabilities(void) ++{ ++ printf("{\n"); ++ printf(" \"type\": \"fs\"\n"); ++ printf("}\n"); ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -1328,6 +1336,10 @@ int main(int argc, char *argv[]) + fuse_lowlevel_version(); + ret = 0; + goto err_out1; ++ } else if (opts.print_capabilities) { ++ print_capabilities(); ++ ret = 0; ++ goto err_out1; + } + + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch b/kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch new file mode 100755 index 0000000000000000000000000000000000000000..a6a9cc9ea43ab6aaf72d18e48b4a37ed01c83cbd --- /dev/null +++ b/kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch @@ -0,0 +1,164 @@ +From 555ec3463b3dbfd6e08eac7840419d176f113e46 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:55 +0100 +Subject: [PATCH 4/9] virtiofsd: add --rlimit-nofile=NUM option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-3-dgilbert@redhat.com> +Patchwork-id: 96270 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 2/7] virtiofsd: add --rlimit-nofile=NUM option +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +Make it possible to specify the RLIMIT_NOFILE on the command-line. +Users running multiple virtiofsd processes should allocate a certain +number to each process so that the system-wide limit can never be +exhausted. + +When this option is set to 0 the rlimit is left at its current value. +This is useful when a management tool wants to configure the rlimit +itself. + +The default behavior remains unchanged: try to set the limit to +1,000,000 file descriptors if the current rlimit is lower. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Dr. David Alan Gilbert +Message-Id: <20200501140644.220940-2-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 6dbb716877728ce4eb51619885ef6ef4ada9565f) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 23 +++++++++++++++++++++++ + tools/virtiofsd/passthrough_ll.c | 22 ++++++++-------------- + 3 files changed, 32 insertions(+), 14 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 8f6d705..562fd52 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts { + int syslog; + int log_level; + unsigned int max_idle_threads; ++ unsigned long rlimit_nofile; + }; + + /** +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 0801cf7..9b3eddc 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -23,6 +23,8 @@ + #include + #include + #include ++#include ++#include + #include + + #define FUSE_HELPER_OPT(t, p) \ +@@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile), + FUSE_HELPER_OPT("--syslog", syslog), + FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), + FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), +@@ -171,6 +174,9 @@ void fuse_cmdline_help(void) + " default: no_writeback\n" + " -o xattr|no_xattr enable/disable xattr\n" + " default: no_xattr\n" ++ " --rlimit-nofile= set maximum number of file descriptors\n" ++ " (0 leaves rlimit unchanged)\n" ++ " default: 1,000,000 if the current rlimit is lower\n" + ); + } + +@@ -191,11 +197,28 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, + } + } + ++static unsigned long get_default_rlimit_nofile(void) ++{ ++ rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ ++ struct rlimit rlim; ++ ++ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { ++ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); ++ exit(1); ++ } ++ ++ if (rlim.rlim_cur >= max_fds) { ++ return 0; /* we have more fds available than required! */ ++ } ++ return max_fds; ++} ++ + int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + { + memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + + opts->max_idle_threads = 10; ++ opts->rlimit_nofile = get_default_rlimit_nofile(); + opts->foreground = 1; + + if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 50ff672..184ad0f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2711,24 +2711,18 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, + setup_seccomp(enable_syslog); + } + +-/* Raise the maximum number of open file descriptors */ +-static void setup_nofile_rlimit(void) ++/* Set the maximum number of open file descriptors */ ++static void setup_nofile_rlimit(unsigned long rlimit_nofile) + { +- const rlim_t max_fds = 1000000; +- struct rlimit rlim; +- +- if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { +- fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); +- exit(1); +- } ++ struct rlimit rlim = { ++ .rlim_cur = rlimit_nofile, ++ .rlim_max = rlimit_nofile, ++ }; + +- if (rlim.rlim_cur >= max_fds) { ++ if (rlimit_nofile == 0) { + return; /* nothing to do */ + } + +- rlim.rlim_cur = max_fds; +- rlim.rlim_max = max_fds; +- + if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { + /* Ignore SELinux denials */ + if (errno == EPERM) { +@@ -2981,7 +2975,7 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + +- setup_nofile_rlimit(); ++ setup_nofile_rlimit(opts.rlimit_nofile); + + /* Must be before sandbox since it wants /proc */ + setup_capng(); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-seccomp-whitelist.patch b/kvm-virtiofsd-add-seccomp-whitelist.patch new file mode 100755 index 0000000000000000000000000000000000000000..b34108e78b86b19162d3f95777c6df26ca08813b --- /dev/null +++ b/kvm-virtiofsd-add-seccomp-whitelist.patch @@ -0,0 +1,285 @@ +From 58c4e9473b364fb62aac797b0d69fd8ddb02c8c7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:30 +0100 +Subject: [PATCH 059/116] virtiofsd: add seccomp whitelist +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-56-dgilbert@redhat.com> +Patchwork-id: 93511 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 055/112] virtiofsd: add seccomp whitelist +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Only allow system calls that are needed by virtiofsd. All other system +calls cause SIGSYS to be directed at the thread and the process will +coredump. + +Restricting system calls reduces the kernel attack surface and limits +what the process can do when compromised. + +Signed-off-by: Stefan Hajnoczi +with additional entries by: +Signed-off-by: Ganesh Maharaj Mahalingam +Signed-off-by: Masayoshi Mizuma +Signed-off-by: Misono Tomohiro +Signed-off-by: piaojun +Signed-off-by: Vivek Goyal +Signed-off-by: Eric Ren +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 4f8bde99c175ffd86b5125098a4707d43f5e80c6) + +Signed-off-by: Miroslav Rezanina +--- + Makefile | 5 +- + tools/virtiofsd/Makefile.objs | 5 +- + tools/virtiofsd/passthrough_ll.c | 2 + + tools/virtiofsd/seccomp.c | 151 +++++++++++++++++++++++++++++++++++++++ + tools/virtiofsd/seccomp.h | 14 ++++ + 5 files changed, 174 insertions(+), 3 deletions(-) + create mode 100644 tools/virtiofsd/seccomp.c + create mode 100644 tools/virtiofsd/seccomp.h + +diff --git a/Makefile b/Makefile +index 0e9755d..6879a06 100644 +--- a/Makefile ++++ b/Makefile +@@ -330,7 +330,7 @@ endif + endif + endif + +-ifdef CONFIG_LINUX ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) + HELPERS-y += virtiofsd$(EXESUF) + vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json + endif +@@ -681,7 +681,8 @@ rdmacm-mux$(EXESUF): LIBS += "-libumad" + rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) + $(call LINK, $^) + +-ifdef CONFIG_LINUX # relies on Linux-specific syscalls ++# relies on Linux-specific syscalls ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) + virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS) + $(call LINK, $^) + endif +diff --git a/tools/virtiofsd/Makefile.objs b/tools/virtiofsd/Makefile.objs +index 45a8075..076f667 100644 +--- a/tools/virtiofsd/Makefile.objs ++++ b/tools/virtiofsd/Makefile.objs +@@ -5,5 +5,8 @@ virtiofsd-obj-y = buffer.o \ + fuse_signals.o \ + fuse_virtio.o \ + helper.o \ +- passthrough_ll.o ++ passthrough_ll.o \ ++ seccomp.o + ++seccomp.o-cflags := $(SECCOMP_CFLAGS) ++seccomp.o-libs := $(SECCOMP_LIBS) +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 0947d14..bd8925b 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -59,6 +59,7 @@ + #include + + #include "passthrough_helpers.h" ++#include "seccomp.h" + + struct lo_map_elem { + union { +@@ -2091,6 +2092,7 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) + { + setup_namespaces(lo, se); + setup_mounts(lo->source); ++ setup_seccomp(); + } + + int main(int argc, char *argv[]) +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +new file mode 100644 +index 0000000..691fb63 +--- /dev/null ++++ b/tools/virtiofsd/seccomp.c +@@ -0,0 +1,151 @@ ++/* ++ * Seccomp sandboxing for virtiofsd ++ * ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#include "qemu/osdep.h" ++#include "seccomp.h" ++#include "fuse_i.h" ++#include "fuse_log.h" ++#include ++#include ++#include ++#include ++ ++/* Bodge for libseccomp 2.4.2 which broke ppoll */ ++#if !defined(__SNR_ppoll) && defined(__SNR_brk) ++#ifdef __NR_ppoll ++#define __SNR_ppoll __NR_ppoll ++#else ++#define __SNR_ppoll __PNR_ppoll ++#endif ++#endif ++ ++static const int syscall_whitelist[] = { ++ /* TODO ireg sem*() syscalls */ ++ SCMP_SYS(brk), ++ SCMP_SYS(capget), /* For CAP_FSETID */ ++ SCMP_SYS(capset), ++ SCMP_SYS(clock_gettime), ++ SCMP_SYS(clone), ++#ifdef __NR_clone3 ++ SCMP_SYS(clone3), ++#endif ++ SCMP_SYS(close), ++ SCMP_SYS(copy_file_range), ++ SCMP_SYS(dup), ++ SCMP_SYS(eventfd2), ++ SCMP_SYS(exit), ++ SCMP_SYS(exit_group), ++ SCMP_SYS(fallocate), ++ SCMP_SYS(fchmodat), ++ SCMP_SYS(fchownat), ++ SCMP_SYS(fcntl), ++ SCMP_SYS(fdatasync), ++ SCMP_SYS(fgetxattr), ++ SCMP_SYS(flistxattr), ++ SCMP_SYS(flock), ++ SCMP_SYS(fremovexattr), ++ SCMP_SYS(fsetxattr), ++ SCMP_SYS(fstat), ++ SCMP_SYS(fstatfs), ++ SCMP_SYS(fsync), ++ SCMP_SYS(ftruncate), ++ SCMP_SYS(futex), ++ SCMP_SYS(getdents), ++ SCMP_SYS(getdents64), ++ SCMP_SYS(getegid), ++ SCMP_SYS(geteuid), ++ SCMP_SYS(getpid), ++ SCMP_SYS(gettid), ++ SCMP_SYS(gettimeofday), ++ SCMP_SYS(linkat), ++ SCMP_SYS(lseek), ++ SCMP_SYS(madvise), ++ SCMP_SYS(mkdirat), ++ SCMP_SYS(mknodat), ++ SCMP_SYS(mmap), ++ SCMP_SYS(mprotect), ++ SCMP_SYS(mremap), ++ SCMP_SYS(munmap), ++ SCMP_SYS(newfstatat), ++ SCMP_SYS(open), ++ SCMP_SYS(openat), ++ SCMP_SYS(ppoll), ++ SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */ ++ SCMP_SYS(preadv), ++ SCMP_SYS(pread64), ++ SCMP_SYS(pwritev), ++ SCMP_SYS(pwrite64), ++ SCMP_SYS(read), ++ SCMP_SYS(readlinkat), ++ SCMP_SYS(recvmsg), ++ SCMP_SYS(renameat), ++ SCMP_SYS(renameat2), ++ SCMP_SYS(rt_sigaction), ++ SCMP_SYS(rt_sigprocmask), ++ SCMP_SYS(rt_sigreturn), ++ SCMP_SYS(sendmsg), ++ SCMP_SYS(setresgid), ++ SCMP_SYS(setresuid), ++#ifdef __NR_setresgid32 ++ SCMP_SYS(setresgid32), ++#endif ++#ifdef __NR_setresuid32 ++ SCMP_SYS(setresuid32), ++#endif ++ SCMP_SYS(set_robust_list), ++ SCMP_SYS(symlinkat), ++ SCMP_SYS(time), /* Rarely needed, except on static builds */ ++ SCMP_SYS(tgkill), ++ SCMP_SYS(unlinkat), ++ SCMP_SYS(utimensat), ++ SCMP_SYS(write), ++ SCMP_SYS(writev), ++}; ++ ++void setup_seccomp(void) ++{ ++ scmp_filter_ctx ctx; ++ size_t i; ++ ++#ifdef SCMP_ACT_KILL_PROCESS ++ ctx = seccomp_init(SCMP_ACT_KILL_PROCESS); ++ /* Handle a newer libseccomp but an older kernel */ ++ if (!ctx && errno == EOPNOTSUPP) { ++ ctx = seccomp_init(SCMP_ACT_TRAP); ++ } ++#else ++ ctx = seccomp_init(SCMP_ACT_TRAP); ++#endif ++ if (!ctx) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_init() failed\n"); ++ exit(1); ++ } ++ ++ for (i = 0; i < G_N_ELEMENTS(syscall_whitelist); i++) { ++ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, ++ syscall_whitelist[i], 0) != 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d", ++ syscall_whitelist[i]); ++ exit(1); ++ } ++ } ++ ++ /* libvhost-user calls this for post-copy migration, we don't need it */ ++ if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOSYS), ++ SCMP_SYS(userfaultfd), 0) != 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add userfaultfd failed\n"); ++ exit(1); ++ } ++ ++ if (seccomp_load(ctx) < 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_load() failed\n"); ++ exit(1); ++ } ++ ++ seccomp_release(ctx); ++} +diff --git a/tools/virtiofsd/seccomp.h b/tools/virtiofsd/seccomp.h +new file mode 100644 +index 0000000..86bce72 +--- /dev/null ++++ b/tools/virtiofsd/seccomp.h +@@ -0,0 +1,14 @@ ++/* ++ * Seccomp sandboxing for virtiofsd ++ * ++ * Copyright (C) 2019 Red Hat, Inc. ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#ifndef VIRTIOFSD_SECCOMP_H ++#define VIRTIOFSD_SECCOMP_H ++ ++void setup_seccomp(void); ++ ++#endif /* VIRTIOFSD_SECCOMP_H */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-some-options-to-the-help-message.patch b/kvm-virtiofsd-add-some-options-to-the-help-message.patch new file mode 100755 index 0000000000000000000000000000000000000000..ac6dc542c37952a2eacbf360b653add9320ee914 --- /dev/null +++ b/kvm-virtiofsd-add-some-options-to-the-help-message.patch @@ -0,0 +1,74 @@ +From 6d62abb99b6b918f05f099b01a99f4326a69d650 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:26 +0100 +Subject: [PATCH 115/116] virtiofsd: add some options to the help message +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-112-dgilbert@redhat.com> +Patchwork-id: 93565 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 111/112] virtiofsd: add some options to the help message +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Masayoshi Mizuma + +Add following options to the help message: +- cache +- flock|no_flock +- norace +- posix_lock|no_posix_lock +- readdirplus|no_readdirplus +- timeout +- writeback|no_writeback +- xattr|no_xattr + +Signed-off-by: Masayoshi Mizuma + +dgilbert: Split cache, norace, posix_lock, readdirplus off + into our own earlier patches that added the options + +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 1d59b1b210d7c3b0bdf4b10ebe0bb1fccfcb8b95) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index f98d8f2..0801cf7 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -148,6 +148,8 @@ void fuse_cmdline_help(void) + " -o cache= cache mode. could be one of \"auto, " + "always, none\"\n" + " default: auto\n" ++ " -o flock|no_flock enable/disable flock\n" ++ " default: no_flock\n" + " -o log_level= log level, default to \"info\"\n" + " level could be one of \"debug, " + "info, warn, err\"\n" +@@ -163,7 +165,13 @@ void fuse_cmdline_help(void) + " enable/disable readirplus\n" + " default: readdirplus except with " + "cache=none\n" +- ); ++ " -o timeout= I/O timeout (second)\n" ++ " default: depends on cache= option.\n" ++ " -o writeback|no_writeback enable/disable writeback cache\n" ++ " default: no_writeback\n" ++ " -o xattr|no_xattr enable/disable xattr\n" ++ " default: no_xattr\n" ++ ); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-syslog-command-line-option.patch b/kvm-virtiofsd-add-syslog-command-line-option.patch new file mode 100755 index 0000000000000000000000000000000000000000..5b553422544b56d1803b99ede3445003f25ed054 --- /dev/null +++ b/kvm-virtiofsd-add-syslog-command-line-option.patch @@ -0,0 +1,239 @@ +From 6f5cf644bebc189bdb16f1caf3d7c47835d7c287 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:36 +0100 +Subject: [PATCH 065/116] virtiofsd: add --syslog command-line option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-62-dgilbert@redhat.com> +Patchwork-id: 93509 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 061/112] virtiofsd: add --syslog command-line option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Sometimes collecting output from stderr is inconvenient or does not fit +within the overall logging architecture. Add syslog(3) support for +cases where stderr cannot be used. + +Signed-off-by: Stefan Hajnoczi +dgilbert: Reworked as a logging function +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f185621d41f03a23b55795b89e6584253fa23505) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 2 ++ + tools/virtiofsd/passthrough_ll.c | 50 +++++++++++++++++++++++++++++++++++++--- + tools/virtiofsd/seccomp.c | 32 +++++++++++++++++-------- + tools/virtiofsd/seccomp.h | 4 +++- + 5 files changed, 76 insertions(+), 13 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 0d61df8..f2750bc 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1795,6 +1795,7 @@ struct fuse_cmdline_opts { + int show_version; + int show_help; + int print_capabilities; ++ int syslog; + unsigned int max_idle_threads; + }; + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5531425..9692ef9 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -54,6 +54,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), ++ FUSE_HELPER_OPT("--syslog", syslog), + FUSE_OPT_END + }; + +@@ -138,6 +139,7 @@ void fuse_cmdline_help(void) + " -V --version print version\n" + " --print-capabilities print vhost-user.json\n" + " -d -o debug enable debug output (implies -f)\n" ++ " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" + " -o max_idle_threads the maximum number of idle worker " +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c281d81..0372aca 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -58,6 +58,7 @@ + #include + #include + #include ++#include + #include + + #include "passthrough_helpers.h" +@@ -138,6 +139,7 @@ static const struct fuse_opt lo_opts[] = { + { "norace", offsetof(struct lo_data, norace), 1 }, + FUSE_OPT_END + }; ++static bool use_syslog = false; + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + +@@ -2262,11 +2264,12 @@ static void setup_mounts(const char *source) + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +-static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) ++static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, ++ bool enable_syslog) + { + setup_namespaces(lo, se); + setup_mounts(lo->source); +- setup_seccomp(); ++ setup_seccomp(enable_syslog); + } + + /* Raise the maximum number of open file descriptors */ +@@ -2298,6 +2301,42 @@ static void setup_nofile_rlimit(void) + } + } + ++static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) ++{ ++ if (use_syslog) { ++ int priority = LOG_ERR; ++ switch (level) { ++ case FUSE_LOG_EMERG: ++ priority = LOG_EMERG; ++ break; ++ case FUSE_LOG_ALERT: ++ priority = LOG_ALERT; ++ break; ++ case FUSE_LOG_CRIT: ++ priority = LOG_CRIT; ++ break; ++ case FUSE_LOG_ERR: ++ priority = LOG_ERR; ++ break; ++ case FUSE_LOG_WARNING: ++ priority = LOG_WARNING; ++ break; ++ case FUSE_LOG_NOTICE: ++ priority = LOG_NOTICE; ++ break; ++ case FUSE_LOG_INFO: ++ priority = LOG_INFO; ++ break; ++ case FUSE_LOG_DEBUG: ++ priority = LOG_DEBUG; ++ break; ++ } ++ vsyslog(priority, fmt, ap); ++ } else { ++ vfprintf(stderr, fmt, ap); ++ } ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2336,6 +2375,11 @@ int main(int argc, char *argv[]) + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; + } ++ fuse_set_log_func(log_func); ++ use_syslog = opts.syslog; ++ if (use_syslog) { ++ openlog("virtiofsd", LOG_PID, LOG_DAEMON); ++ } + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); +@@ -2424,7 +2468,7 @@ int main(int argc, char *argv[]) + /* Must be before sandbox since it wants /proc */ + setup_capng(); + +- setup_sandbox(&lo, se); ++ setup_sandbox(&lo, se, opts.syslog); + + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); +diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c +index 691fb63..2d9d4a7 100644 +--- a/tools/virtiofsd/seccomp.c ++++ b/tools/virtiofsd/seccomp.c +@@ -107,11 +107,28 @@ static const int syscall_whitelist[] = { + SCMP_SYS(writev), + }; + +-void setup_seccomp(void) ++/* Syscalls used when --syslog is enabled */ ++static const int syscall_whitelist_syslog[] = { ++ SCMP_SYS(sendto), ++}; ++ ++static void add_whitelist(scmp_filter_ctx ctx, const int syscalls[], size_t len) + { +- scmp_filter_ctx ctx; + size_t i; + ++ for (i = 0; i < len; i++) { ++ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscalls[i], 0) != 0) { ++ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d failed\n", ++ syscalls[i]); ++ exit(1); ++ } ++ } ++} ++ ++void setup_seccomp(bool enable_syslog) ++{ ++ scmp_filter_ctx ctx; ++ + #ifdef SCMP_ACT_KILL_PROCESS + ctx = seccomp_init(SCMP_ACT_KILL_PROCESS); + /* Handle a newer libseccomp but an older kernel */ +@@ -126,13 +143,10 @@ void setup_seccomp(void) + exit(1); + } + +- for (i = 0; i < G_N_ELEMENTS(syscall_whitelist); i++) { +- if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, +- syscall_whitelist[i], 0) != 0) { +- fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d", +- syscall_whitelist[i]); +- exit(1); +- } ++ add_whitelist(ctx, syscall_whitelist, G_N_ELEMENTS(syscall_whitelist)); ++ if (enable_syslog) { ++ add_whitelist(ctx, syscall_whitelist_syslog, ++ G_N_ELEMENTS(syscall_whitelist_syslog)); + } + + /* libvhost-user calls this for post-copy migration, we don't need it */ +diff --git a/tools/virtiofsd/seccomp.h b/tools/virtiofsd/seccomp.h +index 86bce72..d47c8ea 100644 +--- a/tools/virtiofsd/seccomp.h ++++ b/tools/virtiofsd/seccomp.h +@@ -9,6 +9,8 @@ + #ifndef VIRTIOFSD_SECCOMP_H + #define VIRTIOFSD_SECCOMP_H + +-void setup_seccomp(void); ++#include ++ ++void setup_seccomp(bool enable_syslog); + + #endif /* VIRTIOFSD_SECCOMP_H */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-thread-pool-size-NUM-option.patch b/kvm-virtiofsd-add-thread-pool-size-NUM-option.patch new file mode 100755 index 0000000000000000000000000000000000000000..0241a9de23acfca4ce0e86e1363070b6725986e6 --- /dev/null +++ b/kvm-virtiofsd-add-thread-pool-size-NUM-option.patch @@ -0,0 +1,106 @@ +From 3dbfb932288eb5a55dfdc0eebca7e4c7f0cf6f33 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:22 +0100 +Subject: [PATCH 111/116] virtiofsd: add --thread-pool-size=NUM option +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-108-dgilbert@redhat.com> +Patchwork-id: 93561 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 107/112] virtiofsd: add --thread-pool-size=NUM option +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Add an option to control the size of the thread pool. Requests are now +processed in parallel by default. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 951b3120dbc971f08681e1d860360e4a1e638902) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 7 ++++++- + tools/virtiofsd/fuse_virtio.c | 5 +++-- + 3 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index 1447d86..4e47e58 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -72,6 +72,7 @@ struct fuse_session { + int vu_listen_fd; + int vu_socketfd; + struct fv_VuDev *virtio_dev; ++ int thread_pool_size; + }; + + struct fuse_chan { +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 79a4031..de2e2e0 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -28,6 +28,7 @@ + #include + #include + ++#define THREAD_POOL_SIZE 64 + + #define OFFSET_MAX 0x7fffffffffffffffLL + +@@ -2519,6 +2520,7 @@ static const struct fuse_opt fuse_ll_opts[] = { + LL_OPTION("allow_root", deny_others, 1), + LL_OPTION("--socket-path=%s", vu_socket_path, 0), + LL_OPTION("--fd=%d", vu_listen_fd, 0), ++ LL_OPTION("--thread-pool-size=%d", thread_pool_size, 0), + FUSE_OPT_END + }; + +@@ -2537,7 +2539,9 @@ void fuse_lowlevel_help(void) + printf( + " -o allow_root allow access by root\n" + " --socket-path=PATH path for the vhost-user socket\n" +- " --fd=FDNUM fd number of vhost-user socket\n"); ++ " --fd=FDNUM fd number of vhost-user socket\n" ++ " --thread-pool-size=NUM thread pool size limit (default %d)\n", ++ THREAD_POOL_SIZE); + } + + void fuse_session_destroy(struct fuse_session *se) +@@ -2591,6 +2595,7 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + } + se->fd = -1; + se->vu_listen_fd = -1; ++ se->thread_pool_size = THREAD_POOL_SIZE; + se->conn.max_write = UINT_MAX; + se->conn.max_readahead = UINT_MAX; + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 0dcf2ef..9f65823 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -572,10 +572,11 @@ static void *fv_queue_thread(void *opaque) + struct fv_QueueInfo *qi = opaque; + struct VuDev *dev = &qi->virtio_dev->dev; + struct VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ struct fuse_session *se = qi->virtio_dev->se; + GThreadPool *pool; + +- pool = g_thread_pool_new(fv_queue_worker, qi, 1 /* TODO max_threads */, +- TRUE, NULL); ++ pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, TRUE, ++ NULL); + if (!pool) { + fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__); + return NULL; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-add-vhost-user.json-file.patch b/kvm-virtiofsd-add-vhost-user.json-file.patch new file mode 100755 index 0000000000000000000000000000000000000000..a24b24f50206304c2bfc41d51d1a867380e0891c --- /dev/null +++ b/kvm-virtiofsd-add-vhost-user.json-file.patch @@ -0,0 +1,73 @@ +From 77eb3258e76a1ac240503572d4f41d45cb832ba2 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:09 +0100 +Subject: [PATCH 038/116] virtiofsd: add vhost-user.json file +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-35-dgilbert@redhat.com> +Patchwork-id: 93490 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 034/112] virtiofsd: add vhost-user.json file +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Install a vhost-user.json file describing virtiofsd. This allows +libvirt and other management tools to enumerate vhost-user backend +programs. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 315616ed50ba15a5d7236ade8a402a93898202de) +Signed-off-by: Miroslav Rezanina +--- + .gitignore | 1 + + Makefile | 1 + + tools/virtiofsd/50-qemu-virtiofsd.json.in | 5 +++++ + 3 files changed, 7 insertions(+) + create mode 100644 tools/virtiofsd/50-qemu-virtiofsd.json.in + +diff --git a/.gitignore b/.gitignore +index aefad32..d7a4f99 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -6,6 +6,7 @@ + /config-target.* + /config.status + /config-temp ++/tools/virtiofsd/50-qemu-virtiofsd.json + /elf2dmp + /trace-events-all + /trace/generated-events.h +diff --git a/Makefile b/Makefile +index 1526775..0e9755d 100644 +--- a/Makefile ++++ b/Makefile +@@ -332,6 +332,7 @@ endif + + ifdef CONFIG_LINUX + HELPERS-y += virtiofsd$(EXESUF) ++vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json + endif + + # Sphinx does not allow building manuals into the same directory as +diff --git a/tools/virtiofsd/50-qemu-virtiofsd.json.in b/tools/virtiofsd/50-qemu-virtiofsd.json.in +new file mode 100644 +index 0000000..9bcd86f +--- /dev/null ++++ b/tools/virtiofsd/50-qemu-virtiofsd.json.in +@@ -0,0 +1,5 @@ ++{ ++ "description": "QEMU virtiofsd vhost-user-fs", ++ "type": "fs", ++ "binary": "@libexecdir@/virtiofsd" ++} +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-cap-ng-helpers.patch b/kvm-virtiofsd-cap-ng-helpers.patch new file mode 100755 index 0000000000000000000000000000000000000000..305745dba2ab5f6d94226fab3ef891847839fded --- /dev/null +++ b/kvm-virtiofsd-cap-ng-helpers.patch @@ -0,0 +1,175 @@ +From f62613d8058bcb60b26727d980a37537103b0033 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:32 +0100 +Subject: [PATCH 061/116] virtiofsd: cap-ng helpers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-58-dgilbert@redhat.com> +Patchwork-id: 93512 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 057/112] virtiofsd: cap-ng helpers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +libcap-ng reads /proc during capng_get_caps_process, and virtiofsd's +sandboxing doesn't have /proc mounted; thus we have to do the +caps read before we sandbox it and save/restore the state. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 2405f3c0d19eb4d516a88aa4e5c54e5f9c6bbea3) +Signed-off-by: Miroslav Rezanina +--- + Makefile | 4 +-- + tools/virtiofsd/passthrough_ll.c | 72 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 74 insertions(+), 2 deletions(-) + +diff --git a/Makefile b/Makefile +index 6879a06..ff05c30 100644 +--- a/Makefile ++++ b/Makefile +@@ -330,7 +330,7 @@ endif + endif + endif + +-ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy) + HELPERS-y += virtiofsd$(EXESUF) + vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json + endif +@@ -682,7 +682,7 @@ rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) + $(call LINK, $^) + + # relies on Linux-specific syscalls +-ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP),yy) ++ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy) + virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS) + $(call LINK, $^) + endif +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index bd8925b..97e7c75 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -39,6 +39,7 @@ + #include "fuse_virtio.h" + #include "fuse_lowlevel.h" + #include ++#include + #include + #include + #include +@@ -139,6 +140,13 @@ static const struct fuse_opt lo_opts[] = { + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + ++static struct { ++ pthread_mutex_t mutex; ++ void *saved; ++} cap; ++/* That we loaded cap-ng in the current thread from the saved */ ++static __thread bool cap_loaded = 0; ++ + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); + + static int is_dot_or_dotdot(const char *name) +@@ -162,6 +170,37 @@ static struct lo_data *lo_data(fuse_req_t req) + return (struct lo_data *)fuse_req_userdata(req); + } + ++/* ++ * Load capng's state from our saved state if the current thread ++ * hadn't previously been loaded. ++ * returns 0 on success ++ */ ++static int load_capng(void) ++{ ++ if (!cap_loaded) { ++ pthread_mutex_lock(&cap.mutex); ++ capng_restore_state(&cap.saved); ++ /* ++ * restore_state free's the saved copy ++ * so make another. ++ */ ++ cap.saved = capng_save_state(); ++ if (!cap.saved) { ++ fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); ++ return -EINVAL; ++ } ++ pthread_mutex_unlock(&cap.mutex); ++ ++ /* ++ * We want to use the loaded state for our pid, ++ * not the original ++ */ ++ capng_setpid(syscall(SYS_gettid)); ++ cap_loaded = true; ++ } ++ return 0; ++} ++ + static void lo_map_init(struct lo_map *map) + { + map->elems = NULL; +@@ -2024,6 +2063,35 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + } + + /* ++ * Capture the capability state, we'll need to restore this for individual ++ * threads later; see load_capng. ++ */ ++static void setup_capng(void) ++{ ++ /* Note this accesses /proc so has to happen before the sandbox */ ++ if (capng_get_caps_process()) { ++ fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n"); ++ exit(1); ++ } ++ pthread_mutex_init(&cap.mutex, NULL); ++ pthread_mutex_lock(&cap.mutex); ++ cap.saved = capng_save_state(); ++ if (!cap.saved) { ++ fuse_log(FUSE_LOG_ERR, "capng_save_state\n"); ++ exit(1); ++ } ++ pthread_mutex_unlock(&cap.mutex); ++} ++ ++static void cleanup_capng(void) ++{ ++ free(cap.saved); ++ cap.saved = NULL; ++ pthread_mutex_destroy(&cap.mutex); ++} ++ ++ ++/* + * Make the source directory our root so symlinks cannot escape and no other + * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. + */ +@@ -2216,12 +2284,16 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + ++ /* Must be before sandbox since it wants /proc */ ++ setup_capng(); ++ + setup_sandbox(&lo, se); + + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + + fuse_session_unmount(se); ++ cleanup_capng(); + err_out3: + fuse_remove_signal_handlers(se); + err_out2: +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch b/kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch new file mode 100755 index 0000000000000000000000000000000000000000..caa4560b4167b6dfbc8aa9e9e33f749262ff39fb --- /dev/null +++ b/kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch @@ -0,0 +1,1111 @@ +From d6a0067e6c08523a8f605f775be980eaf0a23690 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:23 +0100 +Subject: [PATCH 052/116] virtiofsd: check input buffer size in fuse_lowlevel.c + ops +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-49-dgilbert@redhat.com> +Patchwork-id: 93503 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 048/112] virtiofsd: check input buffer size in fuse_lowlevel.c ops +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Each FUSE operation involves parsing the input buffer. Currently the +code assumes the input buffer is large enough for the expected +arguments. This patch uses fuse_mbuf_iter to check the size. + +Most operations are simple to convert. Some are more complicated due to +variable-length inputs or different sizes depending on the protocol +version. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 70995754416eb4491c31607fe380a83cfd25a087) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 581 +++++++++++++++++++++++++++++++--------- + 1 file changed, 456 insertions(+), 125 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 611e8b0..02e1d83 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -27,7 +28,6 @@ + #include + + +-#define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + #define OFFSET_MAX 0x7fffffffffffffffLL + + struct fuse_pollhandle { +@@ -706,9 +706,14 @@ int fuse_reply_lseek(fuse_req_t req, off_t off) + return send_reply_ok(req, &arg, sizeof(arg)); + } + +-static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.lookup) { + req->se->op.lookup(req, nodeid, name); +@@ -717,9 +722,16 @@ static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_forget(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_forget_in *arg = (struct fuse_forget_in *)inarg; ++ struct fuse_forget_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.forget) { + req->se->op.forget(req, nodeid, arg->nlookup); +@@ -729,20 +741,48 @@ static void do_forget(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg) ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_batch_forget_in *arg = (void *)inarg; +- struct fuse_forget_one *param = (void *)PARAM(arg); +- unsigned int i; ++ struct fuse_batch_forget_in *arg; ++ struct fuse_forget_data *forgets; ++ size_t scount; + + (void)nodeid; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_none(req); ++ return; ++ } ++ ++ /* ++ * Prevent integer overflow. The compiler emits the following warning ++ * unless we use the scount local variable: ++ * ++ * error: comparison is always false due to limited range of data type ++ * [-Werror=type-limits] ++ * ++ * This may be true on 64-bit hosts but we need this check for 32-bit ++ * hosts. ++ */ ++ scount = arg->count; ++ if (scount > SIZE_MAX / sizeof(forgets[0])) { ++ fuse_reply_none(req); ++ return; ++ } ++ ++ forgets = fuse_mbuf_iter_advance(iter, arg->count * sizeof(forgets[0])); ++ if (!forgets) { ++ fuse_reply_none(req); ++ return; ++ } ++ + if (req->se->op.forget_multi) { +- req->se->op.forget_multi(req, arg->count, +- (struct fuse_forget_data *)param); ++ req->se->op.forget_multi(req, arg->count, forgets); + } else if (req->se->op.forget) { ++ unsigned int i; ++ + for (i = 0; i < arg->count; i++) { +- struct fuse_forget_one *forget = ¶m[i]; + struct fuse_req *dummy_req; + + dummy_req = fuse_ll_alloc_req(req->se); +@@ -754,7 +794,7 @@ static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, + dummy_req->ctx = req->ctx; + dummy_req->ch = NULL; + +- req->se->op.forget(dummy_req, forget->nodeid, forget->nlookup); ++ req->se->op.forget(dummy_req, forgets[i].ino, forgets[i].nlookup); + } + fuse_reply_none(req); + } else { +@@ -762,12 +802,19 @@ static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, + } + } + +-static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { + struct fuse_file_info *fip = NULL; + struct fuse_file_info fi; + +- struct fuse_getattr_in *arg = (struct fuse_getattr_in *)inarg; ++ struct fuse_getattr_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (arg->getattr_flags & FUSE_GETATTR_FH) { + memset(&fi, 0, sizeof(fi)); +@@ -782,14 +829,21 @@ static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_setattr_in *arg = (struct fuse_setattr_in *)inarg; +- + if (req->se->op.setattr) { ++ struct fuse_setattr_in *arg; + struct fuse_file_info *fi = NULL; + struct fuse_file_info fi_store; + struct stat stbuf; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&stbuf, 0, sizeof(stbuf)); + convert_attr(arg, &stbuf); + if (arg->valid & FATTR_FH) { +@@ -810,9 +864,16 @@ static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_access(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_access_in *arg = (struct fuse_access_in *)inarg; ++ struct fuse_access_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.access) { + req->se->op.access(req, nodeid, arg->mask); +@@ -821,9 +882,10 @@ static void do_access(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- (void)inarg; ++ (void)iter; + + if (req->se->op.readlink) { + req->se->op.readlink(req, nodeid); +@@ -832,10 +894,18 @@ static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_mknod_in *arg = (struct fuse_mknod_in *)inarg; +- char *name = PARAM(arg); ++ struct fuse_mknod_in *arg; ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + req->ctx.umask = arg->umask; + +@@ -846,22 +916,37 @@ static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *)inarg; ++ struct fuse_mkdir_in *arg; ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + req->ctx.umask = arg->umask; + + if (req->se->op.mkdir) { +- req->se->op.mkdir(req, nodeid, PARAM(arg), arg->mode); ++ req->se->op.mkdir(req, nodeid, name, arg->mode); + } else { + fuse_reply_err(req, ENOSYS); + } + } + +-static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.unlink) { + req->se->op.unlink(req, nodeid, name); +@@ -870,9 +955,15 @@ static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.rmdir) { + req->se->op.rmdir(req, nodeid, name); +@@ -881,10 +972,16 @@ static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; +- char *linkname = ((char *)inarg) + strlen((char *)inarg) + 1; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ const char *linkname = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name || !linkname) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.symlink) { + req->se->op.symlink(req, linkname, nodeid, name); +@@ -893,11 +990,20 @@ static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_rename(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_rename_in *arg = (struct fuse_rename_in *)inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename_in *arg; ++ const char *oldname; ++ const char *newname; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ oldname = fuse_mbuf_iter_advance_str(iter); ++ newname = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !oldname || !newname) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0); +@@ -906,11 +1012,20 @@ static void do_rename(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_rename2_in *arg = (struct fuse_rename2_in *)inarg; +- char *oldname = PARAM(arg); +- char *newname = oldname + strlen(oldname) + 1; ++ struct fuse_rename2_in *arg; ++ const char *oldname; ++ const char *newname; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ oldname = fuse_mbuf_iter_advance_str(iter); ++ newname = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !oldname || !newname) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, +@@ -920,24 +1035,38 @@ static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_link(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_link(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_link_in *arg = (struct fuse_link_in *)inarg; ++ struct fuse_link_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.link) { +- req->se->op.link(req, arg->oldnodeid, nodeid, PARAM(arg)); ++ req->se->op.link(req, arg->oldnodeid, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } + } + +-static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_create(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_create_in *arg = (struct fuse_create_in *)inarg; +- + if (req->se->op.create) { ++ struct fuse_create_in *arg; + struct fuse_file_info fi; +- char *name = PARAM(arg); ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; +@@ -950,11 +1079,18 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_open(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_open_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + +@@ -965,13 +1101,15 @@ static void do_open(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_read(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_read_in *arg = (struct fuse_read_in *)inarg; +- + if (req->se->op.read) { ++ struct fuse_read_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->lock_owner; +@@ -982,11 +1120,24 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_write(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_write_in *arg; + struct fuse_file_info fi; +- char *param; ++ const char *param; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ param = fuse_mbuf_iter_advance(iter, arg->size); ++ if (!param) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +@@ -994,7 +1145,6 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; +- param = PARAM(arg); + + if (req->se->op.write) { + req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); +@@ -1052,11 +1202,18 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, + se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); + } + +-static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_flush(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_flush_in *arg = (struct fuse_flush_in *)inarg; ++ struct fuse_flush_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.flush = 1; +@@ -1069,19 +1226,26 @@ static void do_flush(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_release(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_release_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; + fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; + fi.lock_owner = arg->lock_owner; ++ + if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { + fi.flock_release = 1; +- fi.lock_owner = arg->lock_owner; + } + + if (req->se->op.release) { +@@ -1091,11 +1255,19 @@ static void do_release(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_fsync_in *arg; + struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ int datasync; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +@@ -1111,11 +1283,18 @@ static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_open_in *arg = (struct fuse_open_in *)inarg; ++ struct fuse_open_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + +@@ -1126,11 +1305,18 @@ static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_read_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1141,11 +1327,18 @@ static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_read_in *arg = (struct fuse_read_in *)inarg; ++ struct fuse_read_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1156,11 +1349,18 @@ static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_release_in *arg = (struct fuse_release_in *)inarg; ++ struct fuse_release_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; +@@ -1172,11 +1372,19 @@ static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_fsync_in *arg = (struct fuse_fsync_in *)inarg; ++ struct fuse_fsync_in *arg; + struct fuse_file_info fi; +- int datasync = arg->fsync_flags & 1; ++ int datasync; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +@@ -1188,10 +1396,11 @@ static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { + (void)nodeid; +- (void)inarg; ++ (void)iter; + + if (req->se->op.statfs) { + req->se->op.statfs(req, nodeid); +@@ -1204,11 +1413,25 @@ static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *)inarg; +- char *name = PARAM(arg); +- char *value = name + strlen(name) + 1; ++ struct fuse_setxattr_in *arg; ++ const char *name; ++ const char *value; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ value = fuse_mbuf_iter_advance(iter, arg->size); ++ if (!value) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.setxattr) { + req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags); +@@ -1217,20 +1440,36 @@ static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; ++ struct fuse_getxattr_in *arg; ++ const char *name; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ name = fuse_mbuf_iter_advance_str(iter); ++ if (!arg || !name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.getxattr) { +- req->se->op.getxattr(req, nodeid, PARAM(arg), arg->size); ++ req->se->op.getxattr(req, nodeid, name, arg->size); + } else { + fuse_reply_err(req, ENOSYS); + } + } + +-static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *)inarg; ++ struct fuse_getxattr_in *arg; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.listxattr) { + req->se->op.listxattr(req, nodeid, arg->size); +@@ -1239,9 +1478,15 @@ static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- char *name = (char *)inarg; ++ const char *name = fuse_mbuf_iter_advance_str(iter); ++ ++ if (!name) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.removexattr) { + req->se->op.removexattr(req, nodeid, name); +@@ -1265,12 +1510,19 @@ static void convert_fuse_file_lock(struct fuse_file_lock *fl, + flock->l_pid = fl->pid; + } + +-static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; +@@ -1284,12 +1536,18 @@ static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, int sleep) ++ struct fuse_mbuf_iter *iter, int sleep) + { +- struct fuse_lk_in *arg = (struct fuse_lk_in *)inarg; ++ struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; +@@ -1327,14 +1585,16 @@ static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, + } + } + +-static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- do_setlk_common(req, nodeid, inarg, 0); ++ do_setlk_common(req, nodeid, iter, 0); + } + +-static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- do_setlk_common(req, nodeid, inarg, 1); ++ do_setlk_common(req, nodeid, iter, 1); + } + + static int find_interrupted(struct fuse_session *se, struct fuse_req *req) +@@ -1379,12 +1639,20 @@ static int find_interrupted(struct fuse_session *se, struct fuse_req *req) + return 0; + } + +-static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_interrupt_in *arg = (struct fuse_interrupt_in *)inarg; ++ struct fuse_interrupt_in *arg; + struct fuse_session *se = req->se; + + (void)nodeid; ++ ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", + (unsigned long long)arg->unique); +@@ -1425,9 +1693,15 @@ static struct fuse_req *check_interrupt(struct fuse_session *se, + } + } + +-static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_bmap_in *arg = (struct fuse_bmap_in *)inarg; ++ struct fuse_bmap_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + if (req->se->op.bmap) { + req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); +@@ -1436,18 +1710,34 @@ static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_ioctl_in *arg = (struct fuse_ioctl_in *)inarg; +- unsigned int flags = arg->flags; +- void *in_buf = arg->in_size ? PARAM(arg) : NULL; ++ struct fuse_ioctl_in *arg; ++ unsigned int flags; ++ void *in_buf = NULL; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ flags = arg->flags; + if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { + fuse_reply_err(req, ENOTTY); + return; + } + ++ if (arg->in_size) { ++ in_buf = fuse_mbuf_iter_advance(iter, arg->in_size); ++ if (!in_buf) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1468,11 +1758,18 @@ void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) + free(ph); + } + +-static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_poll(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_poll_in *arg = (struct fuse_poll_in *)inarg; ++ struct fuse_poll_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.poll_events = arg->events; +@@ -1496,11 +1793,18 @@ static void do_poll(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_fallocate_in *arg = (struct fuse_fallocate_in *)inarg; ++ struct fuse_fallocate_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1513,12 +1817,17 @@ static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + + static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, +- const void *inarg) ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_copy_file_range_in *arg = +- (struct fuse_copy_file_range_in *)inarg; ++ struct fuse_copy_file_range_in *arg; + struct fuse_file_info fi_in, fi_out; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + memset(&fi_in, 0, sizeof(fi_in)); + fi_in.fh = arg->fh_in; + +@@ -1535,11 +1844,17 @@ static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, + } + } + +-static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_lseek_in *arg = (struct fuse_lseek_in *)inarg; ++ struct fuse_lseek_in *arg; + struct fuse_file_info fi; + ++ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + +@@ -1550,15 +1865,33 @@ static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_init(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { +- struct fuse_init_in *arg = (struct fuse_init_in *)inarg; ++ size_t compat_size = offsetof(struct fuse_init_in, max_readahead); ++ struct fuse_init_in *arg; + struct fuse_init_out outarg; + struct fuse_session *se = req->se; + size_t bufsize = se->bufsize; + size_t outargsize = sizeof(outarg); + + (void)nodeid; ++ ++ /* First consume the old fields... */ ++ arg = fuse_mbuf_iter_advance(iter, compat_size); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ /* ...and now consume the new fields. */ ++ if (arg->major == 7 && arg->minor >= 6) { ++ if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ } ++ + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); + if (arg->major == 7 && arg->minor >= 6) { +@@ -1791,12 +2124,13 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + send_reply_ok(req, &outarg, outargsize); + } + +-static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) ++static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter) + { + struct fuse_session *se = req->se; + + (void)nodeid; +- (void)inarg; ++ (void)iter; + + se->got_destroy = 1; + if (se->op.destroy) { +@@ -1976,7 +2310,7 @@ int fuse_req_interrupted(fuse_req_t req) + } + + static struct { +- void (*func)(fuse_req_t, fuse_ino_t, const void *); ++ void (*func)(fuse_req_t, fuse_ino_t, struct fuse_mbuf_iter *); + const char *name; + } fuse_ll_ops[] = { + [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, +@@ -2060,7 +2394,6 @@ void fuse_session_process_buf_int(struct fuse_session *se, + const struct fuse_buf *buf = bufv->buf; + struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf); + struct fuse_in_header *in; +- const void *inarg; + struct fuse_req *req; + int err; + +@@ -2138,13 +2471,11 @@ void fuse_session_process_buf_int(struct fuse_session *se, + } + } + +- inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { + do_write_buf(req, in->nodeid, &iter, bufv); + } else { +- fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); ++ fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter); + } +- + return; + + reply_err: +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-cleanup-allocated-resource-in-se.patch b/kvm-virtiofsd-cleanup-allocated-resource-in-se.patch new file mode 100755 index 0000000000000000000000000000000000000000..b6de0a95f0847bd98f3081b07f2cf22b86667490 --- /dev/null +++ b/kvm-virtiofsd-cleanup-allocated-resource-in-se.patch @@ -0,0 +1,82 @@ +From 99ff67682ef7c5659bdc9836008541861ae313d5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:56 +0100 +Subject: [PATCH 085/116] virtiofsd: cleanup allocated resource in se +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-82-dgilbert@redhat.com> +Patchwork-id: 93533 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 081/112] virtiofsd: cleanup allocated resource in se +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +This cleans up unfreed resources in se on quiting, including +se->virtio_dev, se->vu_socket_path, se->vu_socketfd. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 61cfc44982e566c33b9d5df17858e4d5ae373873) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 7 +++++++ + tools/virtiofsd/fuse_virtio.c | 7 +++++++ + tools/virtiofsd/fuse_virtio.h | 2 +- + 3 files changed, 15 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 65f91da..440508a 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2532,6 +2532,13 @@ void fuse_session_destroy(struct fuse_session *se) + if (se->fd != -1) { + close(se->fd); + } ++ ++ if (se->vu_socket_path) { ++ virtio_session_close(se); ++ free(se->vu_socket_path); ++ se->vu_socket_path = NULL; ++ } ++ + free(se); + } + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 7a8774a..e7bd772 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -833,3 +833,10 @@ int virtio_session_mount(struct fuse_session *se) + + return 0; + } ++ ++void virtio_session_close(struct fuse_session *se) ++{ ++ close(se->vu_socketfd); ++ free(se->virtio_dev); ++ se->virtio_dev = NULL; ++} +diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h +index cc676b9..1116840 100644 +--- a/tools/virtiofsd/fuse_virtio.h ++++ b/tools/virtiofsd/fuse_virtio.h +@@ -19,7 +19,7 @@ + struct fuse_session; + + int virtio_session_mount(struct fuse_session *se); +- ++void virtio_session_close(struct fuse_session *se); + int virtio_loop(struct fuse_session *se); + + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch b/kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch new file mode 100755 index 0000000000000000000000000000000000000000..d01b000a42a9e1e94763e6c210f421ac9fdc6dfd --- /dev/null +++ b/kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch @@ -0,0 +1,99 @@ +From e00543b0384fba61a9c7274c73e11a25e7ab2946 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:13 +0100 +Subject: [PATCH 102/116] virtiofsd: convert more fprintf and perror to use + fuse log infra +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-99-dgilbert@redhat.com> +Patchwork-id: 93552 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 098/112] virtiofsd: convert more fprintf and perror to use fuse log infra +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eryu Guan + +Signed-off-by: Eryu Guan +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Misono Tomohiro +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit fc1aed0bf96259d0b46b1cfea7497b7762c4ee3d) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_signals.c | 7 +++++-- + tools/virtiofsd/helper.c | 9 ++++++--- + 2 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c +index dc7c8ac..f18625b 100644 +--- a/tools/virtiofsd/fuse_signals.c ++++ b/tools/virtiofsd/fuse_signals.c +@@ -12,6 +12,7 @@ + #include "fuse_i.h" + #include "fuse_lowlevel.h" + ++#include + #include + #include + #include +@@ -47,13 +48,15 @@ static int set_one_signal_handler(int sig, void (*handler)(int), int remove) + sa.sa_flags = 0; + + if (sigaction(sig, NULL, &old_sa) == -1) { +- perror("fuse: cannot get old signal handler"); ++ fuse_log(FUSE_LOG_ERR, "fuse: cannot get old signal handler: %s\n", ++ strerror(errno)); + return -1; + } + + if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && + sigaction(sig, &sa, NULL) == -1) { +- perror("fuse: cannot set signal handler"); ++ fuse_log(FUSE_LOG_ERR, "fuse: cannot set signal handler: %s\n", ++ strerror(errno)); + return -1; + } + return 0; +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 33749bf..f98d8f2 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -208,7 +208,8 @@ int fuse_daemonize(int foreground) + char completed; + + if (pipe(waiter)) { +- perror("fuse_daemonize: pipe"); ++ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: pipe: %s\n", ++ strerror(errno)); + return -1; + } + +@@ -218,7 +219,8 @@ int fuse_daemonize(int foreground) + */ + switch (fork()) { + case -1: +- perror("fuse_daemonize: fork"); ++ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: fork: %s\n", ++ strerror(errno)); + return -1; + case 0: + break; +@@ -228,7 +230,8 @@ int fuse_daemonize(int foreground) + } + + if (setsid() == -1) { +- perror("fuse_daemonize: setsid"); ++ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: setsid: %s\n", ++ strerror(errno)); + return -1; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch b/kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch new file mode 100755 index 0000000000000000000000000000000000000000..8c1022a68f6f0e28f93cb738ff74881531b4d7ff --- /dev/null +++ b/kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch @@ -0,0 +1,57 @@ +From 8e6473e906dfc7d2a62abaf1ec80ff461e4d201d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:12 +0100 +Subject: [PATCH 101/116] virtiofsd: do not always set FUSE_FLOCK_LOCKS +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-98-dgilbert@redhat.com> +Patchwork-id: 93551 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 097/112] virtiofsd: do not always set FUSE_FLOCK_LOCKS +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Peng Tao + +Right now we always enable it regardless of given commandlines. +Fix it by setting the flag relying on the lo->flock bit. + +Signed-off-by: Peng Tao +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e468d4af5f5192ab33283464a9f6933044ce47f7) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ab16135..ccbbec1 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -546,9 +546,14 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); + conn->want |= FUSE_CAP_WRITEBACK_CACHE; + } +- if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); +- conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ if (conn->capable & FUSE_CAP_FLOCK_LOCKS) { ++ if (lo->flock) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); ++ conn->want |= FUSE_CAP_FLOCK_LOCKS; ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n"); ++ conn->want &= ~FUSE_CAP_FLOCK_LOCKS; ++ } + } + + if (conn->capable & FUSE_CAP_POSIX_LOCKS) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-do_read-missing-NULL-check.patch b/kvm-virtiofsd-do_read-missing-NULL-check.patch new file mode 100755 index 0000000000000000000000000000000000000000..4f8e5ef3e502bd43cd5a56efc260f18ac733e57b --- /dev/null +++ b/kvm-virtiofsd-do_read-missing-NULL-check.patch @@ -0,0 +1,49 @@ +From 901c005299b0316bbca7bc190de56f6c7a2a9880 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:11 +0000 +Subject: [PATCH 15/18] virtiofsd: do_read missing NULL check +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-5-dgilbert@redhat.com> +Patchwork-id: 94127 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 4/7] virtiofsd: do_read missing NULL check +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +Missing a NULL check if the argument fetch fails. + +Fixes: Coverity CID 1413119 +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 99ce9a7e60fd12b213b985343ff8fcc172de59fd) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_lowlevel.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 01c418a..704c036 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1116,6 +1116,10 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch b/kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch new file mode 100755 index 0000000000000000000000000000000000000000..569096d0c05861373944e8fb9baf8dbb7dfb1f86 --- /dev/null +++ b/kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch @@ -0,0 +1,67 @@ +From 78152453940967f9ece9fe3ffc5017c669d6ec28 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:36:00 +0100 +Subject: [PATCH 9/9] virtiofsd: drop all capabilities in the wait parent + process +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-8-dgilbert@redhat.com> +Patchwork-id: 96274 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 7/7] virtiofsd: drop all capabilities in the wait parent process +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +All this process does is wait for its child. No capabilities are +needed. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 66502bbca37ca7a3bfa57e82cfc03b89a7a11eae) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 6358874..f41a6b0 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2535,6 +2535,17 @@ static void print_capabilities(void) + } + + /* ++ * Drop all Linux capabilities because the wait parent process only needs to ++ * sit in waitpid(2) and terminate. ++ */ ++static void setup_wait_parent_capabilities(void) ++{ ++ capng_setpid(syscall(SYS_gettid)); ++ capng_clear(CAPNG_SELECT_BOTH); ++ capng_apply(CAPNG_SELECT_BOTH); ++} ++ ++/* + * Move to a new mount, net, and pid namespaces to isolate this process. + */ + static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) +@@ -2567,6 +2578,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + pid_t waited; + int wstatus; + ++ setup_wait_parent_capabilities(); ++ + /* The parent waits for the child */ + do { + waited = waitpid(child, &wstatus, 0); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch b/kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch new file mode 100755 index 0000000000000000000000000000000000000000..3279a5ed4fe0d2330df0d49e9d69318511607057 --- /dev/null +++ b/kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch @@ -0,0 +1,47 @@ +From bc127914b29f2e4163bc7ca786e04ed955d96016 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:00 +0100 +Subject: [PATCH 089/116] virtiofsd: enable PARALLEL_DIROPS during INIT +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-86-dgilbert@redhat.com> +Patchwork-id: 93539 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 085/112] virtiofsd: enable PARALLEL_DIROPS during INIT +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +lookup is a RO operations, PARALLEL_DIROPS can be enabled. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b7ed733a3841c4d489d3bd6ca7ed23c84db119c2) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index aac282f..70568d2 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2062,6 +2062,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + if (se->conn.want & FUSE_CAP_ASYNC_READ) { + outarg.flags |= FUSE_ASYNC_READ; + } ++ if (se->conn.want & FUSE_CAP_PARALLEL_DIROPS) { ++ outarg.flags |= FUSE_PARALLEL_DIROPS; ++ } + if (se->conn.want & FUSE_CAP_POSIX_LOCKS) { + outarg.flags |= FUSE_POSIX_LOCKS; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch b/kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch new file mode 100755 index 0000000000000000000000000000000000000000..b0f678f63550df4f14308fc236d0f23923b9cbf4 --- /dev/null +++ b/kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch @@ -0,0 +1,167 @@ +From c02ebc7e43f55b9423a065a7c53ba72bdb821c98 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 9 Feb 2021 23:14:54 -0500 +Subject: [PATCH 1/3] virtiofsd: extract lo_do_open() from lo_open() + +RH-Author: Jon Maloy +Message-id: <20210209231456.1555472-2-jmaloy@redhat.com> +Patchwork-id: 101024 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/3] virtiofsd: extract lo_do_open() from lo_open() +Bugzilla: 1919111 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Greg Kurz +RH-Acked-by: Dr. David Alan Gilbert + +From: Stefan Hajnoczi + +Both lo_open() and lo_create() have similar code to open a file. Extract +a common lo_do_open() function from lo_open() that will be used by +lo_create() in a later commit. + +Since lo_do_open() does not otherwise need fuse_req_t req, convert +lo_add_fd_mapping() to use struct lo_data *lo instead. + +Signed-off-by: Stefan Hajnoczi +Message-Id: <20210204150208.367837-2-stefanha@redhat.com> +Reviewed-by: Greg Kurz +Signed-off-by: Dr. David Alan Gilbert + +(cherry-picked from commit 8afaaee976965b7fb90ec225a51d60f35c5f173c) + +Conflict: update_open_flags() takes fewer arguments in this version + than in upstream. Instead of applying commit e12a0edafeb + ("virtiofsd: Add -o allow_direct_io|no_allow_direct_io + options") we keep the old signature, since this seems to + be an unrelated change. + +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + tools/virtiofsd/passthrough_ll.c | 73 ++++++++++++++++++++------------ + 1 file changed, 46 insertions(+), 27 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index f41a6b07c8..518ba11c47 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -439,17 +439,17 @@ static void lo_map_remove(struct lo_map *map, size_t key) + } + + /* Assumes lo->mutex is held */ +-static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd) ++static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd) + { + struct lo_map_elem *elem; + +- elem = lo_map_alloc_elem(&lo_data(req)->fd_map); ++ elem = lo_map_alloc_elem(&lo->fd_map); + if (!elem) { + return -1; + } + + elem->fd = fd; +- return elem - lo_data(req)->fd_map.elems; ++ return elem - lo->fd_map.elems; + } + + /* Assumes lo->mutex is held */ +@@ -1712,6 +1712,38 @@ static void update_open_flags(int writeback, struct fuse_file_info *fi) + fi->flags &= ~O_DIRECT; + } + ++static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, ++ struct fuse_file_info *fi) ++{ ++ char buf[64]; ++ ssize_t fh; ++ int fd; ++ ++ update_open_flags(lo->writeback, fi); ++ ++ sprintf(buf, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); ++ if (fd == -1) { ++ return errno; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_fd_mapping(lo, fd); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ close(fd); ++ return ENOMEM; ++ } ++ ++ fi->fh = fh; ++ if (lo->cache == CACHE_NONE) { ++ fi->direct_io = 1; ++ } else if (lo->cache == CACHE_ALWAYS) { ++ fi->keep_cache = 1; ++ } ++ return 0; ++} ++ + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) + { +@@ -1752,7 +1784,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + ssize_t fh; + + pthread_mutex_lock(&lo->mutex); +- fh = lo_add_fd_mapping(req, fd); ++ fh = lo_add_fd_mapping(lo, fd); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); +@@ -1943,38 +1975,25 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + + static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { +- int fd; +- ssize_t fh; +- char buf[64]; + struct lo_data *lo = lo_data(req); ++ struct lo_inode *inode = lo_inode(req, ino); ++ int err; + + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, + fi->flags); + +- update_open_flags(lo->writeback, fi); +- +- sprintf(buf, "%i", lo_fd(req, ino)); +- fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); +- if (fd == -1) { +- return (void)fuse_reply_err(req, errno); +- } +- +- pthread_mutex_lock(&lo->mutex); +- fh = lo_add_fd_mapping(req, fd); +- pthread_mutex_unlock(&lo->mutex); +- if (fh == -1) { +- close(fd); +- fuse_reply_err(req, ENOMEM); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); + return; + } + +- fi->fh = fh; +- if (lo->cache == CACHE_NONE) { +- fi->direct_io = 1; +- } else if (lo->cache == CACHE_ALWAYS) { +- fi->keep_cache = 1; ++ err = lo_do_open(lo, inode, fi); ++ lo_inode_put(lo, &inode); ++ if (err) { ++ fuse_reply_err(req, err); ++ } else { ++ fuse_reply_open(req, fi); + } +- fuse_reply_open(req, fi); + } + + static void lo_release(fuse_req_t req, fuse_ino_t ino, +-- +2.18.2 + diff --git a/kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch b/kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch new file mode 100755 index 0000000000000000000000000000000000000000..96f91a13642d80ed6ef986f21d34c434f74f04f5 --- /dev/null +++ b/kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch @@ -0,0 +1,111 @@ +From 983b383bc4a92a9f7ecff0332cadefed2f58f502 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:50 +0100 +Subject: [PATCH 079/116] virtiofsd: extract root inode init into setup_root() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-76-dgilbert@redhat.com> +Patchwork-id: 93527 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 075/112] virtiofsd: extract root inode init into setup_root() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Inititialize the root inode in a single place. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Stefan Hajnoczi +dgilbert: +with fix suggested by Misono Tomohiro +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 3ca8a2b1c83eb185c232a4e87abbb65495263756) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 35 +++++++++++++++++++++++++---------- + 1 file changed, 25 insertions(+), 10 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 33bfb4d..9e7191e 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2351,6 +2351,30 @@ static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + } + } + ++static void setup_root(struct lo_data *lo, struct lo_inode *root) ++{ ++ int fd, res; ++ struct stat stat; ++ ++ fd = open("/", O_PATH); ++ if (fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source); ++ exit(1); ++ } ++ ++ res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source); ++ exit(1); ++ } ++ ++ root->is_symlink = false; ++ root->fd = fd; ++ root->ino = stat.st_ino; ++ root->dev = stat.st_dev; ++ root->refcount = 2; ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2426,8 +2450,6 @@ int main(int argc, char *argv[]) + if (lo.debug) { + current_log_level = FUSE_LOG_DEBUG; + } +- lo.root.refcount = 2; +- + if (lo.source) { + struct stat stat; + int res; +@@ -2446,7 +2468,6 @@ int main(int argc, char *argv[]) + } else { + lo.source = "/"; + } +- lo.root.is_symlink = false; + if (!lo.timeout_set) { + switch (lo.cache) { + case CACHE_NEVER: +@@ -2466,13 +2487,6 @@ int main(int argc, char *argv[]) + exit(1); + } + +- lo.root.fd = open(lo.source, O_PATH); +- +- if (lo.root.fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", lo.source); +- exit(1); +- } +- + se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); + if (se == NULL) { + goto err_out1; +@@ -2495,6 +2509,7 @@ int main(int argc, char *argv[]) + + setup_sandbox(&lo, se, opts.syslog); + ++ setup_root(&lo, &lo.root); + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch b/kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch new file mode 100755 index 0000000000000000000000000000000000000000..4860bec0e74ba86cc799a236284be2b359a9c35d --- /dev/null +++ b/kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch @@ -0,0 +1,85 @@ +From b3cd18ab58e331d3610cf00f857d6a945f11a030 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:49 +0100 +Subject: [PATCH 078/116] virtiofsd: fail when parent inode isn't known in + lo_do_lookup() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-75-dgilbert@redhat.com> +Patchwork-id: 93529 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 074/112] virtiofsd: fail when parent inode isn't known in lo_do_lookup() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +The Linux file handle APIs (struct export_operations) can access inodes +that are not attached to parents because path name traversal is not +performed. Refuse if there is no parent in lo_do_lookup(). + +Also clean up lo_do_lookup() while we're here. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9de4fab5995d115f8ebfb41d8d94a866d80a1708) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index de12e75..33bfb4d 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -777,6 +777,15 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_data *lo = lo_data(req); + struct lo_inode *inode, *dir = lo_inode(req, parent); + ++ /* ++ * name_to_handle_at() and open_by_handle_at() can reach here with fuse ++ * mount point in guest, but we don't have its inode info in the ++ * ino_map. ++ */ ++ if (!dir) { ++ return ENOENT; ++ } ++ + memset(e, 0, sizeof(*e)); + e->attr_timeout = lo->timeout; + e->entry_timeout = lo->timeout; +@@ -786,7 +795,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + name = "."; + } + +- newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); ++ newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW); + if (newfd == -1) { + goto out_err; + } +@@ -796,7 +805,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out_err; + } + +- inode = lo_find(lo_data(req), &e->attr); ++ inode = lo_find(lo, &e->attr); + if (inode) { + close(newfd); + newfd = -1; +@@ -812,6 +821,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + inode->is_symlink = S_ISLNK(e->attr.st_mode); + inode->refcount = 1; + inode->fd = newfd; ++ newfd = -1; + inode->ino = e->attr.st_ino; + inode->dev = e->attr.st_dev; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-error-handling-in-main.patch b/kvm-virtiofsd-fix-error-handling-in-main.patch new file mode 100755 index 0000000000000000000000000000000000000000..a831992ed5c6d1dc1b5e5ebc7872f13b9065742c --- /dev/null +++ b/kvm-virtiofsd-fix-error-handling-in-main.patch @@ -0,0 +1,63 @@ +From 0ea1c7375d6509367399c706eb9d1e8cf79a5830 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:55 +0100 +Subject: [PATCH 084/116] virtiofsd: fix error handling in main() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-81-dgilbert@redhat.com> +Patchwork-id: 93534 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 080/112] virtiofsd: fix error handling in main() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +Neither fuse_parse_cmdline() nor fuse_opt_parse() goes to the right place +to do cleanup. + +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c6de804670f2255ce776263124c37f3370dc5ac1) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9ed77a1..af050c6 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2443,13 +2443,14 @@ int main(int argc, char *argv[]) + lo_map_init(&lo.fd_map); + + if (fuse_parse_cmdline(&args, &opts) != 0) { +- return 1; ++ goto err_out1; + } + fuse_set_log_func(log_func); + use_syslog = opts.syslog; + if (use_syslog) { + openlog("virtiofsd", LOG_PID, LOG_DAEMON); + } ++ + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); +@@ -2468,7 +2469,7 @@ int main(int argc, char *argv[]) + } + + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { +- return 1; ++ goto err_out1; + } + + /* +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch b/kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch new file mode 100755 index 0000000000000000000000000000000000000000..420a8a6ec7276066524745f6428e3e94f16645f5 --- /dev/null +++ b/kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch @@ -0,0 +1,44 @@ +From 9c291ca8624318613ede6e4174d08cf45aae8384 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:01 +0100 +Subject: [PATCH 090/116] virtiofsd: fix incorrect error handling in + lo_do_lookup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-87-dgilbert@redhat.com> +Patchwork-id: 93543 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 086/112] virtiofsd: fix incorrect error handling in lo_do_lookup +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eric Ren + +Signed-off-by: Eric Ren +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit fc3f0041b43b6c64aa97b3558a6abe1a10028354) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e8dc5c7..05b5f89 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -814,7 +814,6 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + close(newfd); + newfd = -1; + } else { +- saverr = ENOMEM; + inode = calloc(1, sizeof(struct lo_inode)); + if (!inode) { + goto out_err; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-libfuse-information-leaks.patch b/kvm-virtiofsd-fix-libfuse-information-leaks.patch new file mode 100755 index 0000000000000000000000000000000000000000..90debb09044ec7e4c1096558cfca659386e3d5c4 --- /dev/null +++ b/kvm-virtiofsd-fix-libfuse-information-leaks.patch @@ -0,0 +1,322 @@ +From e0d64e481e5a9fab5ff90d2a8f84afcd3311d13b Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:35 +0100 +Subject: [PATCH 064/116] virtiofsd: fix libfuse information leaks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-61-dgilbert@redhat.com> +Patchwork-id: 93515 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 060/112] virtiofsd: fix libfuse information leaks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Some FUSE message replies contain padding fields that are not +initialized by libfuse. This is fine in traditional FUSE applications +because the kernel is trusted. virtiofsd does not trust the guest and +must not expose uninitialized memory. + +Use C struct initializers to automatically zero out memory. Not all of +these code changes are strictly necessary but they will prevent future +information leaks if the structs are extended. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 3db2876a0153ac7103c077c53090e020faffb3ea) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 150 ++++++++++++++++++++-------------------- + 1 file changed, 76 insertions(+), 74 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 2d6dc5a..6ceb33d 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -44,21 +44,23 @@ static __attribute__((constructor)) void fuse_ll_init_pagesize(void) + + static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) + { +- attr->ino = stbuf->st_ino; +- attr->mode = stbuf->st_mode; +- attr->nlink = stbuf->st_nlink; +- attr->uid = stbuf->st_uid; +- attr->gid = stbuf->st_gid; +- attr->rdev = stbuf->st_rdev; +- attr->size = stbuf->st_size; +- attr->blksize = stbuf->st_blksize; +- attr->blocks = stbuf->st_blocks; +- attr->atime = stbuf->st_atime; +- attr->mtime = stbuf->st_mtime; +- attr->ctime = stbuf->st_ctime; +- attr->atimensec = ST_ATIM_NSEC(stbuf); +- attr->mtimensec = ST_MTIM_NSEC(stbuf); +- attr->ctimensec = ST_CTIM_NSEC(stbuf); ++ *attr = (struct fuse_attr){ ++ .ino = stbuf->st_ino, ++ .mode = stbuf->st_mode, ++ .nlink = stbuf->st_nlink, ++ .uid = stbuf->st_uid, ++ .gid = stbuf->st_gid, ++ .rdev = stbuf->st_rdev, ++ .size = stbuf->st_size, ++ .blksize = stbuf->st_blksize, ++ .blocks = stbuf->st_blocks, ++ .atime = stbuf->st_atime, ++ .mtime = stbuf->st_mtime, ++ .ctime = stbuf->st_ctime, ++ .atimensec = ST_ATIM_NSEC(stbuf), ++ .mtimensec = ST_MTIM_NSEC(stbuf), ++ .ctimensec = ST_CTIM_NSEC(stbuf), ++ }; + } + + static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) +@@ -183,16 +185,16 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out = { ++ .unique = req->unique, ++ .error = error, ++ }; + + if (error <= -1000 || error > 0) { + fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); + error = -ERANGE; + } + +- out.unique = req->unique; +- out.error = error; +- + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + +@@ -277,14 +279,16 @@ size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + static void convert_statfs(const struct statvfs *stbuf, + struct fuse_kstatfs *kstatfs) + { +- kstatfs->bsize = stbuf->f_bsize; +- kstatfs->frsize = stbuf->f_frsize; +- kstatfs->blocks = stbuf->f_blocks; +- kstatfs->bfree = stbuf->f_bfree; +- kstatfs->bavail = stbuf->f_bavail; +- kstatfs->files = stbuf->f_files; +- kstatfs->ffree = stbuf->f_ffree; +- kstatfs->namelen = stbuf->f_namemax; ++ *kstatfs = (struct fuse_kstatfs){ ++ .bsize = stbuf->f_bsize, ++ .frsize = stbuf->f_frsize, ++ .blocks = stbuf->f_blocks, ++ .bfree = stbuf->f_bfree, ++ .bavail = stbuf->f_bavail, ++ .files = stbuf->f_files, ++ .ffree = stbuf->f_ffree, ++ .namelen = stbuf->f_namemax, ++ }; + } + + static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) +@@ -328,12 +332,14 @@ static unsigned int calc_timeout_nsec(double t) + static void fill_entry(struct fuse_entry_out *arg, + const struct fuse_entry_param *e) + { +- arg->nodeid = e->ino; +- arg->generation = e->generation; +- arg->entry_valid = calc_timeout_sec(e->entry_timeout); +- arg->entry_valid_nsec = calc_timeout_nsec(e->entry_timeout); +- arg->attr_valid = calc_timeout_sec(e->attr_timeout); +- arg->attr_valid_nsec = calc_timeout_nsec(e->attr_timeout); ++ *arg = (struct fuse_entry_out){ ++ .nodeid = e->ino, ++ .generation = e->generation, ++ .entry_valid = calc_timeout_sec(e->entry_timeout), ++ .entry_valid_nsec = calc_timeout_nsec(e->entry_timeout), ++ .attr_valid = calc_timeout_sec(e->attr_timeout), ++ .attr_valid_nsec = calc_timeout_nsec(e->attr_timeout), ++ }; + convert_stat(&e->attr, &arg->attr); + } + +@@ -362,10 +368,12 @@ size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + fill_entry(&dp->entry_out, e); + + struct fuse_dirent *dirent = &dp->dirent; +- dirent->ino = e->attr.st_ino; +- dirent->off = off; +- dirent->namelen = namelen; +- dirent->type = (e->attr.st_mode & S_IFMT) >> 12; ++ *dirent = (struct fuse_dirent){ ++ .ino = e->attr.st_ino, ++ .off = off, ++ .namelen = namelen, ++ .type = (e->attr.st_mode & S_IFMT) >> 12, ++ }; + memcpy(dirent->name, name, namelen); + memset(dirent->name + namelen, 0, entlen_padded - entlen); + +@@ -496,15 +504,14 @@ static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) + { + struct iovec iov[2]; +- struct fuse_out_header out; ++ struct fuse_out_header out = { ++ .unique = req->unique, ++ }; + int res; + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + +- out.unique = req->unique; +- out.error = 0; +- + res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv); + if (res <= 0) { + fuse_free_req(req); +@@ -2145,14 +2152,14 @@ static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, + static int send_notify_iov(struct fuse_session *se, int notify_code, + struct iovec *iov, int count) + { +- struct fuse_out_header out; ++ struct fuse_out_header out = { ++ .error = notify_code, ++ }; + + if (!se->got_init) { + return -ENOTCONN; + } + +- out.unique = 0; +- out.error = notify_code; + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + +@@ -2162,11 +2169,11 @@ static int send_notify_iov(struct fuse_session *se, int notify_code, + int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) + { + if (ph != NULL) { +- struct fuse_notify_poll_wakeup_out outarg; ++ struct fuse_notify_poll_wakeup_out outarg = { ++ .kh = ph->kh, ++ }; + struct iovec iov[2]; + +- outarg.kh = ph->kh; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + +@@ -2179,17 +2186,17 @@ int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) + int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + off_t off, off_t len) + { +- struct fuse_notify_inval_inode_out outarg; ++ struct fuse_notify_inval_inode_out outarg = { ++ .ino = ino, ++ .off = off, ++ .len = len, ++ }; + struct iovec iov[2]; + + if (!se) { + return -EINVAL; + } + +- outarg.ino = ino; +- outarg.off = off; +- outarg.len = len; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + +@@ -2199,17 +2206,16 @@ int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + const char *name, size_t namelen) + { +- struct fuse_notify_inval_entry_out outarg; ++ struct fuse_notify_inval_entry_out outarg = { ++ .parent = parent, ++ .namelen = namelen, ++ }; + struct iovec iov[3]; + + if (!se) { + return -EINVAL; + } + +- outarg.parent = parent; +- outarg.namelen = namelen; +- outarg.padding = 0; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + iov[2].iov_base = (void *)name; +@@ -2222,18 +2228,17 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + fuse_ino_t child, const char *name, + size_t namelen) + { +- struct fuse_notify_delete_out outarg; ++ struct fuse_notify_delete_out outarg = { ++ .parent = parent, ++ .child = child, ++ .namelen = namelen, ++ }; + struct iovec iov[3]; + + if (!se) { + return -EINVAL; + } + +- outarg.parent = parent; +- outarg.child = child; +- outarg.namelen = namelen; +- outarg.padding = 0; +- + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + iov[2].iov_base = (void *)name; +@@ -2245,24 +2250,21 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv) + { +- struct fuse_out_header out; +- struct fuse_notify_store_out outarg; ++ struct fuse_out_header out = { ++ .error = FUSE_NOTIFY_STORE, ++ }; ++ struct fuse_notify_store_out outarg = { ++ .nodeid = ino, ++ .offset = offset, ++ .size = fuse_buf_size(bufv), ++ }; + struct iovec iov[3]; +- size_t size = fuse_buf_size(bufv); + int res; + + if (!se) { + return -EINVAL; + } + +- out.unique = 0; +- out.error = FUSE_NOTIFY_STORE; +- +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; +- + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(out); + iov[1].iov_base = &outarg; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch b/kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch new file mode 100755 index 0000000000000000000000000000000000000000..6243037b44a2bcfdc0e40c6706951a9e608da135 --- /dev/null +++ b/kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch @@ -0,0 +1,94 @@ +From 9a44d78f5019280b006bb5b3de7164336289d639 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:21 +0100 +Subject: [PATCH 110/116] virtiofsd: fix lo_destroy() resource leaks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-107-dgilbert@redhat.com> +Patchwork-id: 93560 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 106/112] virtiofsd: fix lo_destroy() resource leaks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Now that lo_destroy() is serialized we can call unref_inode() so that +all inode resources are freed. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 28f7a3b026f231bfe8de5fed6a18a8d27b1dfcee) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 41 ++++++++++++++++++++-------------------- + 1 file changed, 20 insertions(+), 21 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 79b8b71..eb001b9 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1371,26 +1371,6 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + } + +-static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) +-{ +- struct lo_inode *inode = value; +- struct lo_data *lo = user_data; +- +- inode->nlookup = 0; +- lo_map_remove(&lo->ino_map, inode->fuse_ino); +- close(inode->fd); +- lo_inode_put(lo, &inode); /* Drop our refcount from lo_do_lookup() */ +- +- return TRUE; +-} +- +-static void unref_all_inodes(struct lo_data *lo) +-{ +- pthread_mutex_lock(&lo->mutex); +- g_hash_table_foreach_remove(lo->inodes, unref_all_inodes_cb, lo); +- pthread_mutex_unlock(&lo->mutex); +-} +- + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +@@ -2477,7 +2457,26 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + static void lo_destroy(void *userdata) + { + struct lo_data *lo = (struct lo_data *)userdata; +- unref_all_inodes(lo); ++ ++ /* ++ * Normally lo->mutex must be taken when traversing lo->inodes but ++ * lo_destroy() is a serialized request so no races are possible here. ++ * ++ * In addition, we cannot acquire lo->mutex since unref_inode() takes it ++ * too and this would result in a recursive lock. ++ */ ++ while (true) { ++ GHashTableIter iter; ++ gpointer key, value; ++ ++ g_hash_table_iter_init(&iter, lo->inodes); ++ if (!g_hash_table_iter_next(&iter, &key, &value)) { ++ break; ++ } ++ ++ struct lo_inode *inode = value; ++ unref_inode_lolocked(lo, inode, inode->nlookup); ++ } + } + + static struct fuse_lowlevel_ops lo_oper = { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fix-memory-leak-on-lo.source.patch b/kvm-virtiofsd-fix-memory-leak-on-lo.source.patch new file mode 100755 index 0000000000000000000000000000000000000000..4d7d6dcdc30e22672d569175134c7a244dd109db --- /dev/null +++ b/kvm-virtiofsd-fix-memory-leak-on-lo.source.patch @@ -0,0 +1,66 @@ +From 9e0f5b64f30c2f841f297e25c2f3a6d82c8a16b8 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:57 +0100 +Subject: [PATCH 086/116] virtiofsd: fix memory leak on lo.source +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-83-dgilbert@redhat.com> +Patchwork-id: 93536 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 082/112] virtiofsd: fix memory leak on lo.source +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Liu Bo + +valgrind reported that lo.source is leaked on quiting, but it was defined +as (const char*) as it may point to a const string "/". + +Signed-off-by: Liu Bo +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit eb68a33b5fc5dde87bd9b99b94e7c33a5d8ea82e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index af050c6..056ebe8 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -115,7 +115,7 @@ struct lo_data { + int writeback; + int flock; + int xattr; +- const char *source; ++ char *source; + double timeout; + int cache; + int timeout_set; +@@ -2497,9 +2497,8 @@ int main(int argc, char *argv[]) + fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); + exit(1); + } +- + } else { +- lo.source = "/"; ++ lo.source = strdup("/"); + } + if (!lo.timeout_set) { + switch (lo.cache) { +@@ -2570,5 +2569,7 @@ err_out1: + close(lo.root.fd); + } + ++ free(lo.source); ++ + return ret ? 1 : 0; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch b/kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch new file mode 100755 index 0000000000000000000000000000000000000000..b17d93c5898c14436b8b0353e2897b9a01beb05e --- /dev/null +++ b/kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch @@ -0,0 +1,56 @@ +From 3b6461ee08654b2cbb6d4e0cc15c02f89a6610d5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:09 +0000 +Subject: [PATCH 13/18] virtiofsd: fv_create_listen_socket error path socket + leak +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-3-dgilbert@redhat.com> +Patchwork-id: 94124 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/7] virtiofsd: fv_create_listen_socket error path socket leak +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +If we fail when bringing up the socket we can leak the listen_fd; +in practice the daemon will exit so it's not really a problem. + +Fixes: Coverity CID 1413121 +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 6fa249027f97e3080f3d9c0fab3f94f8f80828fe) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/fuse_virtio.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 80a6e92..dd1c605 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -916,6 +916,7 @@ static int fv_create_listen_socket(struct fuse_session *se) + old_umask = umask(0077); + if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n"); ++ close(listen_sock); + umask(old_umask); + return -1; + } +@@ -923,6 +924,7 @@ static int fv_create_listen_socket(struct fuse_session *se) + + if (listen(listen_sock, 1) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n"); ++ close(listen_sock); + return -1; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-get-set-features-callbacks.patch b/kvm-virtiofsd-get-set-features-callbacks.patch new file mode 100755 index 0000000000000000000000000000000000000000..fcb5ca29f271f97265750cf7d01ed7ef85086fc8 --- /dev/null +++ b/kvm-virtiofsd-get-set-features-callbacks.patch @@ -0,0 +1,66 @@ +From 59bfe3ad924d00dc9c7a4363fcd3db36ea247988 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:59 +0100 +Subject: [PATCH 028/116] virtiofsd: get/set features callbacks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-25-dgilbert@redhat.com> +Patchwork-id: 93478 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 024/112] virtiofsd: get/set features callbacks +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: "Dr. David Alan Gilbert" + +Add the get/set features callbacks. + +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f2cef5fb9ae20136ca18d16328787b69b3abfa18) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 1928a20..4819e56 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -46,6 +46,17 @@ struct virtio_fs_config { + uint32_t num_queues; + }; + ++/* Callback from libvhost-user */ ++static uint64_t fv_get_features(VuDev *dev) ++{ ++ return 1ULL << VIRTIO_F_VERSION_1; ++} ++ ++/* Callback from libvhost-user */ ++static void fv_set_features(VuDev *dev, uint64_t features) ++{ ++} ++ + /* + * Callback from libvhost-user if there's a new fd we're supposed to listen + * to, typically a queue kick? +@@ -78,7 +89,9 @@ static bool fv_queue_order(VuDev *dev, int qidx) + } + + static const VuDevIface fv_iface = { +- /* TODO: Add other callbacks */ ++ .get_features = fv_get_features, ++ .set_features = fv_set_features, ++ + .queue_is_processed_in_order = fv_queue_order, + }; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch b/kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch new file mode 100755 index 0000000000000000000000000000000000000000..68d20e73481db1cfe76dee9d629bbe6bcecf138b --- /dev/null +++ b/kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch @@ -0,0 +1,589 @@ +From da6ee5c24397d2ca93dfaf275fdd9dafc922da15 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:11 +0100 +Subject: [PATCH 100/116] virtiofsd: introduce inode refcount to prevent + use-after-free +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-97-dgilbert@redhat.com> +Patchwork-id: 93550 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 096/112] virtiofsd: introduce inode refcount to prevent use-after-free +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +If thread A is using an inode it must not be deleted by thread B when +processing a FUSE_FORGET request. + +The FUSE protocol itself already has a counter called nlookup that is +used in FUSE_FORGET messages. We cannot trust this counter since the +untrusted client can manipulate it via FUSE_FORGET messages. + +Introduce a new refcount to keep inodes alive for the required lifespan. +lo_inode_put() must be called to release a reference. FUSE's nlookup +counter holds exactly one reference so that the inode stays alive as +long as the client still wants to remember it. + +Note that the lo_inode->is_symlink field is moved to avoid creating a +hole in the struct due to struct field alignment. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c241aa9457d88c6a0d027f48fadfed131646bce3) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 169 +++++++++++++++++++++++++++++++++------ + 1 file changed, 146 insertions(+), 23 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e3a6d6b..ab16135 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -97,7 +97,13 @@ struct lo_key { + + struct lo_inode { + int fd; +- bool is_symlink; ++ ++ /* ++ * Atomic reference count for this object. The nlookup field holds a ++ * reference and release it when nlookup reaches 0. ++ */ ++ gint refcount; ++ + struct lo_key key; + + /* +@@ -116,6 +122,8 @@ struct lo_inode { + fuse_ino_t fuse_ino; + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ ++ ++ bool is_symlink; + }; + + struct lo_cred { +@@ -471,6 +479,23 @@ static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) + return elem - lo_data(req)->ino_map.elems; + } + ++static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep) ++{ ++ struct lo_inode *inode = *inodep; ++ ++ if (!inode) { ++ return; ++ } ++ ++ *inodep = NULL; ++ ++ if (g_atomic_int_dec_and_test(&inode->refcount)) { ++ close(inode->fd); ++ free(inode); ++ } ++} ++ ++/* Caller must release refcount using lo_inode_put() */ + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { + struct lo_data *lo = lo_data(req); +@@ -478,6 +503,9 @@ static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->ino_map, ino); ++ if (elem) { ++ g_atomic_int_inc(&elem->inode->refcount); ++ } + pthread_mutex_unlock(&lo->mutex); + + if (!elem) { +@@ -487,10 +515,23 @@ static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + return elem->inode; + } + ++/* ++ * TODO Remove this helper and force callers to hold an inode refcount until ++ * they are done with the fd. This will be done in a later patch to make ++ * review easier. ++ */ + static int lo_fd(fuse_req_t req, fuse_ino_t ino) + { + struct lo_inode *inode = lo_inode(req, ino); +- return inode ? inode->fd : -1; ++ int fd; ++ ++ if (!inode) { ++ return -1; ++ } ++ ++ fd = inode->fd; ++ lo_inode_put(lo_data(req), &inode); ++ return fd; + } + + static void lo_init(void *userdata, struct fuse_conn_info *conn) +@@ -545,6 +586,10 @@ static void lo_getattr(fuse_req_t req, fuse_ino_t ino, + fuse_reply_attr(req, &buf, lo->timeout); + } + ++/* ++ * Increments parent->nlookup and caller must release refcount using ++ * lo_inode_put(&parent). ++ */ + static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, + char path[PATH_MAX], struct lo_inode **parent) + { +@@ -582,6 +627,7 @@ retry: + p = &lo->root; + pthread_mutex_lock(&lo->mutex); + p->nlookup++; ++ g_atomic_int_inc(&p->refcount); + pthread_mutex_unlock(&lo->mutex); + } else { + *last = '\0'; +@@ -625,6 +671,7 @@ retry: + + fail_unref: + unref_inode_lolocked(lo, p, 1); ++ lo_inode_put(lo, &p); + fail: + if (retries) { + retries--; +@@ -663,6 +710,7 @@ fallback: + if (res != -1) { + res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW); + unref_inode_lolocked(lo, parent, 1); ++ lo_inode_put(lo, &parent); + } + + return res; +@@ -780,11 +828,13 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + goto out_err; + } + } ++ lo_inode_put(lo, &inode); + + return lo_getattr(req, ino, fi); + + out_err: + saverr = errno; ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + +@@ -801,6 +851,7 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + if (p) { + assert(p->nlookup > 0); + p->nlookup++; ++ g_atomic_int_inc(&p->refcount); + } + pthread_mutex_unlock(&lo->mutex); + +@@ -820,6 +871,10 @@ static void posix_locks_value_destroy(gpointer data) + free(plock); + } + ++/* ++ * Increments nlookup and caller must release refcount using ++ * lo_inode_put(&parent). ++ */ + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct fuse_entry_param *e) + { +@@ -827,7 +882,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + int res; + int saverr; + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode, *dir = lo_inode(req, parent); ++ struct lo_inode *inode = NULL; ++ struct lo_inode *dir = lo_inode(req, parent); + + /* + * name_to_handle_at() and open_by_handle_at() can reach here with fuse +@@ -868,6 +924,13 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + inode->is_symlink = S_ISLNK(e->attr.st_mode); ++ ++ /* ++ * One for the caller and one for nlookup (released in ++ * unref_inode_lolocked()) ++ */ ++ g_atomic_int_set(&inode->refcount, 2); ++ + inode->nlookup = 1; + inode->fd = newfd; + newfd = -1; +@@ -883,6 +946,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; ++ lo_inode_put(lo, &inode); ++ lo_inode_put(lo, &dir); + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e->ino); +@@ -894,6 +959,8 @@ out_err: + if (newfd != -1) { + close(newfd); + } ++ lo_inode_put(lo, &inode); ++ lo_inode_put(lo, &dir); + return saverr; + } + +@@ -991,6 +1058,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + { + int res; + int saverr; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *dir; + struct fuse_entry_param e; + struct lo_cred old = {}; +@@ -1032,9 +1100,11 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); ++ lo_inode_put(lo, &dir); + return; + + out: ++ lo_inode_put(lo, &dir); + fuse_reply_err(req, saverr); + } + +@@ -1085,6 +1155,7 @@ fallback: + if (res != -1) { + res = linkat(parent->fd, path, dfd, name, 0); + unref_inode_lolocked(lo, parent, 1); ++ lo_inode_put(lo, &parent); + } + + return res; +@@ -1095,6 +1166,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + { + int res; + struct lo_data *lo = lo_data(req); ++ struct lo_inode *parent_inode; + struct lo_inode *inode; + struct fuse_entry_param e; + int saverr; +@@ -1104,17 +1176,18 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + return; + } + ++ parent_inode = lo_inode(req, parent); + inode = lo_inode(req, ino); +- if (!inode) { +- fuse_reply_err(req, EBADF); +- return; ++ if (!parent_inode || !inode) { ++ errno = EBADF; ++ goto out_err; + } + + memset(&e, 0, sizeof(struct fuse_entry_param)); + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; + +- res = linkat_empty_nofollow(lo, inode, lo_fd(req, parent), name); ++ res = linkat_empty_nofollow(lo, inode, parent_inode->fd, name); + if (res == -1) { + goto out_err; + } +@@ -1133,13 +1206,18 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); ++ lo_inode_put(lo, &parent_inode); ++ lo_inode_put(lo, &inode); + return; + + out_err: + saverr = errno; ++ lo_inode_put(lo, &parent_inode); ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + ++/* Increments nlookup and caller must release refcount using lo_inode_put() */ + static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, + const char *name) + { +@@ -1176,6 +1254,7 @@ static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); ++ lo_inode_put(lo, &inode); + } + + static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, +@@ -1183,8 +1262,10 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + unsigned int flags) + { + int res; +- struct lo_inode *oldinode; +- struct lo_inode *newinode; ++ struct lo_inode *parent_inode; ++ struct lo_inode *newparent_inode; ++ struct lo_inode *oldinode = NULL; ++ struct lo_inode *newinode = NULL; + struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { +@@ -1192,6 +1273,13 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + return; + } + ++ parent_inode = lo_inode(req, parent); ++ newparent_inode = lo_inode(req, newparent); ++ if (!parent_inode || !newparent_inode) { ++ fuse_reply_err(req, EBADF); ++ goto out; ++ } ++ + oldinode = lookup_name(req, parent, name); + newinode = lookup_name(req, newparent, newname); + +@@ -1204,8 +1292,8 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + #ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); + #else +- res = syscall(SYS_renameat2, lo_fd(req, parent), name, +- lo_fd(req, newparent), newname, flags); ++ res = syscall(SYS_renameat2, parent_inode->fd, name, ++ newparent_inode->fd, newname, flags); + if (res == -1 && errno == ENOSYS) { + fuse_reply_err(req, EINVAL); + } else { +@@ -1215,12 +1303,16 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + goto out; + } + +- res = renameat(lo_fd(req, parent), name, lo_fd(req, newparent), newname); ++ res = renameat(parent_inode->fd, name, newparent_inode->fd, newname); + + fuse_reply_err(req, res == -1 ? errno : 0); + out: + unref_inode_lolocked(lo, oldinode, 1); + unref_inode_lolocked(lo, newinode, 1); ++ lo_inode_put(lo, &oldinode); ++ lo_inode_put(lo, &newinode); ++ lo_inode_put(lo, &parent_inode); ++ lo_inode_put(lo, &newparent_inode); + } + + static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) +@@ -1244,6 +1336,7 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); ++ lo_inode_put(lo, &inode); + } + + static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, +@@ -1265,8 +1358,9 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + g_hash_table_destroy(inode->posix_locks); + pthread_mutex_destroy(&inode->plock_mutex); + pthread_mutex_unlock(&lo->mutex); +- close(inode->fd); +- free(inode); ++ ++ /* Drop our refcount from lo_do_lookup() */ ++ lo_inode_put(lo, &inode); + } else { + pthread_mutex_unlock(&lo->mutex); + } +@@ -1280,6 +1374,7 @@ static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) + inode->nlookup = 0; + lo_map_remove(&lo->ino_map, inode->fuse_ino); + close(inode->fd); ++ lo_inode_put(lo, &inode); /* Drop our refcount from lo_do_lookup() */ + + return TRUE; + } +@@ -1306,6 +1401,7 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + (unsigned long long)nlookup); + + unref_inode_lolocked(lo, inode, nlookup); ++ lo_inode_put(lo, &inode); + } + + static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +@@ -1537,6 +1633,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + err = 0; + error: + lo_dirp_put(&d); ++ lo_inode_put(lo, &dinode); + + /* + * If there's an error, we can only signal it if we haven't stored +@@ -1595,6 +1692,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + { + int fd; + struct lo_data *lo = lo_data(req); ++ struct lo_inode *parent_inode; + struct fuse_entry_param e; + int err; + struct lo_cred old = {}; +@@ -1607,12 +1705,18 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + return; + } + ++ parent_inode = lo_inode(req, parent); ++ if (!parent_inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + err = lo_change_cred(req, &old); + if (err) { + goto out; + } + +- fd = openat(lo_fd(req, parent), name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, ++ fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); + err = fd == -1 ? errno : 0; + lo_restore_cred(&old); +@@ -1625,8 +1729,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); +- fuse_reply_err(req, ENOMEM); +- return; ++ err = ENOMEM; ++ goto out; + } + + fi->fh = fh; +@@ -1639,6 +1743,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + out: ++ lo_inode_put(lo, &parent_inode); ++ + if (err) { + fuse_reply_err(req, err); + } else { +@@ -1712,16 +1818,18 @@ static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + plock = + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + if (!plock) { +- pthread_mutex_unlock(&inode->plock_mutex); +- fuse_reply_err(req, ret); +- return; ++ saverr = ret; ++ goto out; + } + + ret = fcntl(plock->fd, F_OFD_GETLK, lock); + if (ret == -1) { + saverr = errno; + } ++ ++out: + pthread_mutex_unlock(&inode->plock_mutex); ++ lo_inode_put(lo, &inode); + + if (saverr) { + fuse_reply_err(req, saverr); +@@ -1761,9 +1869,8 @@ static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + + if (!plock) { +- pthread_mutex_unlock(&inode->plock_mutex); +- fuse_reply_err(req, ret); +- return; ++ saverr = ret; ++ goto out; + } + + /* TODO: Is it alright to modify flock? */ +@@ -1772,7 +1879,11 @@ static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + if (ret == -1) { + saverr = errno; + } ++ ++out: + pthread_mutex_unlock(&inode->plock_mutex); ++ lo_inode_put(lo, &inode); ++ + fuse_reply_err(req, saverr); + } + +@@ -1898,6 +2009,7 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + pthread_mutex_unlock(&inode->plock_mutex); + + res = close(dup(lo_fi_fd(req, fi))); ++ lo_inode_put(lo_data(req), &inode); + fuse_reply_err(req, res == -1 ? errno : 0); + } + +@@ -2115,11 +2227,14 @@ out_free: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + return; + + out_err: + saverr = errno; + out: ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +@@ -2190,11 +2305,14 @@ out_free: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + return; + + out_err: + saverr = errno; + out: ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +@@ -2243,6 +2361,8 @@ out: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + +@@ -2289,6 +2409,8 @@ out: + if (fd >= 0) { + close(fd); + } ++ ++ lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + } + +@@ -2671,6 +2793,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; + root->nlookup = 2; ++ g_atomic_int_set(&root->refcount, 2); + } + + static guint lo_key_hash(gconstpointer key) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-jail-lo-proc_self_fd.patch b/kvm-virtiofsd-jail-lo-proc_self_fd.patch new file mode 100755 index 0000000000000000000000000000000000000000..df69242e19719c502911e045a77f06891ce8594e --- /dev/null +++ b/kvm-virtiofsd-jail-lo-proc_self_fd.patch @@ -0,0 +1,85 @@ +From 852a0a22d674b0594aecf0912a0885d197f34978 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:57 +0100 +Subject: [PATCH 6/9] virtiofsd: jail lo->proc_self_fd + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-5-dgilbert@redhat.com> +Patchwork-id: 96275 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 4/7] virtiofsd: jail lo->proc_self_fd +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Miklos Szeredi + +While it's not possible to escape the proc filesystem through +lo->proc_self_fd, it is possible to escape to the root of the proc +filesystem itself through "../..". + +Use a temporary mount for opening lo->proc_self_fd, that has it's root at +/proc/self/fd/, preventing access to the ancestor directories. + +Signed-off-by: Miklos Szeredi +Message-Id: <20200429124733.22488-1-mszeredi@redhat.com> +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 397ae982f4df46e7d4b2625c431062c9146f3b83) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 27 +++++++++++++++++++++++++-- + 1 file changed, 25 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 184ad0f..73d8405 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2540,6 +2540,8 @@ static void print_capabilities(void) + static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + { + pid_t child; ++ char template[] = "virtiofsd-XXXXXX"; ++ char *tmpdir; + + /* + * Create a new pid namespace for *child* processes. We'll have to +@@ -2601,12 +2603,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + exit(1); + } + ++ tmpdir = mkdtemp(template); ++ if (!tmpdir) { ++ fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template); ++ exit(1); ++ } ++ ++ if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n", ++ tmpdir); ++ exit(1); ++ } ++ + /* Now we can get our /proc/self/fd directory file descriptor */ +- lo->proc_self_fd = open("/proc/self/fd", O_PATH); ++ lo->proc_self_fd = open(tmpdir, O_PATH); + if (lo->proc_self_fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); ++ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir); + exit(1); + } ++ ++ if (umount2(tmpdir, MNT_DETACH) < 0) { ++ fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir); ++ exit(1); ++ } ++ ++ if (rmdir(tmpdir) < 0) { ++ fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir); ++ } + } + + /* +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-load_capng-missing-unlock.patch b/kvm-virtiofsd-load_capng-missing-unlock.patch new file mode 100755 index 0000000000000000000000000000000000000000..bc04f6be1f52c0e6d5b269ba318f83258c8aca60 --- /dev/null +++ b/kvm-virtiofsd-load_capng-missing-unlock.patch @@ -0,0 +1,46 @@ +From ece7649025fbdbde48ff0b954e8ec2e42c4a8b3d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:10 +0000 +Subject: [PATCH 14/18] virtiofsd: load_capng missing unlock +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-4-dgilbert@redhat.com> +Patchwork-id: 94126 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 3/7] virtiofsd: load_capng missing unlock +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: "Dr. David Alan Gilbert" + +Missing unlock in error path. + +Fixes: Covertiy CID 1413123 +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 686391112fd42c615bcc4233472887a66a9b5a4a) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e6f2399..c635fc8 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -232,6 +232,7 @@ static int load_capng(void) + */ + cap.saved = capng_save_state(); + if (!cap.saved) { ++ pthread_mutex_unlock(&cap.mutex); + fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); + return -EINVAL; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-make-f-foreground-the-default.patch b/kvm-virtiofsd-make-f-foreground-the-default.patch new file mode 100755 index 0000000000000000000000000000000000000000..d6cb0e3585a6aca5a8f13adfa38cee2e6263c942 --- /dev/null +++ b/kvm-virtiofsd-make-f-foreground-the-default.patch @@ -0,0 +1,76 @@ +From 7f2e1f79a3addb242c3018c7a80e2e57589119f0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:08 +0100 +Subject: [PATCH 037/116] virtiofsd: make -f (foreground) the default +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-34-dgilbert@redhat.com> +Patchwork-id: 93489 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 033/112] virtiofsd: make -f (foreground) the default +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +According to vhost-user.rst "Backend program conventions", backend +programs should run in the foregound by default. Follow the +conventions so libvirt and other management tools can control virtiofsd +in a standard way. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0bbd31753714ac2899efda0f0de31e353e965789) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 676032e..a3645fc 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -29,6 +29,11 @@ + { \ + t, offsetof(struct fuse_cmdline_opts, p), 1 \ + } ++#define FUSE_HELPER_OPT_VALUE(t, p, v) \ ++ { \ ++ t, offsetof(struct fuse_cmdline_opts, p), v \ ++ } ++ + + static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("-h", show_help), +@@ -42,6 +47,7 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), + FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("-f", foreground), ++ FUSE_HELPER_OPT_VALUE("--daemonize", foreground, 0), + FUSE_HELPER_OPT("fsname=", nodefault_subtype), + FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("subtype=", nodefault_subtype), +@@ -131,6 +137,7 @@ void fuse_cmdline_help(void) + " -V --version print version\n" + " -d -o debug enable debug output (implies -f)\n" + " -f foreground operation\n" ++ " --daemonize run in background\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" + " allowed (default: 10)\n"); +@@ -158,6 +165,7 @@ int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) + memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + + opts->max_idle_threads = 10; ++ opts->foreground = 1; + + if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == + -1) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-make-lo_release-atomic.patch b/kvm-virtiofsd-make-lo_release-atomic.patch new file mode 100755 index 0000000000000000000000000000000000000000..6d88549076abbbeab24d649398aa282013501445 --- /dev/null +++ b/kvm-virtiofsd-make-lo_release-atomic.patch @@ -0,0 +1,62 @@ +From 4ebabb66f4132186152edf8e1907fce436bf5c69 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:06 +0100 +Subject: [PATCH 095/116] virtiofsd: make lo_release() atomic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-92-dgilbert@redhat.com> +Patchwork-id: 93545 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 091/112] virtiofsd: make lo_release() atomic +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Hold the lock across both lo_map_get() and lo_map_remove() to prevent +races between two FUSE_RELEASE requests. In this case I don't see a +serious bug but it's safer to do things atomically. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit baed65c060c0e524530bc243eec427fb408bd477) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9414935..690edbc 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1772,14 +1772,18 @@ static void lo_release(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { + struct lo_data *lo = lo_data(req); +- int fd; ++ struct lo_map_elem *elem; ++ int fd = -1; + + (void)ino; + +- fd = lo_fi_fd(req, fi); +- + pthread_mutex_lock(&lo->mutex); +- lo_map_remove(&lo->fd_map, fi->fh); ++ elem = lo_map_get(&lo->fd_map, fi->fh); ++ if (elem) { ++ fd = elem->fd; ++ elem = NULL; ++ lo_map_remove(&lo->fd_map, fi->fh); ++ } + pthread_mutex_unlock(&lo->mutex); + + close(fd); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-move-to-a-new-pid-namespace.patch b/kvm-virtiofsd-move-to-a-new-pid-namespace.patch new file mode 100755 index 0000000000000000000000000000000000000000..9a33d1b4607db7ab91c512bfe3fda5c3910f3a14 --- /dev/null +++ b/kvm-virtiofsd-move-to-a-new-pid-namespace.patch @@ -0,0 +1,223 @@ +From a7a87a751a9893830d031a957a751b7622b71fb2 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:29 +0100 +Subject: [PATCH 058/116] virtiofsd: move to a new pid namespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-55-dgilbert@redhat.com> +Patchwork-id: 93510 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 054/112] virtiofsd: move to a new pid namespace +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +virtiofsd needs access to /proc/self/fd. Let's move to a new pid +namespace so that a compromised process cannot see another other +processes running on the system. + +One wrinkle in this approach: unshare(CLONE_NEWPID) affects *child* +processes and not the current process. Therefore we need to fork the +pid 1 process that will actually run virtiofsd and leave a parent in +waitpid(2). This is not the same thing as daemonization and parent +processes should not notice a difference. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8e1d4ef231d8327be219f7aea7aa15d181375bbc) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 134 +++++++++++++++++++++++++-------------- + 1 file changed, 86 insertions(+), 48 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 27ab328..0947d14 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -51,7 +51,10 @@ + #include + #include + #include ++#include + #include ++#include ++#include + #include + #include + +@@ -1945,24 +1948,95 @@ static void print_capabilities(void) + } + + /* +- * Called after our UNIX domain sockets have been created, now we can move to +- * an empty network namespace to prevent TCP/IP and other network activity in +- * case this process is compromised. ++ * Move to a new mount, net, and pid namespaces to isolate this process. + */ +-static void setup_net_namespace(void) ++static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) + { +- if (unshare(CLONE_NEWNET) != 0) { +- fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNET): %m\n"); ++ pid_t child; ++ ++ /* ++ * Create a new pid namespace for *child* processes. We'll have to ++ * fork in order to enter the new pid namespace. A new mount namespace ++ * is also needed so that we can remount /proc for the new pid ++ * namespace. ++ * ++ * Our UNIX domain sockets have been created. Now we can move to ++ * an empty network namespace to prevent TCP/IP and other network ++ * activity in case this process is compromised. ++ */ ++ if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) { ++ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n"); ++ exit(1); ++ } ++ ++ child = fork(); ++ if (child < 0) { ++ fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n"); ++ exit(1); ++ } ++ if (child > 0) { ++ pid_t waited; ++ int wstatus; ++ ++ /* The parent waits for the child */ ++ do { ++ waited = waitpid(child, &wstatus, 0); ++ } while (waited < 0 && errno == EINTR && !se->exited); ++ ++ /* We were terminated by a signal, see fuse_signals.c */ ++ if (se->exited) { ++ exit(0); ++ } ++ ++ if (WIFEXITED(wstatus)) { ++ exit(WEXITSTATUS(wstatus)); ++ } ++ ++ exit(1); ++ } ++ ++ /* Send us SIGTERM when the parent thread terminates, see prctl(2) */ ++ prctl(PR_SET_PDEATHSIG, SIGTERM); ++ ++ /* ++ * If the mounts have shared propagation then we want to opt out so our ++ * mount changes don't affect the parent mount namespace. ++ */ ++ if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n"); ++ exit(1); ++ } ++ ++ /* The child must remount /proc to use the new pid namespace */ ++ if (mount("proc", "/proc", "proc", ++ MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n"); ++ exit(1); ++ } ++ ++ /* Now we can get our /proc/self/fd directory file descriptor */ ++ lo->proc_self_fd = open("/proc/self/fd", O_PATH); ++ if (lo->proc_self_fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); + exit(1); + } + } + +-/* This magic is based on lxc's lxc_pivot_root() */ +-static void setup_pivot_root(const char *source) ++/* ++ * Make the source directory our root so symlinks cannot escape and no other ++ * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. ++ */ ++static void setup_mounts(const char *source) + { + int oldroot; + int newroot; + ++ if (mount(source, source, NULL, MS_BIND, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); ++ exit(1); ++ } ++ ++ /* This magic is based on lxc's lxc_pivot_root() */ + oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (oldroot < 0) { + fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); +@@ -2009,47 +2083,14 @@ static void setup_pivot_root(const char *source) + close(oldroot); + } + +-static void setup_proc_self_fd(struct lo_data *lo) +-{ +- lo->proc_self_fd = open("/proc/self/fd", O_PATH); +- if (lo->proc_self_fd == -1) { +- fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); +- exit(1); +- } +-} +- +-/* +- * Make the source directory our root so symlinks cannot escape and no other +- * files are accessible. +- */ +-static void setup_mount_namespace(const char *source) +-{ +- if (unshare(CLONE_NEWNS) != 0) { +- fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNS): %m\n"); +- exit(1); +- } +- +- if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { +- fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_PRIVATE): %m\n"); +- exit(1); +- } +- +- if (mount(source, source, NULL, MS_BIND, NULL) < 0) { +- fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); +- exit(1); +- } +- +- setup_pivot_root(source); +-} +- + /* + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +-static void setup_sandbox(struct lo_data *lo) ++static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) + { +- setup_net_namespace(); +- setup_mount_namespace(lo->source); ++ setup_namespaces(lo, se); ++ setup_mounts(lo->source); + } + + int main(int argc, char *argv[]) +@@ -2173,10 +2214,7 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + +- /* Must be after daemonize to get the right /proc/self/fd */ +- setup_proc_self_fd(&lo); +- +- setup_sandbox(&lo); ++ setup_sandbox(&lo, se); + + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-move-to-an-empty-network-namespace.patch b/kvm-virtiofsd-move-to-an-empty-network-namespace.patch new file mode 100755 index 0000000000000000000000000000000000000000..69a7c20efdd7a47df4fe9ecf8dfad1bd9869bf01 --- /dev/null +++ b/kvm-virtiofsd-move-to-an-empty-network-namespace.patch @@ -0,0 +1,66 @@ +From 19a16f26bdeb6302159736e182a18b06160a3f42 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:28 +0100 +Subject: [PATCH 057/116] virtiofsd: move to an empty network namespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-54-dgilbert@redhat.com> +Patchwork-id: 93508 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 053/112] virtiofsd: move to an empty network namespace +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +If the process is compromised there should be no network access. Use an +empty network namespace to sandbox networking. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d74830d12ae233186ff74ddf64c552d26bb39e50) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 0570453..27ab328 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1944,6 +1944,19 @@ static void print_capabilities(void) + printf("}\n"); + } + ++/* ++ * Called after our UNIX domain sockets have been created, now we can move to ++ * an empty network namespace to prevent TCP/IP and other network activity in ++ * case this process is compromised. ++ */ ++static void setup_net_namespace(void) ++{ ++ if (unshare(CLONE_NEWNET) != 0) { ++ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNET): %m\n"); ++ exit(1); ++ } ++} ++ + /* This magic is based on lxc's lxc_pivot_root() */ + static void setup_pivot_root(const char *source) + { +@@ -2035,6 +2048,7 @@ static void setup_mount_namespace(const char *source) + */ + static void setup_sandbox(struct lo_data *lo) + { ++ setup_net_namespace(); + setup_mount_namespace(lo->source); + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-only-retain-file-system-capabilities.patch b/kvm-virtiofsd-only-retain-file-system-capabilities.patch new file mode 100755 index 0000000000000000000000000000000000000000..15c8cd8b89e2edac22acb67112add6a0841244ff --- /dev/null +++ b/kvm-virtiofsd-only-retain-file-system-capabilities.patch @@ -0,0 +1,112 @@ +From 8727e4904e7a6588e39f231d837f4527f265e47e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:59 +0100 +Subject: [PATCH 8/9] virtiofsd: only retain file system capabilities + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-7-dgilbert@redhat.com> +Patchwork-id: 96272 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 6/7] virtiofsd: only retain file system capabilities +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +virtiofsd runs as root but only needs a subset of root's Linux +capabilities(7). As a file server its purpose is to create and access +files on behalf of a client. It needs to be able to access files with +arbitrary uid/gid owners. It also needs to be create device nodes. + +Introduce a Linux capabilities(7) whitelist and drop all capabilities +that we don't need, making the virtiofsd process less powerful than a +regular uid root process. + + # cat /proc/PID/status + ... + Before After + CapInh: 0000000000000000 0000000000000000 + CapPrm: 0000003fffffffff 00000000880000df + CapEff: 0000003fffffffff 00000000880000df + CapBnd: 0000003fffffffff 0000000000000000 + CapAmb: 0000000000000000 0000000000000000 + +Note that file capabilities cannot be used to achieve the same effect on +the virtiofsd executable because mount is used during sandbox setup. +Therefore we drop capabilities programmatically at the right point +during startup. + +This patch only affects the sandboxed child process. The parent process +that sits in waitpid(2) still has full root capabilities and will be +addressed in the next patch. + +Signed-off-by: Stefan Hajnoczi +Message-Id: <20200416164907.244868-2-stefanha@redhat.com> +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a59feb483b8fae24d043569ccfcc97ea23d54a02) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 38 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 38 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 614ba55..6358874 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2723,6 +2723,43 @@ static void setup_mounts(const char *source) + } + + /* ++ * Only keep whitelisted capabilities that are needed for file system operation ++ */ ++static void setup_capabilities(void) ++{ ++ pthread_mutex_lock(&cap.mutex); ++ capng_restore_state(&cap.saved); ++ ++ /* ++ * Whitelist file system-related capabilities that are needed for a file ++ * server to act like root. Drop everything else like networking and ++ * sysadmin capabilities. ++ * ++ * Exclusions: ++ * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl ++ * and we don't support that. ++ * 2. CAP_MAC_OVERRIDE is not included because it only seems to be ++ * used by the Smack LSM. Omit it until there is demand for it. ++ */ ++ capng_setpid(syscall(SYS_gettid)); ++ capng_clear(CAPNG_SELECT_BOTH); ++ capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, ++ CAP_CHOWN, ++ CAP_DAC_OVERRIDE, ++ CAP_DAC_READ_SEARCH, ++ CAP_FOWNER, ++ CAP_FSETID, ++ CAP_SETGID, ++ CAP_SETUID, ++ CAP_MKNOD, ++ CAP_SETFCAP); ++ capng_apply(CAPNG_SELECT_BOTH); ++ ++ cap.saved = capng_save_state(); ++ pthread_mutex_unlock(&cap.mutex); ++} ++ ++/* + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +@@ -2732,6 +2769,7 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, + setup_namespaces(lo, se); + setup_mounts(lo->source); + setup_seccomp(enable_syslog); ++ setup_capabilities(); + } + + /* Set the maximum number of open file descriptors */ +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch b/kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch new file mode 100755 index 0000000000000000000000000000000000000000..f21d7935cdee6ec792822a686252ab1b674dabe0 --- /dev/null +++ b/kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch @@ -0,0 +1,124 @@ +From f2c0b07088966c396ddcee54f4bed97cdb01192f Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 9 Feb 2021 23:14:55 -0500 +Subject: [PATCH 2/3] virtiofsd: optionally return inode pointer from + lo_do_lookup() + +RH-Author: Jon Maloy +Message-id: <20210209231456.1555472-3-jmaloy@redhat.com> +Patchwork-id: 101022 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 2/3] virtiofsd: optionally return inode pointer from lo_do_lookup() +Bugzilla: 1919111 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Greg Kurz +RH-Acked-by: Dr. David Alan Gilbert + +From: Stefan Hajnoczi + +lo_do_lookup() finds an existing inode or allocates a new one. It +increments nlookup so that the inode stays alive until the client +releases it. + +Existing callers don't need the struct lo_inode so the function doesn't +return it. Extend the function to optionally return the inode. The next +commit will need it. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Greg Kurz +Message-Id: <20210204150208.367837-3-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert + +(cherry-picked from commit 22d2ece71e533310da31f2857ebc4a00d91968b3) +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + tools/virtiofsd/passthrough_ll.c | 29 +++++++++++++++++++++-------- + 1 file changed, 21 insertions(+), 8 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 518ba11c47..e5bd3d73e4 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -878,11 +878,13 @@ static void posix_locks_value_destroy(gpointer data) + } + + /* +- * Increments nlookup and caller must release refcount using +- * lo_inode_put(&parent). ++ * Increments nlookup on the inode on success. unref_inode_lolocked() must be ++ * called eventually to decrement nlookup again. If inodep is non-NULL, the ++ * inode pointer is stored and the caller must call lo_inode_put(). + */ + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, +- struct fuse_entry_param *e) ++ struct fuse_entry_param *e, ++ struct lo_inode **inodep) + { + int newfd; + int res; +@@ -891,6 +893,10 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_inode *inode = NULL; + struct lo_inode *dir = lo_inode(req, parent); + ++ if (inodep) { ++ *inodep = NULL; ++ } ++ + /* + * name_to_handle_at() and open_by_handle_at() can reach here with fuse + * mount point in guest, but we don't have its inode info in the +@@ -953,7 +959,14 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; +- lo_inode_put(lo, &inode); ++ ++ /* Transfer ownership of inode pointer to caller or drop it */ ++ if (inodep) { ++ *inodep = inode; ++ } else { ++ lo_inode_put(lo, &inode); ++ } ++ + lo_inode_put(lo, &dir); + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, +@@ -988,7 +1001,7 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + return; + } + +- err = lo_do_lookup(req, parent, name, &e); ++ err = lo_do_lookup(req, parent, name, &e, NULL); + if (err) { + fuse_reply_err(req, err); + } else { +@@ -1098,7 +1111,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + goto out; + } + +- saverr = lo_do_lookup(req, parent, name, &e); ++ saverr = lo_do_lookup(req, parent, name, &e, NULL); + if (saverr) { + goto out; + } +@@ -1599,7 +1612,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + if (plus) { + if (!is_dot_or_dotdot(name)) { +- err = lo_do_lookup(req, ino, name, &e); ++ err = lo_do_lookup(req, ino, name, &e, NULL); + if (err) { + goto error; + } +@@ -1793,7 +1806,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + fi->fh = fh; +- err = lo_do_lookup(req, parent, name, &e); ++ err = lo_do_lookup(req, parent, name, &e, NULL); + } + if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; +-- +2.18.2 + diff --git a/kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch b/kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch new file mode 100755 index 0000000000000000000000000000000000000000..e3d5773d036ae25b9936e34267c76119ec43e305 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch @@ -0,0 +1,54 @@ +From fe031dbbf5e287f64de9fcc9aec361e8ab492109 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:24 +0100 +Subject: [PATCH 113/116] virtiofsd/passthrough_ll: Pass errno to + fuse_reply_err() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-110-dgilbert@redhat.com> +Patchwork-id: 93559 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 109/112] virtiofsd/passthrough_ll: Pass errno to fuse_reply_err() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Xiao Yang + +lo_copy_file_range() passes -errno to fuse_reply_err() and then fuse_reply_err() +changes it to errno again, so that subsequent fuse_send_reply_iov_nofree() catches +the wrong errno.(i.e. reports "fuse: bad error value: ..."). + +Make fuse_send_reply_iov_nofree() accept the correct -errno by passing errno +directly in lo_copy_file_range(). + +Signed-off-by: Xiao Yang +Reviewed-by: Eryu Guan + +dgilbert: Sent upstream and now Merged as aa1185e153f774f1df65 +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a931b6861e59c78d861017e9c6a9c161ff49a163) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index fc15d61..e6f2399 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2441,7 +2441,7 @@ static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + + res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); + if (res < 0) { +- fuse_reply_err(req, -errno); ++ fuse_reply_err(req, errno); + } else { + fuse_reply_write(req, res); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch b/kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch new file mode 100755 index 0000000000000000000000000000000000000000..ddacdbe8fdf9f54a8ba6cf604f2d0bfd429d583f --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch @@ -0,0 +1,48 @@ +From 83b03fc4a3ecf6086394363488bbebc8d55428c0 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:16 +0100 +Subject: [PATCH 105/116] virtiofsd: passthrough_ll: Use cache_readdir for + directory open +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-102-dgilbert@redhat.com> +Patchwork-id: 93555 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 101/112] virtiofsd: passthrough_ll: Use cache_readdir for directory open +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Misono Tomohiro + +Since keep_cache(FOPEN_KEEP_CACHE) has no effect for directory as +described in fuse_common.h, use cache_readdir(FOPNE_CACHE_DIR) for +diretory open when cache=always mode. + +Signed-off-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9b610b09b49b1aada256097b338d49da805da6ae) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 4c61ac5..79b8b71 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1523,7 +1523,7 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + + fi->fh = fh; + if (lo->cache == CACHE_ALWAYS) { +- fi->keep_cache = 1; ++ fi->cache_readdir = 1; + } + fuse_reply_open(req, fi); + return; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch b/kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch new file mode 100755 index 0000000000000000000000000000000000000000..050657454be00dfc713f2643998f1f210b456742 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch @@ -0,0 +1,238 @@ +From 474d0adafed4d73720d6413b2903d6c4b529e5e6 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:15 +0100 +Subject: [PATCH 044/116] virtiofsd: passthrough_ll: add dirp_map to hide + lo_dirp pointers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-41-dgilbert@redhat.com> +Patchwork-id: 93495 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 040/112] virtiofsd: passthrough_ll: add dirp_map to hide lo_dirp pointers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Do not expose lo_dirp pointers to clients. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit b39bce121bfad8757eec0ee41f14607b883935d3) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 103 +++++++++++++++++++++++++++++---------- + 1 file changed, 76 insertions(+), 27 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index a3ebf74..5f5a72f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -56,27 +56,10 @@ + + #include "passthrough_helpers.h" + +-/* +- * We are re-using pointers to our `struct lo_inode` +- * elements as inodes. This means that we must be able to +- * store uintptr_t values in a fuse_ino_t variable. The following +- * incantation checks this condition at compile time. +- */ +-#if defined(__GNUC__) && \ +- (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \ +- !defined __cplusplus +-_Static_assert(sizeof(fuse_ino_t) >= sizeof(uintptr_t), +- "fuse_ino_t too small to hold uintptr_t values!"); +-#else +-struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { +- unsigned _uintptr_to_must_hold_fuse_ino_t +- : ((sizeof(fuse_ino_t) >= sizeof(uintptr_t)) ? 1 : -1); +-}; +-#endif +- + struct lo_map_elem { + union { + struct lo_inode *inode; ++ struct lo_dirp *dirp; + ssize_t freelist; + }; + bool in_use; +@@ -123,6 +106,7 @@ struct lo_data { + int timeout_set; + struct lo_inode root; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ ++ struct lo_map dirp_map; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +@@ -253,6 +237,20 @@ static void lo_map_remove(struct lo_map *map, size_t key) + } + + /* Assumes lo->mutex is held */ ++static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) ++{ ++ struct lo_map_elem *elem; ++ ++ elem = lo_map_alloc_elem(&lo_data(req)->dirp_map); ++ if (!elem) { ++ return -1; ++ } ++ ++ elem->dirp = dirp; ++ return elem - lo_data(req)->dirp_map.elems; ++} ++ ++/* Assumes lo->mutex is held */ + static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) + { + struct lo_map_elem *elem; +@@ -861,9 +859,19 @@ struct lo_dirp { + off_t offset; + }; + +-static struct lo_dirp *lo_dirp(struct fuse_file_info *fi) ++static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) + { +- return (struct lo_dirp *)(uintptr_t)fi->fh; ++ struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; ++ ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->dirp_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ if (!elem) { ++ return NULL; ++ } ++ ++ return elem->dirp; + } + + static void lo_opendir(fuse_req_t req, fuse_ino_t ino, +@@ -873,6 +881,7 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + struct lo_data *lo = lo_data(req); + struct lo_dirp *d; + int fd; ++ ssize_t fh; + + d = calloc(1, sizeof(struct lo_dirp)); + if (d == NULL) { +@@ -892,7 +901,14 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + d->offset = 0; + d->entry = NULL; + +- fi->fh = (uintptr_t)d; ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_dirp_mapping(req, d); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ goto out_err; ++ } ++ ++ fi->fh = fh; + if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } +@@ -903,6 +919,9 @@ out_errno: + error = errno; + out_err: + if (d) { ++ if (d->dp) { ++ closedir(d->dp); ++ } + if (fd != -1) { + close(fd); + } +@@ -920,17 +939,21 @@ static int is_dot_or_dotdot(const char *name) + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { +- struct lo_dirp *d = lo_dirp(fi); +- char *buf; ++ struct lo_dirp *d; ++ char *buf = NULL; + char *p; + size_t rem = size; +- int err; ++ int err = ENOMEM; + + (void)ino; + ++ d = lo_dirp(req, fi); ++ if (!d) { ++ goto error; ++ } ++ + buf = calloc(1, size); + if (!buf) { +- err = ENOMEM; + goto error; + } + p = buf; +@@ -1028,8 +1051,21 @@ static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, + static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { +- struct lo_dirp *d = lo_dirp(fi); ++ struct lo_data *lo = lo_data(req); ++ struct lo_dirp *d; ++ + (void)ino; ++ ++ d = lo_dirp(req, fi); ++ if (!d) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ pthread_mutex_lock(&lo->mutex); ++ lo_map_remove(&lo->dirp_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ + closedir(d->dp); + free(d); + fuse_reply_err(req, 0); +@@ -1081,8 +1117,18 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { + int res; +- int fd = dirfd(lo_dirp(fi)->dp); ++ struct lo_dirp *d; ++ int fd; ++ + (void)ino; ++ ++ d = lo_dirp(req, fi); ++ if (!d) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ fd = dirfd(d->dp); + if (datasync) { + res = fdatasync(fd); + } else { +@@ -1614,6 +1660,8 @@ int main(int argc, char *argv[]) + root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); + root_elem->inode = &lo.root; + ++ lo_map_init(&lo.dirp_map); ++ + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; + } +@@ -1710,6 +1758,7 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); + + if (lo.root.fd >= 0) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch b/kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch new file mode 100755 index 0000000000000000000000000000000000000000..b8de3d8c2373bf2d9e916987d668ff08315d9e84 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch @@ -0,0 +1,303 @@ +From 03effbc021064bb77d231ae5ca02d1a579c71ee1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:17 +0100 +Subject: [PATCH 046/116] virtiofsd: passthrough_ll: add fallback for racy ops +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-43-dgilbert@redhat.com> +Patchwork-id: 93496 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 042/112] virtiofsd: passthrough_ll: add fallback for racy ops +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +We have two operations that cannot be done race-free on a symlink in +certain cases: utimes and link. + +Add racy fallback for these if the race-free method doesn't work. We do +our best to avoid races even in this case: + + - get absolute path by reading /proc/self/fd/NN symlink + + - lookup parent directory: after this we are safe against renames in + ancestors + + - lookup name in parent directory, and verify that we got to the original + inode, if not retry the whole thing + +Both utimes(2) and link(2) hold i_lock on the inode across the operation, +so a racing rename/delete by this fuse instance is not possible, only from +other entities changing the filesystem. + +If the "norace" option is given, then disable the racy fallbacks. + +Signed-off-by: Miklos Szeredi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 5fe319a7b19c9c328e6e061bffcf1ff6cc8b89ce) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 5 +- + tools/virtiofsd/passthrough_ll.c | 157 +++++++++++++++++++++++++++++++++++---- + 2 files changed, 145 insertions(+), 17 deletions(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index b8ec5ac..5531425 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -142,7 +142,10 @@ void fuse_cmdline_help(void) + " --daemonize run in background\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" +- " allowed (default: 10)\n"); ++ " allowed (default: 10)\n" ++ " -o norace disable racy fallback\n" ++ " default: false\n" ++ ); + } + + static int fuse_helper_opt_proc(void *data, const char *arg, int key, +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9815bfa..ac380ef 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -98,6 +98,7 @@ enum { + struct lo_data { + pthread_mutex_t mutex; + int debug; ++ int norace; + int writeback; + int flock; + int xattr; +@@ -124,10 +125,15 @@ static const struct fuse_opt lo_opts[] = { + { "cache=never", offsetof(struct lo_data, cache), CACHE_NEVER }, + { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, +- ++ { "norace", offsetof(struct lo_data, norace), 1 }, + FUSE_OPT_END + }; + ++static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); ++ ++static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); ++ ++ + static struct lo_data *lo_data(fuse_req_t req) + { + return (struct lo_data *)fuse_req_userdata(req); +@@ -347,23 +353,127 @@ static void lo_getattr(fuse_req_t req, fuse_ino_t ino, + fuse_reply_attr(req, &buf, lo->timeout); + } + +-static int utimensat_empty_nofollow(struct lo_inode *inode, +- const struct timespec *tv) ++static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, ++ char path[PATH_MAX], struct lo_inode **parent) + { +- int res; + char procname[64]; ++ char *last; ++ struct stat stat; ++ struct lo_inode *p; ++ int retries = 2; ++ int res; ++ ++retry: ++ sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ ++ res = readlink(procname, path, PATH_MAX); ++ if (res < 0) { ++ fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__); ++ goto fail_noretry; ++ } ++ ++ if (res >= PATH_MAX) { ++ fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__); ++ goto fail_noretry; ++ } ++ path[res] = '\0'; ++ ++ last = strrchr(path, '/'); ++ if (last == NULL) { ++ /* Shouldn't happen */ ++ fuse_log( ++ FUSE_LOG_WARNING, ++ "%s: INTERNAL ERROR: bad path read from proc\n", __func__); ++ goto fail_noretry; ++ } ++ if (last == path) { ++ p = &lo->root; ++ pthread_mutex_lock(&lo->mutex); ++ p->refcount++; ++ pthread_mutex_unlock(&lo->mutex); ++ } else { ++ *last = '\0'; ++ res = fstatat(AT_FDCWD, last == path ? "/" : path, &stat, 0); ++ if (res == -1) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to stat parent: %m\n", __func__); ++ } ++ goto fail; ++ } ++ p = lo_find(lo, &stat); ++ if (p == NULL) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to find parent\n", __func__); ++ } ++ goto fail; ++ } ++ } ++ last++; ++ res = fstatat(p->fd, last, &stat, AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to stat last\n", __func__); ++ } ++ goto fail_unref; ++ } ++ if (stat.st_dev != inode->dev || stat.st_ino != inode->ino) { ++ if (!retries) { ++ fuse_log(FUSE_LOG_WARNING, ++ "%s: failed to match last\n", __func__); ++ } ++ goto fail_unref; ++ } ++ *parent = p; ++ memmove(path, last, strlen(last) + 1); ++ ++ return 0; ++ ++fail_unref: ++ unref_inode(lo, p, 1); ++fail: ++ if (retries) { ++ retries--; ++ goto retry; ++ } ++fail_noretry: ++ errno = EIO; ++ return -1; ++} ++ ++static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode, ++ const struct timespec *tv) ++{ ++ int res; ++ struct lo_inode *parent; ++ char path[PATH_MAX]; + + if (inode->is_symlink) { +- res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH); + if (res == -1 && errno == EINVAL) { + /* Sorry, no race free way to set times on symlink. */ +- errno = EPERM; ++ if (lo->norace) { ++ errno = EPERM; ++ } else { ++ goto fallback; ++ } + } + return res; + } +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "/proc/self/fd/%i", inode->fd); + +- return utimensat(AT_FDCWD, procname, tv, 0); ++ return utimensat(AT_FDCWD, path, tv, 0); ++ ++fallback: ++ res = lo_parent_and_name(lo, inode, path, &parent); ++ if (res != -1) { ++ res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW); ++ unref_inode(lo, parent, 1); ++ } ++ ++ return res; + } + + static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) +@@ -387,6 +497,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + { + int saverr; + char procname[64]; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + int ifd; + int res; +@@ -459,7 +570,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + if (fi) { + res = futimens(fd, tv); + } else { +- res = utimensat_empty_nofollow(inode, tv); ++ res = utimensat_empty(lo, inode, tv); + } + if (res == -1) { + goto out_err; +@@ -709,24 +820,38 @@ static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, + lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); + } + +-static int linkat_empty_nofollow(struct lo_inode *inode, int dfd, +- const char *name) ++static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode, ++ int dfd, const char *name) + { + int res; +- char procname[64]; ++ struct lo_inode *parent; ++ char path[PATH_MAX]; + + if (inode->is_symlink) { + res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); + if (res == -1 && (errno == ENOENT || errno == EINVAL)) { + /* Sorry, no race free way to hard-link a symlink. */ +- errno = EPERM; ++ if (lo->norace) { ++ errno = EPERM; ++ } else { ++ goto fallback; ++ } + } + return res; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "/proc/self/fd/%i", inode->fd); ++ ++ return linkat(AT_FDCWD, path, dfd, name, AT_SYMLINK_FOLLOW); ++ ++fallback: ++ res = lo_parent_and_name(lo, inode, path, &parent); ++ if (res != -1) { ++ res = linkat(parent->fd, path, dfd, name, 0); ++ unref_inode(lo, parent, 1); ++ } + +- return linkat(AT_FDCWD, procname, dfd, name, AT_SYMLINK_FOLLOW); ++ return res; + } + + static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, +@@ -748,7 +873,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; + +- res = linkat_empty_nofollow(inode, lo_fd(req, parent), name); ++ res = linkat_empty_nofollow(lo, inode, lo_fd(req, parent), name); + if (res == -1) { + goto out_err; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch b/kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch new file mode 100755 index 0000000000000000000000000000000000000000..24b2a6ee2813d682055086c51aa54df9c3d90679 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch @@ -0,0 +1,328 @@ +From 35337e604e9149d6d8fcf74b8b82ac33a8611ebb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:16 +0100 +Subject: [PATCH 045/116] virtiofsd: passthrough_ll: add fd_map to hide file + descriptors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-42-dgilbert@redhat.com> +Patchwork-id: 93494 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 041/112] virtiofsd: passthrough_ll: add fd_map to hide file descriptors +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Do not expose file descriptor numbers to clients. This prevents the +abuse of internal file descriptors (like stdin/stdout). + +Signed-off-by: Stefan Hajnoczi +Fix from: +Signed-off-by: Xiao Yang +dgilbert: + Added lseek +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 73b4d19dfc4248a74c1f3e511cfa934681d9c602) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 116 +++++++++++++++++++++++++++++++-------- + 1 file changed, 94 insertions(+), 22 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 5f5a72f..9815bfa 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -60,6 +60,7 @@ struct lo_map_elem { + union { + struct lo_inode *inode; + struct lo_dirp *dirp; ++ int fd; + ssize_t freelist; + }; + bool in_use; +@@ -107,6 +108,7 @@ struct lo_data { + struct lo_inode root; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ ++ struct lo_map fd_map; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +@@ -237,6 +239,20 @@ static void lo_map_remove(struct lo_map *map, size_t key) + } + + /* Assumes lo->mutex is held */ ++static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd) ++{ ++ struct lo_map_elem *elem; ++ ++ elem = lo_map_alloc_elem(&lo_data(req)->fd_map); ++ if (!elem) { ++ return -1; ++ } ++ ++ elem->fd = fd; ++ return elem - lo_data(req)->fd_map.elems; ++} ++ ++/* Assumes lo->mutex is held */ + static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) + { + struct lo_map_elem *elem; +@@ -350,6 +366,22 @@ static int utimensat_empty_nofollow(struct lo_inode *inode, + return utimensat(AT_FDCWD, procname, tv, 0); + } + ++static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) ++{ ++ struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; ++ ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->fd_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ ++ if (!elem) { ++ return -1; ++ } ++ ++ return elem->fd; ++} ++ + static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int valid, struct fuse_file_info *fi) + { +@@ -358,6 +390,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + struct lo_inode *inode; + int ifd; + int res; ++ int fd; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -367,9 +400,14 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + + ifd = inode->fd; + ++ /* If fi->fh is invalid we'll report EBADF later */ ++ if (fi) { ++ fd = lo_fi_fd(req, fi); ++ } ++ + if (valid & FUSE_SET_ATTR_MODE) { + if (fi) { +- res = fchmod(fi->fh, attr->st_mode); ++ res = fchmod(fd, attr->st_mode); + } else { + sprintf(procname, "/proc/self/fd/%i", ifd); + res = chmod(procname, attr->st_mode); +@@ -389,7 +427,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + } + if (valid & FUSE_SET_ATTR_SIZE) { + if (fi) { +- res = ftruncate(fi->fh, attr->st_size); ++ res = ftruncate(fd, attr->st_size); + } else { + sprintf(procname, "/proc/self/fd/%i", ifd); + res = truncate(procname, attr->st_size); +@@ -419,7 +457,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + } + + if (fi) { +- res = futimens(fi->fh, tv); ++ res = futimens(fd, tv); + } else { + res = utimensat_empty_nofollow(inode, tv); + } +@@ -1096,7 +1134,18 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + lo_restore_cred(&old); + + if (!err) { +- fi->fh = fd; ++ ssize_t fh; ++ ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_fd_mapping(req, fd); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ close(fd); ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ++ fi->fh = fh; + err = lo_do_lookup(req, parent, name, &e); + } + if (lo->cache == CACHE_NEVER) { +@@ -1140,6 +1189,7 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { + int fd; ++ ssize_t fh; + char buf[64]; + struct lo_data *lo = lo_data(req); + +@@ -1175,7 +1225,16 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + return (void)fuse_reply_err(req, errno); + } + +- fi->fh = fd; ++ pthread_mutex_lock(&lo->mutex); ++ fh = lo_add_fd_mapping(req, fd); ++ pthread_mutex_unlock(&lo->mutex); ++ if (fh == -1) { ++ close(fd); ++ fuse_reply_err(req, ENOMEM); ++ return; ++ } ++ ++ fi->fh = fh; + if (lo->cache == CACHE_NEVER) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { +@@ -1187,9 +1246,18 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + static void lo_release(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { ++ struct lo_data *lo = lo_data(req); ++ int fd; ++ + (void)ino; + +- close(fi->fh); ++ fd = lo_fi_fd(req, fi); ++ ++ pthread_mutex_lock(&lo->mutex); ++ lo_map_remove(&lo->fd_map, fi->fh); ++ pthread_mutex_unlock(&lo->mutex); ++ ++ close(fd); + fuse_reply_err(req, 0); + } + +@@ -1197,7 +1265,7 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + { + int res; + (void)ino; +- res = close(dup(fi->fh)); ++ res = close(dup(lo_fi_fd(req, fi))); + fuse_reply_err(req, res == -1 ? errno : 0); + } + +@@ -1224,7 +1292,7 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + return (void)fuse_reply_err(req, errno); + } + } else { +- fd = fi->fh; ++ fd = lo_fi_fd(req, fi); + } + + if (datasync) { +@@ -1251,7 +1319,7 @@ static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + } + + buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- buf.buf[0].fd = fi->fh; ++ buf.buf[0].fd = lo_fi_fd(req, fi); + buf.buf[0].pos = offset; + + fuse_reply_data(req, &buf); +@@ -1266,7 +1334,7 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); + + out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; +- out_buf.buf[0].fd = fi->fh; ++ out_buf.buf[0].fd = lo_fi_fd(req, fi); + out_buf.buf[0].pos = off; + + if (lo_debug(req)) { +@@ -1303,7 +1371,7 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + (void)ino; + + #ifdef CONFIG_FALLOCATE +- err = fallocate(fi->fh, mode, offset, length); ++ err = fallocate(lo_fi_fd(req, fi), mode, offset, length); + if (err < 0) { + err = errno; + } +@@ -1314,7 +1382,7 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + return; + } + +- err = posix_fallocate(fi->fh, offset, length); ++ err = posix_fallocate(lo_fi_fd(req, fi), offset, length); + #endif + + fuse_reply_err(req, err); +@@ -1326,7 +1394,7 @@ static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + int res; + (void)ino; + +- res = flock(fi->fh, op); ++ res = flock(lo_fi_fd(req, fi), op); + + fuse_reply_err(req, res == -1 ? errno : 0); + } +@@ -1551,17 +1619,19 @@ static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + off_t off_out, struct fuse_file_info *fi_out, + size_t len, int flags) + { ++ int in_fd, out_fd; + ssize_t res; + +- if (lo_debug(req)) +- fuse_log(FUSE_LOG_DEBUG, +- "lo_copy_file_range(ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, ino=%" PRIu64 "/fd=%lu, " +- "off=%lu, size=%zd, flags=0x%x)\n", +- ino_in, fi_in->fh, off_in, ino_out, fi_out->fh, off_out, len, +- flags); ++ in_fd = lo_fi_fd(req, fi_in); ++ out_fd = lo_fi_fd(req, fi_out); ++ ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, " ++ "off=%lu, ino=%" PRIu64 "/fd=%d, " ++ "off=%lu, size=%zd, flags=0x%x)\n", ++ ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags); + +- res = copy_file_range(fi_in->fh, &off_in, fi_out->fh, &off_out, len, flags); ++ res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); + if (res < 0) { + fuse_reply_err(req, -errno); + } else { +@@ -1576,7 +1646,7 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + off_t res; + + (void)ino; +- res = lseek(fi->fh, off, whence); ++ res = lseek(lo_fi_fd(req, fi), off, whence); + if (res != -1) { + fuse_reply_lseek(req, res); + } else { +@@ -1661,6 +1731,7 @@ int main(int argc, char *argv[]) + root_elem->inode = &lo.root; + + lo_map_init(&lo.dirp_map); ++ lo_map_init(&lo.fd_map); + + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; +@@ -1758,6 +1829,7 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ lo_map_destroy(&lo.fd_map); + lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch b/kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch new file mode 100755 index 0000000000000000000000000000000000000000..ba8b7300259d199c454add189ff4222ea3e69fad --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch @@ -0,0 +1,395 @@ +From d81396cc3d9815730903b0755c9d2e67d6954d54 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:14 +0100 +Subject: [PATCH 043/116] virtiofsd: passthrough_ll: add ino_map to hide + lo_inode pointers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-40-dgilbert@redhat.com> +Patchwork-id: 93493 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 039/112] virtiofsd: passthrough_ll: add ino_map to hide lo_inode pointers +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Do not expose lo_inode pointers to clients. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 92fb57b83cdbfc4bf53c0c46a3d0bcbc36e64126) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 144 +++++++++++++++++++++++++++++++-------- + 1 file changed, 114 insertions(+), 30 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e83a976..a3ebf74 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -57,8 +57,8 @@ + #include "passthrough_helpers.h" + + /* +- * We are re-using pointers to our `struct lo_inode` and `struct +- * lo_dirp` elements as inodes. This means that we must be able to ++ * We are re-using pointers to our `struct lo_inode` ++ * elements as inodes. This means that we must be able to + * store uintptr_t values in a fuse_ino_t variable. The following + * incantation checks this condition at compile time. + */ +@@ -76,7 +76,7 @@ struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { + + struct lo_map_elem { + union { +- /* Element values will go here... */ ++ struct lo_inode *inode; + ssize_t freelist; + }; + bool in_use; +@@ -97,6 +97,7 @@ struct lo_inode { + ino_t ino; + dev_t dev; + uint64_t refcount; /* protected by lo->mutex */ ++ fuse_ino_t fuse_ino; + }; + + struct lo_cred { +@@ -121,6 +122,7 @@ struct lo_data { + int cache; + int timeout_set; + struct lo_inode root; /* protected by lo->mutex */ ++ struct lo_map ino_map; /* protected by lo->mutex */ + }; + + static const struct fuse_opt lo_opts[] = { +@@ -145,14 +147,14 @@ static struct lo_data *lo_data(fuse_req_t req) + return (struct lo_data *)fuse_req_userdata(req); + } + +-__attribute__((unused)) static void lo_map_init(struct lo_map *map) ++static void lo_map_init(struct lo_map *map) + { + map->elems = NULL; + map->nelems = 0; + map->freelist = -1; + } + +-__attribute__((unused)) static void lo_map_destroy(struct lo_map *map) ++static void lo_map_destroy(struct lo_map *map) + { + free(map->elems); + } +@@ -183,8 +185,7 @@ static int lo_map_grow(struct lo_map *map, size_t new_nelems) + return 1; + } + +-__attribute__((unused)) static struct lo_map_elem * +-lo_map_alloc_elem(struct lo_map *map) ++static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map) + { + struct lo_map_elem *elem; + +@@ -200,8 +201,7 @@ lo_map_alloc_elem(struct lo_map *map) + return elem; + } + +-__attribute__((unused)) static struct lo_map_elem * +-lo_map_reserve(struct lo_map *map, size_t key) ++static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key) + { + ssize_t *prev; + +@@ -222,8 +222,7 @@ lo_map_reserve(struct lo_map *map, size_t key) + return NULL; + } + +-__attribute__((unused)) static struct lo_map_elem * +-lo_map_get(struct lo_map *map, size_t key) ++static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key) + { + if (key >= map->nelems) { + return NULL; +@@ -234,8 +233,7 @@ lo_map_get(struct lo_map *map, size_t key) + return &map->elems[key]; + } + +-__attribute__((unused)) static void lo_map_remove(struct lo_map *map, +- size_t key) ++static void lo_map_remove(struct lo_map *map, size_t key) + { + struct lo_map_elem *elem; + +@@ -254,18 +252,40 @@ __attribute__((unused)) static void lo_map_remove(struct lo_map *map, + map->freelist = key; + } + ++/* Assumes lo->mutex is held */ ++static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) ++{ ++ struct lo_map_elem *elem; ++ ++ elem = lo_map_alloc_elem(&lo_data(req)->ino_map); ++ if (!elem) { ++ return -1; ++ } ++ ++ elem->inode = inode; ++ return elem - lo_data(req)->ino_map.elems; ++} ++ + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { +- if (ino == FUSE_ROOT_ID) { +- return &lo_data(req)->root; +- } else { +- return (struct lo_inode *)(uintptr_t)ino; ++ struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; ++ ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->ino_map, ino); ++ pthread_mutex_unlock(&lo->mutex); ++ ++ if (!elem) { ++ return NULL; + } ++ ++ return elem->inode; + } + + static int lo_fd(fuse_req_t req, fuse_ino_t ino) + { +- return lo_inode(req, ino)->fd; ++ struct lo_inode *inode = lo_inode(req, ino); ++ return inode ? inode->fd : -1; + } + + static bool lo_debug(fuse_req_t req) +@@ -337,10 +357,18 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + { + int saverr; + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); +- int ifd = inode->fd; ++ struct lo_inode *inode; ++ int ifd; + int res; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ ++ ifd = inode->fd; ++ + if (valid & FUSE_SET_ATTR_MODE) { + if (fi) { + res = fchmod(fi->fh, attr->st_mode); +@@ -470,6 +498,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + inode->dev = e->attr.st_dev; + + pthread_mutex_lock(&lo->mutex); ++ inode->fuse_ino = lo_add_inode_mapping(req, inode); + prev = &lo->root; + next = prev->next; + next->prev = inode; +@@ -478,7 +507,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + prev->next = inode; + pthread_mutex_unlock(&lo->mutex); + } +- e->ino = (uintptr_t)inode; ++ e->ino = inode->fuse_ino; + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +@@ -582,10 +611,16 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + { + int res; + int saverr; +- struct lo_inode *dir = lo_inode(req, parent); ++ struct lo_inode *dir; + struct fuse_entry_param e; + struct lo_cred old = {}; + ++ dir = lo_inode(req, parent); ++ if (!dir) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOMEM; + + saverr = lo_change_cred(req, &old); +@@ -663,10 +698,16 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + { + int res; + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + struct fuse_entry_param e; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + memset(&e, 0, sizeof(struct fuse_entry_param)); + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; +@@ -684,7 +725,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + pthread_mutex_lock(&lo->mutex); + inode->refcount++; + pthread_mutex_unlock(&lo->mutex); +- e.ino = (uintptr_t)inode; ++ e.ino = inode->fuse_ino; + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +@@ -750,10 +791,10 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + next->prev = prev; + prev->next = next; + ++ lo_map_remove(&lo->ino_map, inode->fuse_ino); + pthread_mutex_unlock(&lo->mutex); + close(inode->fd); + free(inode); +- + } else { + pthread_mutex_unlock(&lo->mutex); + } +@@ -762,7 +803,12 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) + static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + { + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; ++ ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ return; ++ } + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +@@ -1244,10 +1290,16 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + { + char *value = NULL; + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1306,10 +1358,16 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + { + char *value = NULL; + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1367,10 +1425,16 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, int flags) + { + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1400,10 +1464,16 @@ out: + static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + { + char procname[64]; +- struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_inode *inode; + ssize_t ret; + int saverr; + ++ inode = lo_inode(req, ino); ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } ++ + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; +@@ -1522,6 +1592,7 @@ int main(int argc, char *argv[]) + struct fuse_session *se; + struct fuse_cmdline_opts opts; + struct lo_data lo = { .debug = 0, .writeback = 0 }; ++ struct lo_map_elem *root_elem; + int ret = -1; + + /* Don't mask creation mode, kernel already did that */ +@@ -1530,8 +1601,19 @@ int main(int argc, char *argv[]) + pthread_mutex_init(&lo.mutex, NULL); + lo.root.next = lo.root.prev = &lo.root; + lo.root.fd = -1; ++ lo.root.fuse_ino = FUSE_ROOT_ID; + lo.cache = CACHE_NORMAL; + ++ /* ++ * Set up the ino map like this: ++ * [0] Reserved (will not be used) ++ * [1] Root inode ++ */ ++ lo_map_init(&lo.ino_map); ++ lo_map_reserve(&lo.ino_map, 0)->in_use = false; ++ root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); ++ root_elem->inode = &lo.root; ++ + if (fuse_parse_cmdline(&args, &opts) != 0) { + return 1; + } +@@ -1628,6 +1710,8 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ lo_map_destroy(&lo.ino_map); ++ + if (lo.root.fd >= 0) { + close(lo.root.fd); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch b/kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch new file mode 100755 index 0000000000000000000000000000000000000000..4751f95183358d76930a0e580eb11cd1be57f9be --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch @@ -0,0 +1,182 @@ +From d56651e227bae83ee0cceb12bd91e3e9f6045ab3 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:13 +0100 +Subject: [PATCH 042/116] virtiofsd: passthrough_ll: add lo_map for ino/fh + indirection +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-39-dgilbert@redhat.com> +Patchwork-id: 93492 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 038/112] virtiofsd: passthrough_ll: add lo_map for ino/fh indirection +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +A layer of indirection is needed because passthrough_ll cannot expose +pointers or file descriptor numbers to untrusted clients. Malicious +clients could send invalid pointers or file descriptors in order to +crash or exploit the file system daemon. + +lo_map provides an integer key->value mapping. This will be used for +ino and fh fields in the patches that follow. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 25c135727b08dca90f00094e522a69170b13dfac) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 124 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 124 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 5e06179..e83a976 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -74,6 +74,21 @@ struct _uintptr_to_must_hold_fuse_ino_t_dummy_struct { + }; + #endif + ++struct lo_map_elem { ++ union { ++ /* Element values will go here... */ ++ ssize_t freelist; ++ }; ++ bool in_use; ++}; ++ ++/* Maps FUSE fh or ino values to internal objects */ ++struct lo_map { ++ struct lo_map_elem *elems; ++ size_t nelems; ++ ssize_t freelist; ++}; ++ + struct lo_inode { + struct lo_inode *next; /* protected by lo->mutex */ + struct lo_inode *prev; /* protected by lo->mutex */ +@@ -130,6 +145,115 @@ static struct lo_data *lo_data(fuse_req_t req) + return (struct lo_data *)fuse_req_userdata(req); + } + ++__attribute__((unused)) static void lo_map_init(struct lo_map *map) ++{ ++ map->elems = NULL; ++ map->nelems = 0; ++ map->freelist = -1; ++} ++ ++__attribute__((unused)) static void lo_map_destroy(struct lo_map *map) ++{ ++ free(map->elems); ++} ++ ++static int lo_map_grow(struct lo_map *map, size_t new_nelems) ++{ ++ struct lo_map_elem *new_elems; ++ size_t i; ++ ++ if (new_nelems <= map->nelems) { ++ return 1; ++ } ++ ++ new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems); ++ if (!new_elems) { ++ return 0; ++ } ++ ++ for (i = map->nelems; i < new_nelems; i++) { ++ new_elems[i].freelist = i + 1; ++ new_elems[i].in_use = false; ++ } ++ new_elems[new_nelems - 1].freelist = -1; ++ ++ map->elems = new_elems; ++ map->freelist = map->nelems; ++ map->nelems = new_nelems; ++ return 1; ++} ++ ++__attribute__((unused)) static struct lo_map_elem * ++lo_map_alloc_elem(struct lo_map *map) ++{ ++ struct lo_map_elem *elem; ++ ++ if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) { ++ return NULL; ++ } ++ ++ elem = &map->elems[map->freelist]; ++ map->freelist = elem->freelist; ++ ++ elem->in_use = true; ++ ++ return elem; ++} ++ ++__attribute__((unused)) static struct lo_map_elem * ++lo_map_reserve(struct lo_map *map, size_t key) ++{ ++ ssize_t *prev; ++ ++ if (!lo_map_grow(map, key + 1)) { ++ return NULL; ++ } ++ ++ for (prev = &map->freelist; *prev != -1; ++ prev = &map->elems[*prev].freelist) { ++ if (*prev == key) { ++ struct lo_map_elem *elem = &map->elems[key]; ++ ++ *prev = elem->freelist; ++ elem->in_use = true; ++ return elem; ++ } ++ } ++ return NULL; ++} ++ ++__attribute__((unused)) static struct lo_map_elem * ++lo_map_get(struct lo_map *map, size_t key) ++{ ++ if (key >= map->nelems) { ++ return NULL; ++ } ++ if (!map->elems[key].in_use) { ++ return NULL; ++ } ++ return &map->elems[key]; ++} ++ ++__attribute__((unused)) static void lo_map_remove(struct lo_map *map, ++ size_t key) ++{ ++ struct lo_map_elem *elem; ++ ++ if (key >= map->nelems) { ++ return; ++ } ++ ++ elem = &map->elems[key]; ++ if (!elem->in_use) { ++ return; ++ } ++ ++ elem->in_use = false; ++ ++ elem->freelist = map->freelist; ++ map->freelist = key; ++} ++ + static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) + { + if (ino == FUSE_ROOT_ID) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch b/kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch new file mode 100755 index 0000000000000000000000000000000000000000..a3f7970aaf9814c41deadc81f84b36f7c1d939eb --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch @@ -0,0 +1,52 @@ +From 86b4f2865f2ebd7e6b3d85beb66a9390eb46eb96 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:45 +0100 +Subject: [PATCH 074/116] virtiofsd: passthrough_ll: add renameat2 support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-71-dgilbert@redhat.com> +Patchwork-id: 93531 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 070/112] virtiofsd: passthrough_ll: add renameat2 support +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit f0ab7d6f78a7d3c1c19fd81a91c9b1199f56c4f6) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 98114a3..18d69ab 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1099,7 +1099,17 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + if (flags) { ++#ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); ++#else ++ res = syscall(SYS_renameat2, lo_fd(req, parent), name, ++ lo_fd(req, newparent), newname, flags); ++ if (res == -1 && errno == ENOSYS) { ++ fuse_reply_err(req, EINVAL); ++ } else { ++ fuse_reply_err(req, res == -1 ? errno : 0); ++ } ++#endif + return; + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch b/kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch new file mode 100755 index 0000000000000000000000000000000000000000..dc87ef204407f027b9f182e55f7ba63a223993cb --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch @@ -0,0 +1,138 @@ +From 079199c53f483f0051f994b195ebb595aec76a39 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:51 +0100 +Subject: [PATCH 080/116] virtiofsd: passthrough_ll: clean up cache related + options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-77-dgilbert@redhat.com> +Patchwork-id: 93530 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 076/112] virtiofsd: passthrough_ll: clean up cache related options +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + + - Rename "cache=never" to "cache=none" to match 9p's similar option. + + - Rename CACHE_NORMAL constant to CACHE_AUTO to match the "cache=auto" + option. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 230e777b5e250759ee0480fcc0e9ccfa2b082fba) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 5 ++++- + tools/virtiofsd/passthrough_ll.c | 20 ++++++++++---------- + 2 files changed, 14 insertions(+), 11 deletions(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 14f5d70..5672024 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -145,6 +145,9 @@ void fuse_cmdline_help(void) + " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" ++ " -o cache= cache mode. could be one of \"auto, " ++ "always, none\"\n" ++ " default: auto\n" + " -o log_level= log level, default to \"info\"\n" + " level could be one of \"debug, " + "info, warn, err\"\n" +@@ -156,7 +159,7 @@ void fuse_cmdline_help(void) + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " +- "cache=never\n" ++ "cache=none\n" + ); + } + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 9e7191e..b40f287 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -101,8 +101,8 @@ struct lo_cred { + }; + + enum { +- CACHE_NEVER, +- CACHE_NORMAL, ++ CACHE_NONE, ++ CACHE_AUTO, + CACHE_ALWAYS, + }; + +@@ -138,8 +138,8 @@ static const struct fuse_opt lo_opts[] = { + { "no_xattr", offsetof(struct lo_data, xattr), 0 }, + { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, + { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, +- { "cache=never", offsetof(struct lo_data, cache), CACHE_NEVER }, +- { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, ++ { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE }, ++ { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, + { "norace", offsetof(struct lo_data, norace), 1 }, + { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, +@@ -482,7 +482,7 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } +- if ((lo->cache == CACHE_NEVER && !lo->readdirplus_set) || ++ if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || + lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); + conn->want &= ~FUSE_CAP_READDIRPLUS; +@@ -1493,7 +1493,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + fi->fh = fh; + err = lo_do_lookup(req, parent, name, &e); + } +- if (lo->cache == CACHE_NEVER) { ++ if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; +@@ -1578,7 +1578,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + } + + fi->fh = fh; +- if (lo->cache == CACHE_NEVER) { ++ if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; +@@ -2395,7 +2395,7 @@ int main(int argc, char *argv[]) + lo.root.next = lo.root.prev = &lo.root; + lo.root.fd = -1; + lo.root.fuse_ino = FUSE_ROOT_ID; +- lo.cache = CACHE_NORMAL; ++ lo.cache = CACHE_AUTO; + + /* + * Set up the ino map like this: +@@ -2470,11 +2470,11 @@ int main(int argc, char *argv[]) + } + if (!lo.timeout_set) { + switch (lo.cache) { +- case CACHE_NEVER: ++ case CACHE_NONE: + lo.timeout = 0.0; + break; + +- case CACHE_NORMAL: ++ case CACHE_AUTO: + lo.timeout = 1.0; + break; + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch b/kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch new file mode 100755 index 0000000000000000000000000000000000000000..c55eeadeb56a7751d36d7ac2f070b4e0d43a570e --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch @@ -0,0 +1,154 @@ +From f93ea308351cbe2630d7ecf637c3b69894d84a11 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 3 Mar 2020 18:43:13 +0000 +Subject: [PATCH 17/18] virtiofsd: passthrough_ll: cleanup getxattr/listxattr +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200303184314.155564-7-dgilbert@redhat.com> +Patchwork-id: 94125 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 6/7] virtiofsd: passthrough_ll: cleanup getxattr/listxattr +Bugzilla: 1797064 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Ján Tomko + +From: Misono Tomohiro + +This is a cleanup patch to simplify the following xattr fix and +there is no functional changes. + +- Move memory allocation to head of the function +- Unify fgetxattr/flistxattr call for both size == 0 and + size != 0 case +- Remove redundant lo_inode_put call in error path + (Note: second call is ignored now since @inode is already NULL) + +Signed-off-by: Misono Tomohiro +Message-Id: <20200227055927.24566-2-misono.tomohiro@jp.fujitsu.com> +Acked-by: Vivek Goyal +Reviewed-by: Dr. David Alan Gilbert +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 16e15a73089102c3d8846792d514e769300fcc3c) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/passthrough_ll.c | 54 ++++++++++++++++------------------------ + 1 file changed, 22 insertions(+), 32 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c635fc8..50c7273 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -2199,34 +2199,30 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ } ++ + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + ++ ret = fgetxattr(fd, name, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } + if (size) { +- value = malloc(size); +- if (!value) { +- goto out_err; +- } +- +- ret = fgetxattr(fd, name, value, size); +- if (ret == -1) { +- goto out_err; +- } + saverr = 0; + if (ret == 0) { + goto out; + } +- + fuse_reply_buf(req, value, ret); + } else { +- ret = fgetxattr(fd, name, NULL, 0); +- if (ret == -1) { +- goto out_err; +- } +- + fuse_reply_xattr(req, ret); + } + out_free: +@@ -2242,7 +2238,6 @@ out_free: + out_err: + saverr = errno; + out: +- lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +@@ -2277,34 +2272,30 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out; + } + ++ if (size) { ++ value = malloc(size); ++ if (!value) { ++ goto out_err; ++ } ++ } ++ + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + ++ ret = flistxattr(fd, value, size); ++ if (ret == -1) { ++ goto out_err; ++ } + if (size) { +- value = malloc(size); +- if (!value) { +- goto out_err; +- } +- +- ret = flistxattr(fd, value, size); +- if (ret == -1) { +- goto out_err; +- } + saverr = 0; + if (ret == 0) { + goto out; + } +- + fuse_reply_buf(req, value, ret); + } else { +- ret = flistxattr(fd, NULL, 0); +- if (ret == -1) { +- goto out_err; +- } +- + fuse_reply_xattr(req, ret); + } + out_free: +@@ -2320,7 +2311,6 @@ out_free: + out_err: + saverr = errno; + out: +- lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-control-readdirplus.patch b/kvm-virtiofsd-passthrough_ll-control-readdirplus.patch new file mode 100755 index 0000000000000000000000000000000000000000..98d00fcf7152b4c846c498a8bc96e489418c73e8 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-control-readdirplus.patch @@ -0,0 +1,79 @@ +From 0f1d456fad4ba6a696eff8976b9fe8a0f251e1b5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:47 +0100 +Subject: [PATCH 076/116] virtiofsd: passthrough_ll: control readdirplus +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-73-dgilbert@redhat.com> +Patchwork-id: 93524 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 072/112] virtiofsd: passthrough_ll: control readdirplus +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 59aef494be2d8d91055ff3f3a8eb13d9f32873d8) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/helper.c | 4 ++++ + tools/virtiofsd/passthrough_ll.c | 7 ++++++- + 2 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 6d50a46..14f5d70 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -153,6 +153,10 @@ void fuse_cmdline_help(void) + " allowed (default: 10)\n" + " -o norace disable racy fallback\n" + " default: false\n" ++ " -o readdirplus|no_readdirplus\n" ++ " enable/disable readirplus\n" ++ " default: readdirplus except with " ++ "cache=never\n" + ); + } + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 6480c51..8b1784f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -117,6 +117,8 @@ struct lo_data { + double timeout; + int cache; + int timeout_set; ++ int readdirplus_set; ++ int readdirplus_clear; + struct lo_inode root; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ +@@ -140,6 +142,8 @@ static const struct fuse_opt lo_opts[] = { + { "cache=auto", offsetof(struct lo_data, cache), CACHE_NORMAL }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, + { "norace", offsetof(struct lo_data, norace), 1 }, ++ { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, ++ { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, + FUSE_OPT_END + }; + static bool use_syslog = false; +@@ -478,7 +482,8 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } +- if (lo->cache == CACHE_NEVER) { ++ if ((lo->cache == CACHE_NEVER && !lo->readdirplus_set) || ++ lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); + conn->want &= ~FUSE_CAP_READDIRPLUS; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch b/kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch new file mode 100755 index 0000000000000000000000000000000000000000..4b02779d0d315e523f207e5dd2c8294524b3b2d6 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch @@ -0,0 +1,198 @@ +From af14ef1dba9356e566c9c7531b8fd23361c2b16d Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:12 +0100 +Subject: [PATCH 041/116] virtiofsd: passthrough_ll: create new files in + caller's context +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-38-dgilbert@redhat.com> +Patchwork-id: 93488 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 037/112] virtiofsd: passthrough_ll: create new files in caller's context +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Vivek Goyal + +We need to create files in the caller's context. Otherwise after +creating a file, the caller might not be able to do file operations on +that file. + +Changed effective uid/gid to caller's uid/gid, create file and then +switch back to uid/gid 0. + +Use syscall(setresuid, ...) otherwise glibc does some magic to change EUID +in all threads, which is not what we want. + +Signed-off-by: Vivek Goyal +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 929cfb7a9a1b101cdfc9ac19807ecab4c81a13e4) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 96 +++++++++++++++++++++++++++++++++++++--- + 1 file changed, 91 insertions(+), 5 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index cd27c09..5e06179 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -83,6 +84,11 @@ struct lo_inode { + uint64_t refcount; /* protected by lo->mutex */ + }; + ++struct lo_cred { ++ uid_t euid; ++ gid_t egid; ++}; ++ + enum { + CACHE_NEVER, + CACHE_NORMAL, +@@ -383,6 +389,69 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + } + } + ++/* ++ * On some archs, setres*id is limited to 2^16 but they ++ * provide setres*id32 variants that allow 2^32. ++ * Others just let setres*id do 2^32 anyway. ++ */ ++#ifdef SYS_setresgid32 ++#define OURSYS_setresgid SYS_setresgid32 ++#else ++#define OURSYS_setresgid SYS_setresgid ++#endif ++ ++#ifdef SYS_setresuid32 ++#define OURSYS_setresuid SYS_setresuid32 ++#else ++#define OURSYS_setresuid SYS_setresuid ++#endif ++ ++/* ++ * Change to uid/gid of caller so that file is created with ++ * ownership of caller. ++ * TODO: What about selinux context? ++ */ ++static int lo_change_cred(fuse_req_t req, struct lo_cred *old) ++{ ++ int res; ++ ++ old->euid = geteuid(); ++ old->egid = getegid(); ++ ++ res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1); ++ if (res == -1) { ++ return errno; ++ } ++ ++ res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1); ++ if (res == -1) { ++ int errno_save = errno; ++ ++ syscall(OURSYS_setresgid, -1, old->egid, -1); ++ return errno_save; ++ } ++ ++ return 0; ++} ++ ++/* Regain Privileges */ ++static void lo_restore_cred(struct lo_cred *old) ++{ ++ int res; ++ ++ res = syscall(OURSYS_setresuid, -1, old->euid, -1); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid); ++ exit(1); ++ } ++ ++ res = syscall(OURSYS_setresgid, -1, old->egid, -1); ++ if (res == -1) { ++ fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid); ++ exit(1); ++ } ++} ++ + static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + const char *name, mode_t mode, dev_t rdev, + const char *link) +@@ -391,12 +460,21 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + int saverr; + struct lo_inode *dir = lo_inode(req, parent); + struct fuse_entry_param e; ++ struct lo_cred old = {}; + + saverr = ENOMEM; + ++ saverr = lo_change_cred(req, &old); ++ if (saverr) { ++ goto out; ++ } ++ + res = mknod_wrapper(dir->fd, name, link, mode, rdev); + + saverr = errno; ++ ++ lo_restore_cred(&old); ++ + if (res == -1) { + goto out; + } +@@ -794,26 +872,34 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_data *lo = lo_data(req); + struct fuse_entry_param e; + int err; ++ struct lo_cred old = {}; + + if (lo_debug(req)) { + fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", + parent, name); + } + ++ err = lo_change_cred(req, &old); ++ if (err) { ++ goto out; ++ } ++ + fd = openat(lo_fd(req, parent), name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); +- if (fd == -1) { +- return (void)fuse_reply_err(req, errno); +- } ++ err = fd == -1 ? errno : 0; ++ lo_restore_cred(&old); + +- fi->fh = fd; ++ if (!err) { ++ fi->fh = fd; ++ err = lo_do_lookup(req, parent, name, &e); ++ } + if (lo->cache == CACHE_NEVER) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } + +- err = lo_do_lookup(req, parent, name, &e); ++out: + if (err) { + fuse_reply_err(req, err); + } else { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch b/kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch new file mode 100755 index 0000000000000000000000000000000000000000..4a531a302bd349eaaf6da7634be02be751c500dc --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch @@ -0,0 +1,50 @@ +From bbf92338e5e5eed796d511d2bd3c3686b7d1e5fd Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:46 +0100 +Subject: [PATCH 075/116] virtiofsd: passthrough_ll: disable readdirplus on + cache=never +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-72-dgilbert@redhat.com> +Patchwork-id: 93525 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 071/112] virtiofsd: passthrough_ll: disable readdirplus on cache=never +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +...because the attributes sent in the READDIRPLUS reply would be discarded +anyway. + +Signed-off-by: Miklos Szeredi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit ddcbabcb0ea177be3ec3500726b699c7c26ffd93) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 18d69ab..6480c51 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -478,6 +478,10 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } ++ if (lo->cache == CACHE_NEVER) { ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); ++ conn->want &= ~FUSE_CAP_READDIRPLUS; ++ } + } + + static void lo_getattr(fuse_req_t req, fuse_ino_t ino, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch b/kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch new file mode 100755 index 0000000000000000000000000000000000000000..00e11b48b5a1dde9a93582027a470c026b0c8f7b --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch @@ -0,0 +1,143 @@ +From 5e33269d5fbc4ba4614bab4a6b9e0ef759bebcb7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:10 +0100 +Subject: [PATCH 099/116] virtiofsd: passthrough_ll: fix refcounting on + remove/rename +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-96-dgilbert@redhat.com> +Patchwork-id: 93549 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 095/112] virtiofsd: passthrough_ll: fix refcounting on remove/rename +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9257e514d861afa759c36704e1904d43ca3fec88) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 50 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 49 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c819b5f..e3a6d6b 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1140,17 +1140,42 @@ out_err: + fuse_reply_err(req, saverr); + } + ++static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, ++ const char *name) ++{ ++ int res; ++ struct stat attr; ++ ++ res = fstatat(lo_fd(req, parent), name, &attr, ++ AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); ++ if (res == -1) { ++ return NULL; ++ } ++ ++ return lo_find(lo_data(req), &attr); ++} ++ + static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; ++ struct lo_inode *inode; ++ struct lo_data *lo = lo_data(req); ++ + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + ++ inode = lookup_name(req, parent, name); ++ if (!inode) { ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ + res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + + fuse_reply_err(req, res == -1 ? errno : 0); ++ unref_inode_lolocked(lo, inode, 1); + } + + static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, +@@ -1158,12 +1183,23 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + unsigned int flags) + { + int res; ++ struct lo_inode *oldinode; ++ struct lo_inode *newinode; ++ struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { + fuse_reply_err(req, EINVAL); + return; + } + ++ oldinode = lookup_name(req, parent, name); ++ newinode = lookup_name(req, newparent, newname); ++ ++ if (!oldinode) { ++ fuse_reply_err(req, EIO); ++ goto out; ++ } ++ + if (flags) { + #ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); +@@ -1176,26 +1212,38 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_reply_err(req, res == -1 ? errno : 0); + } + #endif +- return; ++ goto out; + } + + res = renameat(lo_fd(req, parent), name, lo_fd(req, newparent), newname); + + fuse_reply_err(req, res == -1 ? errno : 0); ++out: ++ unref_inode_lolocked(lo, oldinode, 1); ++ unref_inode_lolocked(lo, newinode, 1); + } + + static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; ++ struct lo_inode *inode; ++ struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + ++ inode = lookup_name(req, parent, name); ++ if (!inode) { ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ + res = unlinkat(lo_fd(req, parent), name, 0); + + fuse_reply_err(req, res == -1 ? errno : 0); ++ unref_inode_lolocked(lo, inode, 1); + } + + static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-passthrough_ll-use-hashtable.patch b/kvm-virtiofsd-passthrough_ll-use-hashtable.patch new file mode 100755 index 0000000000000000000000000000000000000000..b0be1f9733265db748b7ee47c86b01a78497e6c9 --- /dev/null +++ b/kvm-virtiofsd-passthrough_ll-use-hashtable.patch @@ -0,0 +1,211 @@ +From 44f4434b1305f6ff47b4f63fafcf39bcea9e4ceb Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:52 +0100 +Subject: [PATCH 081/116] virtiofsd: passthrough_ll: use hashtable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-78-dgilbert@redhat.com> +Patchwork-id: 93528 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 077/112] virtiofsd: passthrough_ll: use hashtable +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Improve performance of inode lookup by using a hash table. + +Signed-off-by: Miklos Szeredi +Signed-off-by: Dr. David Alan Gilbert +Signed-off-by: Liu Bo +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit bfc50a6e06b10b2f9dbaf6c1a89dd523322e016f) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 81 ++++++++++++++++++++++------------------ + 1 file changed, 45 insertions(+), 36 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index b40f287..b176a31 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -84,13 +84,15 @@ struct lo_map { + ssize_t freelist; + }; + ++struct lo_key { ++ ino_t ino; ++ dev_t dev; ++}; ++ + struct lo_inode { +- struct lo_inode *next; /* protected by lo->mutex */ +- struct lo_inode *prev; /* protected by lo->mutex */ + int fd; + bool is_symlink; +- ino_t ino; +- dev_t dev; ++ struct lo_key key; + uint64_t refcount; /* protected by lo->mutex */ + fuse_ino_t fuse_ino; + }; +@@ -119,7 +121,8 @@ struct lo_data { + int timeout_set; + int readdirplus_set; + int readdirplus_clear; +- struct lo_inode root; /* protected by lo->mutex */ ++ struct lo_inode root; ++ GHashTable *inodes; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ + struct lo_map fd_map; /* protected by lo->mutex */ +@@ -573,7 +576,7 @@ retry: + } + goto fail_unref; + } +- if (stat.st_dev != inode->dev || stat.st_ino != inode->ino) { ++ if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) { + if (!retries) { + fuse_log(FUSE_LOG_WARNING, + "%s: failed to match last\n", __func__); +@@ -753,19 +756,20 @@ out_err: + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + { + struct lo_inode *p; +- struct lo_inode *ret = NULL; ++ struct lo_key key = { ++ .ino = st->st_ino, ++ .dev = st->st_dev, ++ }; + + pthread_mutex_lock(&lo->mutex); +- for (p = lo->root.next; p != &lo->root; p = p->next) { +- if (p->ino == st->st_ino && p->dev == st->st_dev) { +- assert(p->refcount > 0); +- ret = p; +- ret->refcount++; +- break; +- } ++ p = g_hash_table_lookup(lo->inodes, &key); ++ if (p) { ++ assert(p->refcount > 0); ++ p->refcount++; + } + pthread_mutex_unlock(&lo->mutex); +- return ret; ++ ++ return p; + } + + static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, +@@ -810,8 +814,6 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + close(newfd); + newfd = -1; + } else { +- struct lo_inode *prev, *next; +- + saverr = ENOMEM; + inode = calloc(1, sizeof(struct lo_inode)); + if (!inode) { +@@ -822,17 +824,12 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + inode->refcount = 1; + inode->fd = newfd; + newfd = -1; +- inode->ino = e->attr.st_ino; +- inode->dev = e->attr.st_dev; ++ inode->key.ino = e->attr.st_ino; ++ inode->key.dev = e->attr.st_dev; + + pthread_mutex_lock(&lo->mutex); + inode->fuse_ino = lo_add_inode_mapping(req, inode); +- prev = &lo->root; +- next = prev->next; +- next->prev = inode; +- inode->next = next; +- inode->prev = prev; +- prev->next = inode; ++ g_hash_table_insert(lo->inodes, &inode->key, inode); + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; +@@ -1162,14 +1159,8 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + assert(inode->refcount >= n); + inode->refcount -= n; + if (!inode->refcount) { +- struct lo_inode *prev, *next; +- +- prev = inode->prev; +- next = inode->next; +- next->prev = prev; +- prev->next = next; +- + lo_map_remove(&lo->ino_map, inode->fuse_ino); ++ g_hash_table_remove(lo->inodes, &inode->key); + pthread_mutex_unlock(&lo->mutex); + close(inode->fd); + free(inode); +@@ -1369,7 +1360,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + /* Hide root's parent directory */ + if (dinode == &lo->root && strcmp(name, "..") == 0) { +- e.attr.st_ino = lo->root.ino; ++ e.attr.st_ino = lo->root.key.ino; + e.attr.st_mode = DT_DIR << 12; + } + +@@ -2370,11 +2361,26 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + + root->is_symlink = false; + root->fd = fd; +- root->ino = stat.st_ino; +- root->dev = stat.st_dev; ++ root->key.ino = stat.st_ino; ++ root->key.dev = stat.st_dev; + root->refcount = 2; + } + ++static guint lo_key_hash(gconstpointer key) ++{ ++ const struct lo_key *lkey = key; ++ ++ return (guint)lkey->ino + (guint)lkey->dev; ++} ++ ++static gboolean lo_key_equal(gconstpointer a, gconstpointer b) ++{ ++ const struct lo_key *la = a; ++ const struct lo_key *lb = b; ++ ++ return la->ino == lb->ino && la->dev == lb->dev; ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2392,7 +2398,7 @@ int main(int argc, char *argv[]) + umask(0); + + pthread_mutex_init(&lo.mutex, NULL); +- lo.root.next = lo.root.prev = &lo.root; ++ lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); + lo.root.fd = -1; + lo.root.fuse_ino = FUSE_ROOT_ID; + lo.cache = CACHE_AUTO; +@@ -2522,6 +2528,9 @@ err_out2: + err_out1: + fuse_opt_free_args(&args); + ++ if (lo.inodes) { ++ g_hash_table_destroy(lo.inodes); ++ } + lo_map_destroy(&lo.fd_map); + lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch b/kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch new file mode 100755 index 0000000000000000000000000000000000000000..68eb03e3b76733c9ef21d89e2f4be2c09b5c1650 --- /dev/null +++ b/kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch @@ -0,0 +1,54 @@ +From feb005dfeb15dd5ac5156c994f323ab4c573b1fc Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:24 +0100 +Subject: [PATCH 053/116] virtiofsd: prevent ".." escape in lo_do_lookup() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-50-dgilbert@redhat.com> +Patchwork-id: 93500 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 049/112] virtiofsd: prevent ".." escape in lo_do_lookup() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 854684bc0b3d63eb90b3abdfe471c2e4271ef176) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e375406..79d5966 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -624,12 +624,17 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + int res; + int saverr; + struct lo_data *lo = lo_data(req); +- struct lo_inode *inode; ++ struct lo_inode *inode, *dir = lo_inode(req, parent); + + memset(e, 0, sizeof(*e)); + e->attr_timeout = lo->timeout; + e->entry_timeout = lo->timeout; + ++ /* Do not allow escaping root directory */ ++ if (dir == &lo->root && strcmp(name, "..") == 0) { ++ name = "."; ++ } ++ + newfd = openat(lo_fd(req, parent), name, O_PATH | O_NOFOLLOW); + if (newfd == -1) { + goto out_err; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch b/kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch new file mode 100755 index 0000000000000000000000000000000000000000..5f97cbf2a16ead0daf6457d3569b8a4eda3e2a7f --- /dev/null +++ b/kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch @@ -0,0 +1,108 @@ +From 97e232e75bbc0032f4a309d248f383384612eafe Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:25 +0100 +Subject: [PATCH 054/116] virtiofsd: prevent ".." escape in lo_do_readdir() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-51-dgilbert@redhat.com> +Patchwork-id: 93507 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 050/112] virtiofsd: prevent ".." escape in lo_do_readdir() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Construct a fake dirent for the root directory's ".." entry. This hides +the parent directory from the FUSE client. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 752272da2b68a2312f0e11fc5303015a6c3ee1ac) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 36 ++++++++++++++++++++++-------------- + 1 file changed, 22 insertions(+), 14 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 79d5966..e3d65c3 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1149,19 +1149,25 @@ out_err: + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { ++ struct lo_data *lo = lo_data(req); + struct lo_dirp *d; ++ struct lo_inode *dinode; + char *buf = NULL; + char *p; + size_t rem = size; +- int err = ENOMEM; ++ int err = EBADF; + +- (void)ino; ++ dinode = lo_inode(req, ino); ++ if (!dinode) { ++ goto error; ++ } + + d = lo_dirp(req, fi); + if (!d) { + goto error; + } + ++ err = ENOMEM; + buf = calloc(1, size); + if (!buf) { + goto error; +@@ -1192,15 +1198,21 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + } + nextoff = d->entry->d_off; + name = d->entry->d_name; ++ + fuse_ino_t entry_ino = 0; ++ struct fuse_entry_param e = (struct fuse_entry_param){ ++ .attr.st_ino = d->entry->d_ino, ++ .attr.st_mode = d->entry->d_type << 12, ++ }; ++ ++ /* Hide root's parent directory */ ++ if (dinode == &lo->root && strcmp(name, "..") == 0) { ++ e.attr.st_ino = lo->root.ino; ++ e.attr.st_mode = DT_DIR << 12; ++ } ++ + if (plus) { +- struct fuse_entry_param e; +- if (is_dot_or_dotdot(name)) { +- e = (struct fuse_entry_param){ +- .attr.st_ino = d->entry->d_ino, +- .attr.st_mode = d->entry->d_type << 12, +- }; +- } else { ++ if (!is_dot_or_dotdot(name)) { + err = lo_do_lookup(req, ino, name, &e); + if (err) { + goto error; +@@ -1210,11 +1222,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); + } else { +- struct stat st = { +- .st_ino = d->entry->d_ino, +- .st_mode = d->entry->d_type << 12, +- }; +- entsize = fuse_add_direntry(req, p, rem, name, &st, nextoff); ++ entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff); + } + if (entsize > rem) { + if (entry_ino != 0) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch b/kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch new file mode 100755 index 0000000000000000000000000000000000000000..be7c120eb02e56c1520f5360977a5b1c85f48abf --- /dev/null +++ b/kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch @@ -0,0 +1,103 @@ +From 249c02ae54739dc5894ee1b2905bbe8f1e79e909 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:20 +0100 +Subject: [PATCH 109/116] virtiofsd: prevent FUSE_INIT/FUSE_DESTROY races +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-106-dgilbert@redhat.com> +Patchwork-id: 93562 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 105/112] virtiofsd: prevent FUSE_INIT/FUSE_DESTROY races +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +When running with multiple threads it can be tricky to handle +FUSE_INIT/FUSE_DESTROY in parallel with other request types or in +parallel with themselves. Serialize FUSE_INIT and FUSE_DESTROY so that +malicious clients cannot trigger race conditions. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Masayoshi Mizuma +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit cdc497c6925be745bc895355bd4674a17a4b2a8b) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_i.h | 1 + + tools/virtiofsd/fuse_lowlevel.c | 18 ++++++++++++++++++ + 2 files changed, 19 insertions(+) + +diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h +index a20854f..1447d86 100644 +--- a/tools/virtiofsd/fuse_i.h ++++ b/tools/virtiofsd/fuse_i.h +@@ -61,6 +61,7 @@ struct fuse_session { + struct fuse_req list; + struct fuse_req interrupts; + pthread_mutex_t lock; ++ pthread_rwlock_t init_rwlock; + int got_destroy; + int broken_splice_nonblock; + uint64_t notify_ctr; +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index dab6a31..79a4031 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2428,6 +2428,19 @@ void fuse_session_process_buf_int(struct fuse_session *se, + req->ctx.pid = in->pid; + req->ch = ch; + ++ /* ++ * INIT and DESTROY requests are serialized, all other request types ++ * run in parallel. This prevents races between FUSE_INIT and ordinary ++ * requests, FUSE_INIT and FUSE_INIT, FUSE_INIT and FUSE_DESTROY, and ++ * FUSE_DESTROY and FUSE_DESTROY. ++ */ ++ if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT || ++ in->opcode == FUSE_DESTROY) { ++ pthread_rwlock_wrlock(&se->init_rwlock); ++ } else { ++ pthread_rwlock_rdlock(&se->init_rwlock); ++ } ++ + err = EIO; + if (!se->got_init) { + enum fuse_opcode expected; +@@ -2485,10 +2498,13 @@ void fuse_session_process_buf_int(struct fuse_session *se, + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter); + } ++ ++ pthread_rwlock_unlock(&se->init_rwlock); + return; + + reply_err: + fuse_reply_err(req, err); ++ pthread_rwlock_unlock(&se->init_rwlock); + } + + #define LL_OPTION(n, o, v) \ +@@ -2531,6 +2547,7 @@ void fuse_session_destroy(struct fuse_session *se) + se->op.destroy(se->userdata); + } + } ++ pthread_rwlock_destroy(&se->init_rwlock); + pthread_mutex_destroy(&se->lock); + free(se->cuse_data); + if (se->fd != -1) { +@@ -2610,6 +2627,7 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + list_init_req(&se->list); + list_init_req(&se->interrupts); + fuse_mutex_init(&se->lock); ++ pthread_rwlock_init(&se->init_rwlock, NULL); + + memcpy(&se->op, op, op_size); + se->owner = getuid(); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch b/kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch new file mode 100755 index 0000000000000000000000000000000000000000..8eabede5914abb1c025b23df42f6549290f8d38a --- /dev/null +++ b/kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch @@ -0,0 +1,149 @@ +From 69c6a829f8136a8c95ccdf480f2fd0173d64b6ec Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:05 +0100 +Subject: [PATCH 094/116] virtiofsd: prevent fv_queue_thread() vs virtio_loop() + races +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-91-dgilbert@redhat.com> +Patchwork-id: 93544 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 090/112] virtiofsd: prevent fv_queue_thread() vs virtio_loop() races +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +We call into libvhost-user from the virtqueue handler thread and the +vhost-user message processing thread without a lock. There is nothing +protecting the virtqueue handler thread if the vhost-user message +processing thread changes the virtqueue or memory table while it is +running. + +This patch introduces a read-write lock. Virtqueue handler threads are +readers. The vhost-user message processing thread is a writer. This +will allow concurrency for multiqueue in the future while protecting +against fv_queue_thread() vs virtio_loop() races. + +Note that the critical sections could be made smaller but it would be +more invasive and require libvhost-user changes. Let's start simple and +improve performance later, if necessary. Another option would be an +RCU-style approach with lighter-weight primitives. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit e7b337326d594b71b07cd6dbb332c49c122c80a4) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 34 +++++++++++++++++++++++++++++++++- + 1 file changed, 33 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index fb8d6d1..f6242f9 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -59,6 +59,18 @@ struct fv_VuDev { + struct fuse_session *se; + + /* ++ * Either handle virtqueues or vhost-user protocol messages. Don't do ++ * both at the same time since that could lead to race conditions if ++ * virtqueues or memory tables change while another thread is accessing ++ * them. ++ * ++ * The assumptions are: ++ * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev. ++ * 2. virtio_loop() reads/writes virtqueues and VuDev. ++ */ ++ pthread_rwlock_t vu_dispatch_rwlock; ++ ++ /* + * The following pair of fields are only accessed in the main + * virtio_loop + */ +@@ -415,6 +427,8 @@ static void *fv_queue_thread(void *opaque) + qi->qidx, qi->kick_fd); + while (1) { + struct pollfd pf[2]; ++ int ret; ++ + pf[0].fd = qi->kick_fd; + pf[0].events = POLLIN; + pf[0].revents = 0; +@@ -461,6 +475,9 @@ static void *fv_queue_thread(void *opaque) + fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); + break; + } ++ /* Mutual exclusion with virtio_loop() */ ++ ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ assert(ret == 0); /* there is no possible error case */ + /* out is from guest, in is too guest */ + unsigned int in_bytes, out_bytes; + vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); +@@ -469,6 +486,7 @@ static void *fv_queue_thread(void *opaque) + "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + ++ + while (1) { + bool allocated_bufv = false; + struct fuse_bufvec bufv; +@@ -597,6 +615,8 @@ static void *fv_queue_thread(void *opaque) + free(elem); + elem = NULL; + } ++ ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + } + out: + pthread_mutex_destroy(&ch.lock); +@@ -711,6 +731,8 @@ int virtio_loop(struct fuse_session *se) + + while (!fuse_session_exited(se)) { + struct pollfd pf[1]; ++ bool ok; ++ int ret; + pf[0].fd = se->vu_socketfd; + pf[0].events = POLLIN; + pf[0].revents = 0; +@@ -735,7 +757,15 @@ int virtio_loop(struct fuse_session *se) + } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); +- if (!vu_dispatch(&se->virtio_dev->dev)) { ++ /* Mutual exclusion with fv_queue_thread() */ ++ ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock); ++ assert(ret == 0); /* there is no possible error case */ ++ ++ ok = vu_dispatch(&se->virtio_dev->dev); ++ ++ pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock); ++ ++ if (!ok) { + fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); + break; + } +@@ -877,6 +907,7 @@ int virtio_session_mount(struct fuse_session *se) + + se->vu_socketfd = data_sock; + se->virtio_dev->se = se; ++ pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL); + vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch, + fv_remove_watch, &fv_iface); + +@@ -892,6 +923,7 @@ void virtio_session_close(struct fuse_session *se) + } + + free(se->virtio_dev->qi); ++ pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock); + free(se->virtio_dev); + se->virtio_dev = NULL; + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch b/kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch new file mode 100755 index 0000000000000000000000000000000000000000..5956dcee3f921a7ca056daa9e856929a8c906885 --- /dev/null +++ b/kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch @@ -0,0 +1,314 @@ +From cc9a776fba8ec62c862db55753107f19459dafa8 Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Tue, 9 Feb 2021 23:14:56 -0500 +Subject: [PATCH 3/3] virtiofsd: prevent opening of special files + (CVE-2020-35517) + +RH-Author: Jon Maloy +Message-id: <20210209231456.1555472-4-jmaloy@redhat.com> +Patchwork-id: 101023 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/3] virtiofsd: prevent opening of special files (CVE-2020-35517) +Bugzilla: 1919111 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Greg Kurz +RH-Acked-by: Dr. David Alan Gilbert + +From: Stefan Hajnoczi + +A well-behaved FUSE client does not attempt to open special files with +FUSE_OPEN because they are handled on the client side (e.g. device nodes +are handled by client-side device drivers). + +The check to prevent virtiofsd from opening special files is missing in +a few cases, most notably FUSE_OPEN. A malicious client can cause +virtiofsd to open a device node, potentially allowing the guest to +escape. This can be exploited by a modified guest device driver. It is +not exploitable from guest userspace since the guest kernel will handle +special files inside the guest instead of sending FUSE requests. + +This patch fixes this issue by introducing the lo_inode_open() function +to check the file type before opening it. This is a short-term solution +because it does not prevent a compromised virtiofsd process from opening +device nodes on the host. + +Restructure lo_create() to try O_CREAT | O_EXCL first. Note that O_CREAT +| O_EXCL does not follow symlinks, so O_NOFOLLOW masking is not +necessary here. If the file exists and the user did not specify O_EXCL, +open it via lo_do_open(). + +Reported-by: Alex Xu +Fixes: CVE-2020-35517 +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Vivek Goyal +Reviewed-by: Greg Kurz +Signed-off-by: Stefan Hajnoczi +Message-Id: <20210204150208.367837-4-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert + +(cherry picked from commit a3fdbbc7f271bff7d53d0501b29d910ece0b3789) +Signed-off-by: Jon Maloy +Signed-off-by: Jon Maloy +--- + tools/virtiofsd/passthrough_ll.c | 144 ++++++++++++++++++++----------- + 1 file changed, 92 insertions(+), 52 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e5bd3d73e4..cb0992f2db 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -535,6 +535,38 @@ static int lo_fd(fuse_req_t req, fuse_ino_t ino) + return fd; + } + ++/* ++ * Open a file descriptor for an inode. Returns -EBADF if the inode is not a ++ * regular file or a directory. ++ * ++ * Use this helper function instead of raw openat(2) to prevent security issues ++ * when a malicious client opens special files such as block device nodes. ++ * Symlink inodes are also rejected since symlinks must already have been ++ * traversed on the client side. ++ */ ++static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode, ++ int open_flags) ++{ ++ g_autofree char *fd_str = g_strdup_printf("%d", inode->fd); ++ int fd; ++ ++ if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) { ++ return -EBADF; ++ } ++ ++ /* ++ * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier ++ * that the inode is not a special file but if an external process races ++ * with us then symlinks are traversed here. It is not possible to escape ++ * the shared directory since it is mounted as "/" though. ++ */ ++ fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW); ++ if (fd < 0) { ++ return -errno; ++ } ++ return fd; ++} ++ + static void lo_init(void *userdata, struct fuse_conn_info *conn) + { + struct lo_data *lo = (struct lo_data *)userdata; +@@ -788,9 +820,9 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + if (fi) { + truncfd = fd; + } else { +- sprintf(procname, "%i", ifd); +- truncfd = openat(lo->proc_self_fd, procname, O_RDWR); ++ truncfd = lo_inode_open(lo, inode, O_RDWR); + if (truncfd < 0) { ++ errno = -truncfd; + goto out_err; + } + } +@@ -894,7 +926,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct lo_inode *dir = lo_inode(req, parent); + + if (inodep) { +- *inodep = NULL; ++ *inodep = NULL; /* in case there is an error */ + } + + /* +@@ -1725,19 +1757,26 @@ static void update_open_flags(int writeback, struct fuse_file_info *fi) + fi->flags &= ~O_DIRECT; + } + ++/* ++ * Open a regular file, set up an fd mapping, and fill out the struct ++ * fuse_file_info for it. If existing_fd is not negative, use that fd instead ++ * opening a new one. Takes ownership of existing_fd. ++ * ++ * Returns 0 on success or a positive errno. ++ */ + static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, +- struct fuse_file_info *fi) ++ int existing_fd, struct fuse_file_info *fi) + { +- char buf[64]; + ssize_t fh; +- int fd; ++ int fd = existing_fd; + + update_open_flags(lo->writeback, fi); + +- sprintf(buf, "%i", inode->fd); +- fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); +- if (fd == -1) { +- return errno; ++ if (fd < 0) { ++ fd = lo_inode_open(lo, inode, fi->flags); ++ if (fd < 0) { ++ return -fd; ++ } + } + + pthread_mutex_lock(&lo->mutex); +@@ -1760,9 +1799,10 @@ static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) + { +- int fd; ++ int fd = -1; + struct lo_data *lo = lo_data(req); + struct lo_inode *parent_inode; ++ struct lo_inode *inode = NULL; + struct fuse_entry_param e; + int err; + struct lo_cred old = {}; +@@ -1788,36 +1828,38 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + + update_open_flags(lo->writeback, fi); + +- fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, +- mode); ++ /* Try to create a new file but don't open existing files */ ++ fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode); + err = fd == -1 ? errno : 0; +- lo_restore_cred(&old); + +- if (!err) { +- ssize_t fh; ++ lo_restore_cred(&old); + +- pthread_mutex_lock(&lo->mutex); +- fh = lo_add_fd_mapping(lo, fd); +- pthread_mutex_unlock(&lo->mutex); +- if (fh == -1) { +- close(fd); +- err = ENOMEM; +- goto out; +- } ++ /* Ignore the error if file exists and O_EXCL was not given */ ++ if (err && (err != EEXIST || (fi->flags & O_EXCL))) { ++ goto out; ++ } + +- fi->fh = fh; +- err = lo_do_lookup(req, parent, name, &e, NULL); ++ err = lo_do_lookup(req, parent, name, &e, &inode); ++ if (err) { ++ goto out; + } +- if (lo->cache == CACHE_NONE) { +- fi->direct_io = 1; +- } else if (lo->cache == CACHE_ALWAYS) { +- fi->keep_cache = 1; ++ ++ err = lo_do_open(lo, inode, fd, fi); ++ fd = -1; /* lo_do_open() takes ownership of fd */ ++ if (err) { ++ /* Undo lo_do_lookup() nlookup ref */ ++ unref_inode_lolocked(lo, inode, 1); + } + + out: ++ lo_inode_put(lo, &inode); + lo_inode_put(lo, &parent_inode); + + if (err) { ++ if (fd >= 0) { ++ close(fd); ++ } ++ + fuse_reply_err(req, err); + } else { + fuse_reply_create(req, &e, fi); +@@ -1831,7 +1873,6 @@ static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, + pid_t pid, int *err) + { + struct lo_inode_plock *plock; +- char procname[64]; + int fd; + + plock = +@@ -1848,12 +1889,10 @@ static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, + } + + /* Open another instance of file which can be used for ofd locks. */ +- sprintf(procname, "%i", inode->fd); +- + /* TODO: What if file is not writable? */ +- fd = openat(lo->proc_self_fd, procname, O_RDWR); +- if (fd == -1) { +- *err = errno; ++ fd = lo_inode_open(lo, inode, O_RDWR); ++ if (fd < 0) { ++ *err = -fd; + free(plock); + return NULL; + } +@@ -2000,7 +2039,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + return; + } + +- err = lo_do_open(lo, inode, fi); ++ err = lo_do_open(lo, inode, -1, fi); + lo_inode_put(lo, &inode); + if (err) { + fuse_reply_err(req, err); +@@ -2056,39 +2095,40 @@ static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { ++ struct lo_inode *inode = lo_inode(req, ino); ++ struct lo_data *lo = lo_data(req); + int res; + int fd; +- char *buf; + + fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, + (void *)fi); + +- if (!fi) { +- struct lo_data *lo = lo_data(req); +- +- res = asprintf(&buf, "%i", lo_fd(req, ino)); +- if (res == -1) { +- return (void)fuse_reply_err(req, errno); +- } ++ if (!inode) { ++ fuse_reply_err(req, EBADF); ++ return; ++ } + +- fd = openat(lo->proc_self_fd, buf, O_RDWR); +- free(buf); +- if (fd == -1) { +- return (void)fuse_reply_err(req, errno); ++ if (!fi) { ++ fd = lo_inode_open(lo, inode, O_RDWR); ++ if (fd < 0) { ++ res = -fd; ++ goto out; + } + } else { + fd = lo_fi_fd(req, fi); + } + + if (datasync) { +- res = fdatasync(fd); ++ res = fdatasync(fd) == -1 ? errno : 0; + } else { +- res = fsync(fd); ++ res = fsync(fd) == -1 ? errno : 0; + } + if (!fi) { + close(fd); + } +- fuse_reply_err(req, res == -1 ? errno : 0); ++out: ++ lo_inode_put(lo, &inode); ++ fuse_reply_err(req, res); + } + + static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, +-- +2.18.2 + diff --git a/kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch b/kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch new file mode 100755 index 0000000000000000000000000000000000000000..acafa412e494e8237284285dd1374f836c639ab1 --- /dev/null +++ b/kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch @@ -0,0 +1,147 @@ +From 2e58ff6978f8433fc8672d2e357c6f0f5f36d24f Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:07 +0100 +Subject: [PATCH 096/116] virtiofsd: prevent races with lo_dirp_put() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-93-dgilbert@redhat.com> +Patchwork-id: 93546 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 092/112] virtiofsd: prevent races with lo_dirp_put() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Introduce lo_dirp_put() so that FUSE_RELEASEDIR does not cause +use-after-free races with other threads that are accessing lo_dirp. + +Also make lo_releasedir() atomic to prevent FUSE_RELEASEDIR racing with +itself. This prevents double-frees. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit acefdde73b403576a241ebd8dbe8431ddc0d9442) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 41 ++++++++++++++++++++++++++++++++++------ + 1 file changed, 35 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 690edbc..2d703b5 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1284,11 +1284,28 @@ static void lo_readlink(fuse_req_t req, fuse_ino_t ino) + } + + struct lo_dirp { ++ gint refcount; + DIR *dp; + struct dirent *entry; + off_t offset; + }; + ++static void lo_dirp_put(struct lo_dirp **dp) ++{ ++ struct lo_dirp *d = *dp; ++ ++ if (!d) { ++ return; ++ } ++ *dp = NULL; ++ ++ if (g_atomic_int_dec_and_test(&d->refcount)) { ++ closedir(d->dp); ++ free(d); ++ } ++} ++ ++/* Call lo_dirp_put() on the return value when no longer needed */ + static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) + { + struct lo_data *lo = lo_data(req); +@@ -1296,6 +1313,9 @@ static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->dirp_map, fi->fh); ++ if (elem) { ++ g_atomic_int_inc(&elem->dirp->refcount); ++ } + pthread_mutex_unlock(&lo->mutex); + if (!elem) { + return NULL; +@@ -1331,6 +1351,7 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + d->offset = 0; + d->entry = NULL; + ++ g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */ + pthread_mutex_lock(&lo->mutex); + fh = lo_add_dirp_mapping(req, d); + pthread_mutex_unlock(&lo->mutex); +@@ -1364,7 +1385,7 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { + struct lo_data *lo = lo_data(req); +- struct lo_dirp *d; ++ struct lo_dirp *d = NULL; + struct lo_inode *dinode; + char *buf = NULL; + char *p; +@@ -1454,6 +1475,8 @@ static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + + err = 0; + error: ++ lo_dirp_put(&d); ++ + /* + * If there's an error, we can only signal it if we haven't stored + * any entries yet - otherwise we'd end up with wrong lookup +@@ -1484,22 +1507,25 @@ static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) + { + struct lo_data *lo = lo_data(req); ++ struct lo_map_elem *elem; + struct lo_dirp *d; + + (void)ino; + +- d = lo_dirp(req, fi); +- if (!d) { ++ pthread_mutex_lock(&lo->mutex); ++ elem = lo_map_get(&lo->dirp_map, fi->fh); ++ if (!elem) { ++ pthread_mutex_unlock(&lo->mutex); + fuse_reply_err(req, EBADF); + return; + } + +- pthread_mutex_lock(&lo->mutex); ++ d = elem->dirp; + lo_map_remove(&lo->dirp_map, fi->fh); + pthread_mutex_unlock(&lo->mutex); + +- closedir(d->dp); +- free(d); ++ lo_dirp_put(&d); /* paired with lo_opendir() */ ++ + fuse_reply_err(req, 0); + } + +@@ -1710,6 +1736,9 @@ static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + } else { + res = fsync(fd); + } ++ ++ lo_dirp_put(&d); ++ + fuse_reply_err(req, res == -1 ? errno : 0); + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch b/kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch new file mode 100755 index 0000000000000000000000000000000000000000..056559d177149628c67ee2c43af719dbc9ba93f8 --- /dev/null +++ b/kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch @@ -0,0 +1,469 @@ +From 5c9bbd00e8f8c944d9e8e22e7d1cf08cb8fddd6b Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:37 +0100 +Subject: [PATCH 066/116] virtiofsd: print log only when priority is high + enough +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-63-dgilbert@redhat.com> +Patchwork-id: 93518 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 062/112] virtiofsd: print log only when priority is high enough +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eryu Guan + +Introduce "-o log_level=" command line option to specify current log +level (priority), valid values are "debug info warn err", e.g. + + ./virtiofsd -o log_level=debug ... + +So only log priority higher than "debug" will be printed to +stderr/syslog. And the default level is info. + +The "-o debug"/"-d" options are kept, and imply debug log level. + +Signed-off-by: Eryu Guan +dgilbert: Reworked for libfuse's log_func +Signed-off-by: Dr. David Alan Gilbert +with fix by: +Signed-off-by: Xiao Yang +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit d240314a1a18a1d914af1b5763fe8c9a572e6409) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 75 ++++++++++--------------- + tools/virtiofsd/fuse_lowlevel.h | 1 + + tools/virtiofsd/helper.c | 8 ++- + tools/virtiofsd/passthrough_ll.c | 118 ++++++++++++++++----------------------- + 4 files changed, 87 insertions(+), 115 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 6ceb33d..a7a1968 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -158,19 +158,17 @@ static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct fuse_out_header *out = iov[0].iov_base; + + out->len = iov_length(iov, count); +- if (se->debug) { +- if (out->unique == 0) { +- fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, +- out->len); +- } else if (out->error) { +- fuse_log(FUSE_LOG_DEBUG, +- " unique: %llu, error: %i (%s), outsize: %i\n", +- (unsigned long long)out->unique, out->error, +- strerror(-out->error), out->len); +- } else { +- fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", +- (unsigned long long)out->unique, out->len); +- } ++ if (out->unique == 0) { ++ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, ++ out->len); ++ } else if (out->error) { ++ fuse_log(FUSE_LOG_DEBUG, ++ " unique: %llu, error: %i (%s), outsize: %i\n", ++ (unsigned long long)out->unique, out->error, ++ strerror(-out->error), out->len); ++ } else { ++ fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", ++ (unsigned long long)out->unique, out->len); + } + + if (fuse_lowlevel_is_virtio(se)) { +@@ -1662,10 +1660,8 @@ static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, + return; + } + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", +- (unsigned long long)arg->unique); +- } ++ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", ++ (unsigned long long)arg->unique); + + req->u.i.unique = arg->unique; + +@@ -1901,13 +1897,10 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + } + } + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); +- if (arg->major == 7 && arg->minor >= 6) { +- fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); +- fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", +- arg->max_readahead); +- } ++ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); ++ if (arg->major == 7 && arg->minor >= 6) { ++ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); ++ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead); + } + se->conn.proto_major = arg->major; + se->conn.proto_minor = arg->minor; +@@ -2116,19 +2109,14 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, + outarg.congestion_threshold = se->conn.congestion_threshold; + outarg.time_gran = se->conn.time_gran; + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, +- outarg.minor); +- fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); +- fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", +- outarg.max_readahead); +- fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); +- fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", +- outarg.max_background); +- fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", +- outarg.congestion_threshold); +- fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); +- } ++ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); ++ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); ++ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); ++ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); ++ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background); ++ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", ++ outarg.congestion_threshold); ++ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); + + send_reply_ok(req, &outarg, outargsize); + } +@@ -2407,14 +2395,11 @@ void fuse_session_process_buf_int(struct fuse_session *se, + in = fuse_mbuf_iter_advance(&iter, sizeof(*in)); + assert(in); /* caller guarantees the input buffer is large enough */ + +- if (se->debug) { +- fuse_log(FUSE_LOG_DEBUG, +- "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, " +- "pid: %u\n", +- (unsigned long long)in->unique, +- opname((enum fuse_opcode)in->opcode), in->opcode, +- (unsigned long long)in->nodeid, buf->size, in->pid); +- } ++ fuse_log( ++ FUSE_LOG_DEBUG, ++ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", ++ (unsigned long long)in->unique, opname((enum fuse_opcode)in->opcode), ++ in->opcode, (unsigned long long)in->nodeid, buf->size, in->pid); + + req = fuse_ll_alloc_req(se); + if (req == NULL) { +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index f2750bc..138041e 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1796,6 +1796,7 @@ struct fuse_cmdline_opts { + int show_help; + int print_capabilities; + int syslog; ++ int log_level; + unsigned int max_idle_threads; + }; + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 9692ef9..6d50a46 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -34,7 +34,6 @@ + t, offsetof(struct fuse_cmdline_opts, p), v \ + } + +- + static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("-h", show_help), + FUSE_HELPER_OPT("--help", show_help), +@@ -55,6 +54,10 @@ static const struct fuse_opt fuse_helper_opts[] = { + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_HELPER_OPT("--syslog", syslog), ++ FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), ++ FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), ++ FUSE_HELPER_OPT_VALUE("log_level=warn", log_level, FUSE_LOG_WARNING), ++ FUSE_HELPER_OPT_VALUE("log_level=err", log_level, FUSE_LOG_ERR), + FUSE_OPT_END + }; + +@@ -142,6 +145,9 @@ void fuse_cmdline_help(void) + " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" ++ " -o log_level= log level, default to \"info\"\n" ++ " level could be one of \"debug, " ++ "info, warn, err\"\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" + " allowed (default: 10)\n" +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 0372aca..ff6910f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -37,6 +37,7 @@ + + #include "qemu/osdep.h" + #include "fuse_virtio.h" ++#include "fuse_log.h" + #include "fuse_lowlevel.h" + #include + #include +@@ -140,6 +141,7 @@ static const struct fuse_opt lo_opts[] = { + FUSE_OPT_END + }; + static bool use_syslog = false; ++static int current_log_level; + + static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + +@@ -458,11 +460,6 @@ static int lo_fd(fuse_req_t req, fuse_ino_t ino) + return inode ? inode->fd : -1; + } + +-static bool lo_debug(fuse_req_t req) +-{ +- return lo_data(req)->debug != 0; +-} +- + static void lo_init(void *userdata, struct fuse_conn_info *conn) + { + struct lo_data *lo = (struct lo_data *)userdata; +@@ -472,15 +469,11 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) + } + + if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { +- if (lo->debug) { +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); + conn->want |= FUSE_CAP_WRITEBACK_CACHE; + } + if (lo->flock && conn->capable & FUSE_CAP_FLOCK_LOCKS) { +- if (lo->debug) { +- fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } + } +@@ -823,10 +816,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + } + e->ino = inode->fuse_ino; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long)parent, name, (unsigned long long)e->ino); +- } ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, ++ name, (unsigned long long)e->ino); + + return 0; + +@@ -843,10 +834,8 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + struct fuse_entry_param e; + int err; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent, ++ name); + + /* + * Don't use is_safe_path_component(), allow "." and ".." for NFS export +@@ -971,10 +960,8 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long)parent, name, (unsigned long long)e.ino); +- } ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, ++ name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + return; +@@ -1074,10 +1061,8 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + pthread_mutex_unlock(&lo->mutex); + e.ino = inode->fuse_ino; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", +- (unsigned long long)parent, name, (unsigned long long)e.ino); +- } ++ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, ++ name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + return; +@@ -1171,11 +1156,9 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + return; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +- (unsigned long long)ino, (unsigned long long)inode->refcount, +- (unsigned long long)nlookup); +- } ++ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", ++ (unsigned long long)ino, (unsigned long long)inode->refcount, ++ (unsigned long long)nlookup); + + unref_inode(lo, inode, nlookup); + } +@@ -1445,10 +1428,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + int err; + struct lo_cred old = {}; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", +- parent, name); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent, ++ name); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); +@@ -1525,10 +1506,8 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + char buf[64]; + struct lo_data *lo = lo_data(req); + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, +- fi->flags); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, ++ fi->flags); + + /* + * With writeback cache, kernel may send read requests even +@@ -1644,12 +1623,10 @@ static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + { + struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_read(ino=%" PRIu64 ", size=%zd, " +- "off=%lu)\n", +- ino, size, (unsigned long)offset); +- } ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_read(ino=%" PRIu64 ", size=%zd, " ++ "off=%lu)\n", ++ ino, size, (unsigned long)offset); + + buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + buf.buf[0].fd = lo_fi_fd(req, fi); +@@ -1671,11 +1648,9 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + out_buf.buf[0].fd = lo_fi_fd(req, fi); + out_buf.buf[0].pos = off; + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_write(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, +- out_buf.buf[0].size, (unsigned long)off); +- } ++ fuse_log(FUSE_LOG_DEBUG, ++ "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, ++ out_buf.buf[0].size, (unsigned long)off); + + /* + * If kill_priv is set, drop CAP_FSETID which should lead to kernel +@@ -1774,11 +1749,8 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ino, name, +- size); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", ++ ino, name, size); + + if (inode->is_symlink) { + /* Sorry, no race free way to getxattr on symlink. */ +@@ -1852,10 +1824,8 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", +- ino, size); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, ++ size); + + if (inode->is_symlink) { + /* Sorry, no race free way to listxattr on symlink. */ +@@ -1929,11 +1899,8 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, +- "lo_setxattr(ino=%" PRIu64 ", name=%s value=%s size=%zd)\n", +- ino, name, value, size); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 ++ ", name=%s value=%s size=%zd)\n", ino, name, value, size); + + if (inode->is_symlink) { + /* Sorry, no race free way to setxattr on symlink. */ +@@ -1978,10 +1945,8 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + goto out; + } + +- if (lo_debug(req)) { +- fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", +- ino, name); +- } ++ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, ++ name); + + if (inode->is_symlink) { + /* Sorry, no race free way to setxattr on symlink. */ +@@ -2303,6 +2268,10 @@ static void setup_nofile_rlimit(void) + + static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) + { ++ if (current_log_level < level) { ++ return; ++ } ++ + if (use_syslog) { + int priority = LOG_ERR; + switch (level) { +@@ -2401,8 +2370,19 @@ int main(int argc, char *argv[]) + return 1; + } + ++ /* ++ * log_level is 0 if not configured via cmd options (0 is LOG_EMERG, ++ * and we don't use this log level). ++ */ ++ if (opts.log_level != 0) { ++ current_log_level = opts.log_level; ++ } + lo.debug = opts.debug; ++ if (lo.debug) { ++ current_log_level = FUSE_LOG_DEBUG; ++ } + lo.root.refcount = 2; ++ + if (lo.source) { + struct stat stat; + int res; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-process-requests-in-a-thread-pool.patch b/kvm-virtiofsd-process-requests-in-a-thread-pool.patch new file mode 100755 index 0000000000000000000000000000000000000000..87fff99ad560c93a5e6634109a50f45d4c9d1902 --- /dev/null +++ b/kvm-virtiofsd-process-requests-in-a-thread-pool.patch @@ -0,0 +1,533 @@ +From b0db5e666aaa43eadff3e60a1ada704f33b03074 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:19 +0100 +Subject: [PATCH 108/116] virtiofsd: process requests in a thread pool +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-105-dgilbert@redhat.com> +Patchwork-id: 93554 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 104/112] virtiofsd: process requests in a thread pool +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Introduce a thread pool so that fv_queue_thread() just pops +VuVirtqElements and hands them to the thread pool. For the time being +only one worker thread is allowed since passthrough_ll.c is not +thread-safe yet. Future patches will lift this restriction so that +multiple FUSE requests can be processed in parallel. + +The main new concept is struct FVRequest, which contains both +VuVirtqElement and struct fuse_chan. We now have fv_VuDev for a device, +fv_QueueInfo for a virtqueue, and FVRequest for a request. Some of +fv_QueueInfo's fields are moved into FVRequest because they are +per-request. The name FVRequest conforms to QEMU coding style and I +expect the struct fv_* types will be renamed in a future refactoring. + +This patch series is not optimal. fbuf reuse is dropped so each request +does malloc(se->bufsize), but there is no clean and cheap way to keep +this with a thread pool. The vq_lock mutex is held for longer than +necessary, especially during the eventfd_write() syscall. Performance +can be improved in the future. + +prctl(2) had to be added to the seccomp whitelist because glib invokes +it. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Misono Tomohiro +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit a3d756c5aecccc4c0e51060a7e2f1c87bf8f1180) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 359 +++++++++++++++++++++++------------------- + 1 file changed, 201 insertions(+), 158 deletions(-) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index f6242f9..0dcf2ef 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -22,6 +22,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -37,17 +38,28 @@ + struct fv_VuDev; + struct fv_QueueInfo { + pthread_t thread; ++ /* ++ * This lock protects the VuVirtq preventing races between ++ * fv_queue_thread() and fv_queue_worker(). ++ */ ++ pthread_mutex_t vq_lock; ++ + struct fv_VuDev *virtio_dev; + + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; + int kill_fd; /* For killing the thread */ ++}; + +- /* The element for the command currently being processed */ +- VuVirtqElement *qe; ++/* A FUSE request */ ++typedef struct { ++ VuVirtqElement elem; ++ struct fuse_chan ch; ++ ++ /* Used to complete requests that involve no reply */ + bool reply_sent; +-}; ++} FVRequest; + + /* + * We pass the dev element into libvhost-user +@@ -191,8 +203,11 @@ static void copy_iov(struct iovec *src_iov, int src_count, + int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count) + { +- VuVirtqElement *elem; +- VuVirtq *q; ++ FVRequest *req = container_of(ch, FVRequest, ch); ++ struct fv_QueueInfo *qi = ch->qi; ++ VuDev *dev = &se->virtio_dev->dev; ++ VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ VuVirtqElement *elem = &req->elem; + int ret = 0; + + assert(count >= 1); +@@ -205,11 +220,7 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + + /* unique == 0 is notification, which we don't support */ + assert(out->unique); +- /* For virtio we always have ch */ +- assert(ch); +- assert(!ch->qi->reply_sent); +- elem = ch->qi->qe; +- q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; +@@ -236,9 +247,15 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + } + + copy_iov(iov, count, in_sg, in_num, tosend_len); +- vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); +- vu_queue_notify(&se->virtio_dev->dev, q); +- ch->qi->reply_sent = true; ++ ++ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ pthread_mutex_lock(&qi->vq_lock); ++ vu_queue_push(dev, q, elem, tosend_len); ++ vu_queue_notify(dev, q); ++ pthread_mutex_unlock(&qi->vq_lock); ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ ++ req->reply_sent = true; + + err: + return ret; +@@ -254,9 +271,12 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count, struct fuse_bufvec *buf, + size_t len) + { ++ FVRequest *req = container_of(ch, FVRequest, ch); ++ struct fv_QueueInfo *qi = ch->qi; ++ VuDev *dev = &se->virtio_dev->dev; ++ VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ VuVirtqElement *elem = &req->elem; + int ret = 0; +- VuVirtqElement *elem; +- VuVirtq *q; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); +@@ -275,11 +295,7 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + /* unique == 0 is notification which we don't support */ + assert(out->unique); + +- /* For virtio we always have ch */ +- assert(ch); +- assert(!ch->qi->reply_sent); +- elem = ch->qi->qe; +- q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx]; ++ assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; +@@ -395,33 +411,175 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + + ret = 0; + +- vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len); +- vu_queue_notify(&se->virtio_dev->dev, q); ++ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ pthread_mutex_lock(&qi->vq_lock); ++ vu_queue_push(dev, q, elem, tosend_len); ++ vu_queue_notify(dev, q); ++ pthread_mutex_unlock(&qi->vq_lock); ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + + err: + if (ret == 0) { +- ch->qi->reply_sent = true; ++ req->reply_sent = true; + } + + return ret; + } + ++/* Process one FVRequest in a thread pool */ ++static void fv_queue_worker(gpointer data, gpointer user_data) ++{ ++ struct fv_QueueInfo *qi = user_data; ++ struct fuse_session *se = qi->virtio_dev->se; ++ struct VuDev *dev = &qi->virtio_dev->dev; ++ FVRequest *req = data; ++ VuVirtqElement *elem = &req->elem; ++ struct fuse_buf fbuf = {}; ++ bool allocated_bufv = false; ++ struct fuse_bufvec bufv; ++ struct fuse_bufvec *pbufv; ++ ++ assert(se->bufsize > sizeof(struct fuse_in_header)); ++ ++ /* ++ * An element contains one request and the space to send our response ++ * They're spread over multiple descriptors in a scatter/gather set ++ * and we can't trust the guest to keep them still; so copy in/out. ++ */ ++ fbuf.mem = malloc(se->bufsize); ++ assert(fbuf.mem); ++ ++ fuse_mutex_init(&req->ch.lock); ++ req->ch.fd = -1; ++ req->ch.qi = qi; ++ ++ /* The 'out' part of the elem is from qemu */ ++ unsigned int out_num = elem->out_num; ++ struct iovec *out_sg = elem->out_sg; ++ size_t out_len = iov_size(out_sg, out_num); ++ fuse_log(FUSE_LOG_DEBUG, ++ "%s: elem %d: with %d out desc of length %zd\n", ++ __func__, elem->index, out_num, out_len); ++ ++ /* ++ * The elem should contain a 'fuse_in_header' (in to fuse) ++ * plus the data based on the len in the header. ++ */ ++ if (out_len < sizeof(struct fuse_in_header)) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", ++ __func__, elem->index); ++ assert(0); /* TODO */ ++ } ++ if (out_len > se->bufsize) { ++ fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__, ++ elem->index); ++ assert(0); /* TODO */ ++ } ++ /* Copy just the first element and look at it */ ++ copy_from_iov(&fbuf, 1, out_sg); ++ ++ pbufv = NULL; /* Compiler thinks an unitialised path */ ++ if (out_num > 2 && ++ out_sg[0].iov_len == sizeof(struct fuse_in_header) && ++ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE && ++ out_sg[1].iov_len == sizeof(struct fuse_write_in)) { ++ /* ++ * For a write we don't actually need to copy the ++ * data, we can just do it straight out of guest memory ++ * but we must still copy the headers in case the guest ++ * was nasty and changed them while we were using them. ++ */ ++ fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); ++ ++ /* copy the fuse_write_in header afte rthe fuse_in_header */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len; ++ ++ /* Allocate the bufv, with space for the rest of the iov */ ++ pbufv = malloc(sizeof(struct fuse_bufvec) + ++ sizeof(struct fuse_buf) * (out_num - 2)); ++ if (!pbufv) { ++ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", ++ __func__); ++ goto out; ++ } ++ ++ allocated_bufv = true; ++ pbufv->count = 1; ++ pbufv->buf[0] = fbuf; ++ ++ size_t iovindex, pbufvindex; ++ iovindex = 2; /* 2 headers, separate iovs */ ++ pbufvindex = 1; /* 2 headers, 1 fusebuf */ ++ ++ for (; iovindex < out_num; iovindex++, pbufvindex++) { ++ pbufv->count++; ++ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ ++ pbufv->buf[pbufvindex].flags = 0; ++ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; ++ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; ++ } ++ } else { ++ /* Normal (non fast write) path */ ++ ++ /* Copy the rest of the buffer */ ++ fbuf.mem += out_sg->iov_len; ++ copy_from_iov(&fbuf, out_num - 1, out_sg + 1); ++ fbuf.mem -= out_sg->iov_len; ++ fbuf.size = out_len; ++ ++ /* TODO! Endianness of header */ ++ ++ /* TODO: Add checks for fuse_session_exited */ ++ bufv.buf[0] = fbuf; ++ bufv.count = 1; ++ pbufv = &bufv; ++ } ++ pbufv->idx = 0; ++ pbufv->off = 0; ++ fuse_session_process_buf_int(se, pbufv, &req->ch); ++ ++out: ++ if (allocated_bufv) { ++ free(pbufv); ++ } ++ ++ /* If the request has no reply, still recycle the virtqueue element */ ++ if (!req->reply_sent) { ++ struct VuVirtq *q = vu_get_queue(dev, qi->qidx); ++ ++ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__, ++ elem->index); ++ ++ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ pthread_mutex_lock(&qi->vq_lock); ++ vu_queue_push(dev, q, elem, 0); ++ vu_queue_notify(dev, q); ++ pthread_mutex_unlock(&qi->vq_lock); ++ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); ++ } ++ ++ pthread_mutex_destroy(&req->ch.lock); ++ free(fbuf.mem); ++ free(req); ++} ++ + /* Thread function for individual queues, created when a queue is 'started' */ + static void *fv_queue_thread(void *opaque) + { + struct fv_QueueInfo *qi = opaque; + struct VuDev *dev = &qi->virtio_dev->dev; + struct VuVirtq *q = vu_get_queue(dev, qi->qidx); +- struct fuse_session *se = qi->virtio_dev->se; +- struct fuse_chan ch; +- struct fuse_buf fbuf; ++ GThreadPool *pool; + +- fbuf.mem = NULL; +- fbuf.flags = 0; +- +- fuse_mutex_init(&ch.lock); +- ch.fd = (int)0xdaff0d111; +- ch.qi = qi; ++ pool = g_thread_pool_new(fv_queue_worker, qi, 1 /* TODO max_threads */, ++ TRUE, NULL); ++ if (!pool) { ++ fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__); ++ return NULL; ++ } + + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); +@@ -478,6 +636,7 @@ static void *fv_queue_thread(void *opaque) + /* Mutual exclusion with virtio_loop() */ + ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + assert(ret == 0); /* there is no possible error case */ ++ pthread_mutex_lock(&qi->vq_lock); + /* out is from guest, in is too guest */ + unsigned int in_bytes, out_bytes; + vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); +@@ -486,141 +645,22 @@ static void *fv_queue_thread(void *opaque) + "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + +- + while (1) { +- bool allocated_bufv = false; +- struct fuse_bufvec bufv; +- struct fuse_bufvec *pbufv; +- +- /* +- * An element contains one request and the space to send our +- * response They're spread over multiple descriptors in a +- * scatter/gather set and we can't trust the guest to keep them +- * still; so copy in/out. +- */ +- VuVirtqElement *elem = vu_queue_pop(dev, q, sizeof(VuVirtqElement)); +- if (!elem) { ++ FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest)); ++ if (!req) { + break; + } + +- qi->qe = elem; +- qi->reply_sent = false; ++ req->reply_sent = false; + +- if (!fbuf.mem) { +- fbuf.mem = malloc(se->bufsize); +- assert(fbuf.mem); +- assert(se->bufsize > sizeof(struct fuse_in_header)); +- } +- /* The 'out' part of the elem is from qemu */ +- unsigned int out_num = elem->out_num; +- struct iovec *out_sg = elem->out_sg; +- size_t out_len = iov_size(out_sg, out_num); +- fuse_log(FUSE_LOG_DEBUG, +- "%s: elem %d: with %d out desc of length %zd\n", __func__, +- elem->index, out_num, out_len); +- +- /* +- * The elem should contain a 'fuse_in_header' (in to fuse) +- * plus the data based on the len in the header. +- */ +- if (out_len < sizeof(struct fuse_in_header)) { +- fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", +- __func__, elem->index); +- assert(0); /* TODO */ +- } +- if (out_len > se->bufsize) { +- fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", +- __func__, elem->index); +- assert(0); /* TODO */ +- } +- /* Copy just the first element and look at it */ +- copy_from_iov(&fbuf, 1, out_sg); +- +- if (out_num > 2 && +- out_sg[0].iov_len == sizeof(struct fuse_in_header) && +- ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE && +- out_sg[1].iov_len == sizeof(struct fuse_write_in)) { +- /* +- * For a write we don't actually need to copy the +- * data, we can just do it straight out of guest memory +- * but we must still copy the headers in case the guest +- * was nasty and changed them while we were using them. +- */ +- fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); +- +- /* copy the fuse_write_in header after the fuse_in_header */ +- fbuf.mem += out_sg->iov_len; +- copy_from_iov(&fbuf, 1, out_sg + 1); +- fbuf.mem -= out_sg->iov_len; +- fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len; +- +- /* Allocate the bufv, with space for the rest of the iov */ +- allocated_bufv = true; +- pbufv = malloc(sizeof(struct fuse_bufvec) + +- sizeof(struct fuse_buf) * (out_num - 2)); +- if (!pbufv) { +- vu_queue_unpop(dev, q, elem, 0); +- free(elem); +- fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", +- __func__); +- goto out; +- } +- +- pbufv->count = 1; +- pbufv->buf[0] = fbuf; +- +- size_t iovindex, pbufvindex; +- iovindex = 2; /* 2 headers, separate iovs */ +- pbufvindex = 1; /* 2 headers, 1 fusebuf */ +- +- for (; iovindex < out_num; iovindex++, pbufvindex++) { +- pbufv->count++; +- pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ +- pbufv->buf[pbufvindex].flags = 0; +- pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; +- pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; +- } +- } else { +- /* Normal (non fast write) path */ +- +- /* Copy the rest of the buffer */ +- fbuf.mem += out_sg->iov_len; +- copy_from_iov(&fbuf, out_num - 1, out_sg + 1); +- fbuf.mem -= out_sg->iov_len; +- fbuf.size = out_len; +- +- /* TODO! Endianness of header */ +- +- /* TODO: Add checks for fuse_session_exited */ +- bufv.buf[0] = fbuf; +- bufv.count = 1; +- pbufv = &bufv; +- } +- pbufv->idx = 0; +- pbufv->off = 0; +- fuse_session_process_buf_int(se, pbufv, &ch); +- +- if (allocated_bufv) { +- free(pbufv); +- } +- +- if (!qi->reply_sent) { +- fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", +- __func__, elem->index); +- /* I think we've still got to recycle the element */ +- vu_queue_push(dev, q, elem, 0); +- vu_queue_notify(dev, q); +- } +- qi->qe = NULL; +- free(elem); +- elem = NULL; ++ g_thread_pool_push(pool, req, NULL); + } + ++ pthread_mutex_unlock(&qi->vq_lock); + pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + } +-out: +- pthread_mutex_destroy(&ch.lock); +- free(fbuf.mem); ++ ++ g_thread_pool_free(pool, FALSE, TRUE); + + return NULL; + } +@@ -643,6 +683,7 @@ static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) + fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n", + __func__, qidx, ret); + } ++ pthread_mutex_destroy(&ourqi->vq_lock); + close(ourqi->kill_fd); + ourqi->kick_fd = -1; + free(vud->qi[qidx]); +@@ -696,6 +737,8 @@ static void fv_queue_set_started(VuDev *dev, int qidx, bool started) + + ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE); + assert(ourqi->kill_fd != -1); ++ pthread_mutex_init(&ourqi->vq_lock, NULL); ++ + if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", + __func__, qidx); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-remove-mountpoint-dummy-argument.patch b/kvm-virtiofsd-remove-mountpoint-dummy-argument.patch new file mode 100755 index 0000000000000000000000000000000000000000..181e32dac0cfc4f13e4d0311672acbe20d00b84b --- /dev/null +++ b/kvm-virtiofsd-remove-mountpoint-dummy-argument.patch @@ -0,0 +1,159 @@ +From a8a1835a82510be7d2d6edcc28a60e506a2cedad Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:46 +0100 +Subject: [PATCH 015/116] virtiofsd: remove mountpoint dummy argument +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-12-dgilbert@redhat.com> +Patchwork-id: 93466 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 011/112] virtiofsd: remove mountpoint dummy argument +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Classic FUSE file system daemons take a mountpoint argument but +virtiofsd exposes a vhost-user UNIX domain socket instead. The +mountpoint argument is not used by virtiofsd but the user is still +required to pass a dummy argument on the command-line. + +Remove the mountpoint argument to clean up the command-line. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 67aab02272f6cb47c56420f60b370c184961b5ca) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 2 +- + tools/virtiofsd/fuse_lowlevel.h | 4 +--- + tools/virtiofsd/helper.c | 20 +++----------------- + tools/virtiofsd/passthrough_ll.c | 12 ++---------- + 4 files changed, 7 insertions(+), 31 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 5c9cb52..2f32c68 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2455,7 +2455,7 @@ out1: + return NULL; + } + +-int fuse_session_mount(struct fuse_session *se, const char *mountpoint) ++int fuse_session_mount(struct fuse_session *se) + { + int fd; + +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index adb9054..8d8909b 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1863,7 +1863,6 @@ struct fuse_cmdline_opts { + int foreground; + int debug; + int nodefault_subtype; +- char *mountpoint; + int show_version; + int show_help; + unsigned int max_idle_threads; +@@ -1924,12 +1923,11 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + /** + * Mount a FUSE file system. + * +- * @param mountpoint the mount point path + * @param se session object + * + * @return 0 on success, -1 on failure. + **/ +-int fuse_session_mount(struct fuse_session *se, const char *mountpoint); ++int fuse_session_mount(struct fuse_session *se); + + /** + * Enter a single threaded, blocking event loop. +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 5711dd2..5e6f205 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -140,27 +140,13 @@ void fuse_cmdline_help(void) + static int fuse_helper_opt_proc(void *data, const char *arg, int key, + struct fuse_args *outargs) + { ++ (void)data; + (void)outargs; +- struct fuse_cmdline_opts *opts = data; + + switch (key) { + case FUSE_OPT_KEY_NONOPT: +- if (!opts->mountpoint) { +- if (fuse_mnt_parse_fuse_fd(arg) != -1) { +- return fuse_opt_add_opt(&opts->mountpoint, arg); +- } +- +- char mountpoint[PATH_MAX] = ""; +- if (realpath(arg, mountpoint) == NULL) { +- fuse_log(FUSE_LOG_ERR, "fuse: bad mount point `%s': %s\n", arg, +- strerror(errno)); +- return -1; +- } +- return fuse_opt_add_opt(&opts->mountpoint, mountpoint); +- } else { +- fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); +- return -1; +- } ++ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); ++ return -1; + + default: + /* Pass through unknown options */ +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index c5850ef..9377718 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -1297,7 +1297,7 @@ int main(int argc, char *argv[]) + return 1; + } + if (opts.show_help) { +- printf("usage: %s [options] \n\n", argv[0]); ++ printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); + fuse_lowlevel_help(); + ret = 0; +@@ -1308,13 +1308,6 @@ int main(int argc, char *argv[]) + goto err_out1; + } + +- if (opts.mountpoint == NULL) { +- printf("usage: %s [options] \n", argv[0]); +- printf(" %s --help\n", argv[0]); +- ret = 1; +- goto err_out1; +- } +- + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { + return 1; + } +@@ -1374,7 +1367,7 @@ int main(int argc, char *argv[]) + goto err_out2; + } + +- if (fuse_session_mount(se, opts.mountpoint) != 0) { ++ if (fuse_session_mount(se) != 0) { + goto err_out3; + } + +@@ -1393,7 +1386,6 @@ err_out3: + err_out2: + fuse_session_destroy(se); + err_out1: +- free(opts.mountpoint); + fuse_opt_free_args(&args); + + if (lo.root.fd >= 0) { +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-remove-unused-notify-reply-support.patch b/kvm-virtiofsd-remove-unused-notify-reply-support.patch new file mode 100755 index 0000000000000000000000000000000000000000..98fb96849604252cead5af6dfefff17b367724f0 --- /dev/null +++ b/kvm-virtiofsd-remove-unused-notify-reply-support.patch @@ -0,0 +1,294 @@ +From e5534c0d4b866f61dbafa8d2422a24ab956189c1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:47 +0100 +Subject: [PATCH 016/116] virtiofsd: remove unused notify reply support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-13-dgilbert@redhat.com> +Patchwork-id: 93467 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 012/112] virtiofsd: remove unused notify reply support +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Notify reply support is unused by virtiofsd. The code would need to be +updated to validate input buffer sizes. Remove this unused code since +changes to it are untestable. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 64c6f408a29ef03e9b8da9f5a5d8fd511b0d801e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 147 +--------------------------------------- + tools/virtiofsd/fuse_lowlevel.h | 47 ------------- + 2 files changed, 1 insertion(+), 193 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 2f32c68..eb0ec49 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -31,12 +31,6 @@ + #define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + #define OFFSET_MAX 0x7fffffffffffffffLL + +-#define container_of(ptr, type, member) \ +- ({ \ +- const typeof(((type *)0)->member) *__mptr = (ptr); \ +- (type *)((char *)__mptr - offsetof(type, member)); \ +- }) +- + struct fuse_pollhandle { + uint64_t kh; + struct fuse_session *se; +@@ -1862,52 +1856,6 @@ static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + send_reply_ok(req, NULL, 0); + } + +-static void list_del_nreq(struct fuse_notify_req *nreq) +-{ +- struct fuse_notify_req *prev = nreq->prev; +- struct fuse_notify_req *next = nreq->next; +- prev->next = next; +- next->prev = prev; +-} +- +-static void list_add_nreq(struct fuse_notify_req *nreq, +- struct fuse_notify_req *next) +-{ +- struct fuse_notify_req *prev = next->prev; +- nreq->next = next; +- nreq->prev = prev; +- prev->next = nreq; +- next->prev = nreq; +-} +- +-static void list_init_nreq(struct fuse_notify_req *nreq) +-{ +- nreq->next = nreq; +- nreq->prev = nreq; +-} +- +-static void do_notify_reply(fuse_req_t req, fuse_ino_t nodeid, +- const void *inarg, const struct fuse_buf *buf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_notify_req *nreq; +- struct fuse_notify_req *head; +- +- pthread_mutex_lock(&se->lock); +- head = &se->notify_list; +- for (nreq = head->next; nreq != head; nreq = nreq->next) { +- if (nreq->unique == req->unique) { +- list_del_nreq(nreq); +- break; +- } +- } +- pthread_mutex_unlock(&se->lock); +- +- if (nreq != head) { +- nreq->reply(nreq, req, nodeid, inarg, buf); +- } +-} +- + static int send_notify_iov(struct fuse_session *se, int notify_code, + struct iovec *iov, int count) + { +@@ -2059,95 +2007,6 @@ int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + return res; + } + +-struct fuse_retrieve_req { +- struct fuse_notify_req nreq; +- void *cookie; +-}; +- +-static void fuse_ll_retrieve_reply(struct fuse_notify_req *nreq, fuse_req_t req, +- fuse_ino_t ino, const void *inarg, +- const struct fuse_buf *ibuf) +-{ +- struct fuse_session *se = req->se; +- struct fuse_retrieve_req *rreq = +- container_of(nreq, struct fuse_retrieve_req, nreq); +- const struct fuse_notify_retrieve_in *arg = inarg; +- struct fuse_bufvec bufv = { +- .buf[0] = *ibuf, +- .count = 1, +- }; +- +- if (!(bufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- bufv.buf[0].mem = PARAM(arg); +- } +- +- bufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_notify_retrieve_in); +- +- if (bufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, "fuse: retrieve reply: buffer size too small\n"); +- fuse_reply_none(req); +- goto out; +- } +- bufv.buf[0].size = arg->size; +- +- if (se->op.retrieve_reply) { +- se->op.retrieve_reply(req, rreq->cookie, ino, arg->offset, &bufv); +- } else { +- fuse_reply_none(req); +- } +-out: +- free(rreq); +-} +- +-int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie) +-{ +- struct fuse_notify_retrieve_out outarg; +- struct iovec iov[2]; +- struct fuse_retrieve_req *rreq; +- int err; +- +- if (!se) { +- return -EINVAL; +- } +- +- if (se->conn.proto_major < 6 || se->conn.proto_minor < 15) { +- return -ENOSYS; +- } +- +- rreq = malloc(sizeof(*rreq)); +- if (rreq == NULL) { +- return -ENOMEM; +- } +- +- pthread_mutex_lock(&se->lock); +- rreq->cookie = cookie; +- rreq->nreq.unique = se->notify_ctr++; +- rreq->nreq.reply = fuse_ll_retrieve_reply; +- list_add_nreq(&rreq->nreq, &se->notify_list); +- pthread_mutex_unlock(&se->lock); +- +- outarg.notify_unique = rreq->nreq.unique; +- outarg.nodeid = ino; +- outarg.offset = offset; +- outarg.size = size; +- outarg.padding = 0; +- +- iov[1].iov_base = &outarg; +- iov[1].iov_len = sizeof(outarg); +- +- err = send_notify_iov(se, FUSE_NOTIFY_RETRIEVE, iov, 2); +- if (err) { +- pthread_mutex_lock(&se->lock); +- list_del_nreq(&rreq->nreq); +- pthread_mutex_unlock(&se->lock); +- free(rreq); +- } +- +- return err; +-} +- + void *fuse_req_userdata(fuse_req_t req) + { + return req->se->userdata; +@@ -2226,7 +2085,7 @@ static struct { + [FUSE_POLL] = { do_poll, "POLL" }, + [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, + [FUSE_DESTROY] = { do_destroy, "DESTROY" }, +- [FUSE_NOTIFY_REPLY] = { (void *)1, "NOTIFY_REPLY" }, ++ [FUSE_NOTIFY_REPLY] = { NULL, "NOTIFY_REPLY" }, + [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, + [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" }, + [FUSE_RENAME2] = { do_rename2, "RENAME2" }, +@@ -2333,8 +2192,6 @@ void fuse_session_process_buf_int(struct fuse_session *se, + inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { + do_write_buf(req, in->nodeid, inarg, buf); +- } else if (in->opcode == FUSE_NOTIFY_REPLY) { +- do_notify_reply(req, in->nodeid, inarg, buf); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + } +@@ -2437,8 +2294,6 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, + + list_init_req(&se->list); + list_init_req(&se->interrupts); +- list_init_nreq(&se->notify_list); +- se->notify_ctr = 1; + fuse_mutex_init(&se->lock); + + memcpy(&se->op, op, op_size); +diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h +index 8d8909b..12a84b4 100644 +--- a/tools/virtiofsd/fuse_lowlevel.h ++++ b/tools/virtiofsd/fuse_lowlevel.h +@@ -1085,21 +1085,6 @@ struct fuse_lowlevel_ops { + off_t off, struct fuse_file_info *fi); + + /** +- * Callback function for the retrieve request +- * +- * Valid replies: +- * fuse_reply_none +- * +- * @param req request handle +- * @param cookie user data supplied to fuse_lowlevel_notify_retrieve() +- * @param ino the inode number supplied to fuse_lowlevel_notify_retrieve() +- * @param offset the offset supplied to fuse_lowlevel_notify_retrieve() +- * @param bufv the buffer containing the returned data +- */ +- void (*retrieve_reply)(fuse_req_t req, void *cookie, fuse_ino_t ino, +- off_t offset, struct fuse_bufvec *bufv); +- +- /** + * Forget about multiple inodes + * + * See description of the forget function for more +@@ -1726,38 +1711,6 @@ int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv, + enum fuse_buf_copy_flags flags); +-/** +- * Retrieve data from the kernel buffers +- * +- * Retrieve data in the kernel buffers belonging to the given inode. +- * If successful then the retrieve_reply() method will be called with +- * the returned data. +- * +- * Only present pages are returned in the retrieve reply. Retrieving +- * stops when it finds a non-present page and only data prior to that +- * is returned. +- * +- * If this function returns an error, then the retrieve will not be +- * completed and no reply will be sent. +- * +- * This function doesn't change the dirty state of pages in the kernel +- * buffer. For dirty pages the write() method will be called +- * regardless of having been retrieved previously. +- * +- * Added in FUSE protocol version 7.15. If the kernel does not support +- * this (or a newer) version, the function will return -ENOSYS and do +- * nothing. +- * +- * @param se the session object +- * @param ino the inode number +- * @param size the number of bytes to retrieve +- * @param offset the starting offset into the file to retrieve from +- * @param cookie user data to supply to the reply callback +- * @return zero for success, -errno for failure +- */ +-int fuse_lowlevel_notify_retrieve(struct fuse_session *se, fuse_ino_t ino, +- size_t size, off_t offset, void *cookie); +- + + /* + * Utility functions +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch b/kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch new file mode 100755 index 0000000000000000000000000000000000000000..97a0db35bee81cd2dd9887ac098b128ee7620caa --- /dev/null +++ b/kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch @@ -0,0 +1,139 @@ +From e01a6e68d799ed2af0ca3b04d75818ba62b18682 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:08 +0100 +Subject: [PATCH 097/116] virtiofsd: rename inode->refcount to inode->nlookup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-94-dgilbert@redhat.com> +Patchwork-id: 93547 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 093/112] virtiofsd: rename inode->refcount to inode->nlookup +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +This reference counter plays a specific role in the FUSE protocol. It's +not a generic object reference counter and the FUSE kernel code calls it +"nlookup". + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 1222f015558fc34cea02aa3a5a92de608c82cec8) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 37 +++++++++++++++++++++++++------------ + 1 file changed, 25 insertions(+), 12 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 2d703b5..c819b5f 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -99,7 +99,20 @@ struct lo_inode { + int fd; + bool is_symlink; + struct lo_key key; +- uint64_t refcount; /* protected by lo->mutex */ ++ ++ /* ++ * This counter keeps the inode alive during the FUSE session. ++ * Incremented when the FUSE inode number is sent in a reply ++ * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is ++ * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc. ++ * ++ * Note that this value is untrusted because the client can manipulate ++ * it arbitrarily using FUSE_FORGET requests. ++ * ++ * Protected by lo->mutex. ++ */ ++ uint64_t nlookup; ++ + fuse_ino_t fuse_ino; + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ +@@ -568,7 +581,7 @@ retry: + if (last == path) { + p = &lo->root; + pthread_mutex_lock(&lo->mutex); +- p->refcount++; ++ p->nlookup++; + pthread_mutex_unlock(&lo->mutex); + } else { + *last = '\0'; +@@ -786,8 +799,8 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) + pthread_mutex_lock(&lo->mutex); + p = g_hash_table_lookup(lo->inodes, &key); + if (p) { +- assert(p->refcount > 0); +- p->refcount++; ++ assert(p->nlookup > 0); ++ p->nlookup++; + } + pthread_mutex_unlock(&lo->mutex); + +@@ -855,7 +868,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + } + + inode->is_symlink = S_ISLNK(e->attr.st_mode); +- inode->refcount = 1; ++ inode->nlookup = 1; + inode->fd = newfd; + newfd = -1; + inode->key.ino = e->attr.st_ino; +@@ -1112,7 +1125,7 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + } + + pthread_mutex_lock(&lo->mutex); +- inode->refcount++; ++ inode->nlookup++; + pthread_mutex_unlock(&lo->mutex); + e.ino = inode->fuse_ino; + +@@ -1193,9 +1206,9 @@ static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + } + + pthread_mutex_lock(&lo->mutex); +- assert(inode->refcount >= n); +- inode->refcount -= n; +- if (!inode->refcount) { ++ assert(inode->nlookup >= n); ++ inode->nlookup -= n; ++ if (!inode->nlookup) { + lo_map_remove(&lo->ino_map, inode->fuse_ino); + g_hash_table_remove(lo->inodes, &inode->key); + if (g_hash_table_size(inode->posix_locks)) { +@@ -1216,7 +1229,7 @@ static int unref_all_inodes_cb(gpointer key, gpointer value, gpointer user_data) + struct lo_inode *inode = value; + struct lo_data *lo = user_data; + +- inode->refcount = 0; ++ inode->nlookup = 0; + lo_map_remove(&lo->ino_map, inode->fuse_ino); + close(inode->fd); + +@@ -1241,7 +1254,7 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + } + + fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", +- (unsigned long long)ino, (unsigned long long)inode->refcount, ++ (unsigned long long)ino, (unsigned long long)inode->nlookup, + (unsigned long long)nlookup); + + unref_inode_lolocked(lo, inode, nlookup); +@@ -2609,7 +2622,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) + root->fd = fd; + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; +- root->refcount = 2; ++ root->nlookup = 2; + } + + static guint lo_key_hash(gconstpointer key) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch b/kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch new file mode 100755 index 0000000000000000000000000000000000000000..95858f80fff407f7d6d18d67d9965e1cefefd19a --- /dev/null +++ b/kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch @@ -0,0 +1,94 @@ +From cfa4550f926e7a07757853f94273f2d1589cb9d3 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:48 +0100 +Subject: [PATCH 077/116] virtiofsd: rename unref_inode() to + unref_inode_lolocked() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-74-dgilbert@redhat.com> +Patchwork-id: 93526 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 073/112] virtiofsd: rename unref_inode() to unref_inode_lolocked() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Miklos Szeredi + +Signed-off-by: Miklos Szeredi +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 95d2715791c60b5dc2d22e4eb7b83217273296fa) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 8b1784f..de12e75 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -148,8 +148,8 @@ static const struct fuse_opt lo_opts[] = { + }; + static bool use_syslog = false; + static int current_log_level; +- +-static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); ++static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, ++ uint64_t n); + + static struct { + pthread_mutex_t mutex; +@@ -586,7 +586,7 @@ retry: + return 0; + + fail_unref: +- unref_inode(lo, p, 1); ++ unref_inode_lolocked(lo, p, 1); + fail: + if (retries) { + retries--; +@@ -624,7 +624,7 @@ fallback: + res = lo_parent_and_name(lo, inode, path, &parent); + if (res != -1) { + res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW); +- unref_inode(lo, parent, 1); ++ unref_inode_lolocked(lo, parent, 1); + } + + return res; +@@ -1027,7 +1027,7 @@ fallback: + res = lo_parent_and_name(lo, inode, path, &parent); + if (res != -1) { + res = linkat(parent->fd, path, dfd, name, 0); +- unref_inode(lo, parent, 1); ++ unref_inode_lolocked(lo, parent, 1); + } + + return res; +@@ -1141,7 +1141,8 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + fuse_reply_err(req, res == -1 ? errno : 0); + } + +-static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) ++static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, ++ uint64_t n) + { + if (!inode) { + return; +@@ -1181,7 +1182,7 @@ static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) + (unsigned long long)ino, (unsigned long long)inode->refcount, + (unsigned long long)nlookup); + +- unref_inode(lo, inode, nlookup); ++ unref_inode_lolocked(lo, inode, nlookup); + } + + static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-sandbox-mount-namespace.patch b/kvm-virtiofsd-sandbox-mount-namespace.patch new file mode 100755 index 0000000000000000000000000000000000000000..ab6f751c176595f428dac270ab9ae9c3a919e5e7 --- /dev/null +++ b/kvm-virtiofsd-sandbox-mount-namespace.patch @@ -0,0 +1,166 @@ +From c7ae38df696e4be432fd418c670dcea892b910a7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:27 +0100 +Subject: [PATCH 056/116] virtiofsd: sandbox mount namespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-53-dgilbert@redhat.com> +Patchwork-id: 93504 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 052/112] virtiofsd: sandbox mount namespace +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Use a mount namespace with the shared directory tree mounted at "/" and +no other mounts. + +This prevents symlink escape attacks because symlink targets are +resolved only against the shared directory and cannot go outside it. + +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Peng Tao +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 5baa3b8e95064c2434bd9e2f312edd5e9ae275dc) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 89 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 89 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e2e2211..0570453 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1943,6 +1944,58 @@ static void print_capabilities(void) + printf("}\n"); + } + ++/* This magic is based on lxc's lxc_pivot_root() */ ++static void setup_pivot_root(const char *source) ++{ ++ int oldroot; ++ int newroot; ++ ++ oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); ++ if (oldroot < 0) { ++ fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); ++ exit(1); ++ } ++ ++ newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC); ++ if (newroot < 0) { ++ fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source); ++ exit(1); ++ } ++ ++ if (fchdir(newroot) < 0) { ++ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); ++ exit(1); ++ } ++ ++ if (syscall(__NR_pivot_root, ".", ".") < 0) { ++ fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n"); ++ exit(1); ++ } ++ ++ if (fchdir(oldroot) < 0) { ++ fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n"); ++ exit(1); ++ } ++ ++ if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n"); ++ exit(1); ++ } ++ ++ if (umount2(".", MNT_DETACH) < 0) { ++ fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n"); ++ exit(1); ++ } ++ ++ if (fchdir(newroot) < 0) { ++ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); ++ exit(1); ++ } ++ ++ close(newroot); ++ close(oldroot); ++} ++ + static void setup_proc_self_fd(struct lo_data *lo) + { + lo->proc_self_fd = open("/proc/self/fd", O_PATH); +@@ -1952,6 +2005,39 @@ static void setup_proc_self_fd(struct lo_data *lo) + } + } + ++/* ++ * Make the source directory our root so symlinks cannot escape and no other ++ * files are accessible. ++ */ ++static void setup_mount_namespace(const char *source) ++{ ++ if (unshare(CLONE_NEWNS) != 0) { ++ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWNS): %m\n"); ++ exit(1); ++ } ++ ++ if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_PRIVATE): %m\n"); ++ exit(1); ++ } ++ ++ if (mount(source, source, NULL, MS_BIND, NULL) < 0) { ++ fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); ++ exit(1); ++ } ++ ++ setup_pivot_root(source); ++} ++ ++/* ++ * Lock down this process to prevent access to other processes or files outside ++ * source directory. This reduces the impact of arbitrary code execution bugs. ++ */ ++static void setup_sandbox(struct lo_data *lo) ++{ ++ setup_mount_namespace(lo->source); ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2052,6 +2138,7 @@ int main(int argc, char *argv[]) + } + + lo.root.fd = open(lo.source, O_PATH); ++ + if (lo.root.fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(\"%s\", O_PATH): %m\n", lo.source); + exit(1); +@@ -2075,6 +2162,8 @@ int main(int argc, char *argv[]) + /* Must be after daemonize to get the right /proc/self/fd */ + setup_proc_self_fd(&lo); + ++ setup_sandbox(&lo); ++ + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch b/kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch new file mode 100755 index 0000000000000000000000000000000000000000..e54248c48c029bd1909577d08578a327374b59ce --- /dev/null +++ b/kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch @@ -0,0 +1,93 @@ +From 4cc435b3a8a9a419cc85ee883d5184f810f91e52 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:34 +0100 +Subject: [PATCH 063/116] virtiofsd: set maximum RLIMIT_NOFILE limit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-60-dgilbert@redhat.com> +Patchwork-id: 93516 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 059/112] virtiofsd: set maximum RLIMIT_NOFILE limit +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +virtiofsd can exceed the default open file descriptor limit easily on +most systems. Take advantage of the fact that it runs as root to raise +the limit. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 01a6dc95ec7f71eeff9963fe3cb03d85225fba3e) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 32 ++++++++++++++++++++++++++++++++ + 1 file changed, 32 insertions(+) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index d53cb1e..c281d81 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -2268,6 +2269,35 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se) + setup_seccomp(); + } + ++/* Raise the maximum number of open file descriptors */ ++static void setup_nofile_rlimit(void) ++{ ++ const rlim_t max_fds = 1000000; ++ struct rlimit rlim; ++ ++ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { ++ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); ++ exit(1); ++ } ++ ++ if (rlim.rlim_cur >= max_fds) { ++ return; /* nothing to do */ ++ } ++ ++ rlim.rlim_cur = max_fds; ++ rlim.rlim_max = max_fds; ++ ++ if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { ++ /* Ignore SELinux denials */ ++ if (errno == EPERM) { ++ return; ++ } ++ ++ fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n"); ++ exit(1); ++ } ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); +@@ -2389,6 +2419,8 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + ++ setup_nofile_rlimit(); ++ + /* Must be before sandbox since it wants /proc */ + setup_capng(); + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch b/kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch new file mode 100755 index 0000000000000000000000000000000000000000..ce74f4dc4d2b77b8eb822669956180b6c4d6fc56 --- /dev/null +++ b/kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch @@ -0,0 +1,88 @@ +From 301f19f2ebd617e43e3a8e7bdcf694de580fe689 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 5 May 2020 16:35:56 +0100 +Subject: [PATCH 5/9] virtiofsd: stay below fs.file-max sysctl value + (CVE-2020-10717) + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200505163600.22956-4-dgilbert@redhat.com> +Patchwork-id: 96271 +O-Subject: [RHEL-AV-8.2.1 qemu-kvm PATCH 3/7] virtiofsd: stay below fs.file-max sysctl value (CVE-2020-10717) +Bugzilla: 1817445 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Max Reitz +RH-Acked-by: Michael S. Tsirkin + +From: Stefan Hajnoczi + +The system-wide fs.file-max sysctl value determines how many files can +be open. It defaults to a value calculated based on the machine's RAM +size. Previously virtiofsd would try to set RLIMIT_NOFILE to 1,000,000 +and this allowed the FUSE client to exhaust the number of open files +system-wide on Linux hosts with less than 10 GB of RAM! + +Take fs.file-max into account when choosing the default RLIMIT_NOFILE +value. + +Fixes: CVE-2020-10717 +Reported-by: Yuval Avrahami +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Dr. David Alan Gilbert +Message-Id: <20200501140644.220940-3-stefanha@redhat.com> +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8c1d353d107b4fc344e27f2f08ea7fa25de2eea2) +Signed-off-by: Danilo C. L. de Paula +--- + tools/virtiofsd/helper.c | 26 +++++++++++++++++++++++++- + 1 file changed, 25 insertions(+), 1 deletion(-) + +diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c +index 9b3eddc..5b222ea 100644 +--- a/tools/virtiofsd/helper.c ++++ b/tools/virtiofsd/helper.c +@@ -176,7 +176,8 @@ void fuse_cmdline_help(void) + " default: no_xattr\n" + " --rlimit-nofile= set maximum number of file descriptors\n" + " (0 leaves rlimit unchanged)\n" +- " default: 1,000,000 if the current rlimit is lower\n" ++ " default: min(1000000, fs.file-max - 16384)\n" ++ " if the current rlimit is lower\n" + ); + } + +@@ -199,9 +200,32 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, + + static unsigned long get_default_rlimit_nofile(void) + { ++ g_autofree gchar *file_max_str = NULL; ++ const rlim_t reserved_fds = 16384; /* leave at least this many fds free */ + rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ ++ rlim_t file_max; + struct rlimit rlim; + ++ /* ++ * Reduce max_fds below the system-wide maximum, if necessary. This ++ * ensures there are fds available for other processes so we don't ++ * cause resource exhaustion. ++ */ ++ if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str, ++ NULL, NULL)) { ++ fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n"); ++ exit(1); ++ } ++ file_max = g_ascii_strtoull(file_max_str, NULL, 10); ++ if (file_max < 2 * reserved_fds) { ++ fuse_log(FUSE_LOG_ERR, ++ "The fs.file-max sysctl is too low (%lu) to allow a " ++ "reasonable number of open files.\n", ++ (unsigned long)file_max); ++ exit(1); ++ } ++ max_fds = MIN(file_max - reserved_fds, max_fds); ++ + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { + fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch b/kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch new file mode 100755 index 0000000000000000000000000000000000000000..be6b244e6de85a96668d93b10f6606af8a41a8ec --- /dev/null +++ b/kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch @@ -0,0 +1,72 @@ +From 06a24b54c94345b436d888a48b92fafa967c3d58 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:25 +0100 +Subject: [PATCH 114/116] virtiofsd: stop all queue threads on exit in + virtio_loop() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-111-dgilbert@redhat.com> +Patchwork-id: 93564 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 110/112] virtiofsd: stop all queue threads on exit in virtio_loop() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Eryu Guan + +On guest graceful shutdown, virtiofsd receives VHOST_USER_GET_VRING_BASE +request from VMM and shuts down virtqueues by calling fv_set_started(), +which joins fv_queue_thread() threads. So when virtio_loop() returns, +there should be no thread is still accessing data in fuse session and/or +virtio dev. + +But on abnormal exit, e.g. guest got killed for whatever reason, +vhost-user socket is closed and virtio_loop() breaks out the main loop +and returns to main(). But it's possible fv_queue_worker()s are still +working and accessing fuse session and virtio dev, which results in +crash or use-after-free. + +Fix it by stopping fv_queue_thread()s before virtio_loop() returns, +to make sure there's no-one could access fuse session and virtio dev. + +Reported-by: Qingming Su +Signed-off-by: Eryu Guan +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9883df8ccae6d744a0c8d9cbf9d62b1797d70ebd) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_virtio.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c +index 9f65823..80a6e92 100644 +--- a/tools/virtiofsd/fuse_virtio.c ++++ b/tools/virtiofsd/fuse_virtio.c +@@ -815,6 +815,19 @@ int virtio_loop(struct fuse_session *se) + } + } + ++ /* ++ * Make sure all fv_queue_thread()s quit on exit, as we're about to ++ * free virtio dev and fuse session, no one should access them anymore. ++ */ ++ for (int i = 0; i < se->virtio_dev->nqueues; i++) { ++ if (!se->virtio_dev->qi[i]) { ++ continue; ++ } ++ ++ fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i); ++ fv_queue_cleanup_thread(se->virtio_dev, i); ++ } ++ + fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); + + return 0; +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch b/kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch new file mode 100755 index 0000000000000000000000000000000000000000..f595ffa912752a57d50afef2e75f33481d6a8e4f --- /dev/null +++ b/kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch @@ -0,0 +1,83 @@ +From 1744329bcba4a3e1a82cec3b1a34b3fbf0a9d7cf Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:54 +0100 +Subject: [PATCH 083/116] virtiofsd: support nanosecond resolution for file + timestamp +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-80-dgilbert@redhat.com> +Patchwork-id: 93535 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 079/112] virtiofsd: support nanosecond resolution for file timestamp +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Jiufei Xue + +Define HAVE_STRUCT_STAT_ST_ATIM to 1 if `st_atim' is member of `struct +stat' which means support nanosecond resolution for the file timestamp +fields. + +Signed-off-by: Jiufei Xue +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 8a792b034d4b315251fd842bb4c73a133aa1368f) +Signed-off-by: Miroslav Rezanina +--- + configure | 16 ++++++++++++++++ + tools/virtiofsd/fuse_misc.h | 1 + + 2 files changed, 17 insertions(+) + +diff --git a/configure b/configure +index 7831618..5120c14 100755 +--- a/configure ++++ b/configure +@@ -5218,6 +5218,19 @@ if compile_prog "" "" ; then + strchrnul=yes + fi + ++######################################### ++# check if we have st_atim ++ ++st_atim=no ++cat > $TMPC << EOF ++#include ++#include ++int main(void) { return offsetof(struct stat, st_atim); } ++EOF ++if compile_prog "" "" ; then ++ st_atim=yes ++fi ++ + ########################################## + # check if trace backend exists + +@@ -6919,6 +6932,9 @@ fi + if test "$strchrnul" = "yes" ; then + echo "HAVE_STRCHRNUL=y" >> $config_host_mak + fi ++if test "$st_atim" = "yes" ; then ++ echo "HAVE_STRUCT_STAT_ST_ATIM=y" >> $config_host_mak ++fi + if test "$byteswap_h" = "yes" ; then + echo "CONFIG_BYTESWAP_H=y" >> $config_host_mak + fi +diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h +index f252baa..5c618ce 100644 +--- a/tools/virtiofsd/fuse_misc.h ++++ b/tools/virtiofsd/fuse_misc.h +@@ -7,6 +7,7 @@ + */ + + #include ++#include "config-host.h" + + /* + * Versioned symbols cannot be used in some cases because it +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch b/kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch new file mode 100755 index 0000000000000000000000000000000000000000..1bae1bf95ddad352bf3eecf229f26e07adab78fe --- /dev/null +++ b/kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch @@ -0,0 +1,82 @@ +From 7bc27a767bc8c78b1bca46bbe5e1d53dcd7173b4 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:18 +0100 +Subject: [PATCH 107/116] virtiofsd: use fuse_buf_writev to replace + fuse_buf_write for better performance +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-104-dgilbert@redhat.com> +Patchwork-id: 93558 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 103/112] virtiofsd: use fuse_buf_writev to replace fuse_buf_write for better performance +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: piaojun + +fuse_buf_writev() only handles the normal write in which src is buffer +and dest is fd. Specially if src buffer represents guest physical +address that can't be mapped by the daemon process, IO must be bounced +back to the VMM to do it by fuse_buf_copy(). + +Signed-off-by: Jun Piao +Suggested-by: Dr. David Alan Gilbert +Suggested-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit c465bba2c90a810f6e71e4f2646b1b4ee4b478de) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/buffer.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c +index 37befeb..27c1377 100644 +--- a/tools/virtiofsd/buffer.c ++++ b/tools/virtiofsd/buffer.c +@@ -34,7 +34,6 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv) + return size; + } + +-__attribute__((unused)) + static ssize_t fuse_buf_writev(struct fuse_buf *out_buf, + struct fuse_bufvec *in_buf) + { +@@ -262,12 +261,29 @@ static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) + + ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) + { +- size_t copied = 0; ++ size_t copied = 0, i; + + if (dstv == srcv) { + return fuse_buf_size(dstv); + } + ++ /* ++ * use writev to improve bandwidth when all the ++ * src buffers already mapped by the daemon ++ * process ++ */ ++ for (i = 0; i < srcv->count; i++) { ++ if (srcv->buf[i].flags & FUSE_BUF_IS_FD) { ++ break; ++ } ++ } ++ if ((i == srcv->count) && (dstv->count == 1) && ++ (dstv->idx == 0) && ++ (dstv->buf[0].flags & FUSE_BUF_IS_FD)) { ++ dstv->buf[0].pos += dstv->off; ++ return fuse_buf_writev(&dstv->buf[0], srcv); ++ } ++ + for (;;) { + const struct fuse_buf *src = fuse_bufvec_current(srcv); + const struct fuse_buf *dst = fuse_bufvec_current(dstv); +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch b/kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch new file mode 100755 index 0000000000000000000000000000000000000000..feffb5ec037d2cedc26dd08802bf636104272988 --- /dev/null +++ b/kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch @@ -0,0 +1,56 @@ +From 1724f54070d33d8070ba2d22c8fac87ea65814c1 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:02:04 +0100 +Subject: [PATCH 093/116] virtiofsd: use fuse_lowlevel_is_virtio() in + fuse_session_destroy() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-90-dgilbert@redhat.com> +Patchwork-id: 93540 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 089/112] virtiofsd: use fuse_lowlevel_is_virtio() in fuse_session_destroy() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +vu_socket_path is NULL when --fd=FDNUM was used. Use +fuse_lowlevel_is_virtio() instead. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 620e9d8d9cee6df7fe71168dea950dba0cc21a4a) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 70568d2..dab6a31 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -2537,12 +2537,13 @@ void fuse_session_destroy(struct fuse_session *se) + close(se->fd); + } + +- if (se->vu_socket_path) { ++ if (fuse_lowlevel_is_virtio(se)) { + virtio_session_close(se); +- free(se->vu_socket_path); +- se->vu_socket_path = NULL; + } + ++ free(se->vu_socket_path); ++ se->vu_socket_path = NULL; ++ + free(se); + } + +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch b/kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch new file mode 100755 index 0000000000000000000000000000000000000000..f250ed7e60f8812b4c822d8a5d61eaa6f00b73c0 --- /dev/null +++ b/kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch @@ -0,0 +1,390 @@ +From bce5070d1aada88154b811a08eec1586ab24fce5 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:26 +0100 +Subject: [PATCH 055/116] virtiofsd: use /proc/self/fd/ O_PATH file descriptor +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-52-dgilbert@redhat.com> +Patchwork-id: 93506 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 051/112] virtiofsd: use /proc/self/fd/ O_PATH file descriptor +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Sandboxing will remove /proc from the mount namespace so we can no +longer build string paths into "/proc/self/fd/...". + +Keep an O_PATH file descriptor so we can still re-open fds via +/proc/self/fd. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 9f59d175e2ca96f0b87f534dba69ea547dd35945) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 130 +++++++++++++++++++++++++++++++-------- + 1 file changed, 103 insertions(+), 27 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index e3d65c3..e2e2211 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -110,6 +110,9 @@ struct lo_data { + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ + struct lo_map fd_map; /* protected by lo->mutex */ ++ ++ /* An O_PATH file descriptor to /proc/self/fd/ */ ++ int proc_self_fd; + }; + + static const struct fuse_opt lo_opts[] = { +@@ -379,9 +382,9 @@ static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, + int res; + + retry: +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); + +- res = readlink(procname, path, PATH_MAX); ++ res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX); + if (res < 0) { + fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__); + goto fail_noretry; +@@ -477,9 +480,9 @@ static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode, + } + return res; + } +- sprintf(path, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "%i", inode->fd); + +- return utimensat(AT_FDCWD, path, tv, 0); ++ return utimensat(lo->proc_self_fd, path, tv, 0); + + fallback: + res = lo_parent_and_name(lo, inode, path, &parent); +@@ -535,8 +538,8 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + if (fi) { + res = fchmod(fd, attr->st_mode); + } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = chmod(procname, attr->st_mode); ++ sprintf(procname, "%i", ifd); ++ res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); + } + if (res == -1) { + goto out_err; +@@ -552,11 +555,23 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + } + } + if (valid & FUSE_SET_ATTR_SIZE) { ++ int truncfd; ++ + if (fi) { +- res = ftruncate(fd, attr->st_size); ++ truncfd = fd; + } else { +- sprintf(procname, "/proc/self/fd/%i", ifd); +- res = truncate(procname, attr->st_size); ++ sprintf(procname, "%i", ifd); ++ truncfd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (truncfd < 0) { ++ goto out_err; ++ } ++ } ++ ++ res = ftruncate(truncfd, attr->st_size); ++ if (!fi) { ++ saverr = errno; ++ close(truncfd); ++ errno = saverr; + } + if (res == -1) { + goto out_err; +@@ -874,9 +889,9 @@ static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode, + return res; + } + +- sprintf(path, "/proc/self/fd/%i", inode->fd); ++ sprintf(path, "%i", inode->fd); + +- return linkat(AT_FDCWD, path, dfd, name, AT_SYMLINK_FOLLOW); ++ return linkat(lo->proc_self_fd, path, dfd, name, AT_SYMLINK_FOLLOW); + + fallback: + res = lo_parent_and_name(lo, inode, path, &parent); +@@ -1404,8 +1419,8 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) + fi->flags &= ~O_APPEND; + } + +- sprintf(buf, "/proc/self/fd/%i", lo_fd(req, ino)); +- fd = open(buf, fi->flags & ~O_NOFOLLOW); ++ sprintf(buf, "%i", lo_fd(req, ino)); ++ fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); + if (fd == -1) { + return (void)fuse_reply_err(req, errno); + } +@@ -1458,7 +1473,6 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) + { + int res; +- (void)ino; + int fd; + char *buf; + +@@ -1466,12 +1480,14 @@ static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + (void *)fi); + + if (!fi) { +- res = asprintf(&buf, "/proc/self/fd/%i", lo_fd(req, ino)); ++ struct lo_data *lo = lo_data(req); ++ ++ res = asprintf(&buf, "%i", lo_fd(req, ino)); + if (res == -1) { + return (void)fuse_reply_err(req, errno); + } + +- fd = open(buf, O_RDWR); ++ fd = openat(lo->proc_self_fd, buf, O_RDWR); + free(buf); + if (fd == -1) { + return (void)fuse_reply_err(req, errno); +@@ -1587,11 +1603,13 @@ static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + size_t size) + { ++ struct lo_data *lo = lo_data(req); + char *value = NULL; + char procname[64]; + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1616,7 +1634,11 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } + + if (size) { + value = malloc(size); +@@ -1624,7 +1646,7 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out_err; + } + +- ret = getxattr(procname, name, value, size); ++ ret = fgetxattr(fd, name, value, size); + if (ret == -1) { + goto out_err; + } +@@ -1635,7 +1657,7 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + + fuse_reply_buf(req, value, ret); + } else { +- ret = getxattr(procname, name, NULL, 0); ++ ret = fgetxattr(fd, name, NULL, 0); + if (ret == -1) { + goto out_err; + } +@@ -1644,6 +1666,10 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + } + out_free: + free(value); ++ ++ if (fd >= 0) { ++ close(fd); ++ } + return; + + out_err: +@@ -1655,11 +1681,13 @@ out: + + static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + { ++ struct lo_data *lo = lo_data(req); + char *value = NULL; + char procname[64]; + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1683,7 +1711,11 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDONLY); ++ if (fd < 0) { ++ goto out_err; ++ } + + if (size) { + value = malloc(size); +@@ -1691,7 +1723,7 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + goto out_err; + } + +- ret = listxattr(procname, value, size); ++ ret = flistxattr(fd, value, size); + if (ret == -1) { + goto out_err; + } +@@ -1702,7 +1734,7 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + + fuse_reply_buf(req, value, ret); + } else { +- ret = listxattr(procname, NULL, 0); ++ ret = flistxattr(fd, NULL, 0); + if (ret == -1) { + goto out_err; + } +@@ -1711,6 +1743,10 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) + } + out_free: + free(value); ++ ++ if (fd >= 0) { ++ close(fd); ++ } + return; + + out_err: +@@ -1724,9 +1760,11 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, int flags) + { + char procname[64]; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1751,21 +1789,31 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } + +- ret = setxattr(procname, name, value, size, flags); ++ ret = fsetxattr(fd, name, value, size, flags); + saverr = ret == -1 ? errno : 0; + + out: ++ if (fd >= 0) { ++ close(fd); ++ } + fuse_reply_err(req, saverr); + } + + static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + { + char procname[64]; ++ struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; ++ int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { +@@ -1789,12 +1837,20 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) + goto out; + } + +- sprintf(procname, "/proc/self/fd/%i", inode->fd); ++ sprintf(procname, "%i", inode->fd); ++ fd = openat(lo->proc_self_fd, procname, O_RDWR); ++ if (fd < 0) { ++ saverr = errno; ++ goto out; ++ } + +- ret = removexattr(procname, name); ++ ret = fremovexattr(fd, name); + saverr = ret == -1 ? errno : 0; + + out: ++ if (fd >= 0) { ++ close(fd); ++ } + fuse_reply_err(req, saverr); + } + +@@ -1887,12 +1943,25 @@ static void print_capabilities(void) + printf("}\n"); + } + ++static void setup_proc_self_fd(struct lo_data *lo) ++{ ++ lo->proc_self_fd = open("/proc/self/fd", O_PATH); ++ if (lo->proc_self_fd == -1) { ++ fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); ++ exit(1); ++ } ++} ++ + int main(int argc, char *argv[]) + { + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); + struct fuse_session *se; + struct fuse_cmdline_opts opts; +- struct lo_data lo = { .debug = 0, .writeback = 0 }; ++ struct lo_data lo = { ++ .debug = 0, ++ .writeback = 0, ++ .proc_self_fd = -1, ++ }; + struct lo_map_elem *root_elem; + int ret = -1; + +@@ -2003,6 +2072,9 @@ int main(int argc, char *argv[]) + + fuse_daemonize(opts.foreground); + ++ /* Must be after daemonize to get the right /proc/self/fd */ ++ setup_proc_self_fd(&lo); ++ + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + +@@ -2018,6 +2090,10 @@ err_out1: + lo_map_destroy(&lo.dirp_map); + lo_map_destroy(&lo.ino_map); + ++ if (lo.proc_self_fd >= 0) { ++ close(lo.proc_self_fd); ++ } ++ + if (lo.root.fd >= 0) { + close(lo.root.fd); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch b/kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch new file mode 100755 index 0000000000000000000000000000000000000000..d60a9027f4d08aac8ff9c67d251ebe71a44f39bc --- /dev/null +++ b/kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch @@ -0,0 +1,137 @@ +From 6877a6c456178d6c1ca9a0ffaabaa7e51105b2ac Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:22 +0100 +Subject: [PATCH 051/116] virtiofsd: validate input buffer sizes in + do_write_buf() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-48-dgilbert@redhat.com> +Patchwork-id: 93501 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 047/112] virtiofsd: validate input buffer sizes in do_write_buf() +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +There is a small change in behavior: if fuse_write_in->size doesn't +match the input buffer size then the request is failed. Previously +write requests with 1 fuse_buf element would truncate to +fuse_write_in->size. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Sergio Lopez +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 0ba8c3c6fce8fe949d59c1fd84d98d220ef9e759) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/fuse_lowlevel.c | 49 +++++++++++++++++++++++++---------------- + 1 file changed, 30 insertions(+), 19 deletions(-) + +diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c +index 7e10995..611e8b0 100644 +--- a/tools/virtiofsd/fuse_lowlevel.c ++++ b/tools/virtiofsd/fuse_lowlevel.c +@@ -1003,8 +1003,8 @@ static void do_write(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) + } + } + +-static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, +- struct fuse_bufvec *ibufv) ++static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, ++ struct fuse_mbuf_iter *iter, struct fuse_bufvec *ibufv) + { + struct fuse_session *se = req->se; + struct fuse_bufvec *pbufv = ibufv; +@@ -1012,28 +1012,27 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + .buf[0] = ibufv->buf[0], + .count = 1, + }; +- struct fuse_write_in *arg = (struct fuse_write_in *)inarg; ++ struct fuse_write_in *arg; ++ size_t arg_size = sizeof(*arg); + struct fuse_file_info fi; + + memset(&fi, 0, sizeof(fi)); ++ ++ arg = fuse_mbuf_iter_advance(iter, arg_size); ++ if (!arg) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ ++ fi.lock_owner = arg->lock_owner; ++ fi.flags = arg->flags; + fi.fh = arg->fh; + fi.writepage = arg->write_flags & FUSE_WRITE_CACHE; + + if (ibufv->count == 1) { +- fi.lock_owner = arg->lock_owner; +- fi.flags = arg->flags; +- if (!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)) { +- tmpbufv.buf[0].mem = PARAM(arg); +- } +- tmpbufv.buf[0].size -= +- sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in); +- if (tmpbufv.buf[0].size < arg->size) { +- fuse_log(FUSE_LOG_ERR, +- "fuse: do_write_buf: buffer size too small\n"); +- fuse_reply_err(req, EIO); +- return; +- } +- tmpbufv.buf[0].size = arg->size; ++ assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)); ++ tmpbufv.buf[0].mem = ((char *)arg) + arg_size; ++ tmpbufv.buf[0].size -= sizeof(struct fuse_in_header) + arg_size; + pbufv = &tmpbufv; + } else { + /* +@@ -1043,6 +1042,13 @@ static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, const void *inarg, + ibufv->buf[0].size = 0; + } + ++ if (fuse_buf_size(pbufv) != arg->size) { ++ fuse_log(FUSE_LOG_ERR, ++ "fuse: do_write_buf: buffer size doesn't match arg->size\n"); ++ fuse_reply_err(req, EIO); ++ return; ++ } ++ + se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); + } + +@@ -2052,12 +2058,17 @@ void fuse_session_process_buf_int(struct fuse_session *se, + struct fuse_chan *ch) + { + const struct fuse_buf *buf = bufv->buf; ++ struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf); + struct fuse_in_header *in; + const void *inarg; + struct fuse_req *req; + int err; + +- in = buf->mem; ++ /* The first buffer must be a memory buffer */ ++ assert(!(buf->flags & FUSE_BUF_IS_FD)); ++ ++ in = fuse_mbuf_iter_advance(&iter, sizeof(*in)); ++ assert(in); /* caller guarantees the input buffer is large enough */ + + if (se->debug) { + fuse_log(FUSE_LOG_DEBUG, +@@ -2129,7 +2140,7 @@ void fuse_session_process_buf_int(struct fuse_session *se, + + inarg = (void *)&in[1]; + if (in->opcode == FUSE_WRITE && se->op.write_buf) { +- do_write_buf(req, in->nodeid, inarg, bufv); ++ do_write_buf(req, in->nodeid, &iter, bufv); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, inarg); + } +-- +1.8.3.1 + diff --git a/kvm-virtiofsd-validate-path-components.patch b/kvm-virtiofsd-validate-path-components.patch new file mode 100755 index 0000000000000000000000000000000000000000..b35aed7d91b35466b1e6bb7538cef0863b3986c6 --- /dev/null +++ b/kvm-virtiofsd-validate-path-components.patch @@ -0,0 +1,164 @@ +From 69ac47502848c37ca3ede00f432c0675d9eef42c Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:01:18 +0100 +Subject: [PATCH 047/116] virtiofsd: validate path components +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-44-dgilbert@redhat.com> +Patchwork-id: 93498 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 043/112] virtiofsd: validate path components +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Stefan Hajnoczi + +Several FUSE requests contain single path components. A correct FUSE +client sends well-formed path components but there is currently no input +validation in case something went wrong or the client is malicious. + +Refuse ".", "..", and paths containing '/' when we expect a path +component. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Dr. David Alan Gilbert +(cherry picked from commit 25dae28c58d7e706b5d5db99042c9db3cef2e657) +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 59 ++++++++++++++++++++++++++++++++++++---- + 1 file changed, 53 insertions(+), 6 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index ac380ef..e375406 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -133,6 +133,21 @@ static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n); + + static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); + ++static int is_dot_or_dotdot(const char *name) ++{ ++ return name[0] == '.' && ++ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); ++} ++ ++/* Is `path` a single path component that is not "." or ".."? */ ++static int is_safe_path_component(const char *path) ++{ ++ if (strchr(path, '/')) { ++ return 0; ++ } ++ ++ return !is_dot_or_dotdot(path); ++} + + static struct lo_data *lo_data(fuse_req_t req) + { +@@ -681,6 +696,15 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) + parent, name); + } + ++ /* ++ * Don't use is_safe_path_component(), allow "." and ".." for NFS export ++ * support. ++ */ ++ if (strchr(name, '/')) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + err = lo_do_lookup(req, parent, name, &e); + if (err) { + fuse_reply_err(req, err); +@@ -762,6 +786,11 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + struct fuse_entry_param e; + struct lo_cred old = {}; + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + dir = lo_inode(req, parent); + if (!dir) { + fuse_reply_err(req, EBADF); +@@ -863,6 +892,11 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + struct fuse_entry_param e; + int saverr; + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); +@@ -904,6 +938,10 @@ out_err: + static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } + + res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + +@@ -916,6 +954,11 @@ static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + { + int res; + ++ if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + if (flags) { + fuse_reply_err(req, EINVAL); + return; +@@ -930,6 +973,11 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) + { + int res; + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + res = unlinkat(lo_fd(req, parent), name, 0); + + fuse_reply_err(req, res == -1 ? errno : 0); +@@ -1093,12 +1141,6 @@ out_err: + fuse_reply_err(req, error); + } + +-static int is_dot_or_dotdot(const char *name) +-{ +- return name[0] == '.' && +- (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); +-} +- + static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) + { +@@ -1248,6 +1290,11 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + parent, name); + } + ++ if (!is_safe_path_component(name)) { ++ fuse_reply_err(req, EINVAL); ++ return; ++ } ++ + err = lo_change_cred(req, &old); + if (err) { + goto out; +-- +1.8.3.1 + diff --git a/kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch b/kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch new file mode 100755 index 0000000000000000000000000000000000000000..20add81977e2993841e5a1dd9f3e1bc59d678b8c --- /dev/null +++ b/kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch @@ -0,0 +1,56 @@ +From 247987aa987b7332eb501e00c440079b9e8e1fe7 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Mon, 27 Jan 2020 19:00:52 +0100 +Subject: [PATCH 021/116] vitriofsd/passthrough_ll: fix fallocate() ifdefs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200127190227.40942-18-dgilbert@redhat.com> +Patchwork-id: 93471 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 017/112] vitriofsd/passthrough_ll: fix fallocate() ifdefs +Bugzilla: 1694164 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Sergio Lopez Pascual + +From: Xiao Yang + +1) Use correct CONFIG_FALLOCATE macro to check if fallocate() is supported.(i.e configure + script sets CONFIG_FALLOCATE intead of HAVE_FALLOCATE if fallocate() is supported) +2) Replace HAVE_POSIX_FALLOCATE with CONFIG_POSIX_FALLOCATE. + +Signed-off-by: Xiao Yang +Signed-off-by: Dr. David Alan Gilbert + Merged from two of Xiao Yang's patches +(cherry picked from commit 9776457ca6f05d5900e27decb1dba2ffddf95a22) + +Signed-off-by: Miroslav Rezanina +--- + tools/virtiofsd/passthrough_ll.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c +index 322a889..6c4da18 100644 +--- a/tools/virtiofsd/passthrough_ll.c ++++ b/tools/virtiofsd/passthrough_ll.c +@@ -975,13 +975,13 @@ static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + int err = EOPNOTSUPP; + (void)ino; + +-#ifdef HAVE_FALLOCATE ++#ifdef CONFIG_FALLOCATE + err = fallocate(fi->fh, mode, offset, length); + if (err < 0) { + err = errno; + } + +-#elif defined(HAVE_POSIX_FALLOCATE) ++#elif defined(CONFIG_POSIX_FALLOCATE) + if (mode) { + fuse_reply_err(req, EOPNOTSUPP); + return; +-- +1.8.3.1 + diff --git a/kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch b/kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch new file mode 100755 index 0000000000000000000000000000000000000000..dbcf2a7e2c91a8c0d10ce1187cc2476f092628d3 --- /dev/null +++ b/kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch @@ -0,0 +1,63 @@ +From ad50e0e2d310277f06a9c512fe6e31da183ead6e Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:34 -0500 +Subject: [PATCH 1/4] x86/cpu: Enable AVX512_VP2INTERSECT cpu feature + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-2-dgilbert@redhat.com> +Patchwork-id: 101203 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 1/4] x86/cpu: Enable AVX512_VP2INTERSECT cpu feature +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Cathy Zhang + +AVX512_VP2INTERSECT compute vector pair intersection to a pair +of mask registers, which is introduced with intel Tiger Lake, +defining as CPUID.(EAX=7,ECX=0):EDX[bit 08]. + +Refer to the following release spec: +https://software.intel.com/sites/default/files/managed/c5/15/\ +architecture-instruction-set-extensions-programming-reference.pdf + +Signed-off-by: Cathy Zhang +Message-Id: <1586760758-13638-1-git-send-email-cathy.zhang@intel.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 353f98c9ad52ff4b8cfe553c90be04f747a14c98) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 2 +- + target/i386/cpu.h | 2 ++ + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index ff39fc9905..67dab94aa5 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1078,7 +1078,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .feat_names = { + NULL, NULL, "avx512-4vnniw", "avx512-4fmaps", + NULL, NULL, NULL, NULL, +- NULL, NULL, "md-clear", NULL, ++ "avx512-vp2intersect", NULL, "md-clear", NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL /* pconfig */, NULL, + NULL, NULL, NULL, NULL, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index f3da25cb8a..8e2e52ed31 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -770,6 +770,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_7_0_EDX_AVX512_4VNNIW (1U << 2) + /* AVX512 Multiply Accumulation Single Precision */ + #define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) ++/* AVX512 Vector Pair Intersection to a Pair of Mask Registers */ ++#define CPUID_7_0_EDX_AVX512_VP2INTERSECT (1U << 8) + /* Speculation Control */ + #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) + /* Single Thread Indirect Branch Predictors */ +-- +2.27.0 + diff --git a/kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch b/kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch new file mode 100755 index 0000000000000000000000000000000000000000..9ef6d0475f542873892654dd6d3d64caad970214 --- /dev/null +++ b/kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch @@ -0,0 +1,91 @@ +From 655e723a5190206302f6cc4f2e794563b8e1c226 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Wed, 24 Feb 2021 11:30:36 -0500 +Subject: [PATCH 3/4] x86/cpu: Populate SVM CPUID feature bits + +RH-Author: Dr. David Alan Gilbert +Message-id: <20210224113037.15599-4-dgilbert@redhat.com> +Patchwork-id: 101200 +O-Subject: [RHEL-8.4.0 qemu-kvm PATCH 3/4] x86/cpu: Populate SVM CPUID feature bits +Bugzilla: 1790620 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Sergio Lopez Pascual +RH-Acked-by: Peter Xu + +From: Wei Huang + +Newer AMD CPUs will add CPUID_0x8000000A_EDX[28] bit, which indicates +that SVM instructions (VMRUN/VMSAVE/VMLOAD) will trigger #VMEXIT before +CPU checking their EAX against reserved memory regions. This change will +allow the hypervisor to avoid intercepting #GP and emulating SVM +instructions. KVM turns on this CPUID bit for nested VMs. In order to +support it, let us populate this bit, along with other SVM feature bits, +in FEAT_SVM. + +Signed-off-by: Wei Huang +Message-Id: <20210126202456.589932-1-wei.huang2@amd.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit 5447089c2b3b084b51670af36fc86ee3979e04be) +Signed-off-by: Danilo C. L. de Paula +--- + target/i386/cpu.c | 6 +++--- + target/i386/cpu.h | 24 ++++++++++++++---------- + 2 files changed, 17 insertions(+), 13 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index f6a9ed84b3..7227c803c3 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1026,11 +1026,11 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "npt", "lbrv", "svm-lock", "nrip-save", + "tsc-scale", "vmcb-clean", "flushbyasid", "decodeassists", + NULL, NULL, "pause-filter", NULL, +- "pfthreshold", NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, ++ "pfthreshold", "avic", NULL, "v-vmsave-vmload", ++ "vgif", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, ++ "svme-addr-chk", NULL, NULL, NULL, + }, + .cpuid = { .eax = 0x8000000A, .reg = R_EDX, }, + .tcg_features = TCG_SVM_FEATURES, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index f5a4efcec6..e1b67910c2 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -667,16 +667,20 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; + #define CPUID_EXT3_PERFCORE (1U << 23) + #define CPUID_EXT3_PERFNB (1U << 24) + +-#define CPUID_SVM_NPT (1U << 0) +-#define CPUID_SVM_LBRV (1U << 1) +-#define CPUID_SVM_SVMLOCK (1U << 2) +-#define CPUID_SVM_NRIPSAVE (1U << 3) +-#define CPUID_SVM_TSCSCALE (1U << 4) +-#define CPUID_SVM_VMCBCLEAN (1U << 5) +-#define CPUID_SVM_FLUSHASID (1U << 6) +-#define CPUID_SVM_DECODEASSIST (1U << 7) +-#define CPUID_SVM_PAUSEFILTER (1U << 10) +-#define CPUID_SVM_PFTHRESHOLD (1U << 12) ++#define CPUID_SVM_NPT (1U << 0) ++#define CPUID_SVM_LBRV (1U << 1) ++#define CPUID_SVM_SVMLOCK (1U << 2) ++#define CPUID_SVM_NRIPSAVE (1U << 3) ++#define CPUID_SVM_TSCSCALE (1U << 4) ++#define CPUID_SVM_VMCBCLEAN (1U << 5) ++#define CPUID_SVM_FLUSHASID (1U << 6) ++#define CPUID_SVM_DECODEASSIST (1U << 7) ++#define CPUID_SVM_PAUSEFILTER (1U << 10) ++#define CPUID_SVM_PFTHRESHOLD (1U << 12) ++#define CPUID_SVM_AVIC (1U << 13) ++#define CPUID_SVM_V_VMSAVE_VMLOAD (1U << 15) ++#define CPUID_SVM_VGIF (1U << 16) ++#define CPUID_SVM_SVME_ADDR_CHK (1U << 28) + + /* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ + #define CPUID_7_0_EBX_FSGSBASE (1U << 0) +-- +2.27.0 + diff --git a/kvm-x86.conf b/kvm-x86.conf new file mode 100755 index 0000000000000000000000000000000000000000..3f7842a134f8514cdffa7bb05b418a3a73fd9a35 --- /dev/null +++ b/kvm-x86.conf @@ -0,0 +1,12 @@ +# Setting modprobe kvm_intel/kvm_amd nested = 1 +# only enables Nested Virtualization until the next reboot or +# module reload. Uncomment the option applicable +# to your system below to enable the feature permanently. +# +# User changes in this file are preserved across upgrades. +# +# For Intel +#options kvm_intel nested=1 +# +# For AMD +#options kvm_amd nested=1 diff --git a/kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch b/kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch new file mode 100755 index 0000000000000000000000000000000000000000..aabe041f83d0aa420caad20c13309a59ef7c8b17 --- /dev/null +++ b/kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch @@ -0,0 +1,76 @@ +From f38f51d422e82d1241b678960dd6a033ffa398da Mon Sep 17 00:00:00 2001 +From: Jon Maloy +Date: Wed, 21 Apr 2021 22:30:05 -0400 +Subject: [PATCH 6/7] xhci: fix valid.max_access_size to access address + registers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jon Maloy +Message-id: <20210421223006.19650-6-jmaloy@redhat.com> +Patchwork-id: 101483 +O-Subject: [RHEL-8.5.0 qemu-kvm PATCH v2 5/6] xhci: fix valid.max_access_size to access address registers +Bugzilla: 1842478 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laszlo Ersek + +From: Laurent Vivier + +QEMU XHCI advertises AC64 (64-bit addressing) but doesn't allow +64-bit mode access in "runtime" and "operational" MemoryRegionOps. + +Set the max_access_size based on sizeof(dma_addr_t) as AC64 is set. + +XHCI specs: +"If the xHC supports 64-bit addressing (AC64 = ‘1’), then software +should write 64-bit registers using only Qword accesses. If a +system is incapable of issuing Qword accesses, then writes to the +64-bit address fields shall be performed using 2 Dword accesses; +low Dword-first, high-Dword second. If the xHC supports 32-bit +addressing (AC64 = ‘0’), then the high Dword of registers containing +64-bit address fields are unused and software should write addresses +using only Dword accesses" + +The problem has been detected with SLOF, as linux kernel always accesses +registers using 32-bit access even if AC64 is set and revealed by +5d971f9e6725 ("memory: Revert "memory: accept mismatching sizes in memory_region_access_valid"") + +Suggested-by: Alexey Kardashevskiy +Signed-off-by: Laurent Vivier +Message-id: 20200721083322.90651-1-lvivier@redhat.com +Signed-off-by: Gerd Hoffmann + +(cherry picked from commit 8e67fda2dd6202ccec093fda561107ba14830a17) +Signed-off-by: Jon Maloy +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-xhci.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index 646c78cde9..ab449bb003 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -3183,7 +3183,7 @@ static const MemoryRegionOps xhci_oper_ops = { + .read = xhci_oper_read, + .write = xhci_oper_write, + .valid.min_access_size = 4, +- .valid.max_access_size = 4, ++ .valid.max_access_size = sizeof(dma_addr_t), + .endianness = DEVICE_LITTLE_ENDIAN, + }; + +@@ -3199,7 +3199,7 @@ static const MemoryRegionOps xhci_runtime_ops = { + .read = xhci_runtime_read, + .write = xhci_runtime_write, + .valid.min_access_size = 4, +- .valid.max_access_size = 4, ++ .valid.max_access_size = sizeof(dma_addr_t), + .endianness = DEVICE_LITTLE_ENDIAN, + }; + +-- +2.27.0 + diff --git a/kvm-xhci-recheck-slot-status.patch b/kvm-xhci-recheck-slot-status.patch new file mode 100755 index 0000000000000000000000000000000000000000..8bcbc2c0720a9433f7c9ebe080f319e06ba57aac --- /dev/null +++ b/kvm-xhci-recheck-slot-status.patch @@ -0,0 +1,77 @@ +From ab87c0ed2a8f0a626099261a3028bc34cfac3929 Mon Sep 17 00:00:00 2001 +From: "Dr. David Alan Gilbert" +Date: Tue, 14 Jan 2020 20:23:31 +0000 +Subject: [PATCH 5/5] xhci: recheck slot status +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Dr. David Alan Gilbert +Message-id: <20200114202331.51831-3-dgilbert@redhat.com> +Patchwork-id: 93345 +O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH 2/2] xhci: recheck slot status +Bugzilla: 1790844 +RH-Acked-by: Peter Xu +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Gerd Hoffmann + +From: Gerd Hoffmann + +Factor out slot status check into a helper function. Add an additional +check after completing transfers. This is needed in case a guest +queues multiple transfers in a row and a device unplug happens while +qemu processes them. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1786413 +Signed-off-by: Gerd Hoffmann +Reviewed-by: Philippe Mathieu-Daudé +Message-id: 20200107083606.12393-1-kraxel@redhat.com +(cherry picked from commit 236846a019c4f7aa3111026fc9a1fe09684c8978) +Signed-off-by: Danilo C. L. de Paula +--- + hw/usb/hcd-xhci.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index d2b9744..646c78c 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -1861,6 +1861,13 @@ static void xhci_kick_ep(XHCIState *xhci, unsigned int slotid, + xhci_kick_epctx(epctx, streamid); + } + ++static bool xhci_slot_ok(XHCIState *xhci, int slotid) ++{ ++ return (xhci->slots[slotid - 1].uport && ++ xhci->slots[slotid - 1].uport->dev && ++ xhci->slots[slotid - 1].uport->dev->attached); ++} ++ + static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid) + { + XHCIState *xhci = epctx->xhci; +@@ -1878,9 +1885,7 @@ static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid) + + /* If the device has been detached, but the guest has not noticed this + yet the 2 above checks will succeed, but we must NOT continue */ +- if (!xhci->slots[epctx->slotid - 1].uport || +- !xhci->slots[epctx->slotid - 1].uport->dev || +- !xhci->slots[epctx->slotid - 1].uport->dev->attached) { ++ if (!xhci_slot_ok(xhci, epctx->slotid)) { + return; + } + +@@ -1987,6 +1992,10 @@ static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid) + } else { + xhci_fire_transfer(xhci, xfer, epctx); + } ++ if (!xhci_slot_ok(xhci, epctx->slotid)) { ++ /* surprise removal -> stop processing */ ++ break; ++ } + if (xfer->complete) { + /* update ring dequeue ptr */ + xhci_set_ep_state(xhci, epctx, stctx, epctx->state); +-- +1.8.3.1 + diff --git a/kvm-xics-Don-t-deassert-outputs.patch b/kvm-xics-Don-t-deassert-outputs.patch new file mode 100755 index 0000000000000000000000000000000000000000..08ed72430a2f77b13f4716e9b3acab3f81932bd4 --- /dev/null +++ b/kvm-xics-Don-t-deassert-outputs.patch @@ -0,0 +1,52 @@ +From 99b6ee4b7f63ea49e5b73f61bbf68f67252f27da Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Tue, 21 Jan 2020 05:16:12 +0000 +Subject: [PATCH 02/15] xics: Don't deassert outputs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: David Gibson +Message-id: <20200121051613.388295-3-dgibson@redhat.com> +Patchwork-id: 93430 +O-Subject: [RHEL-AV-8.2 qemu-kvm PATCH 2/3] xics: Don't deassert outputs +Bugzilla: 1776638 +RH-Acked-by: Philippe Mathieu-Daudé +RH-Acked-by: Laurent Vivier +RH-Acked-by: Thomas Huth + +From: Greg Kurz + +The correct way to do this is to deassert the input pins on the CPU side. +This is the case since a previous change. + +Signed-off-by: Greg Kurz +Message-Id: <157548862298.3650476.1228720391270249433.stgit@bahia.lan> +Signed-off-by: David Gibson +(cherry picked from commit 4febcdd88f08422a66a1aa0dc55e1472abed3c4b) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1776638 + +Signed-off-by: David Gibson +Signed-off-by: Danilo C. L. de Paula +--- + hw/intc/xics.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/hw/intc/xics.c b/hw/intc/xics.c +index e7ac9ba..72c5dca 100644 +--- a/hw/intc/xics.c ++++ b/hw/intc/xics.c +@@ -289,9 +289,6 @@ void icp_reset(ICPState *icp) + icp->pending_priority = 0xff; + icp->mfrr = 0xff; + +- /* Make all outputs are deasserted */ +- qemu_set_irq(icp->output, 0); +- + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + +-- +1.8.3.1 + diff --git a/kvm.conf b/kvm.conf new file mode 100755 index 0000000000000000000000000000000000000000..24e60e98e318505aad0b49a92c893e9cb52799e8 --- /dev/null +++ b/kvm.conf @@ -0,0 +1,3 @@ +# +# User changes in this file are preserved across upgrades. +# diff --git a/qemu-ga.sysconfig b/qemu-ga.sysconfig new file mode 100755 index 0000000000000000000000000000000000000000..67bad0c684e9c0365da1f21b55c1f5981bbeeac9 --- /dev/null +++ b/qemu-ga.sysconfig @@ -0,0 +1,19 @@ +# This is a systemd environment file, not a shell script. +# It provides settings for "/lib/systemd/system/qemu-guest-agent.service". + +# Comma-separated blacklist of RPCs to disable, or empty list to enable all. +# +# You can get the list of RPC commands using "qemu-ga --blacklist='?'". +# There should be no spaces between commas and commands in the blacklist. +BLACKLIST_RPC=guest-file-open,guest-file-close,guest-file-read,guest-file-write,guest-file-seek,guest-file-flush,guest-exec,guest-exec-status + +# Fsfreeze hook script specification. +# +# FSFREEZE_HOOK_PATHNAME=/dev/null : disables the feature. +# +# FSFREEZE_HOOK_PATHNAME=/path/to/executable : enables the feature with the +# specified binary or shell script. +# +# FSFREEZE_HOOK_PATHNAME= : enables the feature with the +# default value (invoke "qemu-ga --help" to interrogate). +FSFREEZE_HOOK_PATHNAME=/etc/qemu-ga/fsfreeze-hook diff --git a/qemu-guest-agent.service b/qemu-guest-agent.service new file mode 100755 index 0000000000000000000000000000000000000000..b33e951d7ddc15809eca1d88bf9df0585eef1130 --- /dev/null +++ b/qemu-guest-agent.service @@ -0,0 +1,20 @@ +[Unit] +Description=QEMU Guest Agent +BindsTo=dev-virtio\x2dports-org.qemu.guest_agent.0.device +After=dev-virtio\x2dports-org.qemu.guest_agent.0.device +IgnoreOnIsolate=True + +[Service] +UMask=0077 +EnvironmentFile=/etc/sysconfig/qemu-ga +ExecStart=/usr/bin/qemu-ga \ + --method=virtio-serial \ + --path=/dev/virtio-ports/org.qemu.guest_agent.0 \ + --blacklist=${BLACKLIST_RPC} \ + -F${FSFREEZE_HOOK_PATHNAME} +StandardError=syslog +Restart=always +RestartSec=0 + +[Install] +WantedBy=dev-virtio\x2dports-org.qemu.guest_agent.0.device diff --git a/qemu-kvm.spec b/qemu-kvm.spec new file mode 100755 index 0000000000000000000000000000000000000000..1e341d31c643a4f3f2c074b239befb439c4a6936 --- /dev/null +++ b/qemu-kvm.spec @@ -0,0 +1,4271 @@ +%global SLOF_gittagdate 20191022 +%global SLOF_gittagcommit 899d9883 + +%global have_usbredir 1 +%global have_spice 1 +%global have_opengl 1 +%global have_fdt 0 +%global have_gluster 1 +%global have_kvm_setup 0 +%global have_memlock_limits 0 + +%ifnarch %{ix86} x86_64 + %global have_usbredir 0 +%endif + +%ifnarch s390x + %global have_librdma 1 +%else + %global have_librdma 0 +%endif + +%ifarch %{ix86} + %global kvm_target i386 +%endif +%ifarch x86_64 + %global kvm_target x86_64 +%else + %global have_spice 0 + %global have_opengl 0 + %global have_gluster 0 +%endif +%ifarch %{power64} + %global kvm_target ppc64 + %global have_fdt 1 + %global have_kvm_setup 1 + %global have_memlock_limits 1 +%endif +%ifarch s390x + %global kvm_target s390x + %global have_kvm_setup 1 +%endif +%ifarch ppc + %global kvm_target ppc + %global have_fdt 1 +%endif +%ifarch aarch64 + %global kvm_target aarch64 + %global have_fdt 1 +%endif + +#Versions of various parts: + +%global requires_all_modules \ +Requires: %{name}-block-curl = %{epoch}:%{version}-%{release} \ +%if %{have_gluster} \ +Requires: %{name}-block-gluster = %{epoch}:%{version}-%{release} \ +%endif \ +Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \ +Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \ +Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release} + +# Macro to properly setup RHEL/RHEV conflict handling +%define rhev_ma_conflicts() \ +Obsoletes: %1-ma <= %{epoch}:%{version}-%{release} \ +Obsoletes: %1-rhev <= %{epoch}:%{version}-%{release} + +Summary: QEMU is a machine emulator and virtualizer +Name: qemu-kvm +Version: 4.2.0 +Release: 59%{?dist}.2 +# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped +Epoch: 15 +License: GPLv2 and GPLv2+ and CC-BY +Group: Development/Tools +URL: http://www.qemu.org/ +ExclusiveArch: x86_64 %{power64} aarch64 s390x + + +Source0: http://wiki.qemu.org/download/qemu-4.2.0.tar.xz + +# KSM control scripts +Source4: ksm.service +Source5: ksm.sysconfig +Source6: ksmctl.c +Source7: ksmtuned.service +Source8: ksmtuned +Source9: ksmtuned.conf +Source10: qemu-guest-agent.service +Source11: 99-qemu-guest-agent.rules +Source12: bridge.conf +Source13: qemu-ga.sysconfig +Source21: kvm-setup +Source22: kvm-setup.service +Source23: 85-kvm.preset +Source26: vhost.conf +Source27: kvm.conf +Source28: 95-kvm-memlock.conf +Source30: kvm-s390x.conf +Source31: kvm-x86.conf +Source32: qemu-pr-helper.service +Source33: qemu-pr-helper.socket +Source34: 81-kvm-rhel.rules +Source35: udev-kvm-check.c +Source36: README.tests + + +Patch0005: 0005-Initial-redhat-build.patch +Patch0006: 0006-Enable-disable-devices-for-RHEL.patch +Patch0007: 0007-Machine-type-related-general-changes.patch +Patch0008: 0008-Add-aarch64-machine-types.patch +Patch0009: 0009-Add-ppc64-machine-types.patch +Patch0010: 0010-Add-s390x-machine-types.patch +Patch0011: 0011-Add-x86_64-machine-types.patch +Patch0012: 0012-Enable-make-check.patch +Patch0013: 0013-vfio-cap-number-of-devices-that-can-be-assigned.patch +Patch0014: 0014-Add-support-statement-to-help-output.patch +Patch0015: 0015-globally-limit-the-maximum-number-of-CPUs.patch +Patch0016: 0016-Add-support-for-simpletrace.patch +Patch0017: 0017-Use-qemu-kvm-in-documentation-instead-of-qemu-system.patch +Patch0018: 0018-usb-xhci-Fix-PCI-capability-order.patch +Patch0019: 0019-virtio-scsi-Reject-scsi-cd-if-data-plane-enabled-RHE.patch +Patch0020: 0020-BZ1653590-Require-at-least-64kiB-pages-for-downstrea.patch +Patch0021: 0021-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch +# For bz#1741345 - Remove the "cpu64-rhel6" CPU from qemu-kvm +Patch22: kvm-i386-Remove-cpu64-rhel6-CPU-model.patch +# For bz#1772774 - qemu-kvm core dump during migration+reboot ( Assertion `mem->dirty_bmap' failed ) +Patch23: kvm-Reallocate-dirty_bmap-when-we-change-a-slot.patch +# For bz#1733893 - Boot a guest with "-prom-env 'auto-boot?=false'", SLOF failed to enter the boot entry after input "boot" followed by "0 > " on VNC +Patch24: kvm-spapr-Don-t-trigger-a-CAS-reboot-for-XICS-XIVE-mode-.patch +# For bz#1782678 - qemu core dump after hot-unplugging the XXV710/XL710 PF +Patch25: kvm-vfio-pci-Don-t-remove-irqchip-notifier-if-not-regist.patch +# For bz#1789301 - virtio-blk/scsi: fix notification suppression during AioContext polling +Patch26: kvm-virtio-don-t-enable-notifications-during-polling.patch +# For bz#1790844 - USB related fixes +Patch27: kvm-usbredir-Prevent-recursion-in-usbredir_write.patch +# For bz#1790844 - USB related fixes +Patch28: kvm-xhci-recheck-slot-status.patch +# For bz#1791568 - CVE-2020-7039 qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-av-8.2.0] +Patch29: kvm-tcp_emu-Fix-oob-access.patch +# For bz#1791568 - CVE-2020-7039 qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-av-8.2.0] +Patch30: kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch +# For bz#1791568 - CVE-2020-7039 qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-av-8.2.0] +Patch31: kvm-slirp-use-correct-size-while-emulating-commands.patch +# For bz#1559846 - Nested KVM: limit VMX features according to CPU models - Fast Train +Patch32: kvm-RHEL-hw-i386-disable-nested-PERF_GLOBAL_CTRL-MSR-sup.patch +# For bz#1725084 - aarch64: support dumping SVE registers +Patch33: kvm-target-arm-arch_dump-Add-SVE-notes.patch +# For bz#1779041 - netkvm: no connectivity Windows guest with q35 + hugepages + vhost + hv_synic +Patch34: kvm-vhost-Add-names-to-section-rounded-warning.patch +# For bz#1779041 - netkvm: no connectivity Windows guest with q35 + hugepages + vhost + hv_synic +Patch35: kvm-vhost-Only-align-sections-for-vhost-user.patch +# For bz#1779041 - netkvm: no connectivity Windows guest with q35 + hugepages + vhost + hv_synic +Patch36: kvm-vhost-coding-style-fix.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch37: kvm-virtio-fs-fix-MSI-X-nvectors-calculation.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch38: kvm-vhost-user-fs-remove-vhostfd-property.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch39: kvm-build-rename-CONFIG_LIBCAP-to-CONFIG_LIBCAP_NG.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch40: kvm-virtiofsd-Pull-in-upstream-headers.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch41: kvm-virtiofsd-Pull-in-kernel-s-fuse.h.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch42: kvm-virtiofsd-Add-auxiliary-.c-s.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch43: kvm-virtiofsd-Add-fuse_lowlevel.c.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch44: kvm-virtiofsd-Add-passthrough_ll.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch45: kvm-virtiofsd-Trim-down-imported-files.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch46: kvm-virtiofsd-Format-imported-files-to-qemu-style.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch47: kvm-virtiofsd-remove-mountpoint-dummy-argument.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch48: kvm-virtiofsd-remove-unused-notify-reply-support.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch49: kvm-virtiofsd-Remove-unused-enum-fuse_buf_copy_flags.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch50: kvm-virtiofsd-Fix-fuse_daemonize-ignored-return-values.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch51: kvm-virtiofsd-Fix-common-header-and-define-for-QEMU-buil.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch52: kvm-virtiofsd-Trim-out-compatibility-code.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch53: kvm-vitriofsd-passthrough_ll-fix-fallocate-ifdefs.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch54: kvm-virtiofsd-Make-fsync-work-even-if-only-inode-is-pass.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch55: kvm-virtiofsd-Add-options-for-virtio.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch56: kvm-virtiofsd-add-o-source-PATH-to-help-output.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch57: kvm-virtiofsd-Open-vhost-connection-instead-of-mounting.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch58: kvm-virtiofsd-Start-wiring-up-vhost-user.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch59: kvm-virtiofsd-Add-main-virtio-loop.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch60: kvm-virtiofsd-get-set-features-callbacks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch61: kvm-virtiofsd-Start-queue-threads.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch62: kvm-virtiofsd-Poll-kick_fd-for-queue.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch63: kvm-virtiofsd-Start-reading-commands-from-queue.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch64: kvm-virtiofsd-Send-replies-to-messages.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch65: kvm-virtiofsd-Keep-track-of-replies.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch66: kvm-virtiofsd-Add-Makefile-wiring-for-virtiofsd-contrib.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch67: kvm-virtiofsd-Fast-path-for-virtio-read.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch68: kvm-virtiofsd-add-fd-FDNUM-fd-passing-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch69: kvm-virtiofsd-make-f-foreground-the-default.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch70: kvm-virtiofsd-add-vhost-user.json-file.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch71: kvm-virtiofsd-add-print-capabilities-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch72: kvm-virtiofs-Add-maintainers-entry.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch73: kvm-virtiofsd-passthrough_ll-create-new-files-in-caller-.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch74: kvm-virtiofsd-passthrough_ll-add-lo_map-for-ino-fh-indir.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch75: kvm-virtiofsd-passthrough_ll-add-ino_map-to-hide-lo_inod.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch76: kvm-virtiofsd-passthrough_ll-add-dirp_map-to-hide-lo_dir.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch77: kvm-virtiofsd-passthrough_ll-add-fd_map-to-hide-file-des.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch78: kvm-virtiofsd-passthrough_ll-add-fallback-for-racy-ops.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch79: kvm-virtiofsd-validate-path-components.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch80: kvm-virtiofsd-Plumb-fuse_bufvec-through-to-do_write_buf.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch81: kvm-virtiofsd-Pass-write-iov-s-all-the-way-through.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch82: kvm-virtiofsd-add-fuse_mbuf_iter-API.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch83: kvm-virtiofsd-validate-input-buffer-sizes-in-do_write_bu.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch84: kvm-virtiofsd-check-input-buffer-size-in-fuse_lowlevel.c.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch85: kvm-virtiofsd-prevent-.-escape-in-lo_do_lookup.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch86: kvm-virtiofsd-prevent-.-escape-in-lo_do_readdir.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch87: kvm-virtiofsd-use-proc-self-fd-O_PATH-file-descriptor.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch88: kvm-virtiofsd-sandbox-mount-namespace.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch89: kvm-virtiofsd-move-to-an-empty-network-namespace.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch90: kvm-virtiofsd-move-to-a-new-pid-namespace.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch91: kvm-virtiofsd-add-seccomp-whitelist.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch92: kvm-virtiofsd-Parse-flag-FUSE_WRITE_KILL_PRIV.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch93: kvm-virtiofsd-cap-ng-helpers.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch94: kvm-virtiofsd-Drop-CAP_FSETID-if-client-asked-for-it.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch95: kvm-virtiofsd-set-maximum-RLIMIT_NOFILE-limit.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch96: kvm-virtiofsd-fix-libfuse-information-leaks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch97: kvm-virtiofsd-add-syslog-command-line-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch98: kvm-virtiofsd-print-log-only-when-priority-is-high-enoug.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch99: kvm-virtiofsd-Add-ID-to-the-log-with-FUSE_LOG_DEBUG-leve.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch100: kvm-virtiofsd-Add-timestamp-to-the-log-with-FUSE_LOG_DEB.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch101: kvm-virtiofsd-Handle-reinit.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch102: kvm-virtiofsd-Handle-hard-reboot.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch103: kvm-virtiofsd-Kill-threads-when-queues-are-stopped.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch104: kvm-vhost-user-Print-unexpected-slave-message-types.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch105: kvm-contrib-libvhost-user-Protect-slave-fd-with-mutex.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch106: kvm-virtiofsd-passthrough_ll-add-renameat2-support.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch107: kvm-virtiofsd-passthrough_ll-disable-readdirplus-on-cach.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch108: kvm-virtiofsd-passthrough_ll-control-readdirplus.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch109: kvm-virtiofsd-rename-unref_inode-to-unref_inode_lolocked.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch110: kvm-virtiofsd-fail-when-parent-inode-isn-t-known-in-lo_d.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch111: kvm-virtiofsd-extract-root-inode-init-into-setup_root.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch112: kvm-virtiofsd-passthrough_ll-clean-up-cache-related-opti.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch113: kvm-virtiofsd-passthrough_ll-use-hashtable.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch114: kvm-virtiofsd-Clean-up-inodes-on-destroy.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch115: kvm-virtiofsd-support-nanosecond-resolution-for-file-tim.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch116: kvm-virtiofsd-fix-error-handling-in-main.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch117: kvm-virtiofsd-cleanup-allocated-resource-in-se.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch118: kvm-virtiofsd-fix-memory-leak-on-lo.source.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch119: kvm-virtiofsd-add-helper-for-lo_data-cleanup.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch120: kvm-virtiofsd-Prevent-multiply-running-with-same-vhost_u.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch121: kvm-virtiofsd-enable-PARALLEL_DIROPS-during-INIT.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch122: kvm-virtiofsd-fix-incorrect-error-handling-in-lo_do_look.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch123: kvm-Virtiofsd-fix-memory-leak-on-fuse-queueinfo.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch124: kvm-virtiofsd-Support-remote-posix-locks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch125: kvm-virtiofsd-use-fuse_lowlevel_is_virtio-in-fuse_sessio.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch126: kvm-virtiofsd-prevent-fv_queue_thread-vs-virtio_loop-rac.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch127: kvm-virtiofsd-make-lo_release-atomic.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch128: kvm-virtiofsd-prevent-races-with-lo_dirp_put.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch129: kvm-virtiofsd-rename-inode-refcount-to-inode-nlookup.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch130: kvm-libvhost-user-Fix-some-memtable-remap-cases.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch131: kvm-virtiofsd-passthrough_ll-fix-refcounting-on-remove-r.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch132: kvm-virtiofsd-introduce-inode-refcount-to-prevent-use-af.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch133: kvm-virtiofsd-do-not-always-set-FUSE_FLOCK_LOCKS.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch134: kvm-virtiofsd-convert-more-fprintf-and-perror-to-use-fus.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch135: kvm-virtiofsd-Reset-O_DIRECT-flag-during-file-open.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch136: kvm-virtiofsd-Fix-data-corruption-with-O_APPEND-write-in.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch137: kvm-virtiofsd-passthrough_ll-Use-cache_readdir-for-direc.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch138: kvm-virtiofsd-add-definition-of-fuse_buf_writev.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch139: kvm-virtiofsd-use-fuse_buf_writev-to-replace-fuse_buf_wr.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch140: kvm-virtiofsd-process-requests-in-a-thread-pool.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch141: kvm-virtiofsd-prevent-FUSE_INIT-FUSE_DESTROY-races.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch142: kvm-virtiofsd-fix-lo_destroy-resource-leaks.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch143: kvm-virtiofsd-add-thread-pool-size-NUM-option.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch144: kvm-virtiofsd-Convert-lo_destroy-to-take-the-lo-mutex-lo.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch145: kvm-virtiofsd-passthrough_ll-Pass-errno-to-fuse_reply_er.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch146: kvm-virtiofsd-stop-all-queue-threads-on-exit-in-virtio_l.patch +# For bz#1694164 - virtio-fs: host<->guest shared file system (qemu) +Patch147: kvm-virtiofsd-add-some-options-to-the-help-message.patch +# For bz#1776638 - Guest failed to boot up after system_reset 20 times +Patch148: kvm-ppc-Deassert-the-external-interrupt-pin-in-KVM-on-re.patch +# For bz#1776638 - Guest failed to boot up after system_reset 20 times +Patch149: kvm-xics-Don-t-deassert-outputs.patch +# For bz#1776638 - Guest failed to boot up after system_reset 20 times +Patch150: kvm-ppc-Don-t-use-CPUPPCState-irq_input_state-with-moder.patch +# For bz#1787395 - qemu-trace-stap list : TypeError: startswith first arg must be bytes or a tuple of bytes, not str +Patch151: kvm-trace-update-qemu-trace-stap-to-Python-3.patch +# For bz#1794503 - CVE-2020-1711 qemu-kvm: QEMU: block: iscsi: OOB heap access via an unexpected response of iSCSI Server [rhel-av-8.2.0] +Patch153: kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch +# For bz#1787444 - Broken postcopy migration with vTPM device +Patch154: kvm-tpm-ppi-page-align-PPI-RAM.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch155: kvm-target-arm-kvm-trivial-Clean-up-header-documentation.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch156: kvm-target-arm-kvm64-kvm64-cpus-have-timer-registers.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch157: kvm-tests-arm-cpu-features-Check-feature-default-values.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch158: kvm-target-arm-kvm-Implement-virtual-time-adjustment.patch +# For bz#1647366 - aarch64: Add support for the kvm-no-adjvtime ARM CPU feature +Patch159: kvm-target-arm-cpu-Add-the-kvm-no-adjvtime-CPU-property.patch +# For bz#1529231 - [q35] VM hangs after migration with 200 vCPUs +Patch160: kvm-migration-Define-VMSTATE_INSTANCE_ID_ANY.patch +# For bz#1529231 - [q35] VM hangs after migration with 200 vCPUs +Patch161: kvm-migration-Change-SaveStateEntry.instance_id-into-uin.patch +# For bz#1529231 - [q35] VM hangs after migration with 200 vCPUs +Patch162: kvm-apic-Use-32bit-APIC-ID-for-migration-instance-ID.patch +# For bz#1779078 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) +# For bz#1787291 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) [rhel-8.1.0.z] +# For bz#1779078 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) +# For bz#1779078 - RHVH 4.4: Failed to run VM on 4.3/4.4 engine (Exit message: the CPU is incompatible with host CPU: Host CPU does not provide required features: hle, rtm) +Patch163: kvm-i386-Resolve-CPU-models-to-v1-by-default.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch164: kvm-iotests-Support-job-complete-in-run_job.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch165: kvm-iotests-Create-VM.blockdev_create.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch166: kvm-block-Activate-recursively-even-for-already-active-n.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch167: kvm-hmp-Allow-using-qdev-ID-for-qemu-io-command.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch168: kvm-iotests-Test-external-snapshot-with-VM-state.patch +# For bz#1781637 - qemu crashed when do mem and disk snapshot +Patch169: kvm-iotests.py-Let-wait_migration-wait-even-more.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch170: kvm-blockdev-fix-coding-style-issues-in-drive_backup_pre.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch171: kvm-blockdev-unify-qmp_drive_backup-and-drive-backup-tra.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch172: kvm-blockdev-unify-qmp_blockdev_backup-and-blockdev-back.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch173: kvm-blockdev-honor-bdrv_try_set_aio_context-context-requ.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch174: kvm-backup-top-Begin-drain-earlier.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch175: kvm-block-backup-top-Don-t-acquire-context-while-droppin.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch176: kvm-blockdev-Acquire-AioContext-on-dirty-bitmap-function.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch177: kvm-blockdev-Return-bs-to-the-proper-context-on-snapshot.patch +# For bz#1745606 - Qemu hang when do incremental live backup in transaction mode without bitmap +# For bz#1746217 - Src qemu hang when do storage vm migration during guest installation +# For bz#1773517 - Src qemu hang when do storage vm migration with dataplane enable +# For bz#1779036 - Qemu coredump when do snapshot in transaction mode with one snapshot path not exist +# For bz#1782111 - Qemu hang when do full backup on multi-disks with one job's 'job-id' missed in transaction mode(data plane enable) +# For bz#1782175 - Qemu core dump when add persistent bitmap(data plane enable) +# For bz#1783965 - Qemu core dump when do backup with sync: bitmap and no bitmap provided +Patch178: kvm-iotests-Test-handling-of-AioContexts-with-some-block.patch +# For bz#1801320 - aarch64: backport query-cpu-model-expansion and adjvtime document fixes +Patch179: kvm-target-arm-monitor-query-cpu-model-expansion-crashed.patch +# For bz#1801320 - aarch64: backport query-cpu-model-expansion and adjvtime document fixes +Patch180: kvm-docs-arm-cpu-features-Make-kvm-no-adjvtime-comment-c.patch +# For bz#1796240 - Enable hw accelerated cache-count-flush by default for POWER9 DD2.3 cpus +Patch181: kvm-spapr-Enable-DD2.3-accelerated-count-cache-flush-in-.patch +# For bz#1798994 - CVE-2020-8608 qemu-kvm: QEMU: Slirp: potential OOB access due to unsafe snprintf() usages [rhel-av-8.2.0] +Patch182: kvm-util-add-slirp_fmt-helpers.patch +# For bz#1798994 - CVE-2020-8608 qemu-kvm: QEMU: Slirp: potential OOB access due to unsafe snprintf() usages [rhel-av-8.2.0] +Patch183: kvm-tcp_emu-fix-unsafe-snprintf-usages.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch184: kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch185: kvm-virtio-make-virtio_delete_queue-idempotent.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch186: kvm-virtio-reset-region-cache-when-on-queue-deletion.patch +# For bz#1791590 - [Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device +Patch187: kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch +# For bz#1805334 - vhost-user/50-qemu-gpu.json is not valid JSON +Patch188: kvm-vhost-user-gpu-Drop-trailing-json-comma.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch189: kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch190: kvm-target-i386-add-a-ucode-rev-property.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch191: kvm-target-i386-kvm-initialize-microcode-revision-from-K.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch192: kvm-target-i386-fix-TCG-UCODE_REV-access.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch193: kvm-target-i386-check-for-availability-of-MSR_IA32_UCODE.patch +# For bz#1791648 - [RFE] Passthrough host CPU microcode version to KVM guest if using CPU passthrough +Patch194: kvm-target-i386-enable-monitor-and-ucode-revision-with-c.patch +# For bz#1703907 - [upstream]QEMU coredump when converting to qcow2: external data file images on block devices with copy_offloading +Patch195: kvm-qcow2-Fix-qcow2_alloc_cluster_abort-for-external-dat.patch +# For bz#1794692 - Mirror block job stops making progress +Patch196: kvm-mirror-Store-MirrorOp.co-for-debuggability.patch +# For bz#1794692 - Mirror block job stops making progress +Patch197: kvm-mirror-Don-t-let-an-operation-wait-for-itself.patch +# For bz#1782529 - Windows Update Enablement with default smbios strings in qemu +Patch198: kvm-hw-smbios-set-new-default-SMBIOS-fields-for-Windows-.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch199: kvm-migration-multifd-clean-pages-after-filling-packet.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch200: kvm-migration-Make-sure-that-we-don-t-call-write-in-case.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch201: kvm-migration-multifd-fix-nullptr-access-in-terminating-.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch202: kvm-migration-multifd-fix-destroyed-mutex-access-in-term.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch203: kvm-multifd-Make-sure-that-we-don-t-do-any-IO-after-an-e.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch204: kvm-qemu-file-Don-t-do-IO-after-shutdown.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch205: kvm-migration-Don-t-send-data-if-we-have-stopped.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch206: kvm-migration-Create-migration_is_running.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch207: kvm-migration-multifd-fix-nullptr-access-in-multifd_send.patch +# For bz#1738451 - qemu on src host core dump after set multifd-channels and do migration twice (first migration execute migrate_cancel) +Patch208: kvm-migration-Maybe-VM-is-paused-when-migration-is-cance.patch +# For bz#1797064 - virtiofsd: Fixes +Patch209: kvm-virtiofsd-Remove-fuse_req_getgroups.patch +# For bz#1797064 - virtiofsd: Fixes +Patch210: kvm-virtiofsd-fv_create_listen_socket-error-path-socket-.patch +# For bz#1797064 - virtiofsd: Fixes +Patch211: kvm-virtiofsd-load_capng-missing-unlock.patch +# For bz#1797064 - virtiofsd: Fixes +Patch212: kvm-virtiofsd-do_read-missing-NULL-check.patch +# For bz#1797064 - virtiofsd: Fixes +Patch213: kvm-tools-virtiofsd-fuse_lowlevel-Fix-fuse_out_header-er.patch +# For bz#1797064 - virtiofsd: Fixes +Patch214: kvm-virtiofsd-passthrough_ll-cleanup-getxattr-listxattr.patch +# For bz#1797064 - virtiofsd: Fixes +Patch215: kvm-virtiofsd-Fix-xattr-operations.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch216: kvm-block-nbd-Fix-hang-in-.bdrv_close.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch217: kvm-block-Generic-file-creation-fallback.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch218: kvm-file-posix-Drop-hdev_co_create_opts.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch219: kvm-iscsi-Drop-iscsi_co_create_opts.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch220: kvm-iotests-Add-test-for-image-creation-fallback.patch +# For bz#1640894 - Fix generic file creation fallback for qemu-img nvme:// image creation support +Patch221: kvm-block-Fix-leak-in-bdrv_create_file_fallback.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch222: kvm-iotests-Use-complete_and_wait-in-155.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch223: kvm-block-Introduce-bdrv_reopen_commit_post-step.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch224: kvm-block-qcow2-Move-bitmap-reopen-into-bdrv_reopen_comm.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch225: kvm-iotests-Refactor-blockdev-reopen-test-for-iothreads.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch226: kvm-block-bdrv_reopen-with-backing-file-in-different-Aio.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch227: kvm-block-Versioned-x-blockdev-reopen-API-with-feature-f.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch228: kvm-block-Make-bdrv_get_cumulative_perm-public.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch229: kvm-block-Relax-restrictions-for-blockdev-snapshot.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch230: kvm-iotests-Fix-run_job-with-use_log-False.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch231: kvm-iotests-Test-mirror-with-temporarily-disabled-target.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch232: kvm-block-Fix-cross-AioContext-blockdev-snapshot.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch233: kvm-iotests-Add-iothread-cases-to-155.patch +# For bz#1790482 - bitmaps in backing images can't be modified +# For bz#1805143 - allow late/lazy opening of backing chain for shallow blockdev-mirror +Patch234: kvm-qapi-Add-allow-write-only-overlay-feature-for-blockd.patch +# For bz#1809380 - guest hang during reboot process after migration from RHEl7.8 to RHEL8.2.0. +Patch235: kvm-exec-rom_reset-Free-rom-data-during-inmigrate-skip.patch +# For bz#1814336 - [POWER9] QEMU migration-test triggers a kernel warning +Patch236: kvm-migration-Rate-limit-inside-host-pages.patch +# For bz#1811670 - Unneeded qemu-guest-agent dependency on pixman +Patch237: kvm-build-sys-do-not-make-qemu-ga-link-with-pixman.patch +# For bz#1816007 - qemu-img convert failed to convert with block device as target +Patch238: kvm-block-pass-BlockDriver-reference-to-the-.bdrv_co_cre.patch +# For bz#1816007 - qemu-img convert failed to convert with block device as target +Patch239: kvm-block-trickle-down-the-fallback-image-creation-funct.patch +# For bz#1794692 - Mirror block job stops making progress +Patch240: kvm-Revert-mirror-Don-t-let-an-operation-wait-for-itself.patch +# For bz#1794692 - Mirror block job stops making progress +Patch241: kvm-mirror-Wait-only-for-in-flight-operations.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch242: kvm-job-take-each-job-s-lock-individually-in-job_txn_app.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch243: kvm-replication-assert-we-own-context-before-job_cancel_.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch244: kvm-backup-don-t-acquire-aio_context-in-backup_clean.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch245: kvm-block-backend-Reorder-flush-pdiscard-function-defini.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch246: kvm-block-Increase-BB.in_flight-for-coroutine-and-sync-i.patch +# For bz#1817621 - Crash and deadlock with block jobs when using io-threads +Patch247: kvm-block-Fix-blk-in_flight-during-blk_wait_while_draine.patch +# For bz#1822682 - QEMU-4.2 fails to start a VM on Azure +Patch248: kvm-target-i386-do-not-set-unsupported-VMX-secondary-exe.patch +# For bz#1790899 - [RFE] QEMU devices should have the option to enable/disable hotplug/unplug +Patch249: kvm-pcie_root_port-Add-hotplug-disabling-option.patch +# For bz#1816793 - 'edid' compat handling missing for virtio-gpu-ccw +Patch250: kvm-compat-disable-edid-for-virtio-gpu-ccw.patch +# For bz#1820531 - qmp command query-pci get wrong result after hotplug device under hotplug=off controller +Patch251: kvm-hw-pci-pcie-Forbid-hot-plug-if-it-s-disabled-on-the-.patch +# For bz#1820531 - qmp command query-pci get wrong result after hotplug device under hotplug=off controller +Patch252: kvm-hw-pci-pcie-Replace-PCI_DEVICE-casts-with-existing-v.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch253: kvm-tools-virtiofsd-passthrough_ll-Fix-double-close.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch254: kvm-virtiofsd-add-rlimit-nofile-NUM-option.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch255: kvm-virtiofsd-stay-below-fs.file-max-sysctl-value-CVE-20.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch256: kvm-virtiofsd-jail-lo-proc_self_fd.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch257: kvm-virtiofsd-Show-submounts.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch258: kvm-virtiofsd-only-retain-file-system-capabilities.patch +# For bz#1817445 - CVE-2020-10717 virt:8.2/qemu-kvm: QEMU: virtiofsd: guest may open maximum file descriptor to cause DoS [rhel-av-8] +Patch259: kvm-virtiofsd-drop-all-capabilities-in-the-wait-parent-p.patch +# For bz#1775462 - Creating luks-inside-qcow2 images with cluster_size=2k/4k will get a corrupted image +Patch260: kvm-block-always-fill-entire-LUKS-header-space-with-zero.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch261: kvm-numa-remove-not-needed-check.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch262: kvm-numa-properly-check-if-numa-is-supported.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch263: kvm-numa-Extend-CLI-to-provide-initiator-information-for.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch264: kvm-numa-Extend-CLI-to-provide-memory-latency-and-bandwi.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch265: kvm-numa-Extend-CLI-to-provide-memory-side-cache-informa.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch266: kvm-hmat-acpi-Build-Memory-Proximity-Domain-Attributes-S.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch267: kvm-hmat-acpi-Build-System-Locality-Latency-and-Bandwidt.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch268: kvm-hmat-acpi-Build-Memory-Side-Cache-Information-Struct.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch269: kvm-tests-numa-Add-case-for-QMP-build-HMAT.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch270: kvm-tests-bios-tables-test-add-test-cases-for-ACPI-HMAT.patch +# For bz#1600217 - [Intel 8.2.1 FEAT] KVM ACPI HMAT support - qemu-kvm Fast Train +Patch271: kvm-ACPI-add-expected-files-for-HMAT-tests-acpihmat.patch +# For bz#1813940 - CVE-2020-10702 virt:8.1/qemu-kvm: qemu: weak signature generation in Pointer Authentication support for ARM [rhel-av-8] +Patch272: kvm-target-arm-Fix-PAuth-sbox-functions.patch +# For bz#1749737 - CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-av-8] +Patch273: kvm-Don-t-leak-memory-when-reallocation-fails.patch +# For bz#1749737 - CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-av-8] +Patch274: kvm-Replace-remaining-malloc-free-user-with-glib.patch +# For bz#1839030 - RFE: enable the "memfd" memory backend +Patch275: kvm-Revert-RHEL-disable-hostmem-memfd.patch +# For bz#1827630 - volume creation leaving uncleaned stuff behind on error (vol-clone/libvirt/qemu-kvm) +Patch276: kvm-block-introducing-bdrv_co_delete_file-interface.patch +# For bz#1827630 - volume creation leaving uncleaned stuff behind on error (vol-clone/libvirt/qemu-kvm) +Patch277: kvm-block.c-adding-bdrv_co_delete_file.patch +# For bz#1827630 - volume creation leaving uncleaned stuff behind on error (vol-clone/libvirt/qemu-kvm) +Patch278: kvm-crypto.c-cleanup-created-file-when-block_crypto_co_c.patch +# For bz#1513681 - [Intel 8.2.1 Feat] qemu-kvm PT VMX -- Fast Train +Patch279: kvm-target-i386-set-the-CPUID-level-to-0x14-on-old-machi.patch +# For bz#1841038 - qemu-img: /var/tmp/v2vovl56bced.qcow2: CURL: Error opening file: Server does not support 'range' (byte ranges) with HTTP/2 server in VMware ESXi 7 +Patch280: kvm-block-curl-HTTP-header-fields-allow-whitespace-aroun.patch +# For bz#1841038 - qemu-img: /var/tmp/v2vovl56bced.qcow2: CURL: Error opening file: Server does not support 'range' (byte ranges) with HTTP/2 server in VMware ESXi 7 +Patch281: kvm-block-curl-HTTP-header-field-names-are-case-insensit.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch282: kvm-MAINTAINERS-fix-qcow2-bitmap.c-under-Dirty-Bitmaps-h.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch283: kvm-iotests-Let-_make_test_img-parse-its-parameters.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch284: kvm-qemu_img-add-cvtnum_full-to-print-error-reports.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch285: kvm-block-Make-it-easier-to-learn-which-BDS-support-bitm.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch286: kvm-blockdev-Promote-several-bitmap-functions-to-non-sta.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch287: kvm-blockdev-Split-off-basic-bitmap-operations-for-qemu-.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch288: kvm-qemu-img-Add-bitmap-sub-command.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch289: kvm-iotests-Fix-test-178.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch290: kvm-qcow2-Expose-bitmaps-size-during-measure.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch291: kvm-qemu-img-Factor-out-code-for-merging-bitmaps.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch292: kvm-qemu-img-Add-convert-bitmaps-option.patch +# For bz#1779893 - RFE: Copy bitmaps with qemu-img convert +# For bz#1779904 - RFE: ability to estimate bitmap space utilization for qcow2 +Patch293: kvm-iotests-Add-test-291-to-for-qemu-img-bitmap-coverage.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch294: kvm-iotests-Add-more-skip_if_unsupported-statements-to-t.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch295: kvm-iotests-don-t-use-format-for-drive_add.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch296: kvm-iotests-055-refactor-compressed-backup-to-vmdk.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch297: kvm-iotests-055-skip-vmdk-target-tests-if-vmdk-is-not-wh.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch298: kvm-backup-Improve-error-for-bdrv_getlength-failure.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch299: kvm-backup-Make-sure-that-source-and-target-size-match.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch300: kvm-iotests-Backup-with-different-source-target-size.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch301: kvm-iotests-109-Don-t-mirror-with-mismatched-size.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch302: kvm-iotests-229-Use-blkdebug-to-inject-an-error.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch303: kvm-mirror-Make-sure-that-source-and-target-size-match.patch +# For bz#1778593 - Qemu coredump when backup to a existing small size image +Patch304: kvm-iotests-Mirror-with-different-source-target-size.patch +# For bz#1841068 - RFE: please support the "ramfb" display device model +Patch305: kvm-enable-ramfb.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch306: kvm-block-Add-flags-to-BlockDriver.bdrv_co_truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch307: kvm-block-Add-flags-to-bdrv-_co-_truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch308: kvm-block-backend-Add-flags-to-blk_truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch309: kvm-qcow2-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch310: kvm-raw-format-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch311: kvm-file-posix-Support-BDRV_REQ_ZERO_WRITE-for-truncate.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch312: kvm-block-truncate-Don-t-make-backing-file-data-visible.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch313: kvm-iotests-Add-qemu_io_log.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch314: kvm-iotests-Filter-testfiles-out-in-filter_img_info.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch315: kvm-iotests-Test-committing-to-short-backing-file.patch +# For bz#1780574 - Data corruption with resizing short overlay over longer backing files +Patch316: kvm-qcow2-Forward-ZERO_WRITE-flag-for-full-preallocation.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch317: kvm-i386-Add-MSR-feature-bit-for-MDS-NO.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch318: kvm-i386-Add-macro-for-stibp.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch319: kvm-target-i386-Add-new-bit-definitions-of-MSR_IA32_ARCH.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch320: kvm-i386-Add-new-CPU-model-Cooperlake.patch +# For bz#1769912 - [Intel 8.2.1 Feature] introduce Cooper Lake cpu model - qemu-kvm Fast Train +Patch321: kvm-target-i386-Add-missed-features-to-Cooperlake-CPU-mo.patch +# For bz#1845384 - CVE-2020-10761 virt:8.2/qemu-kvm: QEMU: nbd: reachable assertion failure in nbd_negotiate_send_rep_verr via remote client [rhel-av-8] +Patch322: kvm-nbd-server-Avoid-long-error-message-assertions-CVE-2.patch +# For bz#1845384 - CVE-2020-10761 virt:8.2/qemu-kvm: QEMU: nbd: reachable assertion failure in nbd_negotiate_send_rep_verr via remote client [rhel-av-8] +Patch323: kvm-block-Call-attention-to-truncation-of-long-NBD-expor.patch +# For bz#1820531 - qmp command query-pci get wrong result after hotplug device under hotplug=off controller +Patch324: kvm-hw-pci-pcie-Move-hot-plug-capability-check-to-pre_pl.patch +# For bz#1840342 - [Intel 8.2.1 Bug] qemu-kvm Add ARCH_CAPABILITIES to Icelake-Server cpu model - Fast Train +Patch325: kvm-target-i386-Add-ARCH_CAPABILITIES-related-bits-into-.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch327: kvm-linux-headers-update-kvm.h.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch328: kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch329: kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch330: kvm-s390x-Move-initial-reset.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch331: kvm-s390x-Move-clear-reset.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch332: kvm-s390x-Beautify-diag308-handling.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch333: kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch334: kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch335: kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch336: kvm-pc-bios-s390x-Fix-reset-psw-mask.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch337: kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch338: kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch339: kvm-s390x-Add-missing-vcpu-reset-functions.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch340: kvm-s390-sclp-improve-special-wait-psw-logic.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch341: kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch342: kvm-s390-ipl-sync-back-loadparm.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch343: kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch344: kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch345: kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch346: kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch347: kvm-s390x-protvirt-Support-unpack-facility.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch348: kvm-s390x-protvirt-Add-migration-blocker.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch349: kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch350: kvm-s390x-protvirt-KVM-intercept-changes.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch351: kvm-s390x-Add-SIDA-memory-ops.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch352: kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch353: kvm-s390x-protvirt-SCLP-interpretation.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch354: kvm-s390x-protvirt-Set-guest-IPL-PSW.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch355: kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch356: kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch357: kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch358: kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch359: kvm-s390x-Add-unpack-facility-feature-to-GA1.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch360: kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch361: kvm-s390x-pv-Retry-ioctls-on-EINTR.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch362: kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch +# For bz#1828317 - [IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part +Patch363: kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch +# For bz#1756946 - [zKVM] Re-enable KVM_CAP_S390_AIS for new machine types +Patch364: kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch +# For bz#1756946 - [zKVM] Re-enable KVM_CAP_S390_AIS for new machine types +Patch365: kvm-introduce-kvm_kernel_irqchip_-functions.patch +# For bz#1756946 - [zKVM] Re-enable KVM_CAP_S390_AIS for new machine types +Patch366: kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch +# For bz#1823275 - RHEL8.1 - GPU Numa nodes not visible in guest post the pass-through. +Patch367: kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch368: kvm-vfio-ccw-Fix-error-message.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch369: kvm-vfio-ccw-allow-non-prefetch-ORBs.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch370: kvm-linux-headers-support-vfio-ccw-features.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch371: kvm-vfio-ccw-Refactor-cleanup-of-regions.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch372: kvm-vfio-ccw-Add-support-for-the-schib-region.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch373: kvm-vfio-ccw-Refactor-ccw-irq-handler.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch374: kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch375: kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch +# For bz#1660916 - [IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part +Patch376: kvm-config-enable-VFIO_CCW.patch +Patch377: kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch +Patch378: kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch +# For bz#1838070 - CVE-2020-1983 virt:rhel/qemu-kvm: QEMU: slirp: use-after-free in ip_reass() function in ip_input.c [rhel-8] +Patch379: kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch +# For bz#1835390 - qemu promote host does not support 'EDX.npt' and 'EDX.nrip-save' when test with Q35 machine type on EPYC host +Patch380: kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch +# For bz#1854092 - kvm-unit-tests: tcg smp FAIL +Patch381: kvm-s390x-sigp-Fix-sense-running-reporting.patch +# For bz#1854092 - kvm-unit-tests: tcg smp FAIL +Patch382: kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch +Patch383: kvm-virtio-net-fix-removal-of-failover-device.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch384: kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch385: kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch386: kvm-iotests-026-Test-EIO-on-allocation-in-a-data-file.patch +# For bz#1807057 - qcow2_alloc_cluster_abort() frees preallocated zero clusters +Patch387: kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch +# For bz#1780385 - [RFE] AMD EPYC-Rome support for KVM / QEMU guest +Patch388: kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch +# For bz#1689341 - QEMU should report an error and return failure if AMD SEV is not enabled in the kernel +Patch389: kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch +# For bz#1689341 - QEMU should report an error and return failure if AMD SEV is not enabled in the kernel +Patch390: kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch +# For bz#1863034 - RHEL8.3 Beta - Secure Execution: Unable to start Qemu with "-no-reboot" option (qemu-kvm) +Patch391: kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch +# For bz#1869710 - CVE-2020-14364 qemu-kvm: QEMU: usb: out-of-bounds r/w access issue while processing usb packets [rhel-8.3.0] +Patch392: kvm-usb-fix-setup_len-init-CVE-2020-14364.patch +# For bz#1755075 - [qemu-guest-agent] fsinfo doesn't return disk info on s390x +Patch393: kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch +# For bz#1755075 - [qemu-guest-agent] fsinfo doesn't return disk info on s390x +Patch394: kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch +# For bz#1755075 - [qemu-guest-agent] fsinfo doesn't return disk info on s390x +Patch395: kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch +# For bz#1874780 - -prom-env does not validate input +Patch396: kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch397: kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch398: kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch399: kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch400: kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch401: kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch402: kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch403: kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch404: kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch +# For bz#1846975 - Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous +Patch405: kvm-aio-posix-completely-stop-polling-when-disabled.patch +# For bz#1884531 - qemu-ga aborts after guest-shutdown command +Patch406: kvm-qga-fix-assert-regression-on-guest-shutdown.patch +# For bz#1857733 - [IBM 8.4 FEAT] KVM: Add support for virtio-fs on s390x - qemu part +Patch407: kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch +# For bz#1857733 - [IBM 8.4 FEAT] KVM: Add support for virtio-fs on s390x - qemu part +Patch408: kvm-virtio-add-vhost-user-fs-ccw-device.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch410: kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch411: kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch412: kvm-s390-sclp-rework-sclp-boundary-checks.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch413: kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch414: kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch415: kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch416: kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch417: kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch418: kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch419: kvm-s390-guest-support-for-diagnose-0x318.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch420: kvm-s390x-pv-Remove-sclp-boundary-checks.patch +# For bz#1798506 - [IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part +Patch421: kvm-s390x-pv-Fix-diag318-PV-fencing.patch +# For bz#1659412 - [IBM 8.4 FEAT] KVM enablement for enhanced hardware diagnose data of guest kernel on s390x - qemu part +Patch422: kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch +# For bz#1898700 - qemu-kvm for RHEL-8.4 doesn't build due to a possible incompatibility with systemtap-sdt-devel-4.4-1 +Patch423: kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch +# For bz#1860994 - CVE-2020-16092 virt:rhel/qemu-kvm: QEMU: reachable assertion failure in net_tx_pkt_add_raw_fragment() in hw/net/net_tx_pkt.c [rhel-8] +Patch424: kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch +# For bz#1880546 - qemu use SCMP_ACT_TRAP even SCMP_ACT_KILL_PROCESS is available +Patch425: kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch426: kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch427: kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch428: kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch +# For bz#1903135 - RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm) +Patch429: kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch430: kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch431: kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch +# For bz#1902237 - CVE-2020-29129 CVE-2020-29130 virt:rhel/qemu-kvm: QEMU: slirp: out-of-bounds access while processing ARP/NCSI packets [rhel-8] +Patch432: kvm-slirp-check-pkt_len-before-reading-protocol-header.patch +# For bz#1905386 - RHEL8.3 - s390x/s390-virtio-ccw: Reset PCI devices during subsystem reset (qemu-kvm) +Patch433: kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch434: kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch435: kvm-error-Fix-examples-in-error.h-s-big-comment.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch436: kvm-error-Improve-error.h-s-big-comment.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch437: kvm-error-Document-Error-API-usage-rules.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch438: kvm-error-New-macro-ERRP_GUARD.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch439: kvm-qga-add-command-guest-get-disks.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch440: kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch441: kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch442: kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch +# For bz#1859494 - Report logical_name for disks without mounted file-system +Patch443: kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch +# For bz#1910267 - There is no soft link '/etc/qemu-kvm/fsfreeze-hook' +Patch444: kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch +# For bz#1910326 - Incorrect hostname returned by qga command 'guest-get-host-name' +Patch445: kvm-qga-rename-Error-parameter-to-more-common-errp.patch +# For bz#1910326 - Incorrect hostname returned by qga command 'guest-get-host-name' +Patch446: kvm-util-Introduce-qemu_get_host_name.patch +# For bz#1910326 - Incorrect hostname returned by qga command 'guest-get-host-name' +Patch447: kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch449: kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch450: kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch451: kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch452: kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch453: kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch454: kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch455: kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch456: kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch457: kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch458: kvm-memory-Add-IOMMUTLBEvent.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch459: kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch460: kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch +# For bz#1843852 - qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed. +Patch461: kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch +# For bz#1904393 - CVE-2020-27821 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in msix_table_mmio_write() in hw/pci/msix.c [rhel-8] +Patch462: kvm-memory-clamp-cached-translation-in-case-it-points-to.patch +# For bz#1898628 - CVE-2020-25723 virt:rhel/qemu-kvm: QEMU: assertion failure through usb_packet_unmap() in hw/usb/hcd-ehci.c [rhel-8] +Patch463: kvm-hw-ehci-check-return-value-of-usb_packet_map.patch +# For bz#1903070 - CVE-2020-25707 CVE-2020-28916 virt:rhel/qemu-kvm: various flaws [rhel-8] +Patch464: kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch465: kvm-linux-headers-add-vfio-DMA-available-capability.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch466: kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch467: kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch468: kvm-vfio-Find-DMA-available-capability.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch469: kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch470: kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch +# For bz#1905391 - RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm) +Patch471: kvm-s390x-fix-build-for-without-default-devices.patch +# For bz#1918054 - CVE-2020-10756 virt:rhel/qemu-kvm: QEMU: slirp: networking out-of-bounds read information disclosure vulnerability [rhel-8.4.0] +Patch472: kvm-Drop-bogus-IPv6-messages.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch473: kvm-spapr-Improve-handling-of-fdt-buffer-size.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch474: kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch475: kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch476: kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch477: kvm-spapr-Allow-memory-unplug-to-always-succeed.patch +# For bz#1901837 - Failed to hotunplug pc-dimm device +Patch478: kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch +# For bz#1834281 - qemu-img convert abort when converting image with unaligned size +Patch479: kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch +# For bz#1834281 - qemu-img convert abort when converting image with unaligned size +Patch480: kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch +# For bz#1912974 - CVE-2020-11947 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in iscsi_aio_ioctl_cb() in block/iscsi.c may lead to information disclosure [rhel-8] +Patch481: kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch +# For bz#1919111 - CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0] +Patch482: kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch +# For bz#1919111 - CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0] +Patch483: kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch +# For bz#1919111 - CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0] +Patch484: kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch +# For bz#1912891 - [ppc64le] --disk cdimage.iso,bus=usb fails to boot +Patch486: kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch487: kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch488: kvm-target-i386-add-fast-short-REP-MOV-support.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch489: kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch +# For bz#1790620 - [RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train +Patch490: kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch +# For bz#1917451 - CVE-2020-29443 virt:rhel/qemu-kvm: QEMU: ide: atapi: OOB access while processing read commands [rhel-8.4.0] +Patch491: kvm-ide-atapi-check-logical-block-address-and-read-size-.patch +# For bz#1892350 - CVE-2020-27617 virt:rhel/qemu-kvm: QEMU: net: an assert failure via eth_get_gso_type [rhel-8.5.0] +Patch492: kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch +# For bz#1930092 - CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0] +Patch493: kvm-e1000-fail-early-for-evil-descriptor.patch +# For bz#1859175 - CVE-2020-15859 virt:rhel/qemu-kvm: QEMU: net: e1000e: use-after-free while sending packets [rhel-8] +Patch494: kvm-net-forbid-the-reentrant-RX.patch +# For bz#1855250 - qemu-img convert uses possibly slow pre-zeroing on block storage +Patch495: kvm-qemu-img-convert-Don-t-pre-zero-images.patch +# For bz#1932823 - after upgrade from 4.3 to 4.4 audio stops working in guests after couple of seconds +Patch496: kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch +# For bz#1925430 - CVE-2021-20221 virt:rhel/qemu-kvm: qemu: out-of-bound heap buffer access via an interrupt ID field [rhel-8.5.0] +Patch497: kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch498: kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch499: kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch500: kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch501: kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch502: kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch +# For bz#1842478 - CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0] +Patch503: kvm-softmmu-memory-Log-invalid-memory-accesses.patch +# For bz#1940450 - RHEL8.5 - Mediated Device already in use by same domain we are booting (vfio-ccw/Multipath Testing) (kvm) - qemu-kvm part (also has kernel and libvirt parts) +Patch504: kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch +# For bz#1940450 - RHEL8.5 - Mediated Device already in use by same domain we are booting (vfio-ccw/Multipath Testing) (kvm) - qemu-kvm part (also has kernel and libvirt parts) +Patch505: kvm-vfio-ccw-Connect-the-device-request-notifier.patch +# For bz#1942880 - RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm) +Patch506: kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch +# For bz#1942880 - RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm) +Patch507: kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch +# For bz#1942880 - RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm) +Patch508: kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch +# For bz#1877163 - [FJ 8.3 Bug] The progress bar of the "virt-clone --nonsparse" command shows the progress rate exceeding 100%. +Patch509: kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch +# For bz#1944861 - Qemu-img convert fails when source image is on gpfs +Patch510: kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch +# For bz#1969768 - [ppc64le] Hotplug vcpu device hit call trace:[qemu output] KVM: unknown exit, hardware reason 7fff9ce87ed8 +Patch511: kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch +# For bz#1969768 - [ppc64le] Hotplug vcpu device hit call trace:[qemu output] KVM: unknown exit, hardware reason 7fff9ce87ed8 +Patch512: kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch +# For bz#1967914 - [virtio-fs] virtiofsd quit when coping file to a folder in virtio-fs mounted volume(windows guest) +Patch513: kvm-virtiofsd-Whitelist-fchmod.patch +# For bz#1957866 - RHEL8.4 - EEH capability disabled on KVM guest and recovery of PCI passthru device fails(CX5 / mlx5_core) (qemu-kvm) +Patch514: kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch +# For bz#1970912 - Deployment fails with "Invalid or missing agent token received" +Patch515: kvm-Compress-lines-for-immediate-return.patch +# For bz#1970912 - Deployment fails with "Invalid or missing agent token received" +Patch516: kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch517: kvm-net-introduce-qemu_receive_packet.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch518: kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch519: kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch520: kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch521: kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch522: kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch523: kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch524: kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch +# For bz#1932917 - CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow +Patch525: kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch526: kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch527: kvm-qga-add-ssh-add-remove-authorized-keys.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch528: kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch +# For bz#1967716 - RFE: rebuild guest agent to include public ssh injection api support +Patch529: kvm-qga-add-ssh-get-authorized-keys.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch530: kvm-Add-mtod_check.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch531: kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +Patch532: kvm-bootp-check-bootp_input-buffer-size.patch +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +Patch533: kvm-upd6-check-udp6_input-buffer-size.patch +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +Patch534: kvm-tftp-check-tftp_input-buffer-size.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch535: kvm-tftp-introduce-a-header-structure.patch +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch536: kvm-udp-check-upd_input-buffer-size.patch +# For bz#1970819 - CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8] +# For bz#1970835 - CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8] +# For bz#1970843 - CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8] +# For bz#1970853 - CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8] +Patch537: kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch +# For bz#1982134 - QEMU core dump while booting guest with a non-exist fd on tap +Patch538: kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch +# For bz#1982134 - QEMU core dump while booting guest with a non-exist fd on tap +Patch539: kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch +# For bz#1969848 - qemu-img convert hangs on aarch64 +Patch540: kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch +# For bz#1969848 - qemu-img convert hangs on aarch64 +Patch541: kvm-async-use-explicit-memory-barriers.patch +# For bz#1967496 - [virtio-fs] nfs/xfstest generic/089 generic/478 generic/632 failed +Patch542: kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch +# For bz#1967496 - [virtio-fs] nfs/xfstest generic/089 generic/478 generic/632 failed +Patch543: kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch +# For bz#1994041 - qemu-kvm scsi: change default passthrough timeout to non-infinite +Patch544: kvm-scsi-make-io_timeout-configurable.patch +# For bz#2025605 - CVE-2021-3930 virt:rhel/qemu-kvm: QEMU: off-by-one error in mode_sense_page() in hw/scsi/scsi-disk.c [rhel-8.5.0.z] +Patch545: kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch +# For bz#2025011 - CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0.z] +Patch546: kvm-e1000-fix-tx-re-entrancy-problem.patch +# For bz#2048627 - CVE-2022-0358 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privilege escalation via CVE-2018-13405 [rhel-8.5.0.z] +Patch547: kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch + +BuildRequires: wget +BuildRequires: rpm-build +BuildRequires: zlib-devel +BuildRequires: glib2-devel +BuildRequires: which +BuildRequires: gnutls-devel +BuildRequires: cyrus-sasl-devel +BuildRequires: libtool +BuildRequires: libaio-devel +BuildRequires: rsync +BuildRequires: python3-devel +BuildRequires: pciutils-devel +BuildRequires: libiscsi-devel +BuildRequires: ncurses-devel +BuildRequires: libattr-devel +BuildRequires: libusbx-devel >= 1.0.22 +%if %{have_usbredir} +BuildRequires: usbredir-devel >= 0.7.1 +%endif +BuildRequires: texinfo +BuildRequires: python3-sphinx +%if %{have_spice} +BuildRequires: spice-protocol >= 0.12.12 +BuildRequires: spice-server-devel >= 0.12.8 +BuildRequires: libcacard-devel +# For smartcard NSS support +BuildRequires: nss-devel +%endif +BuildRequires: libseccomp-devel >= 2.4.0 +# For network block driver +BuildRequires: libcurl-devel +BuildRequires: libssh-devel +BuildRequires: librados-devel +BuildRequires: librbd-devel +%if %{have_gluster} +# For gluster block driver +BuildRequires: glusterfs-api-devel +BuildRequires: glusterfs-devel +%endif +# We need both because the 'stap' binary is probed for by configure +BuildRequires: systemtap +BuildRequires: systemtap-sdt-devel +# For VNC PNG support +BuildRequires: libpng-devel +# For uuid generation +BuildRequires: libuuid-devel +# For BlueZ device support +BuildRequires: bluez-libs-devel +# For Braille device support +BuildRequires: brlapi-devel +# For test suite +BuildRequires: check-devel +# For virtiofs +BuildRequires: libcap-ng-devel +# Hard requirement for version >= 1.3 +BuildRequires: pixman-devel +# Documentation requirement +BuildRequires: perl-podlators +BuildRequires: texinfo +BuildRequires: python3-sphinx +# For rdma +%if 0%{?have_librdma} +BuildRequires: rdma-core-devel +%endif +%if %{have_fdt} +BuildRequires: libfdt-devel >= 1.6.0 +%endif +# iasl and cpp for acpi generation (not a hard requirement as we can use +# pre-compiled files, but it's better to use this) +%ifarch %{ix86} x86_64 +BuildRequires: iasl +BuildRequires: cpp +%endif +# For compressed guest memory dumps +BuildRequires: lzo-devel snappy-devel +# For NUMA memory binding +%ifnarch s390x +BuildRequires: numactl-devel +%endif +BuildRequires: libgcrypt-devel +# qemu-pr-helper multipath support (requires libudev too) +BuildRequires: device-mapper-multipath-devel +BuildRequires: systemd-devel +# used by qemu-bridge-helper and qemu-pr-helper +BuildRequires: libcap-ng-devel + +BuildRequires: diffutils +%ifarch x86_64 +BuildRequires: libpmem-devel +Requires: libpmem +%endif + +# qemu-keymap +BuildRequires: pkgconfig(xkbcommon) + +# For s390-pgste flag +%ifarch s390x +BuildRequires: binutils >= 2.27-16 +%endif + +%if %{have_opengl} +BuildRequires: pkgconfig(epoxy) +BuildRequires: pkgconfig(libdrm) +BuildRequires: pkgconfig(gbm) +Requires: mesa-libGL +Requires: mesa-libEGL +Requires: mesa-dri-drivers +%endif + +BuildRequires: perl-Test-Harness + +Requires: qemu-kvm-core = %{epoch}:%{version}-%{release} +%rhev_ma_conflicts qemu-kvm + +%{requires_all_modules} + +%define qemudocdir %{_docdir}/%{name} + +%description +qemu-kvm is an open source virtualizer that provides hardware +emulation for the KVM hypervisor. qemu-kvm acts as a virtual +machine monitor together with the KVM kernel modules, and emulates the +hardware for a full system such as a PC and its associated peripherals. + + +%package -n qemu-kvm-core +Summary: qemu-kvm core components +Requires: qemu-img = %{epoch}:%{version}-%{release} +%ifarch %{ix86} x86_64 +Requires: seabios-bin >= 1.10.2-1 +Requires: sgabios-bin +Requires: edk2-ovmf +%endif +%ifarch aarch64 +Requires: edk2-aarch64 +%endif + +%ifnarch aarch64 s390x +Requires: seavgabios-bin >= 1.12.0-3 +Requires: ipxe-roms-qemu >= 20170123-1 +%endif +%ifarch %{power64} +Requires: SLOF >= %{SLOF_gittagdate}-1.git%{SLOF_gittagcommit} +%endif +Requires: %{name}-common = %{epoch}:%{version}-%{release} +Requires: libseccomp >= 2.4.0 +# For compressed guest memory dumps +Requires: lzo snappy +%if %{have_kvm_setup} +Requires(post): systemd-units +Requires(preun): systemd-units + %ifarch %{power64} +Requires: powerpc-utils + %endif +%endif +Requires: libusbx >= 1.0.19 +%if %{have_usbredir} +Requires: usbredir >= 0.7.1 +%endif +%if %{have_fdt} +Requires: libfdt >= 1.6.0 +%endif + +%rhev_ma_conflicts qemu-kvm + +%description -n qemu-kvm-core +qemu-kvm is an open source virtualizer that provides hardware +emulation for the KVM hypervisor. qemu-kvm acts as a virtual +machine monitor together with the KVM kernel modules, and emulates the +hardware for a full system such as a PC and its associated peripherals. + + +%package -n qemu-img +Summary: QEMU command line tool for manipulating disk images +Group: Development/Tools + +%rhev_ma_conflicts qemu-img + +%description -n qemu-img +This package provides a command line tool for manipulating disk images. + +%package -n qemu-kvm-common +Summary: QEMU common files needed by all QEMU targets +Group: Development/Tools +Requires(post): /usr/bin/getent +Requires(post): /usr/sbin/groupadd +Requires(post): /usr/sbin/useradd +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units + +%rhev_ma_conflicts qemu-kvm-common + +%description -n qemu-kvm-common +qemu-kvm is an open source virtualizer that provides hardware emulation for +the KVM hypervisor. + +This package provides documentation and auxiliary programs used with qemu-kvm. + + +%package -n qemu-guest-agent +Summary: QEMU guest agent +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units + +%description -n qemu-guest-agent +qemu-kvm is an open source virtualizer that provides hardware emulation for +the KVM hypervisor. + +This package provides an agent to run inside guests, which communicates +with the host over a virtio-serial channel named "org.qemu.guest_agent.0" + +This package does not need to be installed on the host OS. + +%package tests +Summary: tests for the qemu-kvm package +Requires: %{name} = %{epoch}:%{version}-%{release} + +%define testsdir %{_libdir}/%{name}/tests-src + +%description tests +The qemu-kvm-tests rpm contains tests that can be used to verify +the functionality of the installed qemu-kvm package + +Install this package if you want access to the avocado_qemu +tests, or qemu-iotests. + +%package block-curl +Summary: QEMU CURL block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-curl +This package provides the additional CURL block driver for QEMU. + +Install this package if you want to access remote disks over +http, https, ftp and other transports provided by the CURL library. + + +%if %{have_gluster} +%package block-gluster +Summary: QEMU Gluster block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} +%description block-gluster +This package provides the additional Gluster block driver for QEMU. + +Install this package if you want to access remote Gluster storage. +%endif + + +%package block-iscsi +Summary: QEMU iSCSI block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-iscsi +This package provides the additional iSCSI block driver for QEMU. + +Install this package if you want to access iSCSI volumes. + + +%package block-rbd +Summary: QEMU Ceph/RBD block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-rbd +This package provides the additional Ceph/RBD block driver for QEMU. + +Install this package if you want to access remote Ceph volumes +using the rbd protocol. + + +%package block-ssh +Summary: QEMU SSH block driver +Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release} + +%description block-ssh +This package provides the additional SSH block driver for QEMU. + +Install this package if you want to access remote disks using +the Secure Shell (SSH) protocol. + + +%prep +%setup -n qemu-%{version} +%autopatch -p1 + +%build +%global buildarch %{kvm_target}-softmmu + +# --build-id option is used for giving info to the debug packages. +buildldflags="VL_LDFLAGS=-Wl,--build-id" + +%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle + +%if 0%{have_gluster} + %global block_drivers_list %{block_drivers_list},gluster +%endif + +./configure \ + --prefix="%{_prefix}" \ + --libdir="%{_libdir}" \ + --sysconfdir="%{_sysconfdir}" \ + --interp-prefix=%{_prefix}/qemu-%M \ + --localstatedir="%{_localstatedir}" \ + --docdir="%{qemudocdir}" \ + --libexecdir="%{_libexecdir}" \ + --extra-ldflags="-Wl,--build-id -Wl,-z,relro -Wl,-z,now" \ + --extra-cflags="%{optflags}" \ + --with-pkgversion="%{name}-%{version}-%{release}" \ + --with-confsuffix=/"%{name}" \ + --firmwarepath=%{_prefix}/share/qemu-firmware \ +%if 0%{have_fdt} + --enable-fdt \ +%else + --disable-fdt \ + %endif +%if 0%{have_gluster} + --enable-glusterfs \ +%else + --disable-glusterfs \ +%endif + --enable-guest-agent \ +%ifnarch s390x + --enable-numa \ +%else + --disable-numa \ +%endif + --enable-rbd \ +%if 0%{have_librdma} + --enable-rdma \ +%else + --disable-rdma \ +%endif + --disable-pvrdma \ + --enable-seccomp \ +%if 0%{have_spice} + --enable-spice \ + --enable-smartcard \ +%else + --disable-spice \ + --disable-smartcard \ +%endif +%if 0%{have_opengl} + --enable-opengl \ +%else + --disable-opengl \ +%endif +%if 0%{have_usbredir} + --enable-usb-redir \ +%else + --disable-usb-redir \ +%endif + --disable-tcmalloc \ +%ifarch x86_64 + --enable-libpmem \ +%else + --disable-libpmem \ +%endif + --enable-vhost-user \ +%ifarch s390x + --enable-vhost-user-fs \ +%endif +%ifarch %{ix86} x86_64 + --enable-avx2 \ +%else + --disable-avx2 \ +%endif + --python=%{__python3} \ + --target-list="%{buildarch}" \ + --block-drv-rw-whitelist=%{block_drivers_list} \ + --audio-drv-list= \ + --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \ + --with-coroutine=ucontext \ + --tls-priority=@QEMU,SYSTEM \ + --disable-bluez \ + --disable-brlapi \ + --enable-cap-ng \ + --enable-coroutine-pool \ + --enable-curl \ + --disable-curses \ + --disable-debug-tcg \ + --enable-docs \ + --disable-gtk \ + --enable-kvm \ + --enable-libiscsi \ + --disable-libnfs \ + --enable-libssh \ + --enable-libusb \ + --disable-bzip2 \ + --enable-linux-aio \ + --disable-live-block-migration \ + --enable-lzo \ + --enable-pie \ + --disable-qom-cast-debug \ + --disable-sdl \ + --enable-snappy \ + --disable-sparse \ + --disable-strip \ + --enable-tpm \ + --enable-trace-backend=dtrace \ + --disable-vde \ + --disable-vhost-scsi \ + --disable-vxhs \ + --disable-virtfs \ + --disable-vnc-jpeg \ + --disable-vte \ + --enable-vnc-png \ + --enable-vnc-sasl \ + --enable-werror \ + --disable-xen \ + --disable-xfsctl \ + --enable-gnutls \ + --enable-gcrypt \ + --disable-nettle \ + --enable-attr \ + --disable-bsd-user \ + --disable-cocoa \ + --enable-debug-info \ + --disable-guest-agent-msi \ + --disable-hax \ + --disable-jemalloc \ + --disable-linux-user \ + --enable-modules \ + --disable-netmap \ + --disable-replication \ + --enable-system \ + --enable-tools \ + --disable-user \ + --enable-vhost-net \ + --enable-vhost-vsock \ + --enable-vnc \ + --enable-mpath \ + --disable-xen-pci-passthrough \ + --enable-tcg \ + --with-git=git \ + --disable-sanitizers \ + --disable-hvf \ + --disable-whpx \ + --enable-malloc-trim \ + --disable-membarrier \ + --disable-vhost-crypto \ + --disable-libxml2 \ + --enable-capstone \ + --disable-git-update \ + --disable-crypto-afalg \ + --disable-debug-mutex \ + --disable-bochs \ + --disable-cloop \ + --disable-dmg \ + --disable-qcow1 \ + --disable-vdi \ + --disable-vvfat \ + --disable-qed \ + --disable-parallels \ + --disable-sheepdog \ + --disable-auth-pam \ + --enable-iconv \ + --disable-lzfse \ + --enable-vhost-kernel \ + --disable-virglrenderer \ + --without-default-devices + +echo "config-host.mak contents:" +echo "===" +cat config-host.mak +echo "===" + +make V=1 %{?_smp_mflags} $buildldflags + +# Setup back compat qemu-kvm binary +%{__python3} scripts/tracetool.py --backend dtrace --format stap \ + --group=all --binary %{_libexecdir}/qemu-kvm --probe-prefix qemu.kvm \ + trace-events-all > qemu-kvm.stp + +%{__python3} scripts/tracetool.py --backends=dtrace --format=log-stap \ + --group=all --binary %{_libexecdir}/qemu-kvm --probe-prefix qemu.kvm \ + trace-events-all > qemu-kvm-log.stp + +%{__python3} scripts/tracetool.py --backend dtrace --format simpletrace-stap \ + --group=all --binary %{_libexecdir}/qemu-kvm --probe-prefix qemu.kvm \ + trace-events-all > qemu-kvm-simpletrace.stp + +cp -a %{kvm_target}-softmmu/qemu-system-%{kvm_target} qemu-kvm + +gcc %{SOURCE6} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o ksmctl +gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check + +%ifarch s390x + # Copy the built new images into place for "make check": + cp pc-bios/s390-ccw/s390-ccw.img pc-bios/s390-ccw/s390-netboot.img pc-bios/ +%endif + +%install +%define _udevdir %(pkg-config --variable=udevdir udev) +%define _udevrulesdir %{_udevdir}/rules.d + +install -D -p -m 0644 %{SOURCE4} $RPM_BUILD_ROOT%{_unitdir}/ksm.service +install -D -p -m 0644 %{SOURCE5} $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ksm +install -D -p -m 0755 ksmctl $RPM_BUILD_ROOT%{_libexecdir}/ksmctl + +install -D -p -m 0644 %{SOURCE7} $RPM_BUILD_ROOT%{_unitdir}/ksmtuned.service +install -D -p -m 0755 %{SOURCE8} $RPM_BUILD_ROOT%{_sbindir}/ksmtuned +install -D -p -m 0644 %{SOURCE9} $RPM_BUILD_ROOT%{_sysconfdir}/ksmtuned.conf +install -D -p -m 0644 %{SOURCE26} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/vhost.conf +%ifarch s390x + install -D -p -m 0644 %{SOURCE30} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/kvm.conf +%else +%ifarch %{ix86} x86_64 + install -D -p -m 0644 %{SOURCE31} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/kvm.conf +%else + install -D -p -m 0644 %{SOURCE27} $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/kvm.conf +%endif +%endif + +mkdir -p $RPM_BUILD_ROOT%{_bindir}/ +mkdir -p $RPM_BUILD_ROOT%{_udevrulesdir}/ +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name} + +# Create new directories and put them all under tests-src +mkdir -p $RPM_BUILD_ROOT%{testsdir}/python +mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests +mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/acceptance +mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests +mkdir -p $RPM_BUILD_ROOT%{testsdir}/scripts/qmp + +install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir} +install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir} + +install -m 0644 scripts/dump-guest-memory.py \ + $RPM_BUILD_ROOT%{_datadir}/%{name} + +# Install avocado_qemu tests +cp -R tests/acceptance/* $RPM_BUILD_ROOT%{testsdir}/tests/acceptance/ + +# Install qemu.py and qmp/ scripts required to run avocado_qemu tests +cp -R python/qemu $RPM_BUILD_ROOT%{testsdir}/python +cp -R scripts/qmp/* $RPM_BUILD_ROOT%{testsdir}/scripts/qmp +install -p -m 0755 tests/Makefile.include $RPM_BUILD_ROOT%{testsdir}/tests/ + +# Install qemu-iotests +cp -R tests/qemu-iotests/* $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests/ +# Avoid ambiguous 'python' interpreter name +find $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests/* -maxdepth 1 -type f -exec sed -i -e '1 s+/usr/bin/env \(python\|python3\)+%{__python3}+' {} \; +find $RPM_BUILD_ROOT%{testsdir}/scripts/qmp/* -maxdepth 1 -type f -exec sed -i -e '1 s+/usr/bin/env \(python\|python3\)+%{__python3}+' {} \; +find $RPM_BUILD_ROOT%{testsdir}/scripts/qmp/* -maxdepth 1 -type f -exec sed -i -e '1 s+/usr/bin/\(python\|python3\)+%{__python3}+' {} \; + +install -p -m 0644 %{SOURCE36} $RPM_BUILD_ROOT%{testsdir}/README + +make DESTDIR=$RPM_BUILD_ROOT \ + sharedir="%{_datadir}/%{name}" \ + datadir="%{_datadir}/%{name}" \ + install + +mkdir -p $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset + +# Install qemu-guest-agent service and udev rules +install -m 0644 %{_sourcedir}/qemu-guest-agent.service %{buildroot}%{_unitdir} +install -m 0644 %{_sourcedir}/qemu-ga.sysconfig %{buildroot}%{_sysconfdir}/sysconfig/qemu-ga +install -m 0644 %{_sourcedir}/99-qemu-guest-agent.rules %{buildroot}%{_udevrulesdir} + +# - the fsfreeze hook script: +install -D --preserve-timestamps \ + scripts/qemu-guest-agent/fsfreeze-hook \ + $RPM_BUILD_ROOT%{_sysconfdir}/qemu-ga/fsfreeze-hook +# Workaround for the missing /etc/qemu-kvm/fsfreeze-hook +# Please, do not carry this over to RHEL-9 +mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/qemu-kvm/ +ln -s %{_sysconfdir}/qemu-ga/fsfreeze-hook \ + $RPM_BUILD_ROOT%{_sysconfdir}/qemu-kvm/fsfreeze-hook + +# - the directory for user scripts: +mkdir $RPM_BUILD_ROOT%{_sysconfdir}/qemu-ga/fsfreeze-hook.d + +# - and the fsfreeze script samples: +mkdir --parents $RPM_BUILD_ROOT%{_datadir}/%{name}/qemu-ga/fsfreeze-hook.d/ +install --preserve-timestamps --mode=0644 \ + scripts/qemu-guest-agent/fsfreeze-hook.d/*.sample \ + $RPM_BUILD_ROOT%{_datadir}/%{name}/qemu-ga/fsfreeze-hook.d/ + +# - Install dedicated log directory: +mkdir -p -v $RPM_BUILD_ROOT%{_localstatedir}/log/qemu-ga/ + +mkdir -p $RPM_BUILD_ROOT%{_bindir} +install -c -m 0755 qemu-ga ${RPM_BUILD_ROOT}%{_bindir}/qemu-ga + +mkdir -p $RPM_BUILD_ROOT%{_mandir}/man8 + +install -m 0755 qemu-kvm $RPM_BUILD_ROOT%{_libexecdir}/ +install -m 0644 qemu-kvm.stp $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/ +install -m 0644 qemu-kvm-log.stp $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/ +install -m 0644 qemu-kvm-simpletrace.stp $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/ + +rm $RPM_BUILD_ROOT/%{_datadir}/applications/qemu.desktop +rm $RPM_BUILD_ROOT%{_bindir}/qemu-system-%{kvm_target} +rm $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/qemu-system-%{kvm_target}.stp +rm $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/qemu-system-%{kvm_target}-simpletrace.stp +rm $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/qemu-system-%{kvm_target}-log.stp +rm $RPM_BUILD_ROOT%{_bindir}/elf2dmp + +# Install simpletrace +install -m 0755 scripts/simpletrace.py $RPM_BUILD_ROOT%{_datadir}/%{name}/simpletrace.py +# Avoid ambiguous 'python' interpreter name +sed -i -e '1 s/python/python3/' $RPM_BUILD_ROOT%{_datadir}/%{name}/simpletrace.py +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool +install -m 0644 -t $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool scripts/tracetool/*.py +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/backend +install -m 0644 -t $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/backend scripts/tracetool/backend/*.py +mkdir -p $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/format +install -m 0644 -t $RPM_BUILD_ROOT%{_datadir}/%{name}/tracetool/format scripts/tracetool/format/*.py + +mkdir -p $RPM_BUILD_ROOT%{qemudocdir} +install -p -m 0644 -t ${RPM_BUILD_ROOT}%{qemudocdir} Changelog README.rst README.systemtap COPYING COPYING.LIB LICENSE docs/interop/qmp-spec.txt +chmod -x ${RPM_BUILD_ROOT}%{_mandir}/man1/* +chmod -x ${RPM_BUILD_ROOT}%{_mandir}/man8/* + +install -D -p -m 0644 qemu.sasl $RPM_BUILD_ROOT%{_sysconfdir}/sasl2/%{name}.conf + +# Provided by package openbios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/openbios-ppc +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/openbios-sparc32 +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/openbios-sparc64 +# Provided by package SLOF +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/slof.bin + +# Remove unpackaged files. +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/palcode-clipper +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/petalogix*.dtb +rm -f ${RPM_BUILD_ROOT}%{_datadir}/%{name}/bamboo.dtb +rm -f ${RPM_BUILD_ROOT}%{_datadir}/%{name}/ppc_rom.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/s390-zipl.rom +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/u-boot.e500 +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/qemu_vga.ndrv +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/skiboot.lid + +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/s390-ccw.img +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/s390-netboot.img +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/hppa-firmware.img +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/canyonlands.dtb +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/u-boot-sam460-20100605.bin + +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/firmware +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/edk2-*.fd +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/edk2-licenses.txt + +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/opensbi-riscv32-virt-fw_jump.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/opensbi-riscv64-sifive_u-fw_jump.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/opensbi-riscv64-virt-fw_jump.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/qemu-nsis.bmp + +rm -rf ${RPM_BUILD_ROOT}%{_libdir}/qemu-kvm/ui-spice-app.so + +%ifarch s390x + # Use the s390-*.imgs that we've just built, not the pre-built ones + install -m 0644 pc-bios/s390-ccw/s390-ccw.img $RPM_BUILD_ROOT%{_datadir}/%{name}/ + install -m 0644 pc-bios/s390-ccw/s390-netboot.img $RPM_BUILD_ROOT%{_datadir}/%{name}/ +%endif + +%ifnarch x86_64 + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/kvmvapic.bin + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/linuxboot.bin + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/multiboot.bin + rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/pvh.bin +%endif + +# Remove sparc files +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/QEMU,tcx.bin +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/QEMU,cgthree.bin + +# Remove ivshmem example programs +rm -rf ${RPM_BUILD_ROOT}%{_bindir}/ivshmem-client +rm -rf ${RPM_BUILD_ROOT}%{_bindir}/ivshmem-server + +# Remove efi roms +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/efi*.rom + +# Provided by package ipxe +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/pxe*rom +# Provided by package vgabios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/vgabios*bin +# Provided by package seabios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/bios*.bin +# Provided by package sgabios +rm -rf ${RPM_BUILD_ROOT}%{_datadir}/%{name}/sgabios.bin + +# the pxe gpxe images will be symlinks to the images on +# /usr/share/ipxe, as QEMU doesn't know how to look +# for other paths, yet. +pxe_link() { + ln -s ../ipxe.efi/$2.rom %{buildroot}%{_datadir}/%{name}/efi-$1.rom +} + +%ifnarch aarch64 s390x +pxe_link e1000 8086100e +pxe_link ne2k_pci 10ec8029 +pxe_link pcnet 10222000 +pxe_link rtl8139 10ec8139 +pxe_link virtio 1af41000 +pxe_link e1000e 808610d3 +%endif + +rom_link() { + ln -s $1 %{buildroot}%{_datadir}/%{name}/$2 +} + +%ifnarch aarch64 s390x + rom_link ../seavgabios/vgabios-isavga.bin vgabios.bin + rom_link ../seavgabios/vgabios-cirrus.bin vgabios-cirrus.bin + rom_link ../seavgabios/vgabios-qxl.bin vgabios-qxl.bin + rom_link ../seavgabios/vgabios-stdvga.bin vgabios-stdvga.bin + rom_link ../seavgabios/vgabios-vmware.bin vgabios-vmware.bin + rom_link ../seavgabios/vgabios-virtio.bin vgabios-virtio.bin + rom_link ../seavgabios/vgabios-ramfb.bin vgabios-ramfb.bin + rom_link ../seavgabios/vgabios-bochs-display.bin vgabios-bochs-display.bin +%endif +%ifarch x86_64 + rom_link ../seabios/bios.bin bios.bin + rom_link ../seabios/bios-256k.bin bios-256k.bin + rom_link ../sgabios/sgabios.bin sgabios.bin +%endif + +%if 0%{have_kvm_setup} + install -D -p -m 755 %{SOURCE21} $RPM_BUILD_ROOT%{_prefix}/lib/systemd/kvm-setup + install -D -p -m 644 %{SOURCE22} $RPM_BUILD_ROOT%{_unitdir}/kvm-setup.service + install -D -p -m 644 %{SOURCE23} $RPM_BUILD_ROOT%{_presetdir}/85-kvm.preset +%endif + +%if 0%{have_memlock_limits} + install -D -p -m 644 %{SOURCE28} $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/95-kvm-memlock.conf +%endif + +# Install rules to use the bridge helper with libvirt's virbr0 +install -D -m 0644 %{SOURCE12} $RPM_BUILD_ROOT%{_sysconfdir}/%{name}/bridge.conf + +# Install qemu-pr-helper service +install -m 0644 %{_sourcedir}/qemu-pr-helper.service %{buildroot}%{_unitdir} +install -m 0644 %{_sourcedir}/qemu-pr-helper.socket %{buildroot}%{_unitdir} + +find $RPM_BUILD_ROOT -name '*.la' -or -name '*.a' | xargs rm -f + +# We need to make the block device modules executable else +# RPM won't pick up their dependencies. +chmod +x $RPM_BUILD_ROOT%{_libdir}/qemu-kvm/block-*.so + +# Remove buildinfo +rm -rf $RPM_BUILD_ROOT%{qemudocdir}/interop/.buildinfo + +# Remove spec +rm -rf $RPM_BUILD_ROOT%{qemudocdir}/specs + +%check +export DIFF=diff; make check V=1 + +%post -n qemu-kvm-core +# load kvm modules now, so we can make sure no reboot is needed. +# If there's already a kvm module installed, we don't mess with it +%udev_rules_update +sh %{_sysconfdir}/sysconfig/modules/kvm.modules &> /dev/null || : + udevadm trigger --subsystem-match=misc --sysname-match=kvm --action=add || : +%if %{have_kvm_setup} + systemctl daemon-reload # Make sure it sees the new presets and unitfile + %systemd_post kvm-setup.service + if systemctl is-enabled kvm-setup.service > /dev/null; then + systemctl start kvm-setup.service + fi +%endif + +%if %{have_kvm_setup} +%preun -n qemu-kvm-core +%systemd_preun kvm-setup.service +%endif + +%post -n qemu-kvm-common +%systemd_post ksm.service +%systemd_post ksmtuned.service + +getent group kvm >/dev/null || groupadd -g 36 -r kvm +getent group qemu >/dev/null || groupadd -g 107 -r qemu +getent passwd qemu >/dev/null || \ +useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ + -c "qemu user" qemu + +%preun -n qemu-kvm-common +%systemd_preun ksm.service +%systemd_preun ksmtuned.service + +%postun -n qemu-kvm-common +%systemd_postun_with_restart ksm.service +%systemd_postun_with_restart ksmtuned.service + +%post -n qemu-guest-agent +%systemd_post qemu-guest-agent.service +%preun -n qemu-guest-agent +%systemd_preun qemu-guest-agent.service +%postun -n qemu-guest-agent +%systemd_postun_with_restart qemu-guest-agent.service + +%files +# Deliberately empty + + +%files -n qemu-kvm-common +%defattr(-,root,root) +%dir %{qemudocdir} +%doc %{qemudocdir}/Changelog +%doc %{qemudocdir}/README.rst +%doc %{qemudocdir}/qemu-doc.html +%doc %{qemudocdir}/COPYING +%doc %{qemudocdir}/COPYING.LIB +%doc %{qemudocdir}/LICENSE +%doc %{qemudocdir}/README.systemtap +%doc %{qemudocdir}/qmp-spec.txt +%doc %{qemudocdir}/qemu-doc.txt +%doc %{qemudocdir}/qemu-ga-ref.html +%doc %{qemudocdir}/qemu-ga-ref.txt +%doc %{qemudocdir}/qemu-qmp-ref.html +%doc %{qemudocdir}/qemu-qmp-ref.txt +%doc %{qemudocdir}/interop/* +%{_mandir}/man7/qemu-qmp-ref.7* +%{_mandir}/man7/qemu-cpu-models.7* +%{_bindir}/qemu-keymap +%{_bindir}/qemu-pr-helper +%{_bindir}/qemu-edid +%{_bindir}/qemu-trace-stap +%{_unitdir}/qemu-pr-helper.service +%{_unitdir}/qemu-pr-helper.socket +%{_mandir}/man7/qemu-ga-ref.7* + +%dir %{_datadir}/%{name}/ +%{_datadir}/%{name}/keymaps/ +%{_mandir}/man1/%{name}.1* +%{_mandir}/man1/qemu-trace-stap.1* +%{_mandir}/man7/qemu-block-drivers.7* +%attr(4755, -, -) %{_libexecdir}/qemu-bridge-helper +%config(noreplace) %{_sysconfdir}/sasl2/%{name}.conf +%{_unitdir}/ksm.service +%{_libexecdir}/ksmctl +%config(noreplace) %{_sysconfdir}/sysconfig/ksm +%{_unitdir}/ksmtuned.service +%{_sbindir}/ksmtuned +%{_udevdir}/udev-kvm-check +%{_udevrulesdir}/81-kvm-rhel.rules +%ghost %{_sysconfdir}/kvm +%config(noreplace) %{_sysconfdir}/ksmtuned.conf +%dir %{_sysconfdir}/%{name} +%config(noreplace) %{_sysconfdir}/%{name}/bridge.conf +%config(noreplace) %{_sysconfdir}/modprobe.d/vhost.conf +%config(noreplace) %{_sysconfdir}/modprobe.d/kvm.conf +%{_datadir}/%{name}/simpletrace.py* +%{_datadir}/%{name}/tracetool/*.py* +%{_datadir}/%{name}/tracetool/backend/*.py* +%{_datadir}/%{name}/tracetool/format/*.py* + +%files -n qemu-kvm-core +%defattr(-,root,root) +%ifarch x86_64 + %{_datadir}/%{name}/bios.bin + %{_datadir}/%{name}/bios-256k.bin + %{_datadir}/%{name}/linuxboot.bin + %{_datadir}/%{name}/multiboot.bin + %{_datadir}/%{name}/kvmvapic.bin + %{_datadir}/%{name}/sgabios.bin + %{_datadir}/%{name}/pvh.bin +%endif +%ifarch s390x + %{_datadir}/%{name}/s390-ccw.img + %{_datadir}/%{name}/s390-netboot.img +%endif +%ifnarch aarch64 s390x + %{_datadir}/%{name}/vgabios.bin + %{_datadir}/%{name}/vgabios-cirrus.bin + %{_datadir}/%{name}/vgabios-qxl.bin + %{_datadir}/%{name}/vgabios-stdvga.bin + %{_datadir}/%{name}/vgabios-vmware.bin + %{_datadir}/%{name}/vgabios-virtio.bin + %{_datadir}/%{name}/vgabios-ramfb.bin + %{_datadir}/%{name}/vgabios-bochs-display.bin + %{_datadir}/%{name}/efi-e1000.rom + %{_datadir}/%{name}/efi-e1000e.rom + %{_datadir}/%{name}/efi-virtio.rom + %{_datadir}/%{name}/efi-pcnet.rom + %{_datadir}/%{name}/efi-rtl8139.rom + %{_datadir}/%{name}/efi-ne2k_pci.rom +%endif +%{_datadir}/icons/* +%{_datadir}/%{name}/linuxboot_dma.bin +%{_datadir}/%{name}/dump-guest-memory.py* +%{_libexecdir}/qemu-kvm +%{_datadir}/systemtap/tapset/qemu-kvm.stp +%{_datadir}/systemtap/tapset/qemu-kvm-log.stp +%{_datadir}/%{name}/trace-events-all +%{_datadir}/systemtap/tapset/qemu-kvm-simpletrace.stp +%{_datadir}/%{name}/systemtap/script.d/qemu_kvm.stp +%{_datadir}/%{name}/systemtap/conf.d/qemu_kvm.conf +%if 0%{have_kvm_setup} + %{_prefix}/lib/systemd/kvm-setup + %{_unitdir}/kvm-setup.service + %{_presetdir}/85-kvm.preset +%endif +%if 0%{have_memlock_limits} + %{_sysconfdir}/security/limits.d/95-kvm-memlock.conf +%endif +%{_libexecdir}/virtiofsd +%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json + +%files -n qemu-img +%defattr(-,root,root) +%{_bindir}/qemu-img +%{_bindir}/qemu-io +%{_bindir}/qemu-nbd +%{_mandir}/man1/qemu-img.1* +%{_mandir}/man8/qemu-nbd.8* + +%files -n qemu-guest-agent +%defattr(-,root,root,-) +%doc COPYING README.rst +%{_bindir}/qemu-ga +%{_mandir}/man8/qemu-ga.8* +%{_unitdir}/qemu-guest-agent.service +%{_udevrulesdir}/99-qemu-guest-agent.rules +%config(noreplace) %{_sysconfdir}/sysconfig/qemu-ga +%{_sysconfdir}/qemu-ga +%{_sysconfdir}/qemu-kvm/fsfreeze-hook +%{_datadir}/%{name}/qemu-ga +%dir %{_localstatedir}/log/qemu-ga + +%files tests +%{testsdir} + +%files block-curl +%{_libdir}/qemu-kvm/block-curl.so + +%if %{have_gluster} +%files block-gluster +%{_libdir}/qemu-kvm/block-gluster.so +%endif + +%files block-iscsi +%{_libdir}/qemu-kvm/block-iscsi.so + +%files block-rbd +%{_libdir}/qemu-kvm/block-rbd.so + +%files block-ssh +%{_libdir}/qemu-kvm/block-ssh.so + + +%changelog +* Tue Feb 08 2022 Jon Maloy - 4.2.0-59.el8_5.2 +- kvm-virtiofsd-Drop-membership-of-all-supplementary-group.patch [bz#2048627] +- Resolves: bz#2048627 + (CVE-2022-0358 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privilege escalation via CVE-2018-13405 [rhel-8.5.0.z]) + +* Thu Nov 25 2021 Jon Maloy - 4.2.0-59.el8_5 +- kvm-hw-scsi-scsi-disk-MODE_PAGE_ALLS-not-allowed-in-MODE.patch [bz#2025605] +- kvm-e1000-fix-tx-re-entrancy-problem.patch [bz#2025011] +- Resolves: bz#2025605 + (CVE-2021-3930 virt:rhel/qemu-kvm: QEMU: off-by-one error in mode_sense_page() in hw/scsi/scsi-disk.c [rhel-8.5.0.z]) +- Resolves: bz#2025011 + (CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0.z]) + +* Fri Oct 01 2021 Jon Maloy - 4.2.0-59 +- kvm-scsi-make-io_timeout-configurable.patch [bz#1994041] +- Resolves: bz#1994041 + (qemu-kvm scsi: change default passthrough timeout to non-infinite) + +* Wed Aug 18 2021 Danilo Cesar Lemes de Paula - 4.2.0-58.el8 +- kvm-virtiofsd-Disable-remote-posix-locks-by-default.patch [bz#1967496] +- kvm-virtiofsd-Fix-the-help-message-of-posix-lock.patch [bz#1967496] +- Resolves: bz#1967496 + ([virtio-fs] nfs/xfstest generic/089 generic/478 generic/632 failed) + +* Wed Aug 04 2021 Miroslav Rezanina - 4.2.0-57 +- kvm-aio-wait-delegate-polling-of-main-AioContext-if-BQL-.patch [bz#1969848] +- kvm-async-use-explicit-memory-barriers.patch [bz#1969848] +- Resolves: bz#1969848 + (qemu-img convert hangs on aarch64) + +* Thu Jul 29 2021 Miroslav Rezanina - 4.2.0-56 +- kvm-glib-compat-add-g_unix_get_passwd_entry_qemu.patch [bz#1967716] +- kvm-qga-add-ssh-add-remove-authorized-keys.patch [bz#1967716] +- kvm-qga-add-reset-argument-to-ssh-add-authorized-keys.patch [bz#1967716] +- kvm-qga-add-ssh-get-authorized-keys.patch [bz#1967716] +- kvm-Add-mtod_check.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-bootp-limit-vendor-specific-area-to-input-packet-mem.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-bootp-check-bootp_input-buffer-size.patch [bz#1970819] +- kvm-upd6-check-udp6_input-buffer-size.patch [bz#1970835] +- kvm-tftp-check-tftp_input-buffer-size.patch [bz#1970843] +- kvm-tftp-introduce-a-header-structure.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-udp-check-upd_input-buffer-size.patch [bz#1970853] +- kvm-Fix-DHCP-broken-in-libslirp-v4.6.0.patch [bz#1970819 bz#1970835 bz#1970843 bz#1970853] +- kvm-net-check-if-the-file-descriptor-is-valid-before-usi.patch [bz#1982134] +- kvm-net-detect-errors-from-probing-vnet-hdr-flag-for-TAP.patch [bz#1982134] +- Resolves: bz#1967716 + (RFE: rebuild guest agent to include public ssh injection api support) +- Resolves: bz#1970819 + (CVE-2021-3592 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (bootp) [rhel-8]) +- Resolves: bz#1970835 + (CVE-2021-3593 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp6) [rhel-8]) +- Resolves: bz#1970843 + (CVE-2021-3595 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (tftp) [rhel-8]) +- Resolves: bz#1970853 + (CVE-2021-3594 virt:rhel/qemu-kvm: QEMU: slirp: invalid pointer initialization may lead to information disclosure (udp) [rhel-8]) +- Resolves: bz#1982134 + (QEMU core dump while booting guest with a non-exist fd on tap) + +* Fri Jul 23 2021 Danilo Cesar Lemes de Paula - 4.2.0-55.el8 +- kvm-net-introduce-qemu_receive_packet.patch [bz#1932917] +- kvm-e1000-switch-to-use-qemu_receive_packet-for-loopback.patch [bz#1932917] +- kvm-dp8393x-switch-to-use-qemu_receive_packet-for-loopba.patch [bz#1932917] +- kvm-sungem-switch-to-use-qemu_receive_packet-for-loopbac.patch [bz#1932917] +- kvm-tx_pkt-switch-to-use-qemu_receive_packet_iov-for-loo.patch [bz#1932917] +- kvm-rtl8139-switch-to-use-qemu_receive_packet-for-loopba.patch [bz#1932917] +- kvm-pcnet-switch-to-use-qemu_receive_packet-for-loopback.patch [bz#1932917] +- kvm-cadence_gem-switch-to-use-qemu_receive_packet-for-lo.patch [bz#1932917] +- kvm-lan9118-switch-to-use-qemu_receive_packet-for-loopba.patch [bz#1932917] +- Resolves: bz#1932917 + (CVE-2021-3416 virt:rhel/qemu-kvm: QEMU: net: infinite loop in loopback mode may lead to stack overflow) + +* Thu Jul 22 2021 Danilo Cesar Lemes de Paula - 4.2.0-54.el8 +- kvm-redhat-Fix-unversioned-Obsoletes-warning.patch [bz#1967329] +- Resolves: bz#1967329 + (Make qemu-kvm use versioned obsoletes for qemu-kvm-ma and qemu-kvm-rhev) + +* Fri Jul 02 2021 Danilo Cesar Lemes de Paula - 4.2.0-53.el8 +- kvm-virtiofsd-Whitelist-fchmod.patch [bz#1967914] +- kvm-spapr-Fix-EEH-capability-issue-on-KVM-guest-for-PCI-.patch [bz#1957866] +- kvm-Compress-lines-for-immediate-return.patch [bz#1970912] +- kvm-file-posix-Handle-EINVAL-fallocate-return-value.patch [bz#1970912] +- Resolves: bz#1967914 + ([virtio-fs] virtiofsd quit when coping file to a folder in virtio-fs mounted volume(windows guest)) +- Resolves: bz#1957866 + (RHEL8.4 - EEH capability disabled on KVM guest and recovery of PCI passthru device fails(CX5 / mlx5_core) (qemu-kvm)) +- Resolves: bz#1970912 + (Deployment fails with "Invalid or missing agent token received") + +* Fri Jun 11 2021 Danilo Cesar Lemes de Paula - 4.2.0-52.el8 +- kvm-file-posix-Mitigate-file-fragmentation-with-extent-s.patch [bz#1877163] +- kvm-block-file-posix-Fix-problem-with-fallocate-PUNCH_HO.patch [bz#1944861] +- kvm-spapr-Remove-stale-comment-about-power-saving-LPCR-b.patch [bz#1969768] +- kvm-spapr-Set-LPCR-to-current-AIL-mode-when-starting-a-n.patch [bz#1969768] +- Resolves: bz#1877163 + ([FJ 8.3 Bug] The progress bar of the "virt-clone --nonsparse" command shows the progress rate exceeding 100%.) +- Resolves: bz#1944861 + (Qemu-img convert fails when source image is on gpfs) +- Resolves: bz#1969768 + ([ppc64le] Hotplug vcpu device hit call trace:[qemu output] KVM: unknown exit, hardware reason 7fff9ce87ed8) + +* Tue May 25 2021 Danilo Cesar Lemes de Paula - 4.2.0-51.el8 +- kvm-linux-headers-Add-VFIO_CCW_REQ_IRQ_INDEX.patch [bz#1940450] +- kvm-vfio-ccw-Connect-the-device-request-notifier.patch [bz#1940450] +- kvm-pc-bios-s390-ccw-fix-off-by-one-error.patch [bz#1942880] +- kvm-pc-bios-s390-ccw-break-loop-if-a-null-block-number-i.patch [bz#1942880] +- kvm-pc-bios-s390-ccw-don-t-try-to-read-the-next-block-if.patch [bz#1942880] +- Resolves: bz#1940450 + (RHEL8.5 - Mediated Device already in use by same domain we are booting (vfio-ccw/Multipath Testing) (kvm) - qemu-kvm part (also has kernel and libvirt parts)) +- Resolves: bz#1942880 + (RHEL8.4 Nightly[0322] - KVM guest fails to find zipl boot menu index (qemu-kvm)) + +* Wed May 05 2021 Danilo Cesar Lemes de Paula - 4.2.0-50.el8 +- kvm-hw-intc-arm_gic-Fix-interrupt-ID-in-GICD_SGIR-regist.patch [bz#1925430] +- kvm-libqos-usb-hcd-ehci-use-32-bit-write-for-config-regi.patch [bz#1842478] +- kvm-libqos-pci-pc-use-32-bit-write-for-EJ-register.patch [bz#1842478] +- kvm-memory-Revert-memory-accept-mismatching-sizes-in-mem.patch [bz#1842478] +- kvm-acpi-accept-byte-and-word-access-to-core-ACPI-regist.patch [bz#1842478] +- kvm-xhci-fix-valid.max_access_size-to-access-address-reg.patch [bz#1842478] +- kvm-softmmu-memory-Log-invalid-memory-accesses.patch [bz#1842478] +- Resolves: bz#1925430 + (CVE-2021-20221 virt:rhel/qemu-kvm: qemu: out-of-bound heap buffer access via an interrupt ID field [rhel-8.5.0]) +- Resolves: bz#1842478 + (CVE-2020-13754 virt:rhel/qemu-kvm: QEMU: msix: OOB access during mmio operations may lead to DoS [rhel-8.5.0]) + +* Wed Apr 28 2021 Danilo Cesar Lemes de Paula - 4.2.0-49.el8 +- kvm-net-remove-an-assert-call-in-eth_get_gso_type.patch [bz#1892350] +- kvm-e1000-fail-early-for-evil-descriptor.patch [bz#1930092] +- kvm-net-forbid-the-reentrant-RX.patch [bz#1859175] +- kvm-qemu-img-convert-Don-t-pre-zero-images.patch [bz#1855250] +- kvm-audio-audio_generic_get_buffer_in-should-honor-size.patch [bz#1932823] +- Resolves: bz#1892350 + (CVE-2020-27617 virt:rhel/qemu-kvm: QEMU: net: an assert failure via eth_get_gso_type [rhel-8.5.0]) +- Resolves: bz#1930092 + (CVE-2021-20257 virt:rhel/qemu-kvm: QEMU: net: e1000: infinite loop while processing transmit descriptors [rhel-8.5.0]) +- Resolves: bz#1859175 + (CVE-2020-15859 virt:rhel/qemu-kvm: QEMU: net: e1000e: use-after-free while sending packets [rhel-8]) +- Resolves: bz#1855250 + (qemu-img convert uses possibly slow pre-zeroing on block storage) +- Resolves: bz#1932823 + (after upgrade from 4.3 to 4.4 audio stops working in guests after couple of seconds) + +* Tue Mar 16 2021 Danilo Cesar Lemes de Paula - 4.2.0-48.el8 +- kvm-ide-atapi-check-logical-block-address-and-read-size-.patch [bz#1917451] +- Resolves: bz#1917451 + (CVE-2020-29443 virt:rhel/qemu-kvm: QEMU: ide: atapi: OOB access while processing read commands [rhel-8.4.0]) + +* Mon Mar 08 2021 Danilo Cesar Lemes de Paula - 4.2.0-47.el8 +- kvm-x86-cpu-Enable-AVX512_VP2INTERSECT-cpu-feature.patch [bz#1790620] +- kvm-target-i386-add-fast-short-REP-MOV-support.patch [bz#1790620] +- kvm-x86-cpu-Populate-SVM-CPUID-feature-bits.patch [bz#1790620] +- kvm-i386-Add-the-support-for-AMD-EPYC-3rd-generation-pro.patch [bz#1790620] +- Resolves: bz#1790620 + ([RFE] AMD Milan - Add KVM/support for EPYC-Milan CPU Model - Slow Train) + +* Wed Mar 03 2021 Danilo Cesar Lemes de Paula - 4.2.0-46.el8 +- kvm-redhat-makes-qemu-respect-system-s-crypto-profile.patch [bz#1902960] +- kvm-spapr-Adjust-firmware-path-of-PCI-devices.patch [bz#1912891] +- Resolves: bz#1902960 + (QEMU doesn't honour system crypto policies) +- Resolves: bz#1912891 + ([ppc64le] --disk cdimage.iso,bus=usb fails to boot) + +* Wed Feb 10 2021 Jon Maloy - 4.2.0-45.el8 +- kvm-virtiofsd-extract-lo_do_open-from-lo_open.patch [bz#1919111] +- kvm-virtiofsd-optionally-return-inode-pointer-from-lo_do.patch [bz#1919111] +- kvm-virtiofsd-prevent-opening-of-special-files-CVE-2020-.patch [bz#1919111] +- Resolves: bz#1919111 + (CVE-2020-35517 virt:rhel/qemu-kvm: QEMU: virtiofsd: potential privileged host device access from guest [rhel-8.4.0]) + +* Tue Feb 02 2021 Jon Maloy - 4.2.0-44.el8 +- kvm-spapr-Improve-handling-of-fdt-buffer-size.patch [bz#1901837] +- kvm-spapr-Fold-h_cas_compose_response-into-h_client_arch.patch [bz#1901837] +- kvm-spapr-Don-t-use-spapr_drc_needed-in-CAS-code.patch [bz#1901837] +- kvm-spapr-Fix-handling-of-unplugged-devices-during-CAS-a.patch [bz#1901837] +- kvm-spapr-Allow-memory-unplug-to-always-succeed.patch [bz#1901837] +- kvm-spapr-Improve-handling-of-memory-unplug-with-old-gue.patch [bz#1901837] +- kvm-block-Require-aligned-image-size-to-avoid-assertion-.patch [bz#1834281] +- kvm-file-posix-Allow-byte-aligned-O_DIRECT-with-NFS.patch [bz#1834281] +- kvm-block-iscsi-fix-heap-buffer-overflow-in-iscsi_aio_io.patch [bz#1912974] +- Resolves: bz#1834281 + (qemu-img convert abort when converting image with unaligned size) +- Resolves: bz#1901837 + (Failed to hotunplug pc-dimm device) +- Resolves: bz#1912974 + (CVE-2020-11947 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in iscsi_aio_ioctl_cb() in block/iscsi.c may lead to information disclosure [rhel-8]) + +* Wed Jan 27 2021 Danilo Cesar Lemes de Paula - 4.2.0-43.el8 +- kvm-Drop-bogus-IPv6-messages.patch [bz#1918054] +- Resolves: bz#1918054 + (CVE-2020-10756 virt:rhel/qemu-kvm: QEMU: slirp: networking out-of-bounds read information disclosure vulnerability [rhel-8.4.0]) + +* Thu Jan 21 2021 Danilo Cesar Lemes de Paula - 4.2.0-42.el8 +- kvm-linux-headers-add-vfio-DMA-available-capability.patch [bz#1905391] +- kvm-s390x-pci-Move-header-files-to-include-hw-s390x.patch [bz#1905391] +- kvm-vfio-Create-shared-routine-for-scanning-info-capabil.patch [bz#1905391] +- kvm-vfio-Find-DMA-available-capability.patch [bz#1905391] +- kvm-s390x-pci-Add-routine-to-get-the-vfio-dma-available-.patch [bz#1905391] +- kvm-s390x-pci-Honor-DMA-limits-set-by-vfio.patch [bz#1905391] +- kvm-s390x-fix-build-for-without-default-devices.patch [bz#1905391] +- Resolves: bz#1905391 + (RHEL8.4 - s390x/pci: Honor vfio DMA limiting (qemu-kvm)) + +* Mon Jan 18 2021 Danilo Cesar Lemes de Paula - 4.2.0-41.el8 +- kvm-udev-kvm-check-remove-the-exceeded-subscription-limi.patch [bz#1909244] +- kvm-hw-arm-smmu-common-Factorize-some-code-in-smmu_ptw_6.patch [bz#1843852] +- kvm-hw-arm-smmu-common-Add-IOTLB-helpers.patch [bz#1843852] +- kvm-hw-arm-smmu-Introduce-smmu_get_iotlb_key.patch [bz#1843852] +- kvm-hw-arm-smmu-Introduce-SMMUTLBEntry-for-PTW-and-IOTLB.patch [bz#1843852] +- kvm-hw-arm-smmu-common-Manage-IOTLB-block-entries.patch [bz#1843852] +- kvm-hw-arm-smmuv3-Introduce-smmuv3_s1_range_inval-helper.patch [bz#1843852] +- kvm-hw-arm-smmuv3-Get-prepared-for-range-invalidation.patch [bz#1843852] +- kvm-hw-arm-smmuv3-Fix-potential-integer-overflow-CID-143.patch [bz#1843852] +- kvm-memory-Rename-memory_region_notify_one-to-memory_reg.patch [bz#1843852] +- kvm-memory-Add-IOMMUTLBEvent.patch [bz#1843852] +- kvm-memory-Add-IOMMU_NOTIFIER_DEVIOTLB_UNMAP-IOMMUTLBNot.patch [bz#1843852] +- kvm-intel_iommu-Skip-page-walking-on-device-iotlb-invali.patch [bz#1843852] +- kvm-memory-Skip-bad-range-assertion-if-notifier-is-DEVIO.patch [bz#1843852] +- kvm-memory-clamp-cached-translation-in-case-it-points-to.patch [bz#1904393] +- kvm-hw-ehci-check-return-value-of-usb_packet_map.patch [bz#1898628] +- kvm-hw-net-e1000e-advance-desc_offset-in-case-of-null-de.patch [bz#1903070] +- Resolves: bz#1909244 + (Remove KVM guest count and limit info message) +- Resolves: bz#1843852 + (qemu core dumped: qemu-kvm: /builddir/build/BUILD/qemu-4.2.0/memory.c:1928: memory_region_notify_one: Assertion `entry->iova >= notifier->start && entry_end <= notifier->end' failed.) +- Resolves: bz#1904393 + (CVE-2020-27821 virt:rhel/qemu-kvm: QEMU: heap buffer overflow in msix_table_mmio_write() in hw/pci/msix.c [rhel-8]) +- Resolves: bz#1898628 + (CVE-2020-25723 virt:rhel/qemu-kvm: QEMU: assertion failure through usb_packet_unmap() in hw/usb/hcd-ehci.c [rhel-8]) +- Resolves: bz#1903070 + (CVE-2020-25707 CVE-2020-28916 virt:rhel/qemu-kvm: various flaws [rhel-8]) + +* Mon Jan 04 2021 Danilo Cesar Lemes de Paula - 4.2.0-40.el8 +- kvm-redhat-link-etc-qemu-ga-fsfreeze-hook-to-etc-qemu-kv.patch [bz#1910267] +- kvm-qga-rename-Error-parameter-to-more-common-errp.patch [bz#1910326] +- kvm-util-Introduce-qemu_get_host_name.patch [bz#1910326] +- kvm-qga-Use-qemu_get_host_name-instead-of-g_get_host_nam.patch [bz#1910326] +- kvm-redhat-add-un-pre-install-systemd-hooks-for-qemu-ga.patch [bz#1910220] +- Resolves: bz#1910267 + (There is no soft link '/etc/qemu-kvm/fsfreeze-hook') +- Resolves: bz#1910326 + (Incorrect hostname returned by qga command 'guest-get-host-name') +- Resolves: bz#1910220 + (qemu-ga service still active and can work after qemu-guest-agent been removed) + +* Wed Dec 23 2020 Danilo Cesar Lemes de Paula - 4.2.0-39.el8 +- kvm-ppc-spapr-Add-hotremovable-flag-on-DIMM-LMBs-on-drme.patch [bz#1901837] +- kvm-ppc-spapr-re-assert-IRQs-during-event-scan-if-there-.patch [bz#1901837] +- kvm-slirp-check-pkt_len-before-reading-protocol-header.patch [bz#1902237] +- kvm-s390x-s390-virtio-ccw-Reset-PCI-devices-during-subsy.patch [bz#1905386] +- kvm-qapi-enable-use-of-g_autoptr-with-QAPI-types.patch [bz#1859494] +- kvm-error-Fix-examples-in-error.h-s-big-comment.patch [bz#1859494] +- kvm-error-Improve-error.h-s-big-comment.patch [bz#1859494] +- kvm-error-Document-Error-API-usage-rules.patch [bz#1859494] +- kvm-error-New-macro-ERRP_GUARD.patch [bz#1859494] +- kvm-qga-add-command-guest-get-disks.patch [bz#1859494] +- kvm-qga-add-implementation-of-guest-get-disks-for-Linux.patch [bz#1859494] +- kvm-qga-add-implementation-of-guest-get-disks-for-Window.patch [bz#1859494] +- kvm-qga-fix-missing-closedir-in-qmp_guest_get_disks.patch [bz#1859494] +- kvm-qga-update-schema-for-guest-get-disks-dependents-fie.patch [bz#1859494] +- Resolves: bz#1859494 + (Report logical_name for disks without mounted file-system) +- Resolves: bz#1901837 + (Failed to hotunplug pc-dimm device) +- Resolves: bz#1902237 + (CVE-2020-29129 CVE-2020-29130 virt:rhel/qemu-kvm: QEMU: slirp: out-of-bounds access while processing ARP/NCSI packets [rhel-8]) +- Resolves: bz#1905386 + (RHEL8.3 - s390x/s390-virtio-ccw: Reset PCI devices during subsystem reset (qemu-kvm)) + +* Fri Dec 11 2020 Danilo Cesar Lemes de Paula - 4.2.0-38.el8 +- kvm-seccomp-fix-killing-of-whole-process-instead-of-thre.patch [bz#1880546] +- kvm-pc-bios-s390x-Rename-PSW_MASK_ZMODE-to-PSW_MASK_64.patch [bz#1903135] +- kvm-pc-bios-s390x-Use-PSW-masks-where-possible-and-intro.patch [bz#1903135] +- kvm-pc-bios-s390x-Ensure-Read-IPL-memory-is-clean.patch [bz#1903135] +- kvm-pc-bios-s390x-Clear-out-leftover-S390EP-string.patch [bz#1903135] +- Resolves: bz#1880546 + (qemu use SCMP_ACT_TRAP even SCMP_ACT_KILL_PROCESS is available) +- Resolves: bz#1903135 + (RHEL8.3 - KVM Distro install to vfio_ccw backed DASD gets error at the reboot step (qemu-kvm)) + +* Mon Nov 23 2020 Danilo Cesar Lemes de Paula - 4.2.0-37.el8 +- kvm-hw-net-net_tx_pkt-fix-assertion-failure-in-net_tx_pk.patch [bz#1860994] +- Resolves: bz#1860994 + (CVE-2020-16092 virt:rhel/qemu-kvm: QEMU: reachable assertion failure in net_tx_pkt_add_raw_fragment() in hw/net/net_tx_pkt.c [rhel-8]) + +* Fri Nov 20 2020 Danilo Cesar Lemes de Paula - 4.2.0-36.el8 +- kvm-qga-fix-assert-regression-on-guest-shutdown.patch [bz#1884531] +- kvm-libvhost-user-handle-endianness-as-mandated-by-the-s.patch [bz#1857733] +- kvm-virtio-add-vhost-user-fs-ccw-device.patch [bz#1857733] +- kvm-Ensure-vhost-user-fs-is-enabled-on-s390x.patch [bz#1857733] +- kvm-s390x-sclp.c-remove-unneeded-label-in-sclp_service_c.patch [bz#1798506] +- kvm-s390-sclp-get-machine-once-during-read-scp-cpu-info.patch [bz#1798506] +- kvm-s390-sclp-rework-sclp-boundary-checks.patch [bz#1798506] +- kvm-s390-sclp-read-sccb-from-mem-based-on-provided-lengt.patch [bz#1798506] +- kvm-s390-sclp-check-sccb-len-before-filling-in-data.patch [bz#1798506] +- kvm-s390-sclp-use-cpu-offset-to-locate-cpu-entries.patch [bz#1798506] +- kvm-s390-sclp-add-extended-length-sccb-support-for-kvm-g.patch [bz#1798506] +- kvm-linux-headers-Partial-update-against-Linux-5.9-rc4.patch [bz#1798506] +- kvm-misc-Replace-zero-length-arrays-with-flexible-array-.patch [bz#1798506] +- kvm-s390-guest-support-for-diagnose-0x318.patch [bz#1798506] +- kvm-s390x-pv-Remove-sclp-boundary-checks.patch [bz#1798506] +- kvm-s390x-pv-Fix-diag318-PV-fencing.patch [bz#1798506] +- kvm-s390-kvm-fix-diag318-propagation-and-reset-functiona.patch [bz#1659412] +- kvm-trace-use-STAP_SDT_V2-to-work-around-symbol-visibili.patch [bz#1898700] +- Resolves: bz#1659412 + ([IBM 8.4 FEAT] KVM enablement for enhanced hardware diagnose data of guest kernel on s390x - qemu part) +- Resolves: bz#1798506 + ([IBM 8.4 FEAT] KVM: Support extended-length SCCBs - qemu part) +- Resolves: bz#1857733 + ([IBM 8.4 FEAT] KVM: Add support for virtio-fs on s390x - qemu part) +- Resolves: bz#1884531 + (qemu-ga aborts after guest-shutdown command) +- Resolves: bz#1898700 + (qemu-kvm for RHEL-8.4 doesn't build due to a possible incompatibility with systemtap-sdt-devel-4.4-1) + +* Mon Oct 19 2020 Danilo Cesar Lemes de Paula - 4.2.0-35.el8 +- kvm-qga-commands-posix-Rework-build_guest_fsinfo_for_rea.patch [bz#1755075] +- kvm-qga-commands-posix-Move-the-udev-code-from-the-pci-t.patch [bz#1755075] +- kvm-qga-commands-posix-Support-fsinfo-for-non-PCI-virtio.patch [bz#1755075] +- kvm-nvram-Exit-QEMU-if-NVRAM-cannot-contain-all-prom-env.patch [bz#1874780] +- kvm-pc-bios-s390-ccw-Makefile-Compile-with-std-gnu99-fwr.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Move-ipl-related-code-from-main-int.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Introduce-ENODEV-define-and-remove-.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Move-the-inner-logic-of-find_subch-.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Do-not-bail-out-early-if-not-findin.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Scan-through-all-devices-if-no-boot.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-Allow-booting-in-case-the-first-vir.patch [bz#1846975] +- kvm-pc-bios-s390-ccw-main-Remove-superfluous-call-to-ena.patch [bz#1846975] +- kvm-aio-posix-completely-stop-polling-when-disabled.patch [bz#1846975] +- kvm-Remove-explicit-glusterfs-api-dependency.patch [bz#1872854] +- Resolves: bz#1755075 + ([qemu-guest-agent] fsinfo doesn't return disk info on s390x) +- Resolves: bz#1846975 + (Failed to boot up a s390x guest with virtio-blk-ccw if attaching a virtio-scsi-ccw bus in previous) +- Resolves: bz#1872854 + (move the glusterfs dependency out of qemu-kvm-core to the glusterfs module) +- Resolves: bz#1874780 + (-prom-env does not validate input) + +* Tue Sep 08 2020 Danilo Cesar Lemes de Paula - 4.2.0-34.el8 +- kvm-usb-fix-setup_len-init-CVE-2020-14364.patch [bz#1869710] +- Resolves: bz#1869710 + (CVE-2020-14364 qemu-kvm: QEMU: usb: out-of-bounds r/w access issue while processing usb packets [rhel-8.3.0]) + +* Wed Aug 19 2020 Danilo Cesar Lemes de Paula - 4.2.0-33.el8 +- kvm-Require-libfdt-1.6.0.patch [bz#1867847] +- Resolves: bz#1867847 + ([ppc] virt module 7629: /usr/libexec/qemu-kvm: undefined symbol: fdt_check_full, version LIBFDT_1.2) + +* Mon Aug 10 2020 Danilo Cesar Lemes de Paula - 4.2.0-32.el8 +- kvm-i386-Add-2nd-Generation-AMD-EPYC-processors.patch [bz#1780385] +- kvm-target-i386-sev-provide-proper-error-reporting-for-q.patch [bz#1689341] +- kvm-target-i386-sev-fail-query-sev-capabilities-if-QEMU-.patch [bz#1689341] +- kvm-s390x-protvirt-allow-to-IPL-secure-guests-with-no-re.patch [bz#1863034] +- Resolves: bz#1689341 + (QEMU should report an error and return failure if AMD SEV is not enabled in the kernel) +- Resolves: bz#1780385 + ([RFE] AMD EPYC-Rome support for KVM / QEMU guest) +- Resolves: bz#1863034 + (RHEL8.3 Beta - Secure Execution: Unable to start Qemu with "-no-reboot" option (qemu-kvm)) + +* Wed Jul 22 2020 Danilo Cesar Lemes de Paula - 4.2.0-31.el8 +- kvm-qcow2-Fix-alloc_cluster_abort-for-pre-existing-clust.patch [bz#1807057] +- kvm-iotests-026-Test-EIO-on-preallocated-zero-cluster.patch [bz#1807057] +- kvm-iotests-026-Test-EIO-on-allocation-in-a-data-file.patch [bz#1807057] +- kvm-iotests-026-Move-v3-exclusive-test-to-new-file.patch [bz#1807057] +- Resolves: bz#1807057 + (qcow2_alloc_cluster_abort() frees preallocated zero clusters) + +* Tue Jul 07 2020 Danilo Cesar Lemes de Paula - 4.2.0-30.el8 +- kvm-i386-Mask-SVM-features-if-nested-SVM-is-disabled.patch [bz#1835390] +- kvm-s390x-sigp-Fix-sense-running-reporting.patch [bz#1854092] +- kvm-s390x-tcg-clear-local-interrupts-on-reset-normal.patch [bz#1854092] +- kvm-virtio-net-fix-removal-of-failover-device.patch [] +- Resolves: bz#1835390 + (qemu promote host does not support 'EDX.npt' and 'EDX.nrip-save' when test with Q35 machine type on EPYC host) +- Resolves: bz#1854092 + (kvm-unit-tests: tcg smp FAIL) + +* Sun Jun 28 2020 Danilo Cesar Lemes de Paula - 4.2.0-29.el8 +- kvm-vfio-ccw-Fix-error-message.patch [bz#1660916] +- kvm-vfio-ccw-allow-non-prefetch-ORBs.patch [bz#1660916] +- kvm-linux-headers-support-vfio-ccw-features.patch [bz#1660916] +- kvm-vfio-ccw-Refactor-cleanup-of-regions.patch [bz#1660916] +- kvm-vfio-ccw-Add-support-for-the-schib-region.patch [bz#1660916] +- kvm-vfio-ccw-Refactor-ccw-irq-handler.patch [bz#1660916] +- kvm-s390x-css-Refactor-the-css_queue_crw-routine.patch [bz#1660916] +- kvm-vfio-ccw-Add-support-for-the-CRW-region-and-IRQ.patch [bz#1660916] +- kvm-config-enable-VFIO_CCW.patch [bz#1660916] +- kvm-virtio-blk-Refactor-the-code-that-processes-queued-r.patch [] +- kvm-virtio-blk-On-restart-process-queued-requests-in-the.patch [] +- kvm-Fix-use-afte-free-in-ip_reass-CVE-2020-1983.patch [bz#1838070] +- Resolves: bz#1660916 + ([IBM 8.3 FEAT] KVM s390x: DASD passthrough support - qemu part) +- Resolves: bz#1838070 + (CVE-2020-1983 virt:rhel/qemu-kvm: QEMU: slirp: use-after-free in ip_reass() function in ip_input.c [rhel-8]) + +* Fri Jun 19 2020 Danilo Cesar Lemes de Paula - 4.2.0-28.el8 +- kvm-redhat-Install-the-s390-netboot.img-that-we-ve-built.patch [bz#1828317] +- kvm-linux-headers-update-kvm.h.patch [bz#1828317] +- kvm-s390x-Don-t-do-a-normal-reset-on-the-initial-cpu.patch [bz#1828317] +- kvm-s390x-Move-reset-normal-to-shared-reset-handler.patch [bz#1828317] +- kvm-s390x-Move-initial-reset.patch [bz#1828317] +- kvm-s390x-Move-clear-reset.patch [bz#1828317] +- kvm-s390x-Beautify-diag308-handling.patch [bz#1828317] +- kvm-s390x-kvm-Make-kvm_sclp_service_call-void.patch [bz#1828317] +- kvm-s390x-Fix-cpu-normal-reset-ri-clearing.patch [bz#1828317] +- kvm-tests-boot-sector-Fix-the-bad-s390x-assembler-code.patch [bz#1828317] +- kvm-pc-bios-s390x-Fix-reset-psw-mask.patch [bz#1828317] +- kvm-s390x-Properly-fetch-and-test-the-short-psw-on-diag3.patch [bz#1828317] +- kvm-s390x-Rename-and-use-constants-for-short-PSW-address.patch [bz#1828317] +- kvm-s390x-Add-missing-vcpu-reset-functions.patch [bz#1828317] +- kvm-s390-sclp-improve-special-wait-psw-logic.patch [bz#1828317] +- kvm-pc-bios-s390x-Save-iplb-location-in-lowcore.patch [bz#1828317] +- kvm-s390-ipl-sync-back-loadparm.patch [bz#1828317] +- kvm-s390-ipl-fix-off-by-one-in-update_machine_ipl_proper.patch [bz#1828317] +- kvm-s390x-ipl-Consolidate-iplb-validity-check-into-one-f.patch [bz#1828317] +- kvm-vhost-correctly-turn-on-VIRTIO_F_IOMMU_PLATFORM.patch [bz#1828317] +- kvm-s390x-Move-diagnose-308-subcodes-and-rcs-into-ipl.h.patch [bz#1828317] +- kvm-s390x-protvirt-Support-unpack-facility.patch [bz#1828317] +- kvm-s390x-protvirt-Add-migration-blocker.patch [bz#1828317] +- kvm-s390x-protvirt-Inhibit-balloon-when-switching-to-pro.patch [bz#1828317] +- kvm-s390x-protvirt-KVM-intercept-changes.patch [bz#1828317] +- kvm-s390x-Add-SIDA-memory-ops.patch [bz#1828317] +- kvm-s390x-protvirt-Move-STSI-data-over-SIDAD.patch [bz#1828317] +- kvm-s390x-protvirt-SCLP-interpretation.patch [bz#1828317] +- kvm-s390x-protvirt-Set-guest-IPL-PSW.patch [bz#1828317] +- kvm-s390x-protvirt-Move-diag-308-data-over-SIDA.patch [bz#1828317] +- kvm-s390x-protvirt-Disable-address-checks-for-PV-guest-I.patch [bz#1828317] +- kvm-s390x-protvirt-Move-IO-control-structures-over-SIDA.patch [bz#1828317] +- kvm-s390x-protvirt-Handle-SIGP-store-status-correctly.patch [bz#1828317] +- kvm-s390x-Add-unpack-facility-feature-to-GA1.patch [bz#1828317] +- kvm-s390x-protvirt-Fix-stray-error_report_err-in-s390_ma.patch [bz#1828317] +- kvm-s390x-pv-Retry-ioctls-on-EINTR.patch [bz#1828317] +- kvm-s390x-s390-virtio-ccw-Fix-build-on-systems-without-K.patch [bz#1828317] +- kvm-s390x-pv-Fix-KVM_PV_PREP_RESET-command-wrapper-name.patch [bz#1828317] +- kvm-spapr-Pass-the-maximum-number-of-vCPUs-to-the-KVM-in.patch [bz#1756946] +- kvm-introduce-kvm_kernel_irqchip_-functions.patch [bz#1756946] +- kvm-target-s390x-kvm-Enable-adapter-interruption-suppres.patch [bz#1756946] +- kvm-vfio-nvlink-Remove-exec-permission-to-avoid-SELinux-.patch [bz#1823275] +- Resolves: bz#1756946 + ([zKVM] Re-enable KVM_CAP_S390_AIS for new machine types) +- Resolves: bz#1823275 + (RHEL8.1 - GPU Numa nodes not visible in guest post the pass-through.) +- Resolves: bz#1828317 + ([IBM 8.3 FEAT] s390x: Base KVM setup for secure guests - qemu part) + +* Fri Jun 19 2020 Danilo C. L. de Paula - 4.2.0 +- Resolves: bz#1810193 +(Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + +* Tue Jun 09 2020 Danilo C. L. de Paula - 4.2.0-25 +- Resolves: bz#1810193 + (Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + Another sync + +* Thu Jun 04 2020 Danilo C. L. de Paula - 4.2.0-23.el8 +- Resolves: bz#1810193 + (Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + Another syncronization + +* Mon Apr 27 2020 Danilo C. L. de Paula - 4.2.0 +- Resolves: bz#1810193 + (Upgrade components in virt:rhel module:stream for RHEL-8.3 release) + +* Fri Feb 21 2020 Danilo Cesar Lemes de Paula - 2.12.0-99.el8 +- kvm-slirp-disable-tcp_emu.patch [bz#1791677] +- kvm-target-i386-kvm-initialize-feature-MSRs-very-early.patch [bz#1790308] +- Resolves: bz#1790308 + (qemu-kvm core dump when do L1 guest live migration with L2 guest running) +- Resolves: bz#1791677 + (QEMU: Slirp: disable emulation of tcp programs like ftp IRC etc. [rhel-8]) + +* Mon Feb 10 2020 Danilo Cesar Lemes de Paula - 2.12.0-98.el8 +- kvm-iscsi-Avoid-potential-for-get_status-overflow.patch [bz#1794501] +- kvm-iscsi-Cap-block-count-from-GET-LBA-STATUS-CVE-2020-1.patch [bz#1794501] +- kvm-clean-up-callback-when-del-virtqueue.patch [bz#1708480] +- kvm-virtio-add-ability-to-delete-vq-through-a-pointer.patch [bz#1708480] +- kvm-virtio-reset-region-cache-when-on-queue-deletion.patch [bz#1708480] +- kvm-virtio-net-delete-also-control-queue-when-TX-RX-dele.patch [bz#1708480] +- Resolves: bz#1708480 + ([Q35] No "DEVICE_DELETED" event in qmp after unplug virtio-net-pci device) +- Resolves: bz#1794501 + (CVE-2020-1711 qemu-kvm: QEMU: block: iscsi: OOB heap access via an unexpected response of iSCSI Server [rhel-8.2.0]) + +* Fri Jan 24 2020 Miroslav Rezanina - 2.12.0-97.el8 +- kvm-exec-Fix-MAP_RAM-for-cached-access.patch [bz#1769613] +- kvm-virtio-Return-true-from-virtio_queue_empty-if-broken.patch [bz#1769613] +- kvm-usbredir-Prevent-recursion-in-usbredir_write.patch [bz#1752320] +- kvm-xhci-recheck-slot-status.patch [bz#1752320] +- kvm-tcp_emu-Fix-oob-access.patch [bz#1791566] +- kvm-slirp-use-correct-size-while-emulating-IRC-commands.patch [bz#1791566] +- kvm-slirp-use-correct-size-while-emulating-commands.patch [bz#1791566] +- Resolves: bz#1752320 + (vm gets stuck when migrate vm back and forth with remote-viewer trying to connect) +- Resolves: bz#1769613 + ([SEV] kexec mays hang at "[sda] Synchronizing SCSI cache " before switching to new kernel) +- Resolves: bz#1791566 + (CVE-2020-7039 virt:rhel/qemu-kvm: QEMU: slirp: OOB buffer access while emulating tcp protocols in tcp_emu() [rhel-8.2.0]) + +* Tue Jan 07 2020 Danilo Cesar Lemes de Paula - 2.12.0-96.el8 +- kvm-i386-Remove-cpu64-rhel6-CPU-model.patch [bz#1741346] +- Resolves: bz#1741346 + (Remove the "cpu64-rhel6" CPU from qemu-kvm) + +* Thu Jan 02 2020 Danilo Cesar Lemes de Paula - 2.12.0-95.el8 +- kvm-virtio-gpu-block-both-2d-and-3d-rendering.patch [bz#1674324] +- kvm-x86-Intel-AVX512_BF16-feature-enabling.patch [bz#1642541] +- Resolves: bz#1642541 + ([Intel 8.2 Feature] qemu-kvm Enable BFloat16 data type support) +- Resolves: bz#1674324 + (With , qemu either refuses to start completely or spice-server crashes afterwards) + +* Wed Dec 18 2019 Danilo Cesar Lemes de Paula - 2.12.0-94.el8 +- kvm-util-mmap-alloc-Add-a-is_pmem-parameter-to-qemu_ram_.patch [bz#1539282] +- kvm-mmap-alloc-unfold-qemu_ram_mmap.patch [bz#1539282] +- kvm-mmap-alloc-fix-hugetlbfs-misaligned-length-in-ppc64.patch [bz#1539282] +- kvm-util-mmap-alloc-support-MAP_SYNC-in-qemu_ram_mmap.patch [bz#1539282] +- kvm-x86-cpu-Enable-MOVDIRI-cpu-feature.patch [bz#1634827] +- kvm-x86-cpu-Enable-MOVDIR64B-cpu-feature.patch [bz#1634827] +- kvm-add-call-to-qemu_add_opts-for-overcommit-option.patch [bz#1634827] +- kvm-support-overcommit-cpu-pm-on-off.patch [bz#1634827] +- kvm-i386-cpu-make-cpu-host-support-monitor-mwait.patch [] +- kvm-x86-cpu-Add-support-for-UMONITOR-UMWAIT-TPAUSE.patch [bz#1634827] +- kvm-target-i386-Add-support-for-save-load-IA32_UMWAIT_CO.patch [bz#1634827] +- Resolves: bz#1539282 + ([Intel 8.2 Feature][Crystal Ridge] Support MAP_SYNC - qemu-kvm) +- Resolves: bz#1634827 + ([Intel 8.2 Feat] KVM Enable SnowRidge Accelerator Interface Architecture (AIA) - qemu) + +* Wed Dec 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-93.el8 +- kvm-target-i386-Export-TAA_NO-bit-to-guests.patch [bz#1771971] +- kvm-target-i386-add-support-for-MSR_IA32_TSX_CTRL.patch [bz#1771971] +- Resolves: bz#1771971 + (CVE-2019-11135 virt:rhel/qemu-kvm: hw: TSX Transaction Asynchronous Abort (TAA) [rhel-8.2.0]) + +* Mon Dec 02 2019 Danilo Cesar Lemes de Paula - 2.12.0-92.el8 +- kvm-x86-cpu-use-FeatureWordArray-to-define-filtered_feat.patch [bz#1689270] +- kvm-i386-Add-x-force-features-option-for-testing.patch [bz#1689270] +- kvm-target-i386-define-a-new-MSR-based-feature-word-FEAT.patch [bz#1689270] +- kvm-i386-display-known-CPUID-features-linewrapped-in-alp.patch [bz#1689270] +- kvm-target-i386-kvm-kvm_get_supported_msrs-cleanup.patch [bz#1689270] +- kvm-target-i386-handle-filtered_features-in-a-new-functi.patch [bz#1689270] +- kvm-target-i386-introduce-generic-feature-dependency-mec.patch [bz#1689270] +- kvm-target-i386-expand-feature-words-to-64-bits.patch [bz#1689270] +- kvm-target-i386-add-VMX-definitions.patch [bz#1689270] +- kvm-vmxcap-correct-the-name-of-the-variables.patch [bz#1689270] +- kvm-target-i386-add-VMX-features.patch [bz#1689270] +- kvm-target-i386-work-around-KVM_GET_MSRS-bug-for-seconda.patch [bz#1689270] +- kvm-target-i386-adjust-for-missing-VMX-features.patch [bz#1689270] +- kvm-target-i386-add-VMX-features-to-named-CPU-models.patch [bz#1689270] +- kvm-target-i386-add-VMX-features-to-named-CPU-models-RHE.patch [bz#1689270] +- kvm-vhost-fix-vhost_log-size-overflow-during-migration.patch [bz#1776808] +- Resolves: bz#1689270 + (Nested KVM: limit VMX features according to CPU models - Slow Train) +- Resolves: bz#1776808 + (qemu-kvm crashes when Windows VM is migrated with multiqueue) + +* Wed Nov 27 2019 Danilo Cesar Lemes de Paula - 2.12.0-91.el8 +- kvm-qapi-fill-in-CpuInfoFast.arch-in-query-cpus-fast.patch [bz#1730969] +- kvm-curl-Keep-pointer-to-the-CURLState-in-CURLSocket.patch [bz#1744602] +- kvm-curl-Keep-socket-until-the-end-of-curl_sock_cb.patch [bz#1744602] +- kvm-curl-Check-completion-in-curl_multi_do.patch [bz#1744602] +- kvm-curl-Pass-CURLSocket-to-curl_multi_do.patch [bz#1744602] +- kvm-curl-Report-only-ready-sockets.patch [bz#1744602] +- kvm-curl-Handle-success-in-multi_check_completion.patch [bz#1744602] +- kvm-curl-Check-curl_multi_add_handle-s-return-code.patch [bz#1744602] +- Resolves: bz#1730969 + ([ppc] qmp: The 'arch' value returned by the command 'query-cpus-fast' does not match) +- Resolves: bz#1744602 + (qemu-img gets stuck when stream-converting from http) + +* Tue Nov 12 2019 Danilo Cesar Lemes de Paula - 2.12.0-90.el8 +- kvm-i386-Don-t-print-warning-if-phys-bits-was-set-automa.patch [bz#1719127] +- kvm-Disable-CONFIG_I2C-and-CONFIG_IOH3420.patch [bz#1693140] +- kvm-usb-drop-unnecessary-usb_device_post_load-checks.patch [bz#1757482] +- kvm-pc-bios-s390-ccw-define-loadparm-length.patch [bz#1664376] +- kvm-pc-bios-s390-ccw-net-Use-diag308-to-reset-machine-be.patch [bz#1664376] +- kvm-s390-bios-decouple-cio-setup-from-virtio.patch [bz#1664376] +- kvm-s390-bios-decouple-common-boot-logic-from-virtio.patch [bz#1664376] +- kvm-s390-bios-Clean-up-cio.h.patch [bz#1664376] +- kvm-s390-bios-Decouple-channel-i-o-logic-from-virtio.patch [bz#1664376] +- kvm-s390-bios-Map-low-core-memory.patch [bz#1664376] +- kvm-s390-bios-ptr2u32-and-u32toptr.patch [bz#1664376] +- kvm-s390-bios-Support-for-running-format-0-1-channel-pro.patch [bz#1664376] +- kvm-s390-bios-cio-error-handling.patch [bz#1664376] +- kvm-s390-bios-Extend-find_dev-for-non-virtio-devices.patch [bz#1664376] +- kvm-s390-bios-Factor-finding-boot-device-out-of-virtio-c.patch [bz#1664376] +- kvm-s390-bios-Refactor-virtio-to-run-channel-programs-vi.patch [bz#1664376] +- kvm-s390-bios-Use-control-unit-type-to-determine-boot-me.patch [bz#1664376] +- kvm-s390-bios-Add-channel-command-codes-structs-needed-f.patch [bz#1664376] +- kvm-s390-bios-Support-booting-from-real-dasd-device.patch [bz#1664376] +- kvm-s390-bios-Use-control-unit-type-to-find-bootable-dev.patch [bz#1664376] +- kvm-s390x-vfio-ap-Implement-hot-plug-unplug-of-vfio-ap-d.patch [bz#1660906] +- Resolves: bz#1660906 + ([IBM 8.2 FEAT] KVM s390x: Crypto Passthrough Hotplug - qemu part) +- Resolves: bz#1664376 + ([IBM 8.2 FEAT] CCW IPL Support (kvm) - qemu part) +- Resolves: bz#1693140 + (aarch64: qemu: remove smbus_eeprom and i2c from config) +- Resolves: bz#1719127 + ([Intel 8.2 Bug] warning shown when boot VM with “–cpu host” or “–cpu other mode” on ICX platform (physical)) +- Resolves: bz#1757482 + (Fail to migrate a rhel6.10-mt7.6 guest with dimm device) + +* Mon Oct 14 2019 Danilo Cesar Lemes de Paula - 2.12.0-89.el8 +- kvm-accel-use-g_strsplit-for-parsing-accelerator-names.patch [bz#1749022] +- kvm-opts-don-t-silently-truncate-long-parameter-keys.patch [bz#1749022] +- kvm-opts-don-t-silently-truncate-long-option-values.patch [bz#1749022] +- kvm-i386-fix-regression-parsing-multiboot-initrd-modules.patch [bz#1749022] +- kvm-i386-only-parse-the-initrd_filename-once-for-multibo.patch [bz#1749022] +- kvm-opts-remove-redundant-check-for-NULL-parameter.patch [bz#1749022] +- kvm-Using-ip_deq-after-m_free-might-read-pointers-from-a.patch [bz#1749724] +- kvm-virtio-blk-Cancel-the-pending-BH-when-the-dataplane-.patch [bz#1708459] +- kvm-s390x-cpumodel-Rework-CPU-feature-definition.patch [bz#1660909] +- kvm-s390x-cpumodel-Set-up-CPU-model-for-AQIC-interceptio.patch [bz#1660909] +- kvm-ccid-Fix-dwProtocols-advertisement-of-T-0.patch [bz#1746361] +- kvm-s390-PCI-fix-IOMMU-region-init.patch [bz#1754643] +- kvm-fw_cfg-Improve-error-message-when-can-t-load-splash-.patch [bz#1607367] +- kvm-fw_cfg-Fix-boot-bootsplash-error-checking.patch [bz#1607367] +- kvm-fw_cfg-Fix-boot-reboot-timeout-error-checking.patch [bz#1607367] +- kvm-hw-nvram-fw_cfg-Store-reboot-timeout-as-little-endia.patch [bz#1607367] +- kvm-intel_iommu-Correct-caching-mode-error-message.patch [bz#1738440] +- kvm-intel_iommu-Sanity-check-vfio-pci-config-on-machine-.patch [bz#1738440] +- kvm-qdev-machine-Introduce-hotplug_allowed-hook.patch [bz#1738440] +- kvm-pc-q35-Disallow-vfio-pci-hotplug-without-VT-d-cachin.patch [bz#1738440] +- kvm-intel_iommu-Remove-the-caching-mode-check-during-fla.patch [bz#1738440] +- kvm-pseries-do-not-allow-memory-less-cpu-less-NUMA-node.patch [bz#1651474] +- Resolves: bz#1607367 + (After boot failed, guest should not reboot when set reboot-timeout < -1) +- Resolves: bz#1651474 + (RHEL8.0 Beta - [4.18.0-32.el8.ppc64le] Guest VM crashes during vcpu hotplug with specific numa configuration (kvm)) +- Resolves: bz#1660909 + ([IBM 8.2 FEAT] KVM s390x: Crypto Passthrough Interrupt Support - qemu part) +- Resolves: bz#1708459 + (qemu-kvm core dumped when repeat "system_reset" multiple times during guest boot) +- Resolves: bz#1738440 + (For intel-iommu, qemu shows conflict behaviors between booting a guest with vfio and hot plugging vfio device) +- Resolves: bz#1746361 + (ccid: Fix incorrect dwProtocol advertisement of T=0) +- Resolves: bz#1749022 + (Please backport 950c4e6c94b1 ("opts: don't silently truncate long option values", 2018-05-09)) +- Resolves: bz#1749724 + (CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-8]) +- Resolves: bz#1754643 + (RHEL8.1 Snapshot3 - Passthrough PCI card goes into error state if used in domain (kvm)) + +* Fri Sep 13 2019 Danilo Cesar Lemes de Paula - 2.12.0-88.el8 +- Revert fix for bz#1749724 - this got delayed to 8.2 + (CVE-2019-15890 qemu-kvm: QEMU: Slirp: use-after-free during packet reassembly [rhel-8]) + +* Tue Sep 03 2019 Danilo Cesar Lemes de Paula - 2.12.0-86.el8 +- kvm-Do-not-run-iotests-on-brew-build.patch [bz#1742819] +- kvm-target-ppc-spapr-Add-workaround-option-to-SPAPR_CAP_.patch [bz#1744415] +- kvm-target-ppc-spapr-Add-SPAPR_CAP_CCF_ASSIST.patch [bz#1744415] +- kvm-i386-x86_cpu_list_feature_names-function.patch [bz#1747185] +- kvm-i386-unavailable-features-QOM-property.patch [bz#1747185] +- kvm-file-posix-Handle-undetectable-alignment.patch [bz#1738839] +- kvm-iotests-Tweak-221-sizing-for-different-hole-granular.patch [bz#1738839] +- kvm-iotests-Filter-175-s-allocation-information.patch [bz#1738839] +- kvm-block-posix-Always-allocate-the-first-block.patch [bz#1738839] +- kvm-iotests-Test-allocate_first_block-with-O_DIRECT.patch [bz#1738839] +- Resolves: bz#1738839 + (I/O error when virtio-blk disk is backed by a raw image on 4k disk) +- Resolves: bz#1742819 + (Remove iotests from qemu-kvm builds [RHEL 8.1.0]) +- Resolves: bz#1744415 + (Backport support for count cache flush Spectre v2 mitigation [slow train]) +- Resolves: bz#1747185 + ("filtered-features" QOM property is not available) + +* Mon Aug 19 2019 Danilo Cesar Lemes de Paula - 2.12.0-85.el8 +- kvm-console-Avoid-segfault-in-screendump.patch [bz#1684383] +- kvm-usb-hub-clear-suspend-on-detach.patch [bz#1619661] +- kvm-qemu-img-fix-regression-copying-secrets-during-conve.patch [bz#1727821] +- Resolves: bz#1619661 + (the attach hub on one hub still exits in device manager after unhotplug) +- Resolves: bz#1684383 + (qemu crashed when take screenshot for 2nd head of virtio video device if the display not opened by virt-viewer) +- Resolves: bz#1727821 + (Failed to convert a source image to the qcow2 image encrypted by luks) + +* Fri Aug 16 2019 Danilo Cesar Lemes de Paula - 2.12.0-84.el8 +- kvm-vnc-detect-and-optimize-pageflips.patch [bz#1727033] +- kvm-block-backend-Make-blk_inc-dec_in_flight-public.patch [bz#1716349] +- kvm-virtio-blk-Increase-in_flight-for-request-restart-BH.patch [bz#1716349] +- kvm-block-Fix-AioContext-switch-for-drained-node.patch [bz#1716349] +- kvm-test-bdrv-drain-AioContext-switch-in-drained-section.patch [bz#1716349] +- kvm-block-Use-normal-drain-for-bdrv_set_aio_context.patch [bz#1716349] +- kvm-block-Fix-AioContext-switch-for-bs-drv-NULL.patch [bz#1716347] +- kvm-iothread-fix-crash-with-invalid-properties.patch [bz#1687541] +- kvm-iothread-replace-init_done_cond-with-a-semaphore.patch [bz#1687541] +- kvm-RHEL-disable-hostmem-memfd.patch [bz#1740797] +- Resolves: bz#1687541 + (qemu aborted when start guest with a big iothreads) +- Resolves: bz#1716347 + (Qemu Core dump when quit vm that's in status "paused(io-error)" with data plane enabled) +- Resolves: bz#1716349 + (qemu with iothreads enabled crashes on resume after enospc pause for disk extension) +- Resolves: bz#1727033 + (vnc server should detect page-flips and avoid sending fullscreen updates then.) +- Resolves: bz#1740797 + (Disable memfd in QEMU) + +* Thu Aug 01 2019 Danilo Cesar Lemes de Paula - 2.12.0-83.el8 +- kvm-hw-block-pflash_cfi01-Add-missing-DeviceReset-handle.patch [bz#1707192] +- kvm-block-file-posix-Unaligned-O_DIRECT-block-status.patch [bz#1678979] +- kvm-iotests-Test-unaligned-raw-images-with-O_DIRECT.patch [bz#1678979] +- kvm-nbd-client-Lower-min_block-for-block-status-unaligne.patch [bz#1678979] +- kvm-nbd-client-Reject-inaccessible-tail-of-inconsistent-.patch [bz#1678979] +- kvm-nbd-client-Support-qemu-img-convert-from-unaligned-s.patch [bz#1678979] +- kvm-block-Add-bdrv_get_request_alignment.patch [bz#1678979] +- kvm-nbd-server-Advertise-actual-minimum-block-size.patch [bz#1678979] +- kvm-slirp-check-sscanf-result-when-emulating-ident.patch [bz#1727642] +- kvm-slirp-fix-big-little-endian-conversion-in-ident-prot.patch [bz#1727642] +- kvm-slirp-ensure-there-is-enough-space-in-mbuf-to-null-t.patch [bz#1727642] +- kvm-slirp-don-t-manipulate-so_rcv-in-tcp_emu.patch [bz#1727642] +- kvm-tap-set-vhostfd-passed-from-qemu-cli-to-non-blocking.patch [bz#1732642] +- kvm-Fix-heap-overflow-in-ip_reass-on-big-packet-input.patch [bz#1734751] +- Resolves: bz#1678979 + (qemu-img convert abort when converting image with unaligned size (qemu-img: block/io.c:2134: bdrv_co_block_status: Assertion `*pnum && (((*pnum) % (align)) == 0) && align > offset - aligned_offset\' failed)) +- Resolves: bz#1707192 + (implement missing reset handler for cfi.pflash01 - slow train) +- Resolves: bz#1727642 + (CVE-2019-6778 qemu-kvm: QEMU: slirp: heap buffer overflow in tcp_emu()) +- Resolves: bz#1732642 + (enable the virtio-net frontend to work with the vhost-net backend in SEV guests) +- Resolves: bz#1734751 + (CVE-2019-14378 qemu-kvm: QEMU: slirp: heap buffer overflow during packet reassembly [rhel-8.1.0]) + +* Tue Jul 23 2019 Danilo Cesar Lemes de Paula - 2.12.0-82.el8 +- kvm-i386-Add-new-model-of-Cascadelake-Server.patch [bz#1629906] +- kvm-i386-Update-stepping-of-Cascadelake-Server.patch [bz#1629906] +- kvm-target-i386-Disable-MPX-support-on-named-CPU-models.patch [bz#1629906] +- kvm-i386-remove-the-INTEL_PT-CPUID-bit-from-named-CPU-NEW.patch [bz#1629906] +- kvm-i386-Disable-OSPKE-on-CPU-model-definitions-NEW.patch [bz#1629906] +- kvm-block-ssh-Convert-from-DPRINTF-macro-to-trace-events.patch [bz#1513367] +- kvm-block-ssh-Do-not-report-read-write-flush-errors-to-t.patch [bz#1513367] +- kvm-qemu-iotests-Fix-paths-for-NFS.patch [bz#1513367] +- kvm-qemu-iotests-Filter-NFS-paths.patch [bz#1513367] +- kvm-iotests-Filter-SSH-paths.patch [bz#1513367] +- kvm-block-ssh-Implement-.bdrv_refresh_filename.patch [bz#1513367] +- kvm-iotests-Use-Python-byte-strings-where-appropriate.patch [bz#1513367] +- kvm-iotests-Unify-log-outputs-between-Python-2-and-3.patch [bz#1513367] +- kvm-ssh-switch-from-libssh2-to-libssh.patch [bz#1513367] +- kvm-redhat-switch-from-libssh2-to-libssh.patch [bz#1513367] +- kvm-block-gluster-limit-the-transfer-size-to-512-MiB.patch [bz#1728657] +- kvm-s390-cpumodel-fix-description-for-the-new-vector-fac.patch [bz#1729975] +- kvm-s390x-cpumodel-remove-esort-from-the-default-model.patch [bz#1729975] +- kvm-s390x-cpumodel-also-change-name-of-vxbeh.patch [bz#1729975] +- kvm-s390x-cpumodel-change-internal-name-of-vxpdeh-to-mat.patch [bz#1729975] +- kvm-target-i386-sev-Do-not-unpin-ram-device-memory-regio.patch [bz#1728958] +- kvm-i386-Save-EFER-for-32-bit-targets.patch [bz#1689269] +- kvm-target-i386-rename-HF_SVMI_MASK-to-HF_GUEST_MASK.patch [bz#1689269] +- kvm-target-i386-kvm-add-VMX-migration-blocker.patch [bz#1689269] +- kvm-target-i386-kvm-just-return-after-migrate_add_blocke.patch [bz#1689269] +- kvm-target-i386-kvm-Delete-VMX-migration-blocker-on-vCPU.patch [bz#1689269] +- kvm-Introduce-kvm_arch_destroy_vcpu.patch [bz#1689269] +- kvm-target-i386-kvm-Use-symbolic-constant-for-DB-BP-exce.patch [bz#1689269] +- kvm-target-i386-kvm-Re-inject-DB-to-guest-with-updated-D.patch [bz#1689269] +- kvm-target-i386-kvm-Block-migration-for-vCPUs-exposed-wi.patch [bz#1689269] +- kvm-target-i386-kvm-do-not-initialize-padding-fields.patch [bz#1689269] +- kvm-linux-headers-synchronize-generic-and-x86-KVM-header.patch [bz#1689269] +- kvm-vmstate-Add-support-for-kernel-integer-types.patch [bz#1689269] +- kvm-target-i386-kvm-Add-support-for-save-and-restore-nes.patch [bz#1689269] +- kvm-target-i386-kvm-Add-support-for-KVM_CAP_EXCEPTION_PA.patch [bz#1689269] +- kvm-target-i386-kvm-Add-nested-migration-blocker-only-wh.patch [bz#1689269] +- kvm-target-i386-kvm-Demand-nested-migration-kernel-capab.patch [bz#1689269] +- kvm-target-i386-skip-KVM_GET-SET_NESTED_STATE-if-VMX-dis.patch [bz#1689269] +- kvm-i386-kvm-Do-not-sync-nested-state-during-runtime.patch [bz#1689269] +- Resolves: bz#1513367 + (qemu with libssh) +- Resolves: bz#1629906 + ([Intel 8.1 Feat] qemu-kvm Introduce Cascade Lake (CLX) cpu model) +- Resolves: bz#1689269 + (Nested KVM: support for migration of nested hypervisors - Slow Train) +- Resolves: bz#1728657 + ('qemu-io write' to a raw image over libgfapi fails) +- Resolves: bz#1728958 + (Hot unplug vfio-pci NIC devices from sev guest will cause qemu-kvm: sev_ram_block_removed: failed to unregister region) +- Resolves: bz#1729975 + (RHEL 8.1 Pre-Beta - Fix for hardware CPU Model) + +* Mon Jul 08 2019 Miroslav Rezanina - 2.12.0-81.el8 +- kvm-target-i386-add-MDS-NO-feature.patch [bz#1714792] +- kvm-virtio-gpu-pass-down-VirtIOGPU-pointer-to-a-bunch-of.patch [bz#1531543] +- kvm-virtio-gpu-add-iommu-support.patch [bz#1531543] +- kvm-virtio-gpu-fix-unmap-in-error-path.patch [bz#1531543] +- Resolves: bz#1531543 + ([RFE] add iommu support to virtio-gpu) +- Resolves: bz#1714792 + ([Intel 8.1 FEAT] MDS_NO exposure to guest) + +* Tue Jul 02 2019 Danilo Cesar Lemes de Paula - 2.12.0-80.el8 +- kvm-qxl-check-release-info-object.patch [bz#1712705] +- kvm-iotests-Make-182-do-without-device_add.patch [bz#1707598] +- Resolves: bz#1707598 + (qemu-iotest 182 fails without device hotplugging support) +- Resolves: bz#1712705 + (CVE-2019-12155 qemu-kvm: QEMU: qxl: null pointer dereference while releasing spice resources [rhel-8]) + +* Fri Jun 28 2019 Danilo de Paula - 15:2.12.0-79 +- Rebuild all virt packages to fix RHEL's upgrade path +- Resolves: rhbz#1695587 + (Ensure modular RPM upgrade path) + +* Thu Jun 20 2019 Miroslav Rezanina - 2.12.0-78.el8 +- kvm-gluster-Handle-changed-glfs_ftruncate-signature.patch [bz#1721983] +- kvm-gluster-the-glfs_io_cbk-callback-function-pointer-ad.patch [bz#1721983] +- Resolves: bz#1721983 + (qemu-kvm can't be build with new gluster version (6.0.6)) + +* Thu Jun 13 2019 Danilo Cesar Lemes de Paula - 2.12.0-77.el8 +- kvm-i386-Make-arch_capabilities-migratable.patch [bz#1709970] +- kvm-spapr-Fix-ibm-max-associativity-domains-property-num.patch [bz#1710662] +- kvm-linux-headers-Update-for-NVLink2-passthrough-downstr.patch [bz#1710662] +- kvm-pci-Move-NVIDIA-vendor-id-to-the-rest-of-ids.patch [bz#1710662] +- kvm-vfio-quirks-Add-common-quirk-alloc-helper.patch [bz#1710662] +- kvm-vfio-Make-vfio_get_region_info_cap-public.patch [bz#1710662] +- kvm-spapr-Support-NVIDIA-V100-GPU-with-NVLink2.patch [bz#1710662] +- kvm-qemu-kvm.spec-bump-libseccomp-2.4.0.patch [bz#1719578] +- Resolves: bz#1709970 + ([Intel 8.1 Bug] [KVM][CLX] CPUID_7_0_EDX_ARCH_CAPABILITIES is not enabled in VM - qemu-kvm) +- Resolves: bz#1710662 + ([IBM 8.1 FEAT] POWER9 - Virt: qemu: NVLink2 passthru to guest - Nvidia Volta (GPU) (kvm)) +- Resolves: bz#1719578 + (VM failed to start with error "failed to install seccomp syscall filter in the kernel") + +* Tue Jun 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-76.el8 +- kvm-Introduce-new-no_guest_reset-parameter-for-usb-host-.patch [bz#1713677] +- kvm-usb-call-reset-handler-before-updating-state.patch [bz#1713677] +- kvm-usb-host-skip-reset-for-untouched-devices.patch [bz#1713677] +- kvm-usb-host-avoid-libusb_set_configuration-calls.patch [bz#1713677] +- kvm-virtio-scsi-Move-BlockBackend-back-to-the-main-AioCo.patch [bz#1673396 bz#1673401] +- kvm-scsi-disk-Acquire-the-AioContext-in-scsi_-_realize.patch [bz#1673396 bz#1673401] +- kvm-virtio-scsi-Forbid-devices-with-different-iothreads-.patch [bz#1673396 bz#1673401] +- kvm-Disable-VXHS-support.patch [bz#1714933] +- Resolves: bz#1673396 + (qemu-kvm core dumped after hotplug the deleted disk with iothread parameter) +- Resolves: bz#1673401 + (Qemu core dump when start guest with two disks using same drive) +- Resolves: bz#1713677 + (Detached device when trying to upgrade USB device firmware when in doing USB Passthrough via QEMU) +- Resolves: bz#1714933 + (Disable VXHS in qemu-kvm) + +* Fri May 24 2019 Danilo Cesar Lemes de Paula - 2.12.0-75.el8 +- kvm-s390x-cpumodel-enum-type-S390FeatGroup-now-gets-gene.patch [bz#1660912] +- kvm-linux-headers-update-against-Linux-5.2-rc1.patch [bz#1660912] +- kvm-s390x-cpumodel-ignore-csske-for-expansion.patch [bz#1660912] +- kvm-s390x-cpumodel-Miscellaneous-Instruction-Extensions-.patch [bz#1660912] +- kvm-s390x-cpumodel-msa9-facility.patch [bz#1660912] +- kvm-s390x-cpumodel-vector-enhancements.patch [bz#1660912] +- kvm-s390x-cpumodel-enhanced-sort-facility.patch [bz#1660912] +- kvm-s390x-cpumodel-add-Deflate-conversion-facility.patch [bz#1660912] +- kvm-s390x-cpumodel-add-gen15-defintions.patch [bz#1660912] +- kvm-s390x-cpumodel-wire-up-8561-and-8562-as-gen15-machin.patch [bz#1660912] +- kvm-spice-set-device-address-and-device-display-ID-in-QX.patch [bz#1712946] +- kvm-hw-pci-Add-missing-include.patch [bz#1712946] +- Resolves: bz#1660912 + ([IBM 8.1 FEAT] KVM s390x: Add hardware CPU Model - qemu part) +- Resolves: bz#1712946 + (qemu-kvm build is broken due to spice_qxl_set_max_monitors being deprecated) + +* Mon May 20 2019 Danilo Cesar Lemes de Paula - 2.12.0-74.el8 +- kvm-x86-cpu-Enable-CLDEMOTE-Demote-Cache-Line-cpu-featur.patch [bz#1696436] +- kvm-memory-Fix-the-memory-region-type-assignment-order.patch [bz#1667249] +- kvm-target-i386-sev-Do-not-pin-the-ram-device-memory-reg.patch [bz#1667249] +- kvm-block-Fix-invalidate_cache-error-path-for-parent-act.patch [bz#1673010] +- kvm-target-i386-define-md-clear-bit.patch [bz#1703302 bz#1703308] +- Resolves: bz#1667249 + (Fail to launch AMD SEV VM with assigned PCI device) +- Resolves: bz#1673010 + (Local VM and migrated VM on the same host can run with same RAW file as visual disk source while without shareable configured or lock manager enabled) +- Resolves: bz#1696436 + ([Intel 8.0 Feat] KVM Enabling SnowRidge new NIs - qemu-kvm) +- Resolves: bz#1703302 + (CVE-2018-12130 virt:rhel/qemu-kvm: hardware: Microarchitectural Fill Buffer Data Sampling (MFBDS) [rhel-8]) +- Resolves: bz#1703308 + (CVE-2018-12127 virt:rhel/qemu-kvm: hardware: Micro-architectural Load Port Data Sampling - Information Leak (MLPDS) [rhel-8]) + +* Tue May 14 2019 Danilo Cesar Lemes de Paula - 2.12.0-73.el8 +- kvm-i386-remove-the-INTEL_PT-CPUID-bit-from-named-CPU-mo.patch [bz#1561761] +- kvm-i386-Disable-OSPKE-on-CPU-model-definitions.patch [bz#1561761] +- Resolves: bz#1561761 + ([Intel 8.1 Feat] qemu-kvm Introduce Icelake cpu model) + +* Tue May 14 2019 Danilo Cesar Lemes de Paula - 2.12.0-72.el8 +- kvm-Use-KVM_GET_MSR_INDEX_LIST-for-MSR_IA32_ARCH_CAP.patch [bz#1707706] +- kvm-i386-kvm-Disable-arch_capabilities-if-MSR-can-t-be-s.patch [bz#1707706] +- Resolves: bz#1707706 + (/builddir/build/BUILD/qemu-2.12.0/target/i386/kvm.c:2031: kvm_put_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed.) + +* Wed May 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-71.el8 +- kvm-s390-bios-Skip-bootmap-signature-entries.patch [bz#1683275] +- Resolves: bz#1683275 + ([IBM 8.1 FEAT] KVM: Secure Linux Boot Toleration (qemu)) + +* Tue May 07 2019 Danilo Cesar Lemes de Paula - 2.12.0-70.el8 +- kvm-i386-Add-new-MSR-indices-for-IA32_PRED_CMD-and-IA32_.patch [bz#1561761] +- kvm-i386-Add-CPUID-bit-and-feature-words-for-IA32_ARCH_C.patch [bz#1561761] +- kvm-i386-Add-CPUID-bit-for-PCONFIG.patch [bz#1561761] +- kvm-i386-Add-CPUID-bit-for-WBNOINVD.patch [bz#1561761] +- kvm-i386-Add-new-CPU-model-Icelake-Server-Client.patch [bz#1561761] +- kvm-Add-support-to-KVM_GET_MSR_FEATURE_INDEX_LIST-an.patch [bz#1561761] +- kvm-x86-Data-structure-changes-to-support-MSR-based-feat.patch [bz#1561761] +- kvm-x86-define-a-new-MSR-based-feature-word-FEATURE_WORD.patch [bz#1561761] +- kvm-i386-remove-the-new-CPUID-PCONFIG-from-Icelake-Serve.patch [bz#1561761] +- kvm-Revert-i386-Add-CPUID-bit-for-PCONFIG.patch [bz#1561761] +- Resolves: bz#1561761 + ([Intel 8.1 Feat] qemu-kvm Introduce Icelake cpu model) + +* Fri May 03 2019 Danilo Cesar Lemes de Paula - 2.12.0-69.el8 +- kvm-tests-crypto-Use-the-IEC-binary-prefix-definitions.patch [bz#1680231] +- kvm-crypto-expand-algorithm-coverage-for-cipher-benchmar.patch [bz#1680231] +- kvm-crypto-remove-code-duplication-in-tweak-encrypt-decr.patch [bz#1680231] +- kvm-crypto-introduce-a-xts_uint128-data-type.patch [bz#1680231] +- kvm-crypto-convert-xts_tweak_encdec-to-use-xts_uint128-t.patch [bz#1680231] +- kvm-crypto-convert-xts_mult_x-to-use-xts_uint128-type.patch [bz#1680231] +- kvm-crypto-annotate-xts_tweak_encdec-as-inlineable.patch [bz#1680231] +- kvm-crypto-refactor-XTS-cipher-mode-test-suite.patch [bz#1680231] +- kvm-crypto-add-testing-for-unaligned-buffers-with-XTS-ci.patch [bz#1680231] +- Resolves: bz#1680231 + (severe performance impact using luks format) + +* Mon Apr 29 2019 Danilo Cesar Lemes de Paula - 2.12.0-68.el8 +- kvm-s390x-ipl-Try-to-detect-Linux-vs-non-Linux-for-initi.patch [bz#1699070] +- kvm-loader-Check-access-size-when-calling-rom_ptr-to-avo.patch [bz#1699070] +- kvm-hw-s390x-Use-the-IEC-binary-prefix-definitions.patch [bz#1699070] +- kvm-s390x-storage-attributes-fix-CMMA_BLOCK_SIZE-usage.patch [bz#1699070] +- kvm-s390x-cpumodel-fix-segmentation-fault-when-baselinin.patch [bz#1699070] +- kvm-hw-s390x-s390-pci-bus-Convert-sysbus-init-function-t.patch [bz#1699070] +- kvm-s390x-pci-properly-fail-if-the-zPCI-device-cannot-be.patch [bz#1699070] +- kvm-s390x-pci-rename-hotplug-handler-callbacks.patch [bz#1699070] +- kvm-s390-avoid-potential-null-dereference-in-s390_pcihos.patch [bz#1699070] +- kvm-s390x-pci-Send-correct-event-on-hotplug.patch [bz#1699070] +- kvm-s390x-pci-Set-the-iommu-region-size-mpcifc-request.patch [bz#1699070] +- kvm-s390x-pci-Always-delete-and-free-the-release_timer.patch [bz#1699070] +- kvm-s390x-pci-Ignore-the-unplug-call-if-we-already-have-.patch [bz#1699070] +- kvm-s390x-pci-Use-hotplug_dev-instead-of-looking-up-the-.patch [bz#1699070] +- kvm-s390x-pci-Move-some-hotplug-checks-to-the-pre_plug-h.patch [bz#1699070] +- kvm-s390x-pci-Introduce-unplug-requests-and-split-unplug.patch [bz#1699070] +- kvm-s390x-pci-Drop-release-timer-and-replace-it-with-a-f.patch [bz#1699070] +- kvm-s390x-pci-mark-zpci-devices-as-unmigratable.patch [bz#1699070] +- kvm-s390x-pci-Fix-primary-bus-number-for-PCI-bridges.patch [bz#1699070] +- kvm-s390x-pci-Fix-hotplugging-of-PCI-bridges.patch [bz#1699070] +- kvm-s390x-pci-Warn-when-adding-PCI-devices-without-the-z.patch [bz#1699070] +- kvm-s390x-pci-Unplug-remaining-requested-devices-on-pcih.patch [bz#1699070] +- kvm-s390x-refactor-reset-reipl-handling.patch [bz#1699070] +- kvm-s390-ipl-fix-ipl-with-no-reboot.patch [bz#1699070] +- Resolves: bz#1699070 + (Backport s390x-related fixes for qemu-kvm) + +* Tue Apr 23 2019 Danilo Cesar Lemes de Paula - 2.12.0-67.el8 +- kvm-device_tree-Fix-integer-overflowing-in-load_device_t.patch [bz#1693116] +- Resolves: bz#1693116 + (CVE-2018-20815 qemu-kvm: QEMU: device_tree: heap buffer overflow while loading device tree blob [rhel-8.0]) + +* Mon Apr 15 2019 Danilo Cesar Lemes de Paula - 2.12.0-66.el8 +- kvm-iotests-153-Fix-dead-code.patch [bz#1694148] +- kvm-file-posix-Include-filename-in-locking-error-message.patch [bz#1694148] +- kvm-file-posix-Skip-effectiveless-OFD-lock-operations.patch [bz#1694148] +- kvm-file-posix-Drop-s-lock_fd.patch [bz#1694148] +- kvm-tests-Add-unit-tests-for-image-locking.patch [bz#1694148] +- kvm-file-posix-Fix-shared-locks-on-reopen-commit.patch [bz#1694148] +- kvm-iotests-Test-file-posix-locking-and-reopen.patch [bz#1694148] +- kvm-block-file-posix-do-not-fail-on-unlock-bytes.patch [bz#1694148] +- kvm-hostmem-file-remove-object-id-from-pmem-error-messag.patch [bz#1687596] +- kvm-redhat-setting-target-release-to-rhel-8.1.0.patch [] +- kvm-redhat-removing-iotest-182.patch [] +- Resolves: bz#1687596 + ([Intel 8.1 BUG][KVM][Crystal Ridge]object_get_canonical_path_component: assertion failed: (obj->parent != NULL)) +- Resolves: bz#1694148 + (QEMU image locking needn't double open fd number, and it should not fail when attempting to release locks) + +* Tue Apr 09 2019 Danilo Cesar Lemes de Paula - 2.12.0-65.el8 +- kvm-s390x-cpumodel-mepochptff-warn-when-no-mepoch-and-re.patch [bz#1664371] +- kvm-s390x-cpumodel-add-z14-GA2-model.patch [bz#1664371] +- kvm-redhat-s390x-cpumodel-enable-mepoch-by-default-for-z.patch [bz#1664371] +- kvm-intel_iommu-fix-operator-in-vtd_switch_address_space.patch [bz#1662272] +- kvm-intel_iommu-reset-intr_enabled-when-system-reset.patch [bz#1662272] +- kvm-pci-msi-export-msi_is_masked.patch [bz#1662272] +- kvm-i386-kvm-ignore-masked-irqs-when-update-msi-routes.patch [bz#1662272] +- Resolves: bz#1662272 + (Boot guest with device assignment+vIOMMU, qemu prompts "vtd_interrupt_remap_msi: MSI address low 32 bit invalid: 0x0" when first rebooting guest) +- Resolves: bz#1664371 + ([IBM 8.1 FEAT] Update hardware CPU Model z14 (kvm) - qemu part) + +* Mon Apr 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-64.el8 +- kvm-doc-fix-the-configuration-path.patch [bz#1645411] +- kvm-Increase-number-of-iotests-being-run-as-a-part-of-RH.patch [bz#1664463] +- kvm-Load-kvm-module-during-boot.patch [bz#1676907 bz#1685995] +- kvm-qemu-kvm.spec.template-Update-pyton-path-to-system-i.patch [] +- Resolves: bz#1645411 + (the "fsfreeze-hook" script path shown by command "qemu-ga --help" or "man qemu-ga" is wrong) +- Resolves: bz#1664463 + (Modify iotest behavior to include luks and nbd and fail build if iotests fail) +- Resolves: bz#1676907 + (/dev/kvm device exists but kernel module is not loaded on boot up causing VM start to fail in libvirt) +- Resolves: bz#1685995 + (/dev/kvm device exists but kernel module is not loaded on boot up causing VM start to fail in libvirt) + +* Tue Feb 26 2019 Danilo Cesar Lemes de Paula - 2.12.0-63.el8 +- kvm-scsi-generic-avoid-possible-out-of-bounds-access-to-.patch [bz#1668162] +- Resolves: bz#1668162 + (CVE-2019-6501 qemu-kvm: QEMU: scsi-generic: possible OOB access while handling inquiry request [rhel-8]) + +* Mon Feb 25 2019 Danilo Cesar Lemes de Paula - 2.12.0-62.el8 +- kvm-slirp-check-data-length-while-emulating-ident-functi.patch [bz#1669069] +- Resolves: bz#1669069 + (CVE-2019-6778 qemu-kvm: QEMU: slirp: heap buffer overflow in tcp_emu() [rhel-8.0]) + +* Mon Feb 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-61.el8 +- kvm-qemu-ga-make-get-fsinfo-work-over-pci-bridges.patch [bz#1666952] +- kvm-qga-fix-driver-leak-in-guest-get-fsinfo.patch [bz#1666952] +- Resolves: bz#1666952 + (qemu-guest-agent does not parse PCI bridge links in "build_guest_fsinfo_for_real_device" (q35)) + +* Mon Jan 28 2019 Danilo Cesar Lemes de Paula - 2.12.0-60.el8 +- kvm-ne2000-fix-possible-out-of-bound-access-in-ne2000_re.patch [bz#1636784] +- kvm-rtl8139-fix-possible-out-of-bound-access.patch [bz#1636784] +- kvm-pcnet-fix-possible-buffer-overflow.patch [bz#1636784] +- kvm-net-ignore-packet-size-greater-than-INT_MAX.patch [bz#1636784] +- kvm-net-drop-too-large-packet-early.patch [bz#1636784] +- kvm-net-hub-suppress-warnings-of-no-host-network-for-qte.patch [bz#1636784] +- kvm-virtio-net-test-accept-variable-length-argument-in-p.patch [bz#1636784] +- kvm-virtio-net-test-remove-unused-macro.patch [bz#1636784] +- kvm-virtio-net-test-add-large-tx-buffer-test.patch [bz#1636784] +- kvm-s390x-Return-specification-exception-for-unimplement.patch [bz#1668261] +- kvm-cpus-ignore-ESRCH-in-qemu_cpu_kick_thread.patch [bz#1665844] +- Resolves: bz#1636784 + (CVE-2018-17963 qemu-kvm: Qemu: net: ignore packets with large size [rhel-8]) +- Resolves: bz#1665844 + (Guest quit with error when hotunplug cpu) +- Resolves: bz#1668261 + ([RHEL8] Backport diag308 stable exception fix (qemu-kvm)) + +* Thu Jan 24 2019 Danilo Cesar Lemes de Paula - 2.12.0-59.el8 +- kvm-hw-scsi-cleanups-before-VPD-BL-emulation.patch [bz#1639957] +- kvm-hw-scsi-centralize-SG_IO-calls-into-single-function.patch [bz#1639957] +- kvm-hw-scsi-add-VPD-Block-Limits-emulation.patch [bz#1639957] +- kvm-scsi-disk-Block-Device-Characteristics-emulation-fix.patch [bz#1639957] +- kvm-scsi-generic-keep-VPD-page-list-sorted.patch [bz#1639957] +- kvm-scsi-generic-avoid-out-of-bounds-access-to-VPD-page-.patch [bz#1639957] +- kvm-scsi-generic-avoid-invalid-access-to-struct-when-emu.patch [bz#1639957] +- kvm-scsi-generic-do-not-do-VPD-emulation-for-sense-other.patch [bz#1639957] +- Resolves: bz#1639957 + ([RHEL.8] scsi host device passthrough limits IO writes - slow train) + +* Mon Jan 21 2019 Danilo Cesar Lemes de Paula - 2.12.0-58.el8 +- kvm-block-Update-flags-in-bdrv_set_read_only.patch [bz#1644996] +- kvm-block-Add-auto-read-only-option.patch [bz#1644996] +- kvm-rbd-Close-image-in-qemu_rbd_open-error-path.patch [bz#1644996] +- kvm-block-Require-auto-read-only-for-existing-fallbacks.patch [bz#1644996] +- kvm-nbd-Support-auto-read-only-option.patch [bz#1644996] +- kvm-file-posix-Support-auto-read-only-option.patch [bz#1644996] +- kvm-curl-Support-auto-read-only-option.patch [bz#1644996] +- kvm-gluster-Support-auto-read-only-option.patch [bz#1644996] +- kvm-iscsi-Support-auto-read-only-option.patch [bz#1644996] +- kvm-block-Make-auto-read-only-on-default-for-drive.patch [bz#1644996] +- kvm-qemu-iotests-Test-auto-read-only-with-drive-and-bloc.patch [bz#1644996] +- kvm-block-Fix-update-of-BDRV_O_AUTO_RDONLY-in-update_fla.patch [bz#1644996] +- kvm-qemu-img-Add-C-option-for-convert-with-copy-offloadi.patch [bz#1623082] +- kvm-iotests-Add-test-for-qemu-img-convert-C-compatibilit.patch [bz#1623082] +- Resolves: bz#1623082 + ([rhel.8.0]Target files for 'qemu-img convert' do not support thin_provisoning with iscsi/nfs backend) +- Resolves: bz#1644996 + (block-commit can't be used with -blockdev) + +* Fri Jan 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-57.el8 +- kvm-qemu-kvm.spec.template-Update-files-for-tests-rpm-to.patch [bz#1601107] + +* Fri Jan 11 2019 Danilo Cesar Lemes de Paula - 2.12.0-56.el8 +- kvm-Run-iotests-as-part-of-the-build-process.patch [bz#1661026] +- kvm-Introduce-the-qemu-kvm-tests-rpm.patch [bz#1601107] +- Resolves: bz#1601107 + (qemu-kvm packaging: make running qemu-iotests more robust) +- Resolves: bz#1661026 + (Run iotests as part of build process) + +* Tue Jan 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-55.el8 +- kvm-block-Don-t-inactivate-children-before-parents.patch [bz#1659395] +- kvm-iotests-Test-migration-with-blockdev.patch [bz#1659395] +- Resolves: bz#1659395 + (src qemu core dump when do migration ( block device node-name changed after change cdrom) - Slow Train) + +* Tue Jan 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-54.el8 +- kvm-s390x-tcg-avoid-overflows-in-time2tod-tod2time.patch [bz#1653569] +- kvm-s390x-kvm-pass-values-instead-of-pointers-to-kvm_s39.patch [bz#1653569] +- kvm-s390x-tod-factor-out-TOD-into-separate-device.patch [bz#1653569] +- kvm-s390x-tcg-drop-tod_basetime.patch [bz#1653569] +- kvm-s390x-tcg-properly-implement-the-TOD.patch [bz#1653569] +- kvm-s390x-tcg-SET-CLOCK-COMPARATOR-can-clear-CKC-interru.patch [bz#1653569] +- kvm-s390x-tcg-implement-SET-CLOCK.patch [bz#1653569] +- kvm-s390x-tcg-rearm-the-CKC-timer-during-migration.patch [bz#1653569] +- kvm-s390x-tcg-fix-locking-problem-with-tcg_s390_tod_upda.patch [bz#1653569] +- kvm-hw-s390x-Include-the-tod-qemu-also-for-builds-with-d.patch [bz#1653569] +- kvm-s390x-tod-Properly-stop-the-KVM-TOD-while-the-guest-.patch [bz#1653569] +- kvm-hw-s390x-Fix-bad-mask-in-time2tod.patch [bz#1653569] +- kvm-migration-discard-non-migratable-RAMBlocks.patch [bz#1539285] +- kvm-vfio-pci-do-not-set-the-PCIDevice-has_rom-attribute.patch [bz#1539285] +- kvm-memory-exec-Expose-all-memory-block-related-flags.patch [bz#1539285] +- kvm-memory-exec-switch-file-ram-allocation-functions-to-.patch [bz#1539285] +- kvm-configure-add-libpmem-support.patch [bz#1539285] +- kvm-hostmem-file-add-the-pmem-option.patch [bz#1539285] +- kvm-mem-nvdimm-ensure-write-persistence-to-PMEM-in-label.patch [bz#1539285] +- kvm-migration-ram-Add-check-and-info-message-to-nvdimm-p.patch [bz#1539285] +- kvm-migration-ram-ensure-write-persistence-on-loading-al.patch [bz#1539285] +- Resolves: bz#1539285 + ([Intel 8.0 Bug] [KVM][Crystal Ridge] Lack of data persistence guarantee of QEMU writes to host PMEM) +- Resolves: bz#1653569 + (Stress guest and stop it, then do live migration, guest hit call trace on destination end) + +* Tue Jan 08 2019 Danilo Cesar Lemes de Paula - 2.12.0-53.el8 +- kvm-ui-add-qapi-parser-for-display.patch [bz#1652871] +- kvm-ui-switch-trivial-displays-to-qapi-parser.patch [bz#1652871] +- kvm-qapi-Add-rendernode-display-option-for-egl-headless.patch [bz#1652871] +- kvm-ui-Allow-specifying-rendernode-display-option-for-eg.patch [bz#1652871] +- kvm-qapi-add-query-display-options-command.patch [bz#1652871] +- Resolves: bz#1652871 + (QEMU doesn't expose rendernode option for egl-headless display type) + +* Fri Jan 04 2019 Danilo Cesar Lemes de Paula - 2.12.0-52.el8 +- kvm-Add-edk2-Requires-to-qemu-kvm.patch [bz#1654276] +- Resolves: bz#1654276 + (qemu-kvm: Should depend on the architecture-appropriate guest firmware) + +* Mon Dec 24 2018 Danilo Cesar Lemes de Paula - 2.12.0-51.el8 +- kvm-x86-host-phys-bits-limit-option.patch [bz#1598284] +- kvm-rhel-Set-host-phys-bits-limit-48-on-rhel-machine-typ.patch [bz#1598284] +- kvm-i386-do-not-migrate-MSR_SMI_COUNT-on-machine-types-2.patch [bz#1659565] +- kvm-pc-x-migrate-smi-count-to-PC_RHEL_COMPAT.patch [bz#1659565] +- kvm-slow-train-kvm-clear-out-KVM_ASYNC_PF_DELIVERY_AS_PF.patch [bz#1656829] +- Resolves: bz#1598284 + ([Intel 8.0 Alpha] physical bits should < 48 when host with 5level paging &EPT5 and qemu command with "-cpu qemu64" parameters.) +- Resolves: bz#1656829 + (8->7 migration failed: qemu-kvm: error: failed to set MSR 0x4b564d02 to 0x27fc13285) +- Resolves: bz#1659565 + (machine type: required compat flag x-migrate-smi-count=off) + +* Tue Dec 18 2018 Danilo Cesar Lemes de Paula - 2.12.0-51 +- kvm-Add-edk2-Requires-to-qemu-kvm.patch [bz#1654276] +- Resolves: bz#1654276 + (qemu-kvm: Should depend on the architecture-appropriate guest firmware) + +* Mon Dec 17 2018 Danilo Cesar Lemes de Paula - +- kvm-redhat-enable-tpmdev-passthrough.patch [bz#1654486] +- Resolves: bz#1654486 + ([RFE] enable TPM passthrough at compile time (qemu-kvm)) + +* Fri Dec 14 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-48 +- kvm-redhat-use-autopatch-instead-of-PATCHAPPLY.patch [bz#1613128] +- kvm-redhat-Removing-some-unused-build-flags-in-the-spec-.patch [bz#1613128] +- kvm-redhat-Fixing-rhev-ma-conflicts.patch [bz#1613126] +- kvm-redhat-Remove-_smp_mflags-cleanup-workaround-for-s39.patch [bz#1613128] +- kvm-redhat-Removing-dead-code-from-the-spec-file.patch [bz#1613128] +- kvm-i386-Add-stibp-flag-name.patch [bz#1639446] +- kvm-Add-functional-acceptance-tests-infrastructure.patch [bz#1655807] +- kvm-scripts-qemu.py-allow-adding-to-the-list-of-extra-ar.patch [bz#1655807] +- kvm-Acceptance-tests-add-quick-VNC-tests.patch [bz#1655807] +- kvm-scripts-qemu.py-introduce-set_console-method.patch [bz#1655807] +- kvm-Acceptance-tests-add-Linux-kernel-boot-and-console-c.patch [bz#1655807] +- kvm-Bootstrap-Python-venv-for-tests.patch [bz#1655807] +- kvm-Acceptance-tests-add-make-rule-for-running-them.patch [bz#1655807] +- Resolves: bz#1613126 + (Check and fix qemu-kvm-rhev and qemu-kvm-ma conflicts in qemu-kvm for rhel-8) +- Resolves: bz#1613128 + (Spec file clean up) +- Resolves: bz#1639446 + (Cross migration from RHEL7.5 to RHEL8 shouldn't fail with cpu flag stibp [qemu-kvm]) +- Resolves: bz#1655807 + (Backport avocado-qemu tests for QEMU 2.12) + +* Tue Dec 11 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-47 +- kvm-Disable-CONFIG_IPMI-and-CONFIG_I2C-for-ppc64.patch [bz#1640044] +- kvm-Disable-CONFIG_CAN_BUS-and-CONFIG_CAN_SJA1000.patch [bz#1640042] +- Resolves: bz#1640042 + (Disable CONFIG_CAN_BUS and CONFIG_CAN_SJA1000 config switches) +- Resolves: bz#1640044 + (Disable CONFIG_I2C and CONFIG_IPMI in default-configs/ppc64-softmmu.mak) + +* Tue Dec 11 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-46 +- kvm-qcow2-Give-the-refcount-cache-the-minimum-possible-s.patch [bz#1656507] +- kvm-docs-Document-the-new-default-sizes-of-the-qcow2-cac.patch [bz#1656507] +- kvm-qcow2-Fix-Coverity-warning-when-calculating-the-refc.patch [bz#1656507] +- kvm-include-Add-IEC-binary-prefixes-in-qemu-units.h.patch [bz#1656507] +- kvm-qcow2-Options-documentation-fixes.patch [bz#1656507] +- kvm-include-Add-a-lookup-table-of-sizes.patch [bz#1656507] +- kvm-qcow2-Make-sizes-more-humanly-readable.patch [bz#1656507] +- kvm-qcow2-Avoid-duplication-in-setting-the-refcount-cach.patch [bz#1656507] +- kvm-qcow2-Assign-the-L2-cache-relatively-to-the-image-si.patch [bz#1656507] +- kvm-qcow2-Increase-the-default-upper-limit-on-the-L2-cac.patch [bz#1656507] +- kvm-qcow2-Resize-the-cache-upon-image-resizing.patch [bz#1656507] +- kvm-qcow2-Set-the-default-cache-clean-interval-to-10-min.patch [bz#1656507] +- kvm-qcow2-Explicit-number-replaced-by-a-constant.patch [bz#1656507] +- kvm-block-backend-Set-werror-rerror-defaults-in-blk_new.patch [bz#1657637] +- kvm-qcow2-Fix-cache-clean-interval-documentation.patch [bz#1656507] +- Resolves: bz#1656507 + ([RHEL.8] qcow2 cache is too small) +- Resolves: bz#1657637 + (Wrong werror default for -device drive=) + +* Thu Dec 06 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-45 +- kvm-target-ppc-add-basic-support-for-PTCR-on-POWER9.patch [bz#1639069] +- kvm-linux-headers-Update-for-nested-KVM-HV-downstream-on.patch [bz#1639069] +- kvm-target-ppc-Add-one-reg-id-for-ptcr.patch [bz#1639069] +- kvm-ppc-spapr_caps-Add-SPAPR_CAP_NESTED_KVM_HV.patch [bz#1639069] +- kvm-Re-enable-CONFIG_HYPERV_TESTDEV.patch [bz#1651195] +- kvm-qxl-use-guest_monitor_config-for-local-renderer.patch [bz#1610163] +- kvm-Declare-cirrus-vga-as-deprecated.patch [bz#1651994] +- kvm-Do-not-build-bluetooth-support.patch [bz#1654651] +- kvm-vfio-helpers-Fix-qemu_vfio_open_pci-crash.patch [bz#1645840] +- kvm-balloon-Allow-multiple-inhibit-users.patch [bz#1650272] +- kvm-Use-inhibit-to-prevent-ballooning-without-synchr.patch [bz#1650272] +- kvm-vfio-Inhibit-ballooning-based-on-group-attachment-to.patch [bz#1650272] +- kvm-vfio-ccw-pci-Allow-devices-to-opt-in-for-ballooning.patch [bz#1650272] +- kvm-vfio-pci-Handle-subsystem-realpath-returning-NULL.patch [bz#1650272] +- kvm-vfio-pci-Fix-failure-to-close-file-descriptor-on-err.patch [bz#1650272] +- kvm-postcopy-Synchronize-usage-of-the-balloon-inhibitor.patch [bz#1650272] +- Resolves: bz#1610163 + (guest shows border blurred screen with some resolutions when qemu boot with -device qxl-vga ,and guest on rhel7.6 has no such question) +- Resolves: bz#1639069 + ([IBM 8.0 FEAT] POWER9 - Nested virtualization in RHEL8.0 KVM for ppc64le - qemu-kvm side) +- Resolves: bz#1645840 + (Qemu core dump when hotplug nvme:// drive via -blockdev) +- Resolves: bz#1650272 + (Ballooning is incompatible with vfio assigned devices, but not prevented) +- Resolves: bz#1651195 + (Re-enable hyperv-testdev device) +- Resolves: bz#1651994 + (Declare the "Cirrus VGA" device emulation of QEMU as deprecated in RHEL8) +- Resolves: bz#1654651 + (Qemu: hw: bt: keep bt/* objects from building [rhel-8.0]) + +* Tue Nov 27 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-44 +- kvm-block-Make-more-block-drivers-compile-time-configura.patch [bz#1598842 bz#1598842] +- kvm-RHEL8-Add-disable-configure-options-to-qemu-spec-fil.patch [bz#1598842] +- Resolves: bz#1598842 + (Compile out unused block drivers) + +* Mon Nov 26 2018 Danilo Cesar Lemes de Paula - qemu-kvm-2.12.0-43 +- kvm-configure-add-test-for-libudev.patch [bz#1636185] +- kvm-qga-linux-report-disk-serial-number.patch [bz#1636185] +- kvm-qga-linux-return-disk-device-in-guest-get-fsinfo.patch [bz#1636185] +- kvm-qemu-error-introduce-error-warn-_report_once.patch [bz#1625173] +- kvm-intel-iommu-start-to-use-error_report_once.patch [bz#1625173] +- kvm-intel-iommu-replace-more-vtd_err_-traces.patch [bz#1625173] +- kvm-intel_iommu-introduce-vtd_reset_caches.patch [bz#1625173] +- kvm-intel_iommu-better-handling-of-dmar-state-switch.patch [bz#1625173] +- kvm-intel_iommu-move-ce-fetching-out-when-sync-shadow.patch [bz#1625173 bz#1629616] +- kvm-intel_iommu-handle-invalid-ce-for-shadow-sync.patch [bz#1625173 bz#1629616] +- kvm-block-remove-bdrv_dirty_bitmap_make_anon.patch [bz#1518989] +- kvm-block-simplify-code-around-releasing-bitmaps.patch [bz#1518989] +- kvm-hbitmap-Add-advance-param-to-hbitmap_iter_next.patch [bz#1518989] +- kvm-test-hbitmap-Add-non-advancing-iter_next-tests.patch [bz#1518989] +- kvm-block-dirty-bitmap-Add-bdrv_dirty_iter_next_area.patch [bz#1518989] +- kvm-blockdev-backup-add-bitmap-argument.patch [bz#1518989] +- kvm-dirty-bitmap-switch-assert-fails-to-errors-in-bdrv_m.patch [bz#1518989] +- kvm-dirty-bitmap-rename-bdrv_undo_clear_dirty_bitmap.patch [bz#1518989] +- kvm-dirty-bitmap-make-it-possible-to-restore-bitmap-afte.patch [bz#1518989] +- kvm-blockdev-rename-block-dirty-bitmap-clear-transaction.patch [bz#1518989] +- kvm-qapi-add-transaction-support-for-x-block-dirty-bitma.patch [bz#1518989] +- kvm-block-dirty-bitmaps-add-user_locked-status-checker.patch [bz#1518989] +- kvm-block-dirty-bitmaps-fix-merge-permissions.patch [bz#1518989] +- kvm-block-dirty-bitmaps-allow-clear-on-disabled-bitmaps.patch [bz#1518989] +- kvm-block-dirty-bitmaps-prohibit-enable-disable-on-locke.patch [bz#1518989] +- kvm-block-backup-prohibit-backup-from-using-in-use-bitma.patch [bz#1518989] +- kvm-nbd-forbid-use-of-frozen-bitmaps.patch [bz#1518989] +- kvm-bitmap-Update-count-after-a-merge.patch [bz#1518989] +- kvm-iotests-169-drop-deprecated-autoload-parameter.patch [bz#1518989] +- kvm-block-qcow2-improve-error-message-in-qcow2_inactivat.patch [bz#1518989] +- kvm-bloc-qcow2-drop-dirty_bitmaps_loaded-state-variable.patch [bz#1518989] +- kvm-dirty-bitmaps-clean-up-bitmaps-loading-and-migration.patch [bz#1518989] +- kvm-iotests-improve-169.patch [bz#1518989] +- kvm-iotests-169-add-cases-for-source-vm-resuming.patch [bz#1518989] +- kvm-pc-dimm-turn-alignment-assert-into-check.patch [bz#1630116] +- Resolves: bz#1518989 + (RFE: QEMU Incremental live backup) +- Resolves: bz#1625173 + ([NVMe Device Assignment] Guest could not boot up with q35+iommu) +- Resolves: bz#1629616 + (boot guest with q35+vIOMMU+ device assignment, qemu terminal shows "qemu-kvm: VFIO_UNMAP_DMA: -22" when return assigned network devices from vfio driver to ixgbe in guest) +- Resolves: bz#1630116 + (pc_dimm_get_free_addr: assertion failed: (QEMU_ALIGN_UP(address_space_start, align) == address_space_start)) +- Resolves: bz#1636185 + ([RFE] Report disk device name and serial number (qemu-guest-agent on Linux)) + +* Mon Nov 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-42.el8 +- kvm-luks-Allow-share-rw-on.patch [bz#1629701] +- kvm-redhat-reenable-gluster-support.patch [bz#1599340] +- kvm-redhat-bump-libusb-requirement.patch [bz#1627970] +- Resolves: bz#1599340 + (Reenable glusterfs in qemu-kvm once BZ#1567292 gets fixed) +- Resolves: bz#1627970 + (symbol lookup error: /usr/libexec/qemu-kvm: undefined symbol: libusb_set_option) +- Resolves: bz#1629701 + ("share-rw=on" does not work for luks format image - Fast Train) + +* Tue Oct 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-41.el8 +- kvm-block-rbd-pull-out-qemu_rbd_convert_options.patch [bz#1635585] +- kvm-block-rbd-Attempt-to-parse-legacy-filenames.patch [bz#1635585] +- kvm-block-rbd-add-deprecation-documentation-for-filename.patch [bz#1635585] +- kvm-block-rbd-add-iotest-for-rbd-legacy-keyvalue-filenam.patch [bz#1635585] +- Resolves: bz#1635585 + (rbd json format of 7.6 is incompatible with 7.5) + +* Tue Oct 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-40.el8 +- kvm-vnc-call-sasl_server_init-only-when-required.patch [bz#1609327] +- kvm-nbd-server-fix-NBD_CMD_CACHE.patch [bz#1636142] +- kvm-nbd-fix-NBD_FLAG_SEND_CACHE-value.patch [bz#1636142] +- kvm-test-bdrv-drain-bdrv_drain-works-with-cross-AioConte.patch [bz#1637976] +- kvm-block-Use-bdrv_do_drain_begin-end-in-bdrv_drain_all.patch [bz#1637976] +- kvm-block-Remove-recursive-parameter-from-bdrv_drain_inv.patch [bz#1637976] +- kvm-block-Don-t-manually-poll-in-bdrv_drain_all.patch [bz#1637976] +- kvm-tests-test-bdrv-drain-bdrv_drain_all-works-in-corout.patch [bz#1637976] +- kvm-block-Avoid-unnecessary-aio_poll-in-AIO_WAIT_WHILE.patch [bz#1637976] +- kvm-block-Really-pause-block-jobs-on-drain.patch [bz#1637976] +- kvm-block-Remove-bdrv_drain_recurse.patch [bz#1637976] +- kvm-test-bdrv-drain-Add-test-for-node-deletion.patch [bz#1637976] +- kvm-block-Drain-recursively-with-a-single-BDRV_POLL_WHIL.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-node-deletion-in-subtree-recurs.patch [bz#1637976] +- kvm-block-Don-t-poll-in-parent-drain-callbacks.patch [bz#1637976] +- kvm-test-bdrv-drain-Graph-change-through-parent-callback.patch [bz#1637976] +- kvm-block-Defer-.bdrv_drain_begin-callback-to-polling-ph.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-that-bdrv_drain_invoke-doesn-t-.patch [bz#1637976] +- kvm-block-Allow-AIO_WAIT_WHILE-with-NULL-ctx.patch [bz#1637976] +- kvm-block-Move-bdrv_drain_all_begin-out-of-coroutine-con.patch [bz#1637976] +- kvm-block-ignore_bds_parents-parameter-for-drain-functio.patch [bz#1637976] +- kvm-block-Allow-graph-changes-in-bdrv_drain_all_begin-en.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-graph-changes-in-drain_all-sect.patch [bz#1637976] +- kvm-block-Poll-after-drain-on-attaching-a-node.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-bdrv_append-to-drained-node.patch [bz#1637976] +- kvm-block-linux-aio-acquire-AioContext-before-qemu_laio_.patch [bz#1637976] +- kvm-util-async-use-qemu_aio_coroutine_enter-in-co_schedu.patch [bz#1637976] +- kvm-job-Fix-nested-aio_poll-hanging-in-job_txn_apply.patch [bz#1637976] +- kvm-job-Fix-missing-locking-due-to-mismerge.patch [bz#1637976] +- kvm-blockjob-Wake-up-BDS-when-job-becomes-idle.patch [bz#1637976] +- kvm-aio-wait-Increase-num_waiters-even-in-home-thread.patch [bz#1637976] +- kvm-test-bdrv-drain-Drain-with-block-jobs-in-an-I-O-thre.patch [bz#1637976] +- kvm-test-blockjob-Acquire-AioContext-around-job_cancel_s.patch [bz#1637976] +- kvm-job-Use-AIO_WAIT_WHILE-in-job_finish_sync.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-AIO_WAIT_WHILE-in-completion-ca.patch [bz#1637976] +- kvm-block-Add-missing-locking-in-bdrv_co_drain_bh_cb.patch [bz#1637976] +- kvm-block-backend-Add-.drained_poll-callback.patch [bz#1637976] +- kvm-block-backend-Fix-potential-double-blk_delete.patch [bz#1637976] +- kvm-block-backend-Decrease-in_flight-only-after-callback.patch [bz#1637976] +- kvm-blockjob-Lie-better-in-child_job_drained_poll.patch [bz#1637976] +- kvm-block-Remove-aio_poll-in-bdrv_drain_poll-variants.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-nested-poll-in-bdrv_drain_poll_.patch [bz#1637976] +- kvm-job-Avoid-deadlocks-in-job_completed_txn_abort.patch [bz#1637976] +- kvm-test-bdrv-drain-AIO_WAIT_WHILE-in-job-.commit-.abort.patch [bz#1637976] +- kvm-test-bdrv-drain-Fix-outdated-comments.patch [bz#1637976] +- kvm-block-Use-a-single-global-AioWait.patch [bz#1637976] +- kvm-test-bdrv-drain-Test-draining-job-source-child-and-p.patch [bz#1637976] +- kvm-qemu-img-Fix-assert-when-mapping-unaligned-raw-file.patch [bz#1639374] +- kvm-iotests-Add-test-221-to-catch-qemu-img-map-regressio.patch [bz#1639374] +- Resolves: bz#1609327 + (qemu-kvm[37046]: Could not find keytab file: /etc/qemu/krb5.tab: Unknown error 49408) +- Resolves: bz#1636142 + (qemu NBD_CMD_CACHE flaws impacting non-qemu NBD clients) +- Resolves: bz#1637976 + (Crashes and hangs with iothreads vs. block jobs) +- Resolves: bz#1639374 + (qemu-img map 'Aborted (core dumped)' when specifying a plain file) + +* Tue Oct 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-39.el8 +- kvm-linux-headers-update.patch [bz#1508142] +- kvm-s390x-cpumodel-Set-up-CPU-model-for-AP-device-suppor.patch [bz#1508142] +- kvm-s390x-kvm-enable-AP-instruction-interpretation-for-g.patch [bz#1508142] +- kvm-s390x-ap-base-Adjunct-Processor-AP-object-model.patch [bz#1508142] +- kvm-s390x-vfio-ap-Introduce-VFIO-AP-device.patch [bz#1508142] +- kvm-s390-doc-detailed-specifications-for-AP-virtualizati.patch [bz#1508142] +- Resolves: bz#1508142 + ([IBM 8.0 FEAT] KVM: Guest-dedicated Crypto Adapters - qemu part) + +* Mon Oct 15 2018 Danilo Cesar Lemes de Paula - 2.12.0-38.el8 +- kvm-Revert-hw-acpi-build-build-SRAT-memory-affinity-stru.patch [bz#1609235] +- kvm-add-udev-kvm-check.patch [bz#1552663] +- kvm-aio-posix-Don-t-count-ctx-notifier-as-progress-when-.patch [bz#1623085] +- kvm-aio-Do-aio_notify_accept-only-during-blocking-aio_po.patch [bz#1623085] +- kvm-aio-posix-fix-concurrent-access-to-poll_disable_cnt.patch [bz#1632622] +- kvm-aio-posix-compute-timeout-before-polling.patch [bz#1632622] +- kvm-aio-posix-do-skip-system-call-if-ctx-notifier-pollin.patch [bz#1632622] +- kvm-intel-iommu-send-PSI-always-even-if-across-PDEs.patch [bz#1450712] +- kvm-intel-iommu-remove-IntelIOMMUNotifierNode.patch [bz#1450712] +- kvm-intel-iommu-add-iommu-lock.patch [bz#1450712] +- kvm-intel-iommu-only-do-page-walk-for-MAP-notifiers.patch [bz#1450712] +- kvm-intel-iommu-introduce-vtd_page_walk_info.patch [bz#1450712] +- kvm-intel-iommu-pass-in-address-space-when-page-walk.patch [bz#1450712] +- kvm-intel-iommu-trace-domain-id-during-page-walk.patch [bz#1450712] +- kvm-util-implement-simple-iova-tree.patch [bz#1450712] +- kvm-intel-iommu-rework-the-page-walk-logic.patch [bz#1450712] +- kvm-i386-define-the-ssbd-CPUID-feature-bit-CVE-2018-3639.patch [bz#1633928] +- Resolves: bz#1450712 + (Booting nested guest with vIOMMU, the assigned network devices can not receive packets (qemu)) +- Resolves: bz#1552663 + (81-kvm-rhel.rules is no longer part of initscripts) +- Resolves: bz#1609235 + (Win2016 guest can't recognize pc-dimm hotplugged to node 0) +- Resolves: bz#1623085 + (VM doesn't boot from HD) +- Resolves: bz#1632622 + (~40% virtio_blk disk performance drop for win2012r2 guest when comparing qemu-kvm-rhev-2.12.0-9 with qemu-kvm-rhev-2.12.0-12) +- Resolves: bz#1633928 + (CVE-2018-3639 qemu-kvm: hw: cpu: speculative store bypass [rhel-8.0]) + +* Fri Oct 12 2018 Danilo Cesar Lemes de Paula - 2.12.0-37.el8 +- kvm-block-for-jobs-do-not-clear-user_paused-until-after-.patch [bz#1635583] +- kvm-iotests-Add-failure-matching-to-common.qemu.patch [bz#1635583] +- kvm-block-iotest-to-catch-abort-on-forced-blockjob-cance.patch [bz#1635583] +- Resolves: bz#1635583 + (Quitting VM causes qemu core dump once the block mirror job paused for no enough target space) + +* Fri Oct 12 2018 Danilo Cesar Lemes de Paula - 2.12.0-36.el8 +- kvm-check-Only-test-ivshm-when-it-is-compiled-in.patch [bz#1621817] +- kvm-Disable-ivshmem.patch [bz#1621817] +- kvm-mirror-Fail-gracefully-for-source-target.patch [bz#1637963] +- kvm-commit-Add-top-node-base-node-options.patch [bz#1637970] +- kvm-qemu-iotests-Test-commit-with-top-node-base-node.patch [bz#1637970] +- Resolves: bz#1621817 + (Disable IVSHMEM in RHEL 8) +- Resolves: bz#1637963 + (Segfault on 'blockdev-mirror' with same node as source and target) +- Resolves: bz#1637970 + (allow using node-names with block-commit) + +* Thu Oct 11 2018 Danilo Cesar Lemes de Paula - 2.12.0-35.el8 +- kvm-redhat-make-the-plugins-executable.patch [bz#1638304] +- Resolves: bz#1638304 + (the driver packages lack all the library Requires) + +* Thu Oct 11 2018 Danilo Cesar Lemes de Paula - 2.12.0-34.el8 +- kvm-seccomp-allow-sched_setscheduler-with-SCHED_IDLE-pol.patch [bz#1618356] +- kvm-seccomp-use-SIGSYS-signal-instead-of-killing-the-thr.patch [bz#1618356] +- kvm-seccomp-prefer-SCMP_ACT_KILL_PROCESS-if-available.patch [bz#1618356] +- kvm-configure-require-libseccomp-2.2.0.patch [bz#1618356] +- kvm-seccomp-set-the-seccomp-filter-to-all-threads.patch [bz#1618356] +- kvm-memory-cleanup-side-effects-of-memory_region_init_fo.patch [bz#1600365] +- Resolves: bz#1600365 + (QEMU core dumped when hotplug memory exceeding host hugepages and with discard-data=yes) +- Resolves: bz#1618356 + (qemu-kvm: Qemu: seccomp: blacklist is not applied to all threads [rhel-8]) + +* Fri Oct 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-33.el8 +- kvm-migration-postcopy-Clear-have_listen_thread.patch [bz#1608765] +- kvm-migration-cleanup-in-error-paths-in-loadvm.patch [bz#1608765] +- kvm-jobs-change-start-callback-to-run-callback.patch [bz#1632939] +- kvm-jobs-canonize-Error-object.patch [bz#1632939] +- kvm-jobs-add-exit-shim.patch [bz#1632939] +- kvm-block-commit-utilize-job_exit-shim.patch [bz#1632939] +- kvm-block-mirror-utilize-job_exit-shim.patch [bz#1632939] +- kvm-jobs-utilize-job_exit-shim.patch [bz#1632939] +- kvm-block-backup-make-function-variables-consistently-na.patch [bz#1632939] +- kvm-jobs-remove-ret-argument-to-job_completed-privatize-.patch [bz#1632939] +- kvm-jobs-remove-job_defer_to_main_loop.patch [bz#1632939] +- kvm-block-commit-add-block-job-creation-flags.patch [bz#1632939] +- kvm-block-mirror-add-block-job-creation-flags.patch [bz#1632939] +- kvm-block-stream-add-block-job-creation-flags.patch [bz#1632939] +- kvm-block-commit-refactor-commit-to-use-job-callbacks.patch [bz#1632939] +- kvm-block-mirror-don-t-install-backing-chain-on-abort.patch [bz#1632939] +- kvm-block-mirror-conservative-mirror_exit-refactor.patch [bz#1632939] +- kvm-block-stream-refactor-stream-to-use-job-callbacks.patch [bz#1632939] +- kvm-tests-blockjob-replace-Blockjob-with-Job.patch [bz#1632939] +- kvm-tests-test-blockjob-remove-exit-callback.patch [bz#1632939] +- kvm-tests-test-blockjob-txn-move-.exit-to-.clean.patch [bz#1632939] +- kvm-jobs-remove-.exit-callback.patch [bz#1632939] +- kvm-qapi-block-commit-expose-new-job-properties.patch [bz#1632939] +- kvm-qapi-block-mirror-expose-new-job-properties.patch [bz#1632939] +- kvm-qapi-block-stream-expose-new-job-properties.patch [bz#1632939] +- kvm-block-backup-qapi-documentation-fixup.patch [bz#1632939] +- kvm-blockdev-document-transactional-shortcomings.patch [bz#1632939] +- Resolves: bz#1608765 + (After postcopy migration, do savevm and loadvm, guest hang and call trace) +- Resolves: bz#1632939 + (qemu blockjobs other than backup do not support job-finalize or job-dismiss) + +* Fri Sep 28 2018 Danilo Cesar Lemes de Paula - 2.12.0-32.el8 +- kvm-Re-enable-disabled-Hyper-V-enlightenments.patch [bz#1625185] +- kvm-Fix-annocheck-issues.patch [bz#1624164] +- kvm-exec-check-that-alignment-is-a-power-of-two.patch [bz#1630746] +- kvm-curl-Make-sslverify-off-disable-host-as-well-as-peer.patch [bz#1575925] +- Resolves: bz#1575925 + ("SSL: no alternative certificate subject name matches target host name" error even though sslverify = off) +- Resolves: bz#1624164 + (Review annocheck distro flag failures in qemu-kvm) +- Resolves: bz#1625185 + (Re-enable disabled Hyper-V enlightenments) +- Resolves: bz#1630746 + (qemu_ram_mmap: Assertion `is_power_of_2(align)' failed) + +* Tue Sep 11 2018 Danilo Cesar Lemes de Paula - 2.12.0-31.el8 +- kvm-i386-Disable-TOPOEXT-by-default-on-cpu-host.patch [bz#1619804] +- kvm-redhat-enable-opengl-add-build-and-runtime-deps.patch [bz#1618412] +- Resolves: bz#1618412 + (Enable opengl (for intel vgpu display)) +- Resolves: bz#1619804 + (kernel panic in init_amd_cacheinfo) + +* Wed Sep 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-30.el8 +- kvm-redhat-Disable-vhost-crypto.patch [bz#1625668] +- Resolves: bz#1625668 + (Decide if we should disable 'vhost-crypto' or not) + +* Wed Sep 05 2018 Danilo Cesar Lemes de Paula - 2.12.0-29.el8 +- kvm-target-i386-sev-fix-memory-leaks.patch [bz#1615717] +- kvm-i386-Fix-arch_query_cpu_model_expansion-leak.patch [bz#1615717] +- kvm-redhat-Update-build-configuration.patch [bz#1573156] +- Resolves: bz#1573156 + (Update build configure for QEMU 2.12.0) +- Resolves: bz#1615717 + (Memory leaks) + +* Tue Sep 04 2018 Danilo Cesar Lemes de Paula - 2.12.0-28.el8 +- kvm-e1000e-Do-not-auto-clear-ICR-bits-which-aren-t-set-i.patch [bz#1596024] +- kvm-e1000e-Prevent-MSI-MSI-X-storms.patch [bz#1596024] +- kvm-Drop-build_configure.sh-and-Makefile.local-files.patch [] +- kvm-Fix-subject-line-in-.gitpublish.patch [] +- Resolves: bz#1596024 + (The network link can't be detected on guest when the guest uses e1000e model type) + +* Wed Aug 29 2018 Danilo Cesar Lemes de Paula - 2.12.0-27.el8 +- kvm-Fix-libusb-1.0.22-deprecated-libusb_set_debug-with-l.patch [bz#1622656] +- Resolves: bz#1622656 + (qemu-kvm fails to build due to libusb_set_debug being deprecated) + +* Fri Aug 17 2018 Danilo Cesar Lemes de Paula - 2.12.0-26.el8 +- kvm-redhat-remove-extra-in-rhel_rhev_conflicts-macro.patch [bz#1618752] +- Resolves: bz#1618752 + (qemu-kvm can't be installed in RHEL-8 as it Conflicts with itself.) + +* Thu Aug 16 2018 Danilo Cesar Lemes de Paula - 2.12.0-25.el8 +- kvm-Migration-TLS-Fix-crash-due-to-double-cleanup.patch [bz#1594384] +- Resolves: bz#1594384 + (2.12 migration fixes) + +* Tue Aug 14 2018 Danilo Cesar Lemes de Paula - 2.12.0-24.el8 +- kvm-Add-qemu-keymap-to-qemu-kvm-common.patch [bz#1593117] +- Resolves: bz#1593117 + (add qemu-keymap utility) + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-23.el8 +- Fixing an issue with some old command in the spec file + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-22.el8 +- Fix an issue with the build_configure script. +- Resolves: bz#1425820 + (Improve QEMU packaging layout with modularization of the block layer) + + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-20.el8 +- kvm-migration-stop-compressing-page-in-migration-thread.patch [bz#1594384] +- kvm-migration-stop-compression-to-allocate-and-free-memo.patch [bz#1594384] +- kvm-migration-stop-decompression-to-allocate-and-free-me.patch [bz#1594384] +- kvm-migration-detect-compression-and-decompression-error.patch [bz#1594384] +- kvm-migration-introduce-control_save_page.patch [bz#1594384] +- kvm-migration-move-some-code-to-ram_save_host_page.patch [bz#1594384] +- kvm-migration-move-calling-control_save_page-to-the-comm.patch [bz#1594384] +- kvm-migration-move-calling-save_zero_page-to-the-common-.patch [bz#1594384] +- kvm-migration-introduce-save_normal_page.patch [bz#1594384] +- kvm-migration-remove-ram_save_compressed_page.patch [bz#1594384] +- kvm-migration-block-dirty-bitmap-fix-memory-leak-in-dirt.patch [bz#1594384] +- kvm-migration-fix-saving-normal-page-even-if-it-s-been-c.patch [bz#1594384] +- kvm-migration-update-index-field-when-delete-or-qsort-RD.patch [bz#1594384] +- kvm-migration-introduce-decompress-error-check.patch [bz#1594384] +- kvm-migration-Don-t-activate-block-devices-if-using-S.patch [bz#1594384] +- kvm-migration-not-wait-RDMA_CM_EVENT_DISCONNECTED-event-.patch [bz#1594384] +- kvm-migration-block-dirty-bitmap-fix-dirty_bitmap_load.patch [bz#1594384] +- kvm-s390x-add-RHEL-7.6-machine-type-for-ccw.patch [bz#1595718] +- kvm-s390x-cpumodel-default-enable-bpb-and-ppa15-for-z196.patch [bz#1595718] +- kvm-linux-headers-asm-s390-kvm.h-header-sync.patch [bz#1612938] +- kvm-s390x-kvm-add-etoken-facility.patch [bz#1612938] +- Resolves: bz#1594384 + (2.12 migration fixes) +- Resolves: bz#1595718 + (Add ppa15/bpb to the default cpu model for z196 and higher in the 7.6 s390-ccw-virtio machine) +- Resolves: bz#1612938 + (Add etoken support to qemu-kvm for s390x KVM guests) + +* Fri Aug 10 2018 Danilo Cesar Lemes de Paula - 2.12.0-18.el8 + Mass import from RHEL 7.6 qemu-kvm-rhev, including fixes to the following BZs: + +- kvm-AArch64-Add-virt-rhel7.6-machine-type.patch [bz#1558723] +- kvm-cpus-Fix-event-order-on-resume-of-stopped-guest.patch [bz#1566153] +- kvm-qemu-img-Check-post-truncation-size.patch [bz#1523065] +- kvm-vga-catch-depth-0.patch [bz#1575541] +- kvm-Fix-x-hv-max-vps-compat-value-for-7.4-machine-type.patch [bz#1583959] +- kvm-ccid-card-passthru-fix-regression-in-realize.patch [bz#1584984] +- kvm-Use-4-MB-vram-for-cirrus.patch [bz#1542080] +- kvm-spapr_pci-Remove-unhelpful-pagesize-warning.patch [bz#1505664] +- kvm-rpm-Add-nvme-VFIO-driver-to-rw-whitelist.patch [bz#1416180] +- kvm-qobject-Use-qobject_to-instead-of-type-cast.patch [bz#1557995] +- kvm-qobject-Ensure-base-is-at-offset-0.patch [bz#1557995] +- kvm-qobject-use-a-QObjectBase_-struct.patch [bz#1557995] +- kvm-qobject-Replace-qobject_incref-QINCREF-qobject_decre.patch [bz#1557995] +- kvm-qobject-Modify-qobject_ref-to-return-obj.patch [bz#1557995] +- kvm-rbd-Drop-deprecated-drive-parameter-filename.patch [bz#1557995] +- kvm-iscsi-Drop-deprecated-drive-parameter-filename.patch [bz#1557995] +- kvm-block-Add-block-specific-QDict-header.patch [bz#1557995] +- kvm-qobject-Move-block-specific-qdict-code-to-block-qdic.patch [bz#1557995] +- kvm-block-Fix-blockdev-for-certain-non-string-scalars.patch [bz#1557995] +- kvm-block-Fix-drive-for-certain-non-string-scalars.patch [bz#1557995] +- kvm-block-Clean-up-a-misuse-of-qobject_to-in-.bdrv_co_cr.patch [bz#1557995] +- kvm-block-Factor-out-qobject_input_visitor_new_flat_conf.patch [bz#1557995] +- kvm-block-Make-remaining-uses-of-qobject-input-visitor-m.patch [bz#1557995] +- kvm-block-qdict-Simplify-qdict_flatten_qdict.patch [bz#1557995] +- kvm-block-qdict-Tweak-qdict_flatten_qdict-qdict_flatten_.patch [bz#1557995] +- kvm-block-qdict-Clean-up-qdict_crumple-a-bit.patch [bz#1557995] +- kvm-block-qdict-Simplify-qdict_is_list-some.patch [bz#1557995] +- kvm-check-block-qdict-Rename-qdict_flatten-s-variables-f.patch [bz#1557995] +- kvm-check-block-qdict-Cover-flattening-of-empty-lists-an.patch [bz#1557995] +- kvm-block-Fix-blockdev-blockdev-add-for-empty-objects-an.patch [bz#1557995] +- kvm-rbd-New-parameter-auth-client-required.patch [bz#1557995] +- kvm-rbd-New-parameter-key-secret.patch [bz#1557995] +- kvm-block-mirror-honor-ratelimit-again.patch [bz#1572856] +- kvm-block-mirror-Make-cancel-always-cancel-pre-READY.patch [bz#1572856] +- kvm-iotests-Add-test-for-cancelling-a-mirror-job.patch [bz#1572856] +- kvm-iotests-Split-214-off-of-122.patch [bz#1518738] +- kvm-block-Add-COR-filter-driver.patch [bz#1518738] +- kvm-block-BLK_PERM_WRITE-includes-._UNCHANGED.patch [bz#1518738] +- kvm-block-Add-BDRV_REQ_WRITE_UNCHANGED-flag.patch [bz#1518738] +- kvm-block-Set-BDRV_REQ_WRITE_UNCHANGED-for-COR-writes.patch [bz#1518738] +- kvm-block-quorum-Support-BDRV_REQ_WRITE_UNCHANGED.patch [bz#1518738] +- kvm-block-Support-BDRV_REQ_WRITE_UNCHANGED-in-filters.patch [bz#1518738] +- kvm-iotests-Clean-up-wrap-image-in-197.patch [bz#1518738] +- kvm-iotests-Copy-197-for-COR-filter-driver.patch [bz#1518738] +- kvm-iotests-Add-test-for-COR-across-nodes.patch [bz#1518738] +- kvm-qemu-io-Use-purely-string-blockdev-options.patch [bz#1576598] +- kvm-qemu-img-Use-only-string-options-in-img_open_opts.patch [bz#1576598] +- kvm-iotests-Add-test-for-U-force-share-conflicts.patch [bz#1576598] +- kvm-qemu-io-Drop-command-functions-return-values.patch [bz#1519617] +- kvm-qemu-io-Let-command-functions-return-error-code.patch [bz#1519617] +- kvm-qemu-io-Exit-with-error-when-a-command-failed.patch [bz#1519617] +- kvm-iotests.py-Add-qemu_io_silent.patch [bz#1519617] +- kvm-iotests-Let-216-make-use-of-qemu-io-s-exit-code.patch [bz#1519617] +- kvm-qcow2-Repair-OFLAG_COPIED-when-fixing-leaks.patch [bz#1527085] +- kvm-iotests-Repairing-error-during-snapshot-deletion.patch [bz#1527085] +- kvm-block-Make-bdrv_is_writable-public.patch [bz#1588039] +- kvm-qcow2-Do-not-mark-inactive-images-corrupt.patch [bz#1588039] +- kvm-iotests-Add-case-for-a-corrupted-inactive-image.patch [bz#1588039] +- kvm-main-loop-drop-spin_counter.patch [bz#1168213] +- kvm-target-ppc-Factor-out-the-parsing-in-kvmppc_get_cpu_.patch [bz#1560847] +- kvm-target-ppc-Don-t-require-private-l1d-cache-on-POWER8.patch [bz#1560847] +- kvm-ppc-spapr_caps-Don-t-disable-cap_cfpc-on-POWER8-by-d.patch [bz#1560847] +- kvm-qxl-fix-local-renderer-crash.patch [bz#1567733] +- kvm-qemu-img-Amendment-support-implies-create_opts.patch [bz#1537956] +- kvm-block-Add-Error-parameter-to-bdrv_amend_options.patch [bz#1537956] +- kvm-qemu-option-Pull-out-Supported-options-print.patch [bz#1537956] +- kvm-qemu-img-Add-print_amend_option_help.patch [bz#1537956] +- kvm-qemu-img-Recognize-no-creation-support-in-o-help.patch [bz#1537956] +- kvm-iotests-Test-help-option-for-unsupporting-formats.patch [bz#1537956] +- kvm-iotests-Rework-113.patch [bz#1537956] +- kvm-qemu-img-Resolve-relative-backing-paths-in-rebase.patch [bz#1569835] +- kvm-iotests-Add-test-for-rebasing-with-relative-paths.patch [bz#1569835] +- kvm-qemu-img-Special-post-backing-convert-handling.patch [bz#1527898] +- kvm-iotests-Test-post-backing-convert-target-behavior.patch [bz#1527898] +- kvm-migration-calculate-expected_downtime-with-ram_bytes.patch [bz#1564576] +- kvm-sheepdog-Fix-sd_co_create_opts-memory-leaks.patch [bz#1513543] +- kvm-qemu-iotests-reduce-chance-of-races-in-185.patch [bz#1513543] +- kvm-blockjob-do-not-cancel-timer-in-resume.patch [bz#1513543] +- kvm-nfs-Fix-error-path-in-nfs_options_qdict_to_qapi.patch [bz#1513543] +- kvm-nfs-Remove-processed-options-from-QDict.patch [bz#1513543] +- kvm-blockjob-drop-block_job_pause-resume_all.patch [bz#1513543] +- kvm-blockjob-expose-error-string-via-query.patch [bz#1513543] +- kvm-blockjob-Fix-assertion-in-block_job_finalize.patch [bz#1513543] +- kvm-blockjob-Wrappers-for-progress-counter-access.patch [bz#1513543] +- kvm-blockjob-Move-RateLimit-to-BlockJob.patch [bz#1513543] +- kvm-blockjob-Implement-block_job_set_speed-centrally.patch [bz#1513543] +- kvm-blockjob-Introduce-block_job_ratelimit_get_delay.patch [bz#1513543] +- kvm-blockjob-Add-block_job_driver.patch [bz#1513543] +- kvm-blockjob-Update-block-job-pause-resume-documentation.patch [bz#1513543] +- kvm-blockjob-Improve-BlockJobInfo.offset-len-documentati.patch [bz#1513543] +- kvm-job-Create-Job-JobDriver-and-job_create.patch [bz#1513543] +- kvm-job-Rename-BlockJobType-into-JobType.patch [bz#1513543] +- kvm-job-Add-JobDriver.job_type.patch [bz#1513543] +- kvm-job-Add-job_delete.patch [bz#1513543] +- kvm-job-Maintain-a-list-of-all-jobs.patch [bz#1513543] +- kvm-job-Move-state-transitions-to-Job.patch [bz#1513543] +- kvm-job-Add-reference-counting.patch [bz#1513543] +- kvm-job-Move-cancelled-to-Job.patch [bz#1513543] +- kvm-job-Add-Job.aio_context.patch [bz#1513543] +- kvm-job-Move-defer_to_main_loop-to-Job.patch [bz#1513543] +- kvm-job-Move-coroutine-and-related-code-to-Job.patch [bz#1513543] +- kvm-job-Add-job_sleep_ns.patch [bz#1513543] +- kvm-job-Move-pause-resume-functions-to-Job.patch [bz#1513543] +- kvm-job-Replace-BlockJob.completed-with-job_is_completed.patch [bz#1513543] +- kvm-job-Move-BlockJobCreateFlags-to-Job.patch [bz#1513543] +- kvm-blockjob-Split-block_job_event_pending.patch [bz#1513543] +- kvm-job-Add-job_event_.patch [bz#1513543] +- kvm-job-Move-single-job-finalisation-to-Job.patch [bz#1513543] +- kvm-job-Convert-block_job_cancel_async-to-Job.patch [bz#1513543] +- kvm-job-Add-job_drain.patch [bz#1513543] +- kvm-job-Move-.complete-callback-to-Job.patch [bz#1513543] +- kvm-job-Move-job_finish_sync-to-Job.patch [bz#1513543] +- kvm-job-Switch-transactions-to-JobTxn.patch [bz#1513543] +- kvm-job-Move-transactions-to-Job.patch [bz#1513543] +- kvm-job-Move-completion-and-cancellation-to-Job.patch [bz#1513543] +- kvm-block-Cancel-job-in-bdrv_close_all-callers.patch [bz#1513543] +- kvm-job-Add-job_yield.patch [bz#1513543] +- kvm-job-Add-job_dismiss.patch [bz#1513543] +- kvm-job-Add-job_is_ready.patch [bz#1513543] +- kvm-job-Add-job_transition_to_ready.patch [bz#1513543] +- kvm-job-Move-progress-fields-to-Job.patch [bz#1513543] +- kvm-job-Introduce-qapi-job.json.patch [bz#1513543] +- kvm-job-Add-JOB_STATUS_CHANGE-QMP-event.patch [bz#1513543] +- kvm-job-Add-lifecycle-QMP-commands.patch [bz#1513543] +- kvm-job-Add-query-jobs-QMP-command.patch [bz#1513543] +- kvm-blockjob-Remove-BlockJob.driver.patch [bz#1513543] +- kvm-iotests-Move-qmp_to_opts-to-VM.patch [bz#1513543] +- kvm-qemu-iotests-Test-job-with-block-jobs.patch [bz#1513543] +- kvm-vdi-Fix-vdi_co_do_create-return-value.patch [bz#1513543] +- kvm-vhdx-Fix-vhdx_co_create-return-value.patch [bz#1513543] +- kvm-job-Add-error-message-for-failing-jobs.patch [bz#1513543] +- kvm-block-create-Make-x-blockdev-create-a-job.patch [bz#1513543] +- kvm-qemu-iotests-Add-VM.get_qmp_events_filtered.patch [bz#1513543] +- kvm-qemu-iotests-Add-VM.qmp_log.patch [bz#1513543] +- kvm-qemu-iotests-Add-iotests.img_info_log.patch [bz#1513543] +- kvm-qemu-iotests-Add-VM.run_job.patch [bz#1513543] +- kvm-qemu-iotests-iotests.py-helper-for-non-file-protocol.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-206-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-207-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-210-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-211-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-212-for-blockdev-create-job.patch [bz#1513543] +- kvm-qemu-iotests-Rewrite-213-for-blockdev-create-job.patch [bz#1513543] +- kvm-block-create-Mark-blockdev-create-stable.patch [bz#1513543] +- kvm-jobs-fix-stale-wording.patch [bz#1513543] +- kvm-jobs-fix-verb-references-in-docs.patch [bz#1513543] +- kvm-iotests-Fix-219-s-timing.patch [bz#1513543] +- kvm-iotests-improve-pause_job.patch [bz#1513543] +- kvm-rpm-Whitelist-copy-on-read-block-driver.patch [bz#1518738] +- kvm-rpm-add-throttle-driver-to-rw-whitelist.patch [bz#1591076] +- kvm-usb-host-skip-open-on-pending-postload-bh.patch [bz#1572851] +- kvm-i386-Define-the-Virt-SSBD-MSR-and-handling-of-it-CVE.patch [bz#1574216] +- kvm-i386-define-the-AMD-virt-ssbd-CPUID-feature-bit-CVE-.patch [bz#1574216] +- kvm-block-file-posix-Pass-FD-to-locking-helpers.patch [bz#1519144] +- kvm-block-file-posix-File-locking-during-creation.patch [bz#1519144] +- kvm-iotests-Add-creation-test-to-153.patch [bz#1519144] +- kvm-vhost-user-add-Net-prefix-to-internal-state-structur.patch [bz#1526645] +- kvm-virtio-support-setting-memory-region-based-host-noti.patch [bz#1526645] +- kvm-vhost-user-support-receiving-file-descriptors-in-sla.patch [bz#1526645] +- kvm-osdep-add-wait.h-compat-macros.patch [bz#1526645] +- kvm-vhost-user-bridge-support-host-notifier.patch [bz#1526645] +- kvm-vhost-allow-backends-to-filter-memory-sections.patch [bz#1526645] +- kvm-vhost-user-allow-slave-to-send-fds-via-slave-channel.patch [bz#1526645] +- kvm-vhost-user-introduce-shared-vhost-user-state.patch [bz#1526645] +- kvm-vhost-user-support-registering-external-host-notifie.patch [bz#1526645] +- kvm-libvhost-user-support-host-notifier.patch [bz#1526645] +- kvm-block-Introduce-API-for-copy-offloading.patch [bz#1482537] +- kvm-raw-Check-byte-range-uniformly.patch [bz#1482537] +- kvm-raw-Implement-copy-offloading.patch [bz#1482537] +- kvm-qcow2-Implement-copy-offloading.patch [bz#1482537] +- kvm-file-posix-Implement-bdrv_co_copy_range.patch [bz#1482537] +- kvm-iscsi-Query-and-save-device-designator-when-opening.patch [bz#1482537] +- kvm-iscsi-Create-and-use-iscsi_co_wait_for_task.patch [bz#1482537] +- kvm-iscsi-Implement-copy-offloading.patch [bz#1482537] +- kvm-block-backend-Add-blk_co_copy_range.patch [bz#1482537] +- kvm-qemu-img-Convert-with-copy-offloading.patch [bz#1482537] +- kvm-qcow2-Fix-src_offset-in-copy-offloading.patch [bz#1482537] +- kvm-iscsi-Don-t-blindly-use-designator-length-in-respons.patch [bz#1482537] +- kvm-file-posix-Fix-EINTR-handling.patch [bz#1482537] +- kvm-usb-storage-Add-rerror-werror-properties.patch [bz#1595180] +- kvm-numa-clarify-error-message-when-node-index-is-out-of.patch [bz#1578381] +- kvm-qemu-iotests-Update-026.out.nocache-reference-output.patch [bz#1528541] +- kvm-qcow2-Free-allocated-clusters-on-write-error.patch [bz#1528541] +- kvm-qemu-iotests-Test-qcow2-not-leaking-clusters-on-writ.patch [bz#1528541] +- kvm-qemu-options-Add-missing-newline-to-accel-help-text.patch [bz#1586313] +- kvm-xhci-fix-guest-triggerable-assert.patch [bz#1594135] +- kvm-virtio-gpu-tweak-scanout-disable.patch [bz#1589634] +- kvm-virtio-gpu-update-old-resource-too.patch [bz#1589634] +- kvm-virtio-gpu-disable-scanout-when-backing-resource-is-.patch [bz#1589634] +- kvm-block-Don-t-silently-truncate-node-names.patch [bz#1549654] +- kvm-pr-helper-fix-socket-path-default-in-help.patch [bz#1533158] +- kvm-pr-helper-fix-assertion-failure-on-failed-multipath-.patch [bz#1533158] +- kvm-pr-manager-helper-avoid-SIGSEGV-when-writing-to-the-.patch [bz#1533158] +- kvm-pr-manager-put-stubs-in-.c-file.patch [bz#1533158] +- kvm-pr-manager-add-query-pr-managers-QMP-command.patch [bz#1533158] +- kvm-pr-manager-helper-report-event-on-connection-disconn.patch [bz#1533158] +- kvm-pr-helper-avoid-error-on-PR-IN-command-with-zero-req.patch [bz#1533158] +- kvm-pr-helper-Rework-socket-path-handling.patch [bz#1533158] +- kvm-pr-manager-helper-fix-memory-leak-on-event.patch [bz#1533158] +- kvm-object-fix-OBJ_PROP_LINK_UNREF_ON_RELEASE-ambivalenc.patch [bz#1556678] +- kvm-usb-hcd-xhci-test-add-a-test-for-ccid-hotplug.patch [bz#1556678] +- kvm-Revert-usb-release-the-created-buses.patch [bz#1556678] +- kvm-file-posix-Fix-creation-locking.patch [bz#1599335] +- kvm-file-posix-Unlock-FD-after-creation.patch [bz#1599335] +- kvm-ahci-trim-signatures-on-raise-lower.patch [bz#1584914] +- kvm-ahci-fix-PxCI-register-race.patch [bz#1584914] +- kvm-ahci-don-t-schedule-unnecessary-BH.patch [bz#1584914] +- kvm-qcow2-Fix-qcow2_truncate-error-return-value.patch [bz#1595173] +- kvm-block-Convert-.bdrv_truncate-callback-to-coroutine_f.patch [bz#1595173] +- kvm-qcow2-Remove-coroutine-trampoline-for-preallocate_co.patch [bz#1595173] +- kvm-block-Move-bdrv_truncate-implementation-to-io.c.patch [bz#1595173] +- kvm-block-Use-tracked-request-for-truncate.patch [bz#1595173] +- kvm-file-posix-Make-.bdrv_co_truncate-asynchronous.patch [bz#1595173] +- kvm-block-Fix-copy-on-read-crash-with-partial-final-clus.patch [bz#1590640] +- kvm-block-fix-QEMU-crash-with-scsi-hd-and-drive_del.patch [bz#1599515] +- kvm-virtio-rng-process-pending-requests-on-DRIVER_OK.patch [bz#1576743] +- kvm-file-posix-specify-expected-filetypes.patch [bz#1525829] +- kvm-iotests-add-test-226-for-file-driver-types.patch [bz#1525829] +- kvm-block-dirty-bitmap-add-lock-to-bdrv_enable-disable_d.patch [bz#1207657] +- kvm-qapi-add-x-block-dirty-bitmap-enable-disable.patch [bz#1207657] +- kvm-qmp-transaction-support-for-x-block-dirty-bitmap-ena.patch [bz#1207657] +- kvm-qapi-add-x-block-dirty-bitmap-merge.patch [bz#1207657] +- kvm-qapi-add-disabled-parameter-to-block-dirty-bitmap-ad.patch [bz#1207657] +- kvm-block-dirty-bitmap-add-bdrv_enable_dirty_bitmap_lock.patch [bz#1207657] +- kvm-dirty-bitmap-fix-double-lock-on-bitmap-enabling.patch [bz#1207657] +- kvm-block-qcow2-bitmap-fix-free_bitmap_clusters.patch [bz#1207657] +- kvm-qcow2-add-overlap-check-for-bitmap-directory.patch [bz#1207657] +- kvm-blockdev-enable-non-root-nodes-for-backup-source.patch [bz#1207657] +- kvm-iotests-add-222-to-test-basic-fleecing.patch [bz#1207657] +- kvm-qcow2-Remove-dead-check-on-ret.patch [bz#1207657] +- kvm-block-Move-request-tracking-to-children-in-copy-offl.patch [bz#1207657] +- kvm-block-Fix-parameter-checking-in-bdrv_co_copy_range_i.patch [bz#1207657] +- kvm-block-Honour-BDRV_REQ_NO_SERIALISING-in-copy-range.patch [bz#1207657] +- kvm-backup-Use-copy-offloading.patch [bz#1207657] +- kvm-block-backup-disable-copy-offloading-for-backup.patch [bz#1207657] +- kvm-iotests-222-Don-t-run-with-luks.patch [bz#1207657] +- kvm-block-io-fix-copy_range.patch [bz#1207657] +- kvm-block-split-flags-in-copy_range.patch [bz#1207657] +- kvm-block-add-BDRV_REQ_SERIALISING-flag.patch [bz#1207657] +- kvm-block-backup-fix-fleecing-scheme-use-serialized-writ.patch [bz#1207657] +- kvm-nbd-server-Reject-0-length-block-status-request.patch [bz#1207657] +- kvm-nbd-server-fix-trace.patch [bz#1207657] +- kvm-nbd-server-refactor-NBDExportMetaContexts.patch [bz#1207657] +- kvm-nbd-server-add-nbd_meta_empty_or_pattern-helper.patch [bz#1207657] +- kvm-nbd-server-implement-dirty-bitmap-export.patch [bz#1207657] +- kvm-qapi-new-qmp-command-nbd-server-add-bitmap.patch [bz#1207657] +- kvm-docs-interop-add-nbd.txt.patch [bz#1207657] +- kvm-nbd-server-introduce-NBD_CMD_CACHE.patch [bz#1207657] +- kvm-nbd-server-Silence-gcc-false-positive.patch [bz#1207657] +- kvm-nbd-server-Fix-dirty-bitmap-logic-regression.patch [bz#1207657] +- kvm-nbd-server-fix-nbd_co_send_block_status.patch [bz#1207657] +- kvm-nbd-client-Add-x-dirty-bitmap-to-query-bitmap-from-s.patch [bz#1207657] +- kvm-iotests-New-test-223-for-exporting-dirty-bitmap-over.patch [bz#1207657] +- kvm-hw-char-serial-Only-retry-if-qemu_chr_fe_write-retur.patch [bz#1592817] +- kvm-hw-char-serial-retry-write-if-EAGAIN.patch [bz#1592817] +- kvm-throttle-groups-fix-hang-when-group-member-leaves.patch [bz#1535914] +- kvm-Disable-aarch64-devices-reappeared-after-2.12-rebase.patch [bz#1586357] +- kvm-Disable-split-irq-device.patch [bz#1586357] +- kvm-Disable-AT24Cx-i2c-eeprom.patch [bz#1586357] +- kvm-Disable-CAN-bus-devices.patch [bz#1586357] +- kvm-Disable-new-superio-devices.patch [bz#1586357] +- kvm-Disable-new-pvrdma-device.patch [bz#1586357] +- kvm-qdev-add-HotplugHandler-post_plug-callback.patch [bz#1607891] +- kvm-virtio-scsi-fix-hotplug-reset-vs-event-race.patch [bz#1607891] +- kvm-e1000-Fix-tso_props-compat-for-82540em.patch [bz#1608778] +- kvm-slirp-correct-size-computation-while-concatenating-m.patch [bz#1586255] +- kvm-s390x-sclp-fix-maxram-calculation.patch [bz#1595740] +- kvm-redhat-Make-gitpublish-profile-the-default-one.patch [bz#1425820] +- Resolves: bz#1168213 + (main-loop: WARNING: I/O thread spun for 1000 iterations while doing stream block device.) +- Resolves: bz#1207657 + (RFE: QEMU Incremental live backup - push and pull modes) +- Resolves: bz#1416180 + (QEMU VFIO based block driver for NVMe devices) +- Resolves: bz#1425820 + (Improve QEMU packaging layout with modularization of the block layer) +- Resolves: bz#1482537 + ([RFE] qemu-img copy-offloading (convert command)) +- Resolves: bz#1505664 + ("qemu-kvm: System page size 0x1000000 is not enabled in page_size_mask (0x11000). Performance may be slow" show up while using hugepage as guest's memory) +- Resolves: bz#1513543 + ([RFE] Add block job to create format on a storage device) +- Resolves: bz#1518738 + (Add 'copy-on-read' filter driver for use with blockdev-add) +- Resolves: bz#1519144 + (qemu-img: image locking doesn't cover image creation) +- Resolves: bz#1519617 + (The exit code should be non-zero when qemu-io reports an error) +- Resolves: bz#1523065 + ("qemu-img resize" should fail to decrease the size of logical partition/lvm/iSCSI image with raw format) +- Resolves: bz#1525829 + (can not boot up a scsi-block passthrough disk via -blockdev with error "cannot get SG_IO version number: Operation not supported. Is this a SCSI device?") +- Resolves: bz#1526645 + ([Intel 7.6 FEAT] vHost Data Plane Acceleration (vDPA) - vhost user client - qemu-kvm-rhev) +- Resolves: bz#1527085 + (The copied flag should be updated during '-r leaks') +- Resolves: bz#1527898 + ([RFE] qemu-img should leave cluster unallocated if it's read as zero throughout the backing chain) +- Resolves: bz#1528541 + (qemu-img check reports tons of leaked clusters after re-start nfs service to resume writing data in guest) +- Resolves: bz#1533158 + (QEMU support for libvirtd restarting qemu-pr-helper) +- Resolves: bz#1535914 + (Disable io throttling for one member disk of a group during io will induce the other one hang with io) +- Resolves: bz#1537956 + (RFE: qemu-img amend should list the true supported options) +- Resolves: bz#1542080 + (Qemu core dump at cirrus_invalidate_region) +- Resolves: bz#1549654 + (Reject node-names which would be truncated by the block layer commands) +- Resolves: bz#1556678 + (Hot plug usb-ccid for the 2nd time with the same ID as the 1st time failed) +- Resolves: bz#1557995 + (QAPI schema for RBD storage misses the 'password-secret' option) +- Resolves: bz#1558723 + (Create RHEL-7.6 QEMU machine type for AArch64) +- Resolves: bz#1560847 + ([Power8][FW b0320a_1812.861][rhel7.5rc2 3.10.0-861.el7.ppc64le][qemu-kvm-{ma,rhev}-2.10.0-21.el7_5.1.ppc64le] KVM guest does not default to ori type flush even with pseries-rhel7.5.0-sxxm) +- Resolves: bz#1564576 + (Pegas 1.1 - Require to backport qemu-kvm patch that fixes expected_downtime calculation during migration) +- Resolves: bz#1566153 + (IOERROR pause code lost after resuming a VM while I/O error is still present) +- Resolves: bz#1567733 + (qemu abort when migrate during guest reboot) +- Resolves: bz#1569835 + (qemu-img get wrong backing file path after rebasing image with relative path) +- Resolves: bz#1572851 + (Core dumped after migration when with usb-host) +- Resolves: bz#1572856 + ('block-job-cancel' can not cancel a "drive-mirror" job) +- Resolves: bz#1574216 + (CVE-2018-3639 qemu-kvm-rhev: hw: cpu: speculative store bypass [rhel-7.6]) +- Resolves: bz#1575541 + (qemu core dump while installing win10 guest) +- Resolves: bz#1576598 + (Segfault in qemu-io and qemu-img with -U --image-opts force-share=off) +- Resolves: bz#1576743 + (virtio-rng hangs when running on recent (2.x) QEMU versions) +- Resolves: bz#1578381 + (Error message need update when specify numa distance with node index >=128) +- Resolves: bz#1583959 + (Incorrect vcpu count limit for 7.4 machine types for windows guests) +- Resolves: bz#1584914 + (SATA emulator lags and hangs) +- Resolves: bz#1584984 + (Vm starts failed with 'passthrough' smartcard) +- Resolves: bz#1586255 + (CVE-2018-11806 qemu-kvm-rhev: QEMU: slirp: heap buffer overflow while reassembling fragmented datagrams [rhel-7.6]) +- Resolves: bz#1586313 + (-smp option is not easily found in the output of qemu help) +- Resolves: bz#1586357 + (Disable new devices in 2.12) +- Resolves: bz#1588039 + (Possible assertion failure in qemu when a corrupted image is used during an incoming migration) +- Resolves: bz#1589634 + (Migration failed when rebooting guest with multiple virtio videos) +- Resolves: bz#1590640 + (qemu-kvm: block/io.c:1098: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.) +- Resolves: bz#1591076 + (The driver of 'throttle' is not whitelisted) +- Resolves: bz#1592817 + (Retrying on serial_xmit if the pipe is broken may compromise the Guest) +- Resolves: bz#1594135 + (system_reset many times linux guests cause qemu process Aborted) +- Resolves: bz#1595173 + (blockdev-create is blocking) +- Resolves: bz#1595180 + (Can't set rerror/werror with usb-storage) +- Resolves: bz#1595740 + (RHEL-Alt-7.6 - qemu has error during migration of larger guests) +- Resolves: bz#1599335 + (Image creation locking is too tight and is not properly released) +- Resolves: bz#1599515 + (qemu core-dump with aio_read via hmp (util/qemu-thread-posix.c:64: qemu_mutex_lock_impl: Assertion `mutex->initialized' failed)) +- Resolves: bz#1607891 + (Hotplug events are sometimes lost with virtio-scsi + iothread) +- Resolves: bz#1608778 + (qemu/migration: migrate failed from RHEL.7.6 to RHEL.7.5 with e1000-82540em) + +* Mon Aug 06 2018 Danilo Cesar Lemes de Paula - 2.12.0-17.el8 +- kvm-linux-headers-Update-to-include-KVM_CAP_S390_HPAGE_1.patch [bz#1610906] +- kvm-s390x-Enable-KVM-huge-page-backing-support.patch [bz#1610906] +- kvm-redhat-s390x-add-hpage-1-to-kvm.conf.patch [bz#1610906] +- Resolves: bz#1610906 + ([IBM 8.0 FEAT] KVM: Huge Pages - libhugetlbfs Enablement - qemu-kvm part) + +* Tue Jul 31 2018 Danilo Cesar Lemes de Paula - 2.12.0-16.el8 +- kvm-spapr-Correct-inverted-test-in-spapr_pc_dimm_node.patch [bz#1601671] +- kvm-osdep-powerpc64-align-memory-to-allow-2MB-radix-THP-.patch [bz#1601317] +- kvm-RHEL-8.0-Add-pseries-rhel7.6.0-sxxm-machine-type.patch [bz#1595501] +- kvm-i386-Helpers-to-encode-cache-information-consistentl.patch [bz#1597739] +- kvm-i386-Add-cache-information-in-X86CPUDefinition.patch [bz#1597739] +- kvm-i386-Initialize-cache-information-for-EPYC-family-pr.patch [bz#1597739] +- kvm-i386-Add-new-property-to-control-cache-info.patch [bz#1597739] +- kvm-i386-Clean-up-cache-CPUID-code.patch [bz#1597739] +- kvm-i386-Populate-AMD-Processor-Cache-Information-for-cp.patch [bz#1597739] +- kvm-i386-Add-support-for-CPUID_8000_001E-for-AMD.patch [bz#1597739] +- kvm-i386-Fix-up-the-Node-id-for-CPUID_8000_001E.patch [bz#1597739] +- kvm-i386-Enable-TOPOEXT-feature-on-AMD-EPYC-CPU.patch [bz#1597739] +- kvm-i386-Remove-generic-SMT-thread-check.patch [bz#1597739] +- kvm-i386-Allow-TOPOEXT-to-be-enabled-on-older-kernels.patch [bz#1597739] +- Resolves: bz#1595501 + (Create pseries-rhel7.6.0-sxxm machine type) +- Resolves: bz#1597739 + (AMD EPYC/Zen SMT support for KVM / QEMU guest (qemu-kvm)) +- Resolves: bz#1601317 + (RHEL8.0 - qemu patch to align memory to allow 2MB THP) +- Resolves: bz#1601671 + (After rebooting guest,all the hot plug memory will be assigned to the 1st numa node.) + +* Tue Jul 24 2018 Danilo Cesar Lemes de Paula - 2.12.0-15.el8 +- kvm-spapr-Add-ibm-max-associativity-domains-property.patch [bz#1599593] +- kvm-Revert-spapr-Don-t-allow-memory-hotplug-to-memory-le.patch [bz#1599593] +- kvm-simpletrace-Convert-name-from-mapping-record-to-str.patch [bz#1594969] +- kvm-tests-fix-TLS-handshake-failure-with-TLS-1.3.patch [bz#1602403] +- Resolves: bz#1594969 + (simpletrace.py fails when running with Python 3) +- Resolves: bz#1599593 + (User can't hotplug memory to less memory numa node on rhel8) +- Resolves: bz#1602403 + (test-crypto-tlssession unit test fails with assertions) + +* Mon Jul 09 2018 Danilo Cesar Lemes de Paula - 2.12.0-14.el8 +- kvm-vfio-pci-Default-display-option-to-off.patch [bz#1590511] +- kvm-python-futurize-f-libfuturize.fixes.fix_print_with_i.patch [bz#1571533] +- kvm-python-futurize-f-lib2to3.fixes.fix_except.patch [bz#1571533] +- kvm-Revert-Defining-a-shebang-for-python-scripts.patch [bz#1571533] +- kvm-spec-Fix-ambiguous-python-interpreter-name.patch [bz#1571533] +- kvm-qemu-ga-blacklisting-guest-exec-and-guest-exec-statu.patch [bz#1518132] +- kvm-redhat-rewrap-build_configure.sh-cmdline-for-the-rh-.patch +- kvm-redhat-remove-the-VTD-LIVE_BLOCK_OPS-and-RHV-options.patch +- kvm-redhat-fix-the-rh-env-prep-target-s-dependency-on-th.patch +- kvm-redhat-remove-dead-code-related-to-s390-not-s390x.patch +- kvm-redhat-sync-compiler-flags-from-the-spec-file-to-rh-.patch +- kvm-redhat-sync-guest-agent-enablement-and-tcmalloc-usag.patch +- kvm-redhat-fix-up-Python-3-dependency-for-building-QEMU.patch +- kvm-redhat-fix-up-Python-dependency-for-SRPM-generation.patch +- kvm-redhat-disable-glusterfs-dependency-support-temporar.patch +- Resolves: bz#1518132 + (Ensure file access RPCs are disabled by default) +- Resolves: bz#1571533 + (Convert qemu-kvm python scripts to python3) +- Resolves: bz#1590511 + (Fails to start guest with Intel vGPU device) + +* Thu Jun 21 2018 Danilo C. L. de Paula - 2.12.0-13.el8 +- Resolves: bz#1508137 + ([IBM 8.0 FEAT] KVM: Interactive Bootloader (qemu)) +- Resolves: bz#1513558 + (Remove RHEL6 machine types) +- Resolves: bz#1568600 + (pc-i440fx-rhel7.6.0 and pc-q35-rhel7.6.0 machine types (x86)) +- Resolves: bz#1570029 + ([IBM 8.0 FEAT] KVM: 3270 Connectivity - qemu part) +- Resolves: bz#1578855 + (Enable Native Ceph support on non x86_64 CPUs) +- Resolves: bz#1585651 + (RHEL 7.6 new pseries machine type (ppc64le)) +- Resolves: bz#1592337 + ([IBM 8.0 FEAT] KVM: CPU Model z14 ZR1 (qemu-kvm)) + +* Tue May 15 2018 Danilo C. L. de Paula - 2.12.0-11.el8.1 +- Resolves: bz#1576468 + (Enable vhost_user in qemu-kvm 2.12) + +* Wed May 09 2018 Danilo de Paula - 2.12.0-11.el8 +- Resolves: bz#1574406 + ([RHEL 8][qemu-kvm] Failed to find romfile "efi-virtio.rom") +- Resolves: bz#1569675 + (Backwards compatibility of pc-*-rhel7.5.0 and older machine-types) +- Resolves: bz#1576045 + (Fix build issue by using python3) +- Resolves: bz#1571145 + (qemu-kvm segfaults on RHEL 8 when run guestfsd under TCG) + +* Fri Apr 20 2018 Danilo de Paula - 2.12.0-10.el +- Fixing some issues with packaging. +- Rebasing to 2.12.0-rc4 + +* Fri Apr 13 2018 Danilo de Paula - 2.11.0-7.el8 +- Bumping epoch for RHEL8 and dropping self-obsoleting + +* Thu Apr 12 2018 Danilo de Paula - 2.11.0-6.el8 +- Rebuilding + +* Mon Mar 05 2018 Danilo de Paula - 2.11.0-5.el8 +- Prepare building on RHEL-8.0 diff --git a/qemu-pr-helper.service b/qemu-pr-helper.service new file mode 100755 index 0000000000000000000000000000000000000000..a1d27b022170119af939b22cf55d61783230610a --- /dev/null +++ b/qemu-pr-helper.service @@ -0,0 +1,15 @@ +[Unit] +Description=Persistent Reservation Daemon for QEMU + +[Service] +WorkingDirectory=/tmp +Type=simple +ExecStart=/usr/bin/qemu-pr-helper +PrivateTmp=yes +ProtectSystem=strict +ReadWritePaths=/var/run +RestrictAddressFamilies=AF_UNIX +Restart=always +RestartSec=0 + +[Install] diff --git a/qemu-pr-helper.socket b/qemu-pr-helper.socket new file mode 100755 index 0000000000000000000000000000000000000000..9d7c3e5e2c95330f29135680f6353cbd9f27436a --- /dev/null +++ b/qemu-pr-helper.socket @@ -0,0 +1,9 @@ +[Unit] +Description=Persistent Reservation Daemon for QEMU + +[Socket] +ListenStream=/run/qemu-pr-helper.sock +SocketMode=0600 + +[Install] +WantedBy=multi-user.target diff --git a/udev-kvm-check.c b/udev-kvm-check.c new file mode 100755 index 0000000000000000000000000000000000000000..928b9dec2c8ec1b5d0a4366e6a9f8e930534e78e --- /dev/null +++ b/udev-kvm-check.c @@ -0,0 +1,155 @@ +/* + * udev-kvm-check.c + * + * Copyright 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include + +#define DEFAULT 0 +#define FACILITY "kvm" +#define SYSCONFIG_KVM "/etc/sysconfig/kvm" + +#define COUNT_MSG \ + "%d %s now active" + +int get_threshold_from_file(FILE *fp) +{ + static const char key[] = "THRESHOLD="; + int pos = 0; + int thres; + int ch; + +start: + /* State START - at beginning of line, search for beginning of "THRESHOLD=" + * string. + */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (isspace(ch)) { + goto start; + } + if (ch == 'T') { + pos = 1; + goto key; + } + goto eol; + +eol: + /* State EOL - loop until end of line */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (ch == '\n') { + goto start; + } + goto eol; + +key: + /* State KEY - match "THRESHOLD=" string, go to THRESHOLD if found */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (ch == key[pos]) { + pos++; + if (key[pos] == 0) { + goto threshold; + } else { + goto key; + } + } + goto eol; + +threshold: + /* State THRESHOLD - parse number using fscanf, expect comment or space + * or EOL. + */ + ch = getc(fp); + if (ch == EOF) { + return DEFAULT; + } + if (!isdigit(ch)) { + goto eol; + } + ungetc(ch, fp); + if (fscanf(fp, "%d", &thres) != 1) { + return DEFAULT; + } + ch = getc(fp); + if (ch == '#' || ch == EOF || ch == '\n' || isspace(ch)) { + return thres; + } + goto eol; +} + +int get_threshold() +{ + FILE *fp = fopen(SYSCONFIG_KVM, "r"); + int val; + + if (!fp) { + return DEFAULT; + } + + val = get_threshold_from_file(fp); + fclose (fp); + return val; +} + +const char *guest(int count) +{ + return (count == 1 ? "guest" : "guests"); +} + +void emit_count_message(int count) +{ + openlog(FACILITY, LOG_CONS, LOG_USER); + syslog(LOG_INFO, COUNT_MSG, count, guest(count)); + closelog(); +} + +int main(int argc, char **argv) +{ + int count, threshold; + + if (argc < 3) + exit(1); + + count = atoi(argv[1]); + threshold = get_threshold(); + + if (!strcmp(argv[2], "create")) { + if (threshold == 0 || count > threshold) { + emit_count_message(count); + } + } else { + if (count >= threshold) { + emit_count_message(count); + } + } + + return 0; +} diff --git a/vhost.conf b/vhost.conf new file mode 100755 index 0000000000000000000000000000000000000000..68d6d7f9b42fcab3dbe0342c42a73977db433cd9 --- /dev/null +++ b/vhost.conf @@ -0,0 +1,3 @@ +# Increase default vhost memory map limit to match +# KVM's memory slot limit +options vhost max_mem_regions=509