From 05a2e4a93ad596e1017e18c1ec80039b1f890e7a Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Thu, 6 Mar 2025 18:54:47 +0000 Subject: [PATCH 1/6] firmware: arm_scmi: Balance device refcount when destroying devices stable inclusion from stable-6.15 commit 9ca67840c0ddf3f39407339624cef824a4f27599 category: bugfix issue: #ICF1E7 CVE: CVE-2025-37905 Signed-off-by: Li Nan --------------------------------------- Using device_find_child() to lookup the proper SCMI device to destroy causes an unbalance in device refcount, since device_find_child() calls an implicit get_device(): this, in turns, inhibits the call of the provided release methods upon devices destruction. As a consequence, one of the structures that is not freed properly upon destruction is the internal struct device_private dev->p populated by the drivers subsystem core. KMemleak detects this situation since loading/unloding some SCMI driver causes related devices to be created/destroyed without calling any device_release method. unreferenced object 0xffff00000f583800 (size 512): comm "insmod", pid 227, jiffies 4294912190 hex dump (first 32 bytes): 00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00 .....N.......... ff ff ff ff ff ff ff ff 60 36 1d 8a 00 80 ff ff ........`6...... backtrace (crc 114e2eed): kmemleak_alloc+0xbc/0xd8 __kmalloc_cache_noprof+0x2dc/0x398 device_add+0x954/0x12d0 device_register+0x28/0x40 __scmi_device_create.part.0+0x1bc/0x380 scmi_device_create+0x2d0/0x390 scmi_create_protocol_devices+0x74/0xf8 scmi_device_request_notifier+0x1f8/0x2a8 notifier_call_chain+0x110/0x3b0 blocking_notifier_call_chain+0x70/0xb0 scmi_driver_register+0x350/0x7f0 0xffff80000a3b3038 do_one_initcall+0x12c/0x730 do_init_module+0x1dc/0x640 load_module+0x4b20/0x5b70 init_module_from_file+0xec/0x158 $ ./scripts/faddr2line ./vmlinux device_add+0x954/0x12d0 device_add+0x954/0x12d0: kmalloc_noprof at include/linux/slab.h:901 (inlined by) kzalloc_noprof at include/linux/slab.h:1037 (inlined by) device_private_init at drivers/base/core.c:3510 (inlined by) device_add at drivers/base/core.c:3561 Balance device refcount by issuing a put_device() on devices found via device_find_child(). Reported-by: Alice Ryhl Closes: https://lore.kernel.org/linux-arm-kernel/Z8nK3uFkspy61yjP@arm.com/T/#mc1f73a0ea5e41014fa145147b7b839fc988ada8f CC: Sudeep Holla CC: Catalin Marinas Fixes: d4f9dddd21f3 ("firmware: arm_scmi: Add dynamic scmi devices creation") Signed-off-by: Cristian Marussi Tested-by: Alice Ryhl Message-Id: <20250306185447.2039336-1-cristian.marussi@arm.com> Signed-off-by: Sudeep Holla Signed-off-by: Li Nan --- drivers/firmware/arm_scmi/bus.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/firmware/arm_scmi/bus.c b/drivers/firmware/arm_scmi/bus.c index dcf774d3edfe..51eeaf14367d 100644 --- a/drivers/firmware/arm_scmi/bus.c +++ b/drivers/firmware/arm_scmi/bus.c @@ -240,6 +240,9 @@ static struct scmi_device *scmi_child_dev_find(struct device *parent, if (!dev) return NULL; + /* Drop the refcnt bumped implicitly by device_find_child */ + put_device(dev); + return to_scmi_dev(dev); } -- Gitee From e93ff34ec87c81d1e5bcec222e456385d03db506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Sat, 19 Apr 2025 13:14:00 +0200 Subject: [PATCH 2/6] riscv: uprobes: Add missing fence.i after building the XOL buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-6.15 commit 7d1d19a11cfbfd8bae1d89cc010b2cc397cd0c48 category: bugfix issue: #ICF1E7 CVE: CVE-2025-37822 Signed-off-by: Li Nan --------------------------------------- The XOL (execute out-of-line) buffer is used to single-step the replaced instruction(s) for uprobes. The RISC-V port was missing a proper fence.i (i$ flushing) after constructing the XOL buffer, which can result in incorrect execution of stale/broken instructions. This was found running the BPF selftests "test_progs: uprobe_autoattach, attach_probe" on the Spacemit K1/X60, where the uprobes tests randomly blew up. Reviewed-by: Guo Ren Fixes: 74784081aac8 ("riscv: Add uprobes supported") Signed-off-by: Björn Töpel Link: https://lore.kernel.org/r/20250419111402.1660267-2-bjorn@kernel.org Signed-off-by: Palmer Dabbelt Signed-off-by: Li Nan --- arch/riscv/kernel/probes/uprobes.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c index 4b3dc8beaf77..cc15f7ca6cc1 100644 --- a/arch/riscv/kernel/probes/uprobes.c +++ b/arch/riscv/kernel/probes/uprobes.c @@ -167,6 +167,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, /* Initialize the slot */ void *kaddr = kmap_atomic(page); void *dst = kaddr + (vaddr & ~PAGE_MASK); + unsigned long start = (unsigned long)dst; memcpy(dst, src, len); @@ -176,13 +177,6 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, *(uprobe_opcode_t *)dst = __BUG_INSN_32; } + flush_icache_range(start, start + len); kunmap_atomic(kaddr); - - /* - * We probably need flush_icache_user_page() but it needs vma. - * This should work on most of architectures by default. If - * architecture needs to do something different it can define - * its own version of the function. - */ - flush_dcache_page(page); } -- Gitee From aaa1ef63e396cc330cc8fc9fe72055aff75e6e3b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 14 Mar 2025 12:31:45 +0800 Subject: [PATCH 3/6] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING stable inclusion from stable-6.14 commit 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 category: bugfix issue: #ICF1E7 CVE: CVE-2025-21816 Signed-off-by: Li Nan --------------------------------------- hrtimers are migrated away from the dying CPU to any online target at the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers handling tasks involved in the CPU hotplug forward progress. However wakeups can still be performed by the outgoing CPU after CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being armed. Depending on several considerations (crystal ball power management based election, earliest timer already enqueued, timer migration enabled or not), the target may eventually be the current CPU even if offline. If that happens, the timer is eventually ignored. The most notable example is RCU which had to deal with each and every of those wake-ups by deferring them to an online CPU, along with related workarounds: _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying) _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU) _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq) The problem isn't confined to RCU though as the stop machine kthread (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end of its work through cpu_stop_signal_done() and performs a wake up that eventually arms the deadline server timer: WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0 CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0 RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0 Call Trace: start_dl_timer enqueue_dl_entity dl_server_start enqueue_task_fair enqueue_task ttwu_do_activate try_to_wake_up complete cpu_stopper_thread Instead of providing yet another bandaid to work around the situation, fix it in the hrtimers infrastructure instead: always migrate away a timer to an online target whenever it is enqueued from an offline CPU. This will also allow to revert all the above RCU disgraceful hacks. Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Reported-by: Vlad Poenaru Reported-by: Usama Arif Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Tested-by: Paul E. McKenney Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org Closes: 20241213203739.1519801-1-usamaarif642@gmail.com Conflicts: include/linux/hrtimer_defs.h include/linux/hrtimer.h [wxf: fix conflicts because 'struct hrtimer_cpu_base' is moved to another file.] Signed-off-by: Xiongfeng Wang Signed-off-by: Li Nan --- include/linux/hrtimer.h | 1 + kernel/time/hrtimer.c | 103 ++++++++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 8f77bb0f4ae0..05f8b7d7d1e9 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -237,6 +237,7 @@ struct hrtimer_cpu_base { ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + call_single_data_t csd; } ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 877535b06e73..6d9da768604d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_TAI] = HRTIMER_BASE_TAI, }; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -178,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -249,8 +287,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -259,8 +297,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -720,8 +757,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1208,6 +1243,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1219,9 +1255,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; + /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the @@ -1251,8 +1293,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid -- Gitee From 3e2e5e12bd09495872cc7c08648bc998127d6c69 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 May 2025 03:42:09 +0000 Subject: [PATCH 4/6] bpf: Fix kmemleak warning for percpu hashmap stable inclusion from stable-6.15 commit 11ba7ce076e5903e7bdc1fd1498979c331b3c286 category: bugfix issue: #ICF1E7 CVE: CVE-2025-37807 Signed-off-by: Li Nan --------------------------------------- Vlad Poenaru reported the following kmemleak issue: unreferenced object 0x606fd7c44ac8 (size 32): backtrace (crc 0): pcpu_alloc_noprof+0x730/0xeb0 bpf_map_alloc_percpu+0x69/0xc0 prealloc_init+0x9d/0x1b0 htab_map_alloc+0x363/0x510 map_create+0x215/0x3a0 __sys_bpf+0x16b/0x3e0 __x64_sys_bpf+0x18/0x20 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Further investigation shows the reason is due to not 8-byte aligned store of percpu pointer in htab_elem_set_ptr(): *(void __percpu **)(l->key + key_size) = pptr; Note that the whole htab_elem alignment is 8 (for x86_64). If the key_size is 4, that means pptr is stored in a location which is 4 byte aligned but not 8 byte aligned. In mm/kmemleak.c, scan_block() scans the memory based on 8 byte stride, so it won't detect above pptr, hence reporting the memory leak. In htab_map_alloc(), we already have htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) htab->elem_size += sizeof(void *); else htab->elem_size += round_up(htab->map.value_size, 8); So storing pptr with 8-byte alignment won't cause any problem and can fix kmemleak too. The issue can be reproduced with bpf selftest as well: 1. Enable CONFIG_DEBUG_KMEMLEAK config 2. Add a getchar() before skel destroy in test_hash_map() in prog_tests/for_each.c. The purpose is to keep map available so kmemleak can be detected. 3. run './test_progs -t for_each/hash_map &' and a kmemleak should be reported. Reported-by: Vlad Poenaru Signed-off-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250224175514.2207227-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Conflicts: kernel/bpf/hashtab.c [The conflicts were due to not merge commit 0b56e637f7058] Signed-off-by: Pu Lehui Signed-off-by: Li Nan --- kernel/bpf/hashtab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fc34f72702cc..fdc74aeaa164 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -197,12 +197,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab) static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)(l->key + roundup(key_size, 8)); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) -- Gitee From 1f952067f4b17b6b494f059cb47902be42db33ab Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 19 Apr 2025 18:49:07 +0800 Subject: [PATCH 5/6] ext4: define ext4_journal_destroy wrapper stable inclusion from stable-6.15 commit 5a02a6204ca37e7c22fbb55a789c503f05e8e89a category: bugfix issue: #ICF1E7 CVE: CVE-2025-22113 Signed-off-by: Li Nan --------------------------------------- Define an ext4 wrapper over jbd2_journal_destroy to make sure we have consistent behavior during journal destruction. This will also come useful in the next patch where we add some ext4 specific logic in the destroy path. Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/c3ba78c5c419757e6d5f2d8ebb4a8ce9d21da86a.1742279837.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o Conflicts: fs/ext4/super.c [22650a99821d not applied] Signed-off-by: Yongjian Sun Signed-off-by: Li Nan --- fs/ext4/ext4_jbd2.h | 14 ++++++++++++++ fs/ext4/super.c | 16 ++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0c77697d5e90..930778e507cc 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -513,4 +513,18 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 270f9d291643..85b4946ab5df 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1321,8 +1321,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } @@ -4994,8 +4993,7 @@ static int ext4_load_and_init_journal(struct super_block *sb, out: /* flush s_sb_upd_work before destroying the journal. */ flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } @@ -5666,8 +5664,7 @@ failed_mount8: __maybe_unused if (sbi->s_journal) { /* flush s_sb_upd_work before journal destroy. */ flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); @@ -5975,7 +5972,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: blkdev_put(journal_bdev, sb); return ERR_PTR(errno); @@ -6092,8 +6089,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6111,7 +6107,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } -- Gitee From 74950b2e836726922a02d8dfecd0bc403e5d7268 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 19 Apr 2025 18:49:08 +0800 Subject: [PATCH 6/6] ext4: avoid journaling sb update on error if journal is destroying stable inclusion from stable-6.15 commit ce2f26e73783b4a7c46a86e3af5b5c8de0971790 category: bugfix issue: #ICF1E7 CVE: CVE-2025-22113 Signed-off-by: Li Nan --------------------------------------- Presently we always BUG_ON if trying to start a transaction on a journal marked with JBD2_UNMOUNT, since this should never happen. However, while ltp running stress tests, it was observed that in case of some error handling paths, it is possible for update_super_work to start a transaction after the journal is destroyed eg: (umount) ext4_kill_sb kill_block_super generic_shutdown_super sync_filesystem /* commits all txns */ evict_inodes /* might start a new txn */ ext4_put_super flush_work(&sbi->s_sb_upd_work) /* flush the workqueue */ jbd2_journal_destroy journal_kill_thread journal->j_flags |= JBD2_UNMOUNT; jbd2_journal_commit_transaction jbd2_journal_get_descriptor_buffer jbd2_journal_bmap ext4_journal_bmap ext4_map_blocks ... ext4_inode_error ext4_handle_error schedule_work(&sbi->s_sb_upd_work) /* work queue kicks in */ update_super_work jbd2_journal_start start_this_handle BUG_ON(journal->j_flags & JBD2_UNMOUNT) Hence, introduce a new mount flag to indicate journal is destroying and only do a journaled (and deferred) update of sb if this flag is not set. Otherwise, just fallback to an un-journaled commit. Further, in the journal destroy path, we have the following sequence: 1. Set mount flag indicating journal is destroying 2. force a commit and wait for it 3. flush pending sb updates This sequence is important as it ensures that, after this point, there is no sb update that might be journaled so it is safe to update the sb outside the journal. (To avoid race discussed in 2d01ddc86606) Also, we don't need a similar check in ext4_grp_locked_error since it is only called from mballoc and AFAICT it would be always valid to schedule work here. Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available") Reported-by: Mahesh Kumar Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://patch.msgid.link/9613c465d6ff00cd315602f99283d5f24018c3f7.1742279837.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o Conflicts: fs/ext4/ext4.h [95257987a638 not applied] Signed-off-by: Yongjian Sun Signed-off-by: Li Nan --- fs/ext4/ext4.h | 3 ++- fs/ext4/ext4_jbd2.h | 15 +++++++++++++++ fs/ext4/super.c | 16 ++++++++-------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 60455c84a937..1f7a36de69af 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1814,7 +1814,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 930778e507cc..ada46189b086 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -521,6 +521,21 @@ static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *jour { int err = 0; + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_sb_upd_work); + err = jbd2_journal_destroy(journal); sbi->s_journal = NULL; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 85b4946ab5df..92d2182f472b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -728,9 +728,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); @@ -1315,7 +1319,6 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); @@ -1325,7 +1328,8 @@ static void ext4_put_super(struct super_block *sb) if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); @@ -4991,8 +4995,6 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_sb_upd_work before destroying the journal. */ - flush_work(&sbi->s_sb_upd_work); ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } @@ -5662,8 +5664,6 @@ failed_mount8: __maybe_unused sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_sb_upd_work before journal destroy. */ - flush_work(&sbi->s_sb_upd_work); ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: -- Gitee