diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst index b90dafcc8237cc8be9c3fa51aa2e12fd47cb56b7..16f3ac0a601df80f49d569cc56dd650964768f28 100644 --- a/Documentation/admin-guide/cputopology.rst +++ b/Documentation/admin-guide/cputopology.rst @@ -18,6 +18,11 @@ die_id: identifier (rather than the kernel's). The actual value is architecture and platform dependent. +cluster_id: + the cluster ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + core_id: the CPU core ID of cpuX. Typically it is the hardware platform's @@ -36,6 +41,15 @@ drawer_id: identifier (rather than the kernel's). The actual value is architecture and platform dependent. ++cluster_cpus: + + internal kernel map of CPUs within the same cluster + ++cluster_cpus_list: + + human-readable list of CPUs within the same cluster. + The format is like 0-3, 8-11, 14,17. + core_cpus: internal kernel map of CPUs within the same core. @@ -88,6 +102,9 @@ Architecture-neutral, drivers/base/topology.c, exports these attributes. However, the book and drawer related sysfs files will only be created if CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. +The cluster hierarchy related sysfs files will only be created if an +architecture provides the related macros as described below. + CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, where they reflect the cpu and cache hierarchy. @@ -96,12 +113,14 @@ these macros in include/asm-XXX/topology.h:: #define topology_physical_package_id(cpu) #define topology_die_id(cpu) + #define topology_cluster_id(cpu) #define topology_core_id(cpu) #define topology_book_id(cpu) #define topology_drawer_id(cpu) #define topology_sibling_cpumask(cpu) #define topology_core_cpumask(cpu) #define topology_die_cpumask(cpu) + #define topology_cluster_cpumask(cpu) #define topology_book_cpumask(cpu) #define topology_drawer_cpumask(cpu) @@ -116,10 +135,12 @@ not defined by include/asm-XXX/topology.h: 1) topology_physical_package_id: -1 2) topology_die_id: -1 -3) topology_core_id: 0 -4) topology_sibling_cpumask: just the given CPU -5) topology_core_cpumask: just the given CPU -6) topology_die_cpumask: just the given CPU +3) topology_cluster_id: -1 +4) topology_core_id: 0 +5) topology_sibling_cpumask: just the given CPU +6) topology_core_cpumask: just the given CPU +7) topology_cluster_cpumask: just the given CPU +8) topology_die_cpumask: just the given CPU For architectures that don't support books (CONFIG_SCHED_BOOK) there are no default definitions for topology_book_id() and topology_book_cpumask(). diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e7e9c9979fe380d390042966e8ac8a4690b9a0e7..810b9f5cbf9b7ac85818a7073edaddc35704b452 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4461,6 +4461,10 @@ qspinlock.numa_spinlock_threshold_ns= [NUMA, PV_OPS] sched_debug [KNL] Enables verbose scheduler debug messages. + sched_cluster= Enable or disable cluster scheduling. + 0 -- disable. + 1 -- enable. + schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature incurs a small amount of overhead in the scheduler diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 098c8bbbd4e7116f3034f85b8210cb2696c2021d..8bcd353e333e6b1c7a420a5fe27cb510d2714202 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -994,6 +994,15 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_CLUSTER + bool "Cluster scheduler support" + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + config SCHED_SMT bool "SMT scheduler support" help diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index dcfd3e97e9a488233856ddaca7ac609ae803f3d9..9cfd12ce0e6f2b90d05488840b9a32a344798f64 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -57,6 +57,7 @@ CONFIG_ARM64_VA_BITS_48=y CONFIG_SCHED_MC=y CONFIG_SCHED_SMT=y CONFIG_NR_CPUS=1024 +CONFIG_SCHED_CLUSTER=y CONFIG_NUMA=y CONFIG_NODES_SHIFT=8 CONFIG_HZ_1000=y diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3d3a673c6704140a39aa899c7d65e99c87a92f92..4ecf361cb562acfaa2d527cf488db563d943f43d 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -62,6 +62,8 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; } + topology_id = find_acpi_cpu_topology_cluster(cpu); + cpu_topology[cpu].cluster_id = topology_id; topology_id = find_acpi_cpu_topology_package(cpu); cpu_topology[cpu].package_id = topology_id; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 1699d18bd1548470389529c02c5b74a78c660120..bf1144286ca7c28e28282d1914597fbc94a0cfc3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -119,6 +120,13 @@ int arch_update_cpu_topology(void) return retval; } +void arch_rebuild_cpu_topology(void) +{ + x86_topology_update = true; + rebuild_sched_domains(); + x86_topology_update = false; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index f31544d3656e3dfb400ce86d51ea46679eafde3b..e906dfc17f7763c7a0402eca502e895175656267 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -735,6 +735,73 @@ int find_acpi_cpu_topology_package(unsigned int cpu) ACPI_PPTT_PHYSICAL_PACKAGE); } +/** + * find_acpi_cpu_topology_cluster() - Determine a unique CPU cluster value + * @cpu: Kernel logical CPU number + * + * Determine a topology unique cluster ID for the given CPU/thread. + * This ID can then be used to group peers, which will have matching ids. + * + * The cluster, if present is the level of topology above CPUs. In a + * multi-thread CPU, it will be the level above the CPU, not the thread. + * It may not exist in single CPU systems. In simple multi-CPU systems, + * it may be equal to the package topology level. + * + * Return: -ENOENT if the PPTT doesn't exist, the CPU cannot be found + * or there is no toplogy level above the CPU.. + * Otherwise returns a value which represents the package for this CPU. + */ + +int find_acpi_cpu_topology_cluster(unsigned int cpu) +{ + struct acpi_table_header *table; + acpi_status status; + struct acpi_pptt_processor *cpu_node, *cluster_node; + u32 acpi_cpu_id; + int retval; + int is_thread; + + status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); + if (ACPI_FAILURE(status)) { + acpi_pptt_warn_missing(); + return -ENOENT; + } + + acpi_cpu_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (cpu_node == NULL || !cpu_node->parent) { + retval = -ENOENT; + goto put_table; + } + + is_thread = cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_IS_THREAD; + cluster_node = fetch_pptt_node(table, cpu_node->parent); + if (cluster_node == NULL) { + retval = -ENOENT; + goto put_table; + } + if (is_thread) { + if (!cluster_node->parent) { + retval = -ENOENT; + goto put_table; + } + cluster_node = fetch_pptt_node(table, cluster_node->parent); + if (cluster_node == NULL) { + retval = -ENOENT; + goto put_table; + } + } + if (cluster_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID) + retval = cluster_node->acpi_processor_id; + else + retval = ACPI_PTR_DIFF(cluster_node, table); + +put_table: + acpi_put_table(table); + + return retval; +} + /** * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag * @cpu: Kernel logical CPU number diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 0dfe20d18b8daa70a54224a948183fd2193ee50e..90c46f53caa414a7f027d80416af662db038f9f1 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -75,6 +75,7 @@ static int register_cpu_capacity_sysctl(void) } subsys_initcall(register_cpu_capacity_sysctl); +static u32 capacity_scale; static int update_topology; int topology_update_cpu_topology(void) @@ -94,7 +95,15 @@ static void update_topology_flags_workfn(struct work_struct *work) update_topology = 0; } -static u32 capacity_scale; +void __weak arch_rebuild_cpu_topology(void) +{ + update_topology = 1; + rebuild_sched_domains(); + pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); + update_topology = 0; +} + + static u32 *raw_capacity; static int free_raw_capacity(void) @@ -448,6 +457,18 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return core_mask; } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + /* ++ * Forbid cpu_clustergroup_mask() to span more or the same CPUs as ++ * cpu_coregroup_mask(). ++ */ + if (cpumask_subset(cpu_coregroup_mask(cpu), + &cpu_topology[cpu].cluster_sibling)) + return topology_sibling_cpumask(cpu); + return &cpu_topology[cpu].cluster_sibling; +} + void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -465,6 +486,12 @@ void update_siblings_masks(unsigned int cpuid) if (cpuid_topo->package_id != cpu_topo->package_id) continue; + if (cpuid_topo->cluster_id == cpu_topo->cluster_id && + cpuid_topo->cluster_id != -1) { + cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling); + cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling); + } + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); @@ -483,6 +510,9 @@ static void clear_cpu_topology(int cpu) cpumask_clear(&cpu_topo->llc_sibling); cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); + cpumask_clear(&cpu_topo->cluster_sibling); + cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling); + cpumask_clear(&cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); @@ -498,6 +528,7 @@ void __init reset_cpu_topology(void) cpu_topo->thread_id = -1; cpu_topo->core_id = -1; + cpu_topo->cluster_id = -1; cpu_topo->package_id = -1; cpu_topo->llc_id = -1; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 4e033d4cc0dcffcaf3f201e68100afa0f7abcfbb..ea863bcc726941a43f94f0b89f429f692f5292fa 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -46,6 +46,11 @@ static DEVICE_ATTR_RO(physical_package_id); define_id_show_func(die_id); static DEVICE_ATTR_RO(die_id); +#ifdef TOPOLOGY_CLUSTER_SYSFS +define_id_show_func(cluster_id); +static DEVICE_ATTR_RO(cluster_id); +#endif + define_id_show_func(core_id); static DEVICE_ATTR_RO(core_id); @@ -61,6 +66,12 @@ define_siblings_show_func(core_siblings, core_cpumask); static DEVICE_ATTR_RO(core_siblings); static DEVICE_ATTR_RO(core_siblings_list); +#ifdef TOPOLOGY_CLUSTER_SYSFS +define_siblings_show_func(cluster_cpus, cluster_cpumask); +static DEVICE_ATTR_RO(cluster_cpus); +static DEVICE_ATTR_RO(cluster_cpus_list); +#endif + define_siblings_show_func(die_cpus, die_cpumask); static DEVICE_ATTR_RO(die_cpus); static DEVICE_ATTR_RO(die_cpus_list); @@ -88,6 +99,9 @@ static DEVICE_ATTR_RO(drawer_siblings_list); static struct attribute *default_attrs[] = { &dev_attr_physical_package_id.attr, &dev_attr_die_id.attr, +#ifdef TOPOLOGY_CLUSTER_SYSFS + &dev_attr_cluster_id.attr, +#endif &dev_attr_core_id.attr, &dev_attr_thread_siblings.attr, &dev_attr_thread_siblings_list.attr, @@ -95,6 +109,10 @@ static struct attribute *default_attrs[] = { &dev_attr_core_cpus_list.attr, &dev_attr_core_siblings.attr, &dev_attr_core_siblings_list.attr, +#ifdef TOPOLOGY_CLUSTER_SYSFS + &dev_attr_cluster_cpus.attr, + &dev_attr_cluster_cpus_list.attr, +#endif &dev_attr_die_cpus.attr, &dev_attr_die_cpus_list.attr, &dev_attr_package_cpus.attr, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index ed68f45c09f739f3c2f7c98f30b7cc66b6fd9bc8..dfc4aa531c7722aab7ba793e4a78d2f4780439ad 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1297,6 +1297,7 @@ static inline int lpit_read_residency_count_address(u64 *address) #ifdef CONFIG_ACPI_PPTT int acpi_pptt_cpu_is_thread(unsigned int cpu); int find_acpi_cpu_topology(unsigned int cpu, int level); +int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); int find_acpi_cpu_cache_topology(unsigned int cpu, int level); @@ -1309,6 +1310,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level) { return -EINVAL; } +static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) +{ + return -EINVAL; +} static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 42f2b512609499bebc2f71e75ccca74d420b0a83..9d7a80debe064a006f55e33cac3282728eb3d0d2 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -36,10 +36,12 @@ unsigned long topology_get_freq_scale(int cpu) struct cpu_topology { int thread_id; int core_id; + int cluster_id; int package_id; int llc_id; cpumask_t thread_sibling; cpumask_t core_sibling; + cpumask_t cluster_sibling; cpumask_t llc_sibling; }; @@ -47,13 +49,16 @@ struct cpu_topology { extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) +#define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) #define topology_core_id(cpu) (cpu_topology[cpu].core_id) #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) +#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +const struct cpumask *cpu_clustergroup_mask(int cpu); void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 6a67c16b436b6aab6b389eea3107c4fa58d21236..96b072b2a74c436a738bd1051bb801904a000074 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -128,4 +128,10 @@ extern int sched_energy_aware_handler(struct ctl_table *table, int write, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_CLUSTER +extern unsigned int sysctl_sched_cluster; +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4188de2c163163d325b1192fba5b5011d213f3a2..502d6fb4ab9c2a1348c2659aa9e7986729aeb6a2 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -19,13 +19,14 @@ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */ #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */ -#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */ -#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ -#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ -#define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_CLUSTER 0x0100 /* Domain members share CPU cluster */ +#define SD_SHARE_POWERDOMAIN 0x0200 /* Domain members share power domain */ +#define SD_SHARE_PKG_RESOURCES 0x0400 /* Domain members share CPU pkg resources */ +#define SD_SERIALIZE 0x0800 /* Only a single load balancing instance */ +#define SD_ASYM_PACKING 0x1000 /* Place busy groups earlier in the domain */ +#define SD_PREFER_SIBLING 0x2000 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x4000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x8000 /* cross-node balancing */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) @@ -34,6 +35,13 @@ static inline int cpu_smt_flags(void) } #endif +#ifdef CONFIG_SCHED_CLUSTER +static inline int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; +} +#endif + #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { @@ -66,6 +74,9 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; +#ifndef __GENKSYMS__ + int nr_idle_scan +#endif #ifdef CONFIG_ARM64 } ____cacheline_aligned; #else @@ -173,11 +184,13 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_lowest_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); #define SDTL_OVERLAP 0x01 +#define SDTL_SKIP 0x02 struct sd_data { struct sched_domain *__percpu *sd; @@ -226,6 +239,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ #ifndef arch_scale_cpu_capacity diff --git a/include/linux/topology.h b/include/linux/topology.h index 64e7ee0abe71cd12f4aab15e82d33f97eb903755..677a3d1faade4166f98483891afd0277a2684a8a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -44,6 +44,7 @@ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); +void arch_rebuild_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 @@ -197,12 +198,19 @@ static inline int cpu_to_mem(int cpu) #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ +#if defined(topology_cluster_id) && defined(topology_cluster_cpumask) +#define TOPOLOGY_CLUSTER_SYSFS +#endif + #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif +#ifndef topology_cluster_id +#define topology_cluster_id(cpu) ((void)(cpu), -1) +#endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif @@ -212,6 +220,9 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifndef topology_cluster_cpumask +#define topology_cluster_cpumask(cpu) cpumask_of(cpu) +#endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f7eda976364718c9e11299b8a2b7388a0658fdd..dc5f92ac8111a3349ec69a5b176adef5a7577790 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2925,6 +2925,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +/* + * Whether CPUs are share lowest cache, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_lowest_cache_id, this_cpu) == + per_cpu(sd_lowest_cache_id, that_cpu); +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -7759,6 +7773,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { sched_init_numa(); + set_sched_cluster(); /* * There's no userspace yet to cause hotplug operations; hence all the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index db15e02baa55042a52f6fd0717c418a2e6605ffe..c9a259fde8a62369c6bf9f47a1b23ab16a984dd4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6117,6 +6117,30 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_domain *sdc = + rcu_dereference(per_cpu(sd_cluster, target)); + + if (sdc) { + for_each_cpu_wrap(core, sched_domain_span(sdc), target) { + bool idle = true; + + if (!cpumask_test_cpu(core, cpus)) + continue; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + cpumask_clear_cpu(cpu, cpus); + if (!available_idle_cpu(cpu)) + idle = false; + } + + if (idle) + return core; + } + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); + } + } + for_each_cpu_wrap(core, cpus, target) { bool idle = true; @@ -6190,6 +6214,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t s64 delta; int this = smp_processor_id(); int cpu, nr = INT_MAX, si_cpu = -1; + struct sched_domain_shared *sd_share; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6217,8 +6242,39 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { + /* because !--nr is the condition to stop scan */ + nr = READ_ONCE(sd_share->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } + } + + + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_domain *sdc = + rcu_dereference(per_cpu(sd_cluster, target)); + + if (sdc) { + for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + if (--nr <= 0) + return -1; + if (available_idle_cpu(cpu) || + sched_idle_cpu(cpu)) + return cpu; + } + cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); + } + } + + for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) + if (--nr <= 0) return si_cpu; if (available_idle_cpu(cpu)) break; @@ -6253,7 +6309,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && + if (prev != target && cpus_share_lowest_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; @@ -6261,7 +6317,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && - cpus_share_cache(recent_used_cpu, target) && + cpus_share_lowest_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* @@ -8638,6 +8694,77 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ +static void update_idle_cpu_scan(struct lb_env *env, + unsigned long sum_util) +{ + struct sched_domain_shared *sd_share; + int llc_weight, pct; + u64 x, y, tmp; + /* + * Update the number of CPUs to scan in LLC domain, which could + * be used as a hint in select_idle_cpu(). The update of sd_share + * could be expensive because it is within a shared cache line. + * So the write of this hint only occurs during periodic load + * balancing, rather than CPU_NEWLY_IDLE, because the latter + * can fire way more frequently than the former. + */ + if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) + return; + + llc_weight = per_cpu(sd_llc_size, env->dst_cpu); + if (env->sd->span_weight != llc_weight) + return; + + sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + if (!sd_share) + return; + + /* + * The number of CPUs to search drops as sum_util increases, when + * sum_util hits 85% or above, the scan stops. + * The reason to choose 85% as the threshold is because this is the + * imbalance_pct(117) when a LLC sched group is overloaded. + * + * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] + * and y'= y / SCHED_CAPACITY_SCALE + * + * x is the ratio of sum_util compared to the CPU capacity: + * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) + * y' is the ratio of CPUs to be scanned in the LLC domain, + * and the number of CPUs to scan is calculated by: + * + * nr_scan = llc_weight * y' [2] + * + * When x hits the threshold of overloaded, AKA, when + * x = 100 / pct, y drops to 0. According to [1], + * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 + * + * Scale x by SCHED_CAPACITY_SCALE: + * x' = sum_util / llc_weight; [3] + * + * and finally [1] becomes: + * y = SCHED_CAPACITY_SCALE - + * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] + * + */ + /* equation [3] */ + x = sum_util; + do_div(x, llc_weight); + + /* equation [4] */ + pct = env->sd->imbalance_pct; + tmp = x * x * pct * pct; + do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); + tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); + y = SCHED_CAPACITY_SCALE - tmp; + + /* equation [2] */ + y *= llc_weight; + do_div(y, SCHED_CAPACITY_SCALE); + if ((int)y != sd_share->nr_idle_scan) + WRITE_ONCE(sd_share->nr_idle_scan, (int)y); +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -8651,6 +8778,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats tmp_sgs; bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; int sg_status = 0; + unsigned long sum_util = 0; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) @@ -8704,6 +8832,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sum_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); @@ -8734,6 +8863,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } + update_idle_cpu_scan(env, sum_util); } /** diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 66c74aa4753e79c04d4c52d96a37525b514983ef..59e83f5a5719d5449360e86b0b863dcf6a716210 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,7 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_UTIL, false) /* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 764a0000df94e0fab30a4d0bb4be547a1c64284b..e9d702e0a94d4260b993cd78448229eda275be7d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1821,6 +1821,13 @@ this_rq_lock_irq(struct rq_flags *rf) return rq; } +#ifdef CONFIG_SCHED_CLUSTER +extern void set_sched_cluster(void); +#else +static inline void set_sched_cluster(void) { } +#endif + + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1937,11 +1944,14 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_lowest_cache_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; struct sched_group_capacity { atomic_t ref; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ea53dd8bbf7f9d118458fd6d2b4c71aee6ab9e0b..b25044a6e64be2492f4c6028a06cd2bc63692999 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -618,11 +618,14 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_lowest_cache_id); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -643,6 +646,18 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_lowest_cache_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1306,6 +1321,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ @@ -1439,6 +1455,11 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif + +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, +#endif + #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif @@ -1449,8 +1470,99 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_SCHED_CLUSTER +void set_sched_cluster(void) +{ + struct sched_domain_topology_level *tl; + + for (tl = sched_domain_topology; tl->mask; tl++) { + if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) { + if (!sysctl_sched_cluster) + tl->flags |= SDTL_SKIP; + else + tl->flags &= ~SDTL_SKIP; + break; + } + } +} + +/* set via /proc/sys/kernel/sched_cluster */ +unsigned int __read_mostly sysctl_sched_cluster; + +static DEFINE_MUTEX(sched_cluster_mutex); +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + unsigned int oldval; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sched_cluster_mutex); + oldval = sysctl_sched_cluster; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + if (oldval != sysctl_sched_cluster) { + set_sched_cluster(); + arch_rebuild_cpu_topology(); + } + } + mutex_unlock(&sched_cluster_mutex); + + return ret; +} + +static int zero; +static int one = 1; + +static struct ctl_table sched_cluster_sysctls[] = { + { + .procname = "sched_cluster", + .data = &sysctl_sched_cluster, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_cluster_handler, + .extra1 = (void *)&zero, + .extra2 = (void *)&one, + }, + {} +}; + +static int __init sched_cluster_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_cluster_sysctls); + return 0; +} +late_initcall(sched_cluster_sysctl_init); + +static int __init sched_cluster_option(char *str) +{ + int enable; + + if (get_option(&str, &enable)) { + if (enable != 0 && enable != 1) + return -EINVAL; + + sysctl_sched_cluster = enable; + return 0; + } + + return -EINVAL; +} +early_param("sched_cluster", sched_cluster_option); +#endif + +static struct sched_domain_topology_level * +next_tl(struct sched_domain_topology_level *tl) +{ + while (tl->mask && tl->flags & SDTL_SKIP) + ++tl; + return tl; +} + #define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) + for (tl = next_tl(sched_domain_topology); tl->mask; tl = next_tl(++tl)) void set_sched_topology(struct sched_domain_topology_level *tl) { @@ -1991,6 +2103,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2018,8 +2131,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att goto error; sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); - - if (tl == sched_domain_topology) + has_cluster |= sd->flags & SD_CLUSTER; + if (tl == next_tl(sched_domain_topology)) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; @@ -2069,6 +2182,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", @@ -2168,6 +2283,8 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (rcu_access_pointer(per_cpu(sd_cluster, cpu))) + static_branch_dec_cpuslocked(&sched_cluster_active); rcu_read_lock(); for_each_cpu(i, cpu_map)