From a3fd4df815400eadbfd5125c8393aa5b103d3498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=81=B0=E6=B5=B7=E5=AE=BD=E6=9D=BE?= <2351290287@qq.com> Date: Sat, 17 Jun 2023 15:31:21 +0000 Subject: [PATCH 01/14] add articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 灰海宽松 <2351290287@qq.com> --- ...ation-content-for-str-and-mem-functions.md | 526 ++++++++++++++++++ 1 file changed, 526 insertions(+) create mode 100644 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md new file mode 100644 index 0000000..2a4bac8 --- /dev/null +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -0,0 +1,526 @@ +> Author: Jingqing 2351290287@qq.com +> Date: 2023/6/17 +> Revisor: +> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux) +> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O) +> Sponsor: PLCT Lab, ISCAS + +# 近半年riscv内核库中str和mem函数的优化内容总结 + +## 简介 + +本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况,主要涉及 Memory, String 操作两大部分。 + +## Memory + +### riscv: optimized mem* functions + +[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) + +对各种mem相关操作函数的优化。 + +#### memcpy + +主要是由“直接逐字节复制”转变为“先对齐再按字复制”。 + +1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。 +2. 如果distance==0说明src和dest两者已经对齐,直接进行(32 or 64 bits)字长复制。 +3. 如果!=0说明未对齐,按照差值逐字复制。 + +```c ++void *__memcpy(void *dest, const void *src, size_t count) ++{ ++ union const_types s = { .as_u8 = src }; ++ union types d = { .as_u8 = dest }; ++ int distance = 0; ++ ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ if (count < MIN_THRESHOLD) ++ goto copy_remainder; ++ ++ /* Copy a byte at time until destination is aligned. */ ++ for (; d.as_uptr & WORD_MASK; count--) ++ *d.as_u8++ = *s.as_u8++; ++ ++ distance = s.as_uptr & WORD_MASK; ++ } ++ ++ if (distance) { ++ unsigned long last, next; ++ ++ /* ++ * s is distance bytes ahead of d, and d just reached ++ * the alignment boundary. Move s backward to word align it ++ * and shift data to compensate for distance, in order to do ++ * word-by-word copy. ++ */ ++ s.as_u8 -= distance; ++ ++ next = s.as_ulong[0]; ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) { ++ last = next; ++ next = s.as_ulong[1]; ++ ++ d.as_ulong[0] = last >> (distance * 8) | ++ next << ((BYTES_LONG - distance) * 8); ++ ++ d.as_ulong++; ++ s.as_ulong++; ++ } ++ ++ /* Restore s with the original offset. */ ++ s.as_u8 += distance; ++ } else { ++ /* ++ * If the source and dest lower bits are the same, do a simple ++ * 32/64 bit wide copy. ++ */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *d.as_ulong++ = *s.as_ulong++; ++ } ++ ++copy_remainder: ++ while (count--) ++ *d.as_u8++ = *s.as_u8++; ++ ++ return dest; ++} ++EXPORT_SYMBOL(__memcpy); ++ ++void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); ++EXPORT_SYMBOL(memcpy); +``` + +#### memmove + +如果dest和src不重叠或者dest src) { ++ const char *s = src + count; ++ char *tmp = dest + count; ++ ++ while (count--) ++ *--tmp = *--s; ++ } ++ return dest; ++} ++EXPORT_SYMBOL(__memmove); ++ ++void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove); ++EXPORT_SYMBOL(memmove); +``` + +#### memset + +旧memset:永远一次一个字节地填充。安全但是效率低。 + +修改后:也是采用对齐机制,先按字节填充,等到和最大填充单位的倍数对齐时按最大填充单位填入。 + +```c ++void *__memset(void *s, int c, size_t count) ++{ ++ union types dest = { .as_u8 = s }; ++ ++ if (count >= MIN_THRESHOLD) { ++ unsigned long cu = (unsigned long)c; ++ ++ /* Compose an ulong with 'c' repeated 4/8 times */ ++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER ++ cu *= 0x0101010101010101UL; ++#else ++ cu |= cu << 8; ++ cu |= cu << 16; ++ /* Suppress warning on 32 bit machines */ ++ cu |= (cu << 16) << 16; ++#endif ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ /* ++ * Fill the buffer one byte at time until ++ * the destination is word aligned. ++ */ ++ for (; count && dest.as_uptr & WORD_MASK; count--) ++ *dest.as_u8++ = c; ++ } ++ ++ /* Copy using the largest size allowed */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *dest.as_ulong++ = cu; ++ } ++ ++ /* copy the remainder */ ++ while (count--) ++ *dest.as_u8++ = c; ++ ++ return s; ++} ++EXPORT_SYMBOL(__memset); ++ ++void *memset(void *s, int c, size_t count) __weak __alias(__memset); ++EXPORT_SYMBOL(memset); +``` + +### riscv: lib: optimize memcmp with ld insn + +[riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/) + +这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题,没有看到作者提交新的版本。 + +这笔优化的核心代码和解读如下: + +旧代码: + +``` +sb a1, 0(t0) +addi t0, t0, 1 +bltu t0, a3, 5b +``` + +新代码: + +``` +/* fill head and tail with minimal branching */ +sb a1, 0(t0) +sb a1, -1(a3) +li a4, 2 +bgeu a4, a2, 6f + +sb a1, 1(t0) +sb a1, 2(t0) +sb a1, -2(a3) +sb a1, -3(a3) +li a4, 6 +bgeu a4, a2, 6f + +/* + * Adding additional detection to avoid + * redundant stores can lead + * to better performance + */ +sb a1, 3(t0) +sb a1, -4(a3) +li a4, 8 +bgeu a4, a2, 6f + +sb a1, 4(t0) +sb a1, -5(a3) +li a4, 10 +bgeu a4, a2, 6f + +sb a1, 5(t0) +sb a1, 6(t0) +sb a1, -6(a3) +sb a1, -7(a3) +li a4, 14 +bgeu a4, a2, 6f + +/* store the last byte */ +sb a1, 7(t0) +``` + +主要的改动如下: + +1. 将旧代码中的一行 `addi t0, t0, 1` 替换为一系列新的存储指令,用于填充头部和尾部。新代码中的存储指令是以一定的间隔连续存储数据。 +2. 添加了额外的条件检测和分支,以避免重复存储,这可能会提高性能。 +3. 添加了一行 `li a4, 2` 来设置一个常数,用于条件比较。 +4. 添加了 `6f` 标签,用于跳转到代码的结尾。 + +它的核心优化思路是用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 + +### RISC-V: Apply Zicboz to clear_page and memset + +[RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/) + +引入Zicboz扩展后,Zicboz块大小的内存自然对齐。因此要对接收任意内存块地址和大小的memset()来清空内存的方法进行优化。 + +分析发现当输入的地址未对齐或者太小时,Zicboz中的memset会显得效率低一些(多了几十条指令)。 + +1. 首先检查是否启用了CONFIG_RISCV_ISA_ZICBOZ来判断是否使用Zicboz扩展。如果不使用Zicboz扩展或者传入的参数不适合使用Zicboz扩展,则代码会跳转到.Ldo_memset标签处执行内存清零的逻辑。 +2. 如果使用Zicboz扩展进行内存清零,代码会将地址和长度进行对齐,并使用Zicboz扩展的指令进行内存清零操作。 +3. 在进行Zicboz扩展内存清零时,如果还有一些字节无法使用Zicboz扩展一次性清零,则会使用Duff's设备来处理剩余的字节。 + +### RISC-V: Optimize memset for data sizes less than 16 bytes + +[RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) ... + +在上述memset优化的基础上继续进行。 + +大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据,memset 使用字节存储,效率相对低。 改进方案决定用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 + +```c ++void *__memset(void *s, int c, size_t count) ++{ ++ union types dest = { .as_u8 = s }; ++ ++ if (count >= MIN_THRESHOLD) { ++ unsigned long cu = (unsigned long)c; ++ ++ /* Compose an ulong with 'c' repeated 4/8 times */ ++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER ++ cu *= 0x0101010101010101UL; ++#else ++ cu |= cu << 8; ++ cu |= cu << 16; ++ /* Suppress warning on 32 bit machines */ ++ cu |= (cu << 16) << 16;//8bits的c复制4次来构造unsigned long的cu ++#endif ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ /* ++ * Fill the buffer one byte at time until ++ * the destination is word aligned. ++ */ ++ for (; count && dest.as_uptr & WORD_MASK; count--) ++ *dest.as_u8++ = c;//逐字节填充对应地址中的值=c ++ } ++ ++ /* Copy using the largest size allowed */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu ++ } ++ ++ /* copy the remainder */ ++ while (count--) ++ *dest.as_u8++ = c;//剩余值全部设置为c ++ ++ return s; ++} ++EXPORT_SYMBOL(__memset); ++ ++void *memset(void *s, int c, size_t count) __weak __alias(__memset); ++EXPORT_SYMBOL(memset); +``` + +## String +### Zbb string optimizations + +[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) + +主要是为zbb提供了通用的一些字符串支持,后续特定用法优化拓展需要单独实现。 + +- 为Zbb系统添加了允许未对齐访问的strcmp,strncmp,strlen以及生成相应makefile文件。 + +- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义,简化。 + + ```c + -#define CPUFEATURE_SVPBMT 0 + -#define CPUFEATURE_ZICBOM 1 + -#define CPUFEATURE_ZBB 2 + +#define CPUFEATURE_SVPBMT (1 << 0) + +#define CPUFEATURE_ZICBOM (1 << 1) + +#define CPUFEATURE_ZBB (1 << 2) + ``` + +### Zbb+ fast-unaligned string optimization + +[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ... + +添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体,在无法生效的情况下退回到通用情况。 + +#### strcmp_zbb + +检查两个字符串是否对齐到SZREG的边界。如果是,则以SZREG为单位比较两个字符串中的内容。如果不是,则按字节读取。 + +```c ++/* ++ * Variant of strcmp using the ZBB extension if available ++ */ ++#ifdef CONFIG_RISCV_ISA_ZBB ++strcmp_zbb: ++ ++.option push ++.option arch,+zbb ++ ++ /* ++ * Returns ++ * a0 - comparison result, value like strcmp ++ * ++ * Parameters ++ * a0 - string1 ++ * a1 - string2 ++ * ++ * Clobbers ++ * t0, t1, t2, t3, t4, t5 ++ */ ++ ++ or t2, a0, a1 ++ li t4, -1 ++ and t2, t2, SZREG-1 ++ bnez t2, 3f ++ ++ /* Main loop for aligned string. */ ++ .p2align 3 ++1: ++ REG_L t0, 0(a0) ++ REG_L t1, 0(a1) ++ orc.b t3, t0 ++ bne t3, t4, 2f ++ addi a0, a0, SZREG ++ addi a1, a1, SZREG ++ beq t0, t1, 1b ++ ++ /* ++ * Words don't match, and no null byte in the first ++ * word. Get bytes in big-endian order and compare. ++ */ ++#ifndef CONFIG_CPU_BIG_ENDIAN ++ rev8 t0, t0 ++ rev8 t1, t1 ++#endif ++ ++ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ ++ sltu a0, t0, t1 ++ neg a0, a0 ++ ori a0, a0, 1 ++ ret ++ ++2: ++ /* ++ * Found a null byte. ++ * If words don't match, fall back to simple loop. ++ */ ++ bne t0, t1, 3f ++ ++ /* Otherwise, strings are equal. */ ++ li a0, 0 ++ ret ++ ++ /* Simple loop for misaligned strings. */ ++ .p2align 3 ++3: ++ lbu t0, 0(a0) ++ lbu t1, 0(a1) ++ addi a0, a0, 1 ++ addi a1, a1, 1 ++ bne t0, t1, 4f ++ bnez t0, 3b ++ ++4: ++ sub a0, t0, t1 ++ ret ++ ++.option pop ++#endif +``` + +#### strlen_zbb + +启用CONFIG_RISCV_ISA_ZBB的前提下,移位对齐字符后从头开始以SZREG为单位读取,并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。 + +```c ++#ifdef CONFIG_RISCV_ISA_ZBB ++strlen_zbb: ++ ++#ifdef CONFIG_CPU_BIG_ENDIAN ++# define CZ clz ++# define SHIFT sll ++#else ++# define CZ ctz ++# define SHIFT srl ++#endif ++ ++.option push ++.option arch,+zbb ++ ++ /* ++ * Returns ++ * a0 - string length ++ * ++ * Parameters ++ * a0 - String to measure ++ * ++ * Clobbers ++ * t0, t1, t2, t3 ++ */ ++ ++ /* Number of irrelevant bytes in the first word. */ ++ andi t2, a0, SZREG-1 ++ ++ /* Align pointer. */ ++ andi t0, a0, -SZREG ++ ++ li t3, SZREG ++ sub t3, t3, t2 ++ slli t2, t2, 3 ++ ++ /* Get the first word. */ ++ REG_L t1, 0(t0) ++ ++ /* ++ * Shift away the partial data we loaded to remove the irrelevant bytes ++ * preceding the string with the effect of adding NUL bytes at the ++ * end of the string's first word. ++ */ ++ SHIFT t1, t1, t2 ++ ++ /* Convert non-NUL into 0xff and NUL into 0x00. */ ++ orc.b t1, t1 ++ ++ /* Convert non-NUL into 0x00 and NUL into 0xff. */ ++ not t1, t1 ++ ++ /* ++ * Search for the first set bit (corresponding to a NUL byte in the ++ * original chunk). ++ */ ++ CZ t1, t1 ++ ++ /* ++ * The first chunk is special: compare against the number ++ * of valid bytes in this chunk. ++ */ ++ srli a0, t1, 3 ++ bgtu t3, a0, 3f ++ ++ /* Prepare for the word comparison loop. */ ++ addi t2, t0, SZREG ++ li t3, -1 ++ ++ /* ++ * Our critical loop is 4 instructions and processes data in ++ * 4 byte or 8 byte chunks. ++ */ ++ .p2align 3 ++1: ++ REG_L t1, SZREG(t0) ++ addi t0, t0, SZREG ++ orc.b t1, t1 ++ beq t1, t3, 1b ++2: ++ not t1, t1 ++ CZ t1, t1 ++ ++ /* Get number of processed words. */ ++ sub t2, t0, t2 ++ ++ /* Add number of characters in the first word. */ ++ add a0, a0, t2 ++ srli t1, t1, 3 ++ ++ /* Add number of characters in the last word. */ ++ add a0, a0, t1 ++3: ++ ret ++ ++.option pop ++#endif +``` + +## 总结 + +以上梳理了memory和strcmp相关优化代码,可以发现: + +memory相关优化方法主要有两点:通过连续存储减少条件分支及其跳转次数,减少判断上的时间;以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分,批量操作一定程度上提高效率。 + +string对于zbb支持部分的函数优化,主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架,之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数,以此优化部分情况下的zbb中str相关函数的使用效率。 + +接下来将按照 Memory, String, 数据运算,其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读,敬请期待。 \ No newline at end of file -- Gitee From 2afb67d4cba266bfbd5e659dc8d204a5e39e936b Mon Sep 17 00:00:00 2001 From: Jingqing3948 <2351290287@qq.com> Date: Sun, 18 Jun 2023 13:13:15 +0800 Subject: [PATCH 02/14] 20230618-after using tinycorrect --- ...ation-content-for-str-and-mem-functions.md | 1051 +++++++++-------- 1 file changed, 526 insertions(+), 525 deletions(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index 2a4bac8..a0ee3e5 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -1,526 +1,527 @@ -> Author: Jingqing 2351290287@qq.com -> Date: 2023/6/17 -> Revisor: -> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux) -> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O) -> Sponsor: PLCT Lab, ISCAS - -# 近半年riscv内核库中str和mem函数的优化内容总结 - -## 简介 - -本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况,主要涉及 Memory, String 操作两大部分。 - -## Memory - -### riscv: optimized mem* functions - -[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) - -对各种mem相关操作函数的优化。 - -#### memcpy - -主要是由“直接逐字节复制”转变为“先对齐再按字复制”。 - -1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。 -2. 如果distance==0说明src和dest两者已经对齐,直接进行(32 or 64 bits)字长复制。 -3. 如果!=0说明未对齐,按照差值逐字复制。 - -```c -+void *__memcpy(void *dest, const void *src, size_t count) -+{ -+ union const_types s = { .as_u8 = src }; -+ union types d = { .as_u8 = dest }; -+ int distance = 0; -+ -+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { -+ if (count < MIN_THRESHOLD) -+ goto copy_remainder; -+ -+ /* Copy a byte at time until destination is aligned. */ -+ for (; d.as_uptr & WORD_MASK; count--) -+ *d.as_u8++ = *s.as_u8++; -+ -+ distance = s.as_uptr & WORD_MASK; -+ } -+ -+ if (distance) { -+ unsigned long last, next; -+ -+ /* -+ * s is distance bytes ahead of d, and d just reached -+ * the alignment boundary. Move s backward to word align it -+ * and shift data to compensate for distance, in order to do -+ * word-by-word copy. -+ */ -+ s.as_u8 -= distance; -+ -+ next = s.as_ulong[0]; -+ for (; count >= BYTES_LONG; count -= BYTES_LONG) { -+ last = next; -+ next = s.as_ulong[1]; -+ -+ d.as_ulong[0] = last >> (distance * 8) | -+ next << ((BYTES_LONG - distance) * 8); -+ -+ d.as_ulong++; -+ s.as_ulong++; -+ } -+ -+ /* Restore s with the original offset. */ -+ s.as_u8 += distance; -+ } else { -+ /* -+ * If the source and dest lower bits are the same, do a simple -+ * 32/64 bit wide copy. -+ */ -+ for (; count >= BYTES_LONG; count -= BYTES_LONG) -+ *d.as_ulong++ = *s.as_ulong++; -+ } -+ -+copy_remainder: -+ while (count--) -+ *d.as_u8++ = *s.as_u8++; -+ -+ return dest; -+} -+EXPORT_SYMBOL(__memcpy); -+ -+void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); -+EXPORT_SYMBOL(memcpy); -``` - -#### memmove - -如果dest和src不重叠或者dest src) { -+ const char *s = src + count; -+ char *tmp = dest + count; -+ -+ while (count--) -+ *--tmp = *--s; -+ } -+ return dest; -+} -+EXPORT_SYMBOL(__memmove); -+ -+void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove); -+EXPORT_SYMBOL(memmove); -``` - -#### memset - -旧memset:永远一次一个字节地填充。安全但是效率低。 - -修改后:也是采用对齐机制,先按字节填充,等到和最大填充单位的倍数对齐时按最大填充单位填入。 - -```c -+void *__memset(void *s, int c, size_t count) -+{ -+ union types dest = { .as_u8 = s }; -+ -+ if (count >= MIN_THRESHOLD) { -+ unsigned long cu = (unsigned long)c; -+ -+ /* Compose an ulong with 'c' repeated 4/8 times */ -+#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER -+ cu *= 0x0101010101010101UL; -+#else -+ cu |= cu << 8; -+ cu |= cu << 16; -+ /* Suppress warning on 32 bit machines */ -+ cu |= (cu << 16) << 16; -+#endif -+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { -+ /* -+ * Fill the buffer one byte at time until -+ * the destination is word aligned. -+ */ -+ for (; count && dest.as_uptr & WORD_MASK; count--) -+ *dest.as_u8++ = c; -+ } -+ -+ /* Copy using the largest size allowed */ -+ for (; count >= BYTES_LONG; count -= BYTES_LONG) -+ *dest.as_ulong++ = cu; -+ } -+ -+ /* copy the remainder */ -+ while (count--) -+ *dest.as_u8++ = c; -+ -+ return s; -+} -+EXPORT_SYMBOL(__memset); -+ -+void *memset(void *s, int c, size_t count) __weak __alias(__memset); -+EXPORT_SYMBOL(memset); -``` - -### riscv: lib: optimize memcmp with ld insn - -[riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/) - -这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题,没有看到作者提交新的版本。 - -这笔优化的核心代码和解读如下: - -旧代码: - -``` -sb a1, 0(t0) -addi t0, t0, 1 -bltu t0, a3, 5b -``` - -新代码: - -``` -/* fill head and tail with minimal branching */ -sb a1, 0(t0) -sb a1, -1(a3) -li a4, 2 -bgeu a4, a2, 6f - -sb a1, 1(t0) -sb a1, 2(t0) -sb a1, -2(a3) -sb a1, -3(a3) -li a4, 6 -bgeu a4, a2, 6f - -/* - * Adding additional detection to avoid - * redundant stores can lead - * to better performance - */ -sb a1, 3(t0) -sb a1, -4(a3) -li a4, 8 -bgeu a4, a2, 6f - -sb a1, 4(t0) -sb a1, -5(a3) -li a4, 10 -bgeu a4, a2, 6f - -sb a1, 5(t0) -sb a1, 6(t0) -sb a1, -6(a3) -sb a1, -7(a3) -li a4, 14 -bgeu a4, a2, 6f - -/* store the last byte */ -sb a1, 7(t0) -``` - -主要的改动如下: - -1. 将旧代码中的一行 `addi t0, t0, 1` 替换为一系列新的存储指令,用于填充头部和尾部。新代码中的存储指令是以一定的间隔连续存储数据。 -2. 添加了额外的条件检测和分支,以避免重复存储,这可能会提高性能。 -3. 添加了一行 `li a4, 2` 来设置一个常数,用于条件比较。 -4. 添加了 `6f` 标签,用于跳转到代码的结尾。 - -它的核心优化思路是用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 - -### RISC-V: Apply Zicboz to clear_page and memset - -[RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/) - -引入Zicboz扩展后,Zicboz块大小的内存自然对齐。因此要对接收任意内存块地址和大小的memset()来清空内存的方法进行优化。 - -分析发现当输入的地址未对齐或者太小时,Zicboz中的memset会显得效率低一些(多了几十条指令)。 - -1. 首先检查是否启用了CONFIG_RISCV_ISA_ZICBOZ来判断是否使用Zicboz扩展。如果不使用Zicboz扩展或者传入的参数不适合使用Zicboz扩展,则代码会跳转到.Ldo_memset标签处执行内存清零的逻辑。 -2. 如果使用Zicboz扩展进行内存清零,代码会将地址和长度进行对齐,并使用Zicboz扩展的指令进行内存清零操作。 -3. 在进行Zicboz扩展内存清零时,如果还有一些字节无法使用Zicboz扩展一次性清零,则会使用Duff's设备来处理剩余的字节。 - -### RISC-V: Optimize memset for data sizes less than 16 bytes - -[RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) ... - -在上述memset优化的基础上继续进行。 - -大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据,memset 使用字节存储,效率相对低。 改进方案决定用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 - -```c -+void *__memset(void *s, int c, size_t count) -+{ -+ union types dest = { .as_u8 = s }; -+ -+ if (count >= MIN_THRESHOLD) { -+ unsigned long cu = (unsigned long)c; -+ -+ /* Compose an ulong with 'c' repeated 4/8 times */ -+#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER -+ cu *= 0x0101010101010101UL; -+#else -+ cu |= cu << 8; -+ cu |= cu << 16; -+ /* Suppress warning on 32 bit machines */ -+ cu |= (cu << 16) << 16;//8bits的c复制4次来构造unsigned long的cu -+#endif -+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { -+ /* -+ * Fill the buffer one byte at time until -+ * the destination is word aligned. -+ */ -+ for (; count && dest.as_uptr & WORD_MASK; count--) -+ *dest.as_u8++ = c;//逐字节填充对应地址中的值=c -+ } -+ -+ /* Copy using the largest size allowed */ -+ for (; count >= BYTES_LONG; count -= BYTES_LONG) -+ *dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu -+ } -+ -+ /* copy the remainder */ -+ while (count--) -+ *dest.as_u8++ = c;//剩余值全部设置为c -+ -+ return s; -+} -+EXPORT_SYMBOL(__memset); -+ -+void *memset(void *s, int c, size_t count) __weak __alias(__memset); -+EXPORT_SYMBOL(memset); -``` - -## String -### Zbb string optimizations - -[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) - -主要是为zbb提供了通用的一些字符串支持,后续特定用法优化拓展需要单独实现。 - -- 为Zbb系统添加了允许未对齐访问的strcmp,strncmp,strlen以及生成相应makefile文件。 - -- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义,简化。 - - ```c - -#define CPUFEATURE_SVPBMT 0 - -#define CPUFEATURE_ZICBOM 1 - -#define CPUFEATURE_ZBB 2 - +#define CPUFEATURE_SVPBMT (1 << 0) - +#define CPUFEATURE_ZICBOM (1 << 1) - +#define CPUFEATURE_ZBB (1 << 2) - ``` - -### Zbb+ fast-unaligned string optimization - -[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ... - -添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体,在无法生效的情况下退回到通用情况。 - -#### strcmp_zbb - -检查两个字符串是否对齐到SZREG的边界。如果是,则以SZREG为单位比较两个字符串中的内容。如果不是,则按字节读取。 - -```c -+/* -+ * Variant of strcmp using the ZBB extension if available -+ */ -+#ifdef CONFIG_RISCV_ISA_ZBB -+strcmp_zbb: -+ -+.option push -+.option arch,+zbb -+ -+ /* -+ * Returns -+ * a0 - comparison result, value like strcmp -+ * -+ * Parameters -+ * a0 - string1 -+ * a1 - string2 -+ * -+ * Clobbers -+ * t0, t1, t2, t3, t4, t5 -+ */ -+ -+ or t2, a0, a1 -+ li t4, -1 -+ and t2, t2, SZREG-1 -+ bnez t2, 3f -+ -+ /* Main loop for aligned string. */ -+ .p2align 3 -+1: -+ REG_L t0, 0(a0) -+ REG_L t1, 0(a1) -+ orc.b t3, t0 -+ bne t3, t4, 2f -+ addi a0, a0, SZREG -+ addi a1, a1, SZREG -+ beq t0, t1, 1b -+ -+ /* -+ * Words don't match, and no null byte in the first -+ * word. Get bytes in big-endian order and compare. -+ */ -+#ifndef CONFIG_CPU_BIG_ENDIAN -+ rev8 t0, t0 -+ rev8 t1, t1 -+#endif -+ -+ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ -+ sltu a0, t0, t1 -+ neg a0, a0 -+ ori a0, a0, 1 -+ ret -+ -+2: -+ /* -+ * Found a null byte. -+ * If words don't match, fall back to simple loop. -+ */ -+ bne t0, t1, 3f -+ -+ /* Otherwise, strings are equal. */ -+ li a0, 0 -+ ret -+ -+ /* Simple loop for misaligned strings. */ -+ .p2align 3 -+3: -+ lbu t0, 0(a0) -+ lbu t1, 0(a1) -+ addi a0, a0, 1 -+ addi a1, a1, 1 -+ bne t0, t1, 4f -+ bnez t0, 3b -+ -+4: -+ sub a0, t0, t1 -+ ret -+ -+.option pop -+#endif -``` - -#### strlen_zbb - -启用CONFIG_RISCV_ISA_ZBB的前提下,移位对齐字符后从头开始以SZREG为单位读取,并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。 - -```c -+#ifdef CONFIG_RISCV_ISA_ZBB -+strlen_zbb: -+ -+#ifdef CONFIG_CPU_BIG_ENDIAN -+# define CZ clz -+# define SHIFT sll -+#else -+# define CZ ctz -+# define SHIFT srl -+#endif -+ -+.option push -+.option arch,+zbb -+ -+ /* -+ * Returns -+ * a0 - string length -+ * -+ * Parameters -+ * a0 - String to measure -+ * -+ * Clobbers -+ * t0, t1, t2, t3 -+ */ -+ -+ /* Number of irrelevant bytes in the first word. */ -+ andi t2, a0, SZREG-1 -+ -+ /* Align pointer. */ -+ andi t0, a0, -SZREG -+ -+ li t3, SZREG -+ sub t3, t3, t2 -+ slli t2, t2, 3 -+ -+ /* Get the first word. */ -+ REG_L t1, 0(t0) -+ -+ /* -+ * Shift away the partial data we loaded to remove the irrelevant bytes -+ * preceding the string with the effect of adding NUL bytes at the -+ * end of the string's first word. -+ */ -+ SHIFT t1, t1, t2 -+ -+ /* Convert non-NUL into 0xff and NUL into 0x00. */ -+ orc.b t1, t1 -+ -+ /* Convert non-NUL into 0x00 and NUL into 0xff. */ -+ not t1, t1 -+ -+ /* -+ * Search for the first set bit (corresponding to a NUL byte in the -+ * original chunk). -+ */ -+ CZ t1, t1 -+ -+ /* -+ * The first chunk is special: compare against the number -+ * of valid bytes in this chunk. -+ */ -+ srli a0, t1, 3 -+ bgtu t3, a0, 3f -+ -+ /* Prepare for the word comparison loop. */ -+ addi t2, t0, SZREG -+ li t3, -1 -+ -+ /* -+ * Our critical loop is 4 instructions and processes data in -+ * 4 byte or 8 byte chunks. -+ */ -+ .p2align 3 -+1: -+ REG_L t1, SZREG(t0) -+ addi t0, t0, SZREG -+ orc.b t1, t1 -+ beq t1, t3, 1b -+2: -+ not t1, t1 -+ CZ t1, t1 -+ -+ /* Get number of processed words. */ -+ sub t2, t0, t2 -+ -+ /* Add number of characters in the first word. */ -+ add a0, a0, t2 -+ srli t1, t1, 3 -+ -+ /* Add number of characters in the last word. */ -+ add a0, a0, t1 -+3: -+ ret -+ -+.option pop -+#endif -``` - -## 总结 - -以上梳理了memory和strcmp相关优化代码,可以发现: - -memory相关优化方法主要有两点:通过连续存储减少条件分支及其跳转次数,减少判断上的时间;以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分,批量操作一定程度上提高效率。 - -string对于zbb支持部分的函数优化,主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架,之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数,以此优化部分情况下的zbb中str相关函数的使用效率。 - +> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [tounix]
+> Author: Jingqing 2351290287@qq.com +> Date: 2023/6/17 +> Revisor: +> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux) +> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O) +> Sponsor: PLCT Lab, ISCAS + +# 近半年riscv内核库中str和mem函数的优化内容总结 + +## 简介 + +本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况,主要涉及 Memory, String 操作两大部分。 + +## Memory + +### riscv: optimized mem* functions + +[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) + +对各种mem相关操作函数的优化。 + +#### memcpy + +主要是由“直接逐字节复制”转变为“先对齐再按字复制”。 + +1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。 +2. 如果distance==0说明src和dest两者已经对齐,直接进行(32 or 64 bits)字长复制。 +3. 如果!=0说明未对齐,按照差值逐字复制。 + +```c ++void *__memcpy(void *dest, const void *src, size_t count) ++{ ++ union const_types s = { .as_u8 = src }; ++ union types d = { .as_u8 = dest }; ++ int distance = 0; ++ ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ if (count < MIN_THRESHOLD) ++ goto copy_remainder; ++ ++ /* Copy a byte at time until destination is aligned. */ ++ for (; d.as_uptr & WORD_MASK; count--) ++ *d.as_u8++ = *s.as_u8++; ++ ++ distance = s.as_uptr & WORD_MASK; ++ } ++ ++ if (distance) { ++ unsigned long last, next; ++ ++ /* ++ * s is distance bytes ahead of d, and d just reached ++ * the alignment boundary. Move s backward to word align it ++ * and shift data to compensate for distance, in order to do ++ * word-by-word copy. ++ */ ++ s.as_u8 -= distance; ++ ++ next = s.as_ulong[0]; ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) { ++ last = next; ++ next = s.as_ulong[1]; ++ ++ d.as_ulong[0] = last >> (distance * 8) | ++ next << ((BYTES_LONG - distance) * 8); ++ ++ d.as_ulong++; ++ s.as_ulong++; ++ } ++ ++ /* Restore s with the original offset. */ ++ s.as_u8 += distance; ++ } else { ++ /* ++ * If the source and dest lower bits are the same, do a simple ++ * 32/64 bit wide copy. ++ */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *d.as_ulong++ = *s.as_ulong++; ++ } ++ ++copy_remainder: ++ while (count--) ++ *d.as_u8++ = *s.as_u8++; ++ ++ return dest; ++} ++EXPORT_SYMBOL(__memcpy); ++ ++void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); ++EXPORT_SYMBOL(memcpy); +``` + +#### memmove + +如果dest和src不重叠或者dest src) { ++ const char *s = src + count; ++ char *tmp = dest + count; ++ ++ while (count--) ++ *--tmp = *--s; ++ } ++ return dest; ++} ++EXPORT_SYMBOL(__memmove); ++ ++void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove); ++EXPORT_SYMBOL(memmove); +``` + +#### memset + +旧memset:永远一次一个字节地填充。安全但是效率低。 + +修改后:也是采用对齐机制,先按字节填充,等到和最大填充单位的倍数对齐时按最大填充单位填入。 + +```c ++void *__memset(void *s, int c, size_t count) ++{ ++ union types dest = { .as_u8 = s }; ++ ++ if (count >= MIN_THRESHOLD) { ++ unsigned long cu = (unsigned long)c; ++ ++ /* Compose an ulong with 'c' repeated 4/8 times */ ++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER ++ cu *= 0x0101010101010101UL; ++#else ++ cu |= cu << 8; ++ cu |= cu << 16; ++ /* Suppress warning on 32 bit machines */ ++ cu |= (cu << 16) << 16; ++#endif ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ /* ++ * Fill the buffer one byte at time until ++ * the destination is word aligned. ++ */ ++ for (; count && dest.as_uptr & WORD_MASK; count--) ++ *dest.as_u8++ = c; ++ } ++ ++ /* Copy using the largest size allowed */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *dest.as_ulong++ = cu; ++ } ++ ++ /* copy the remainder */ ++ while (count--) ++ *dest.as_u8++ = c; ++ ++ return s; ++} ++EXPORT_SYMBOL(__memset); ++ ++void *memset(void *s, int c, size_t count) __weak __alias(__memset); ++EXPORT_SYMBOL(memset); +``` + +### riscv: lib: optimize memcmp with ld insn + +[riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/) + +这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题,没有看到作者提交新的版本。 + +这笔优化的核心代码和解读如下: + +旧代码: + +``` +sb a1, 0(t0) +addi t0, t0, 1 +bltu t0, a3, 5b +``` + +新代码: + +``` +/* fill head and tail with minimal branching */ +sb a1, 0(t0) +sb a1, -1(a3) +li a4, 2 +bgeu a4, a2, 6f + +sb a1, 1(t0) +sb a1, 2(t0) +sb a1, -2(a3) +sb a1, -3(a3) +li a4, 6 +bgeu a4, a2, 6f + +/* + * Adding additional detection to avoid + * redundant stores can lead + * to better performance + */ +sb a1, 3(t0) +sb a1, -4(a3) +li a4, 8 +bgeu a4, a2, 6f + +sb a1, 4(t0) +sb a1, -5(a3) +li a4, 10 +bgeu a4, a2, 6f + +sb a1, 5(t0) +sb a1, 6(t0) +sb a1, -6(a3) +sb a1, -7(a3) +li a4, 14 +bgeu a4, a2, 6f + +/* store the last byte */ +sb a1, 7(t0) +``` + +主要的改动如下: + +1. 将旧代码中的一行 `addi t0, t0, 1` 替换为一系列新的存储指令,用于填充头部和尾部。新代码中的存储指令是以一定的间隔连续存储数据。 +2. 添加了额外的条件检测和分支,以避免重复存储,这可能会提高性能。 +3. 添加了一行 `li a4, 2` 来设置一个常数,用于条件比较。 +4. 添加了 `6f` 标签,用于跳转到代码的结尾。 + +它的核心优化思路是用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 + +### RISC-V: Apply Zicboz to clear_page and memset + +[RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/) + +引入Zicboz扩展后,Zicboz块大小的内存自然对齐。因此要对接收任意内存块地址和大小的memset()来清空内存的方法进行优化。 + +分析发现当输入的地址未对齐或者太小时,Zicboz中的memset会显得效率低一些(多了几十条指令)。 + +1. 首先检查是否启用了CONFIG_RISCV_ISA_ZICBOZ来判断是否使用Zicboz扩展。如果不使用Zicboz扩展或者传入的参数不适合使用Zicboz扩展,则代码会跳转到.Ldo_memset标签处执行内存清零的逻辑。 +2. 如果使用Zicboz扩展进行内存清零,代码会将地址和长度进行对齐,并使用Zicboz扩展的指令进行内存清零操作。 +3. 在进行Zicboz扩展内存清零时,如果还有一些字节无法使用Zicboz扩展一次性清零,则会使用Duff's设备来处理剩余的字节。 + +### RISC-V: Optimize memset for data sizes less than 16 bytes + +[RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) ... + +在上述memset优化的基础上继续进行。 + +大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据,memset 使用字节存储,效率相对低。 改进方案决定用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 + +```c ++void *__memset(void *s, int c, size_t count) ++{ ++ union types dest = { .as_u8 = s }; ++ ++ if (count >= MIN_THRESHOLD) { ++ unsigned long cu = (unsigned long)c; ++ ++ /* Compose an ulong with 'c' repeated 4/8 times */ ++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER ++ cu *= 0x0101010101010101UL; ++#else ++ cu |= cu << 8; ++ cu |= cu << 16; ++ /* Suppress warning on 32 bit machines */ ++ cu |= (cu << 16) << 16;//8bits的c复制4次来构造unsigned long的cu ++#endif ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ /* ++ * Fill the buffer one byte at time until ++ * the destination is word aligned. ++ */ ++ for (; count && dest.as_uptr & WORD_MASK; count--) ++ *dest.as_u8++ = c;//逐字节填充对应地址中的值=c ++ } ++ ++ /* Copy using the largest size allowed */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu ++ } ++ ++ /* copy the remainder */ ++ while (count--) ++ *dest.as_u8++ = c;//剩余值全部设置为c ++ ++ return s; ++} ++EXPORT_SYMBOL(__memset); ++ ++void *memset(void *s, int c, size_t count) __weak __alias(__memset); ++EXPORT_SYMBOL(memset); +``` + +## String +### Zbb string optimizations + +[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) + +主要是为zbb提供了通用的一些字符串支持,后续特定用法优化拓展需要单独实现。 + +- 为Zbb系统添加了允许未对齐访问的strcmp,strncmp,strlen以及生成相应makefile文件。 + +- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义,简化。 + + ```c + -#define CPUFEATURE_SVPBMT 0 + -#define CPUFEATURE_ZICBOM 1 + -#define CPUFEATURE_ZBB 2 + +#define CPUFEATURE_SVPBMT (1 << 0) + +#define CPUFEATURE_ZICBOM (1 << 1) + +#define CPUFEATURE_ZBB (1 << 2) + ``` + +### Zbb+ fast-unaligned string optimization + +[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ... + +添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体,在无法生效的情况下退回到通用情况。 + +#### strcmp_zbb + +检查两个字符串是否对齐到SZREG的边界。如果是,则以SZREG为单位比较两个字符串中的内容。如果不是,则按字节读取。 + +```c ++/* ++ * Variant of strcmp using the ZBB extension if available ++ */ ++#ifdef CONFIG_RISCV_ISA_ZBB ++strcmp_zbb: ++ ++.option push ++.option arch,+zbb ++ ++ /* ++ * Returns ++ * a0 - comparison result, value like strcmp ++ * ++ * Parameters ++ * a0 - string1 ++ * a1 - string2 ++ * ++ * Clobbers ++ * t0, t1, t2, t3, t4, t5 ++ */ ++ ++ or t2, a0, a1 ++ li t4, -1 ++ and t2, t2, SZREG-1 ++ bnez t2, 3f ++ ++ /* Main loop for aligned string. */ ++ .p2align 3 ++1: ++ REG_L t0, 0(a0) ++ REG_L t1, 0(a1) ++ orc.b t3, t0 ++ bne t3, t4, 2f ++ addi a0, a0, SZREG ++ addi a1, a1, SZREG ++ beq t0, t1, 1b ++ ++ /* ++ * Words don't match, and no null byte in the first ++ * word. Get bytes in big-endian order and compare. ++ */ ++#ifndef CONFIG_CPU_BIG_ENDIAN ++ rev8 t0, t0 ++ rev8 t1, t1 ++#endif ++ ++ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ ++ sltu a0, t0, t1 ++ neg a0, a0 ++ ori a0, a0, 1 ++ ret ++ ++2: ++ /* ++ * Found a null byte. ++ * If words don't match, fall back to simple loop. ++ */ ++ bne t0, t1, 3f ++ ++ /* Otherwise, strings are equal. */ ++ li a0, 0 ++ ret ++ ++ /* Simple loop for misaligned strings. */ ++ .p2align 3 ++3: ++ lbu t0, 0(a0) ++ lbu t1, 0(a1) ++ addi a0, a0, 1 ++ addi a1, a1, 1 ++ bne t0, t1, 4f ++ bnez t0, 3b ++ ++4: ++ sub a0, t0, t1 ++ ret ++ ++.option pop ++#endif +``` + +#### strlen_zbb + +启用CONFIG_RISCV_ISA_ZBB的前提下,移位对齐字符后从头开始以SZREG为单位读取,并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。 + +```c ++#ifdef CONFIG_RISCV_ISA_ZBB ++strlen_zbb: ++ ++#ifdef CONFIG_CPU_BIG_ENDIAN ++# define CZ clz ++# define SHIFT sll ++#else ++# define CZ ctz ++# define SHIFT srl ++#endif ++ ++.option push ++.option arch,+zbb ++ ++ /* ++ * Returns ++ * a0 - string length ++ * ++ * Parameters ++ * a0 - String to measure ++ * ++ * Clobbers ++ * t0, t1, t2, t3 ++ */ ++ ++ /* Number of irrelevant bytes in the first word. */ ++ andi t2, a0, SZREG-1 ++ ++ /* Align pointer. */ ++ andi t0, a0, -SZREG ++ ++ li t3, SZREG ++ sub t3, t3, t2 ++ slli t2, t2, 3 ++ ++ /* Get the first word. */ ++ REG_L t1, 0(t0) ++ ++ /* ++ * Shift away the partial data we loaded to remove the irrelevant bytes ++ * preceding the string with the effect of adding NUL bytes at the ++ * end of the string's first word. ++ */ ++ SHIFT t1, t1, t2 ++ ++ /* Convert non-NUL into 0xff and NUL into 0x00. */ ++ orc.b t1, t1 ++ ++ /* Convert non-NUL into 0x00 and NUL into 0xff. */ ++ not t1, t1 ++ ++ /* ++ * Search for the first set bit (corresponding to a NUL byte in the ++ * original chunk). ++ */ ++ CZ t1, t1 ++ ++ /* ++ * The first chunk is special: compare against the number ++ * of valid bytes in this chunk. ++ */ ++ srli a0, t1, 3 ++ bgtu t3, a0, 3f ++ ++ /* Prepare for the word comparison loop. */ ++ addi t2, t0, SZREG ++ li t3, -1 ++ ++ /* ++ * Our critical loop is 4 instructions and processes data in ++ * 4 byte or 8 byte chunks. ++ */ ++ .p2align 3 ++1: ++ REG_L t1, SZREG(t0) ++ addi t0, t0, SZREG ++ orc.b t1, t1 ++ beq t1, t3, 1b ++2: ++ not t1, t1 ++ CZ t1, t1 ++ ++ /* Get number of processed words. */ ++ sub t2, t0, t2 ++ ++ /* Add number of characters in the first word. */ ++ add a0, a0, t2 ++ srli t1, t1, 3 ++ ++ /* Add number of characters in the last word. */ ++ add a0, a0, t1 ++3: ++ ret ++ ++.option pop ++#endif +``` + +## 总结 + +以上梳理了memory和strcmp相关优化代码,可以发现: + +memory相关优化方法主要有两点:通过连续存储减少条件分支及其跳转次数,减少判断上的时间;以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分,批量操作一定程度上提高效率。 + +string对于zbb支持部分的函数优化,主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架,之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数,以此优化部分情况下的zbb中str相关函数的使用效率。 + 接下来将按照 Memory, String, 数据运算,其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读,敬请期待。 \ No newline at end of file -- Gitee From 4057a2c1d19e021edea01955d81a7d29795fb483 Mon Sep 17 00:00:00 2001 From: Jingqing3948 <2351290287@qq.com> Date: Sun, 18 Jun 2023 13:20:33 +0800 Subject: [PATCH 03/14] tinycorrect2 --- ...ation-content-for-str-and-mem-functions.md | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index a0ee3e5..60a6a33 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -1,10 +1,10 @@ -> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [tounix]
-> Author: Jingqing 2351290287@qq.com -> Date: 2023/6/17 -> Revisor: -> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux) -> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O) -> Sponsor: PLCT Lab, ISCAS +> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [spaces header toc]
+> Author: Jingqing 2351290287@qq.com
+> Date: 2023/6/17
+> Revisor: Falcon
+> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux)
+> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)
+> Sponsor: PLCT Lab, ISCAS # 近半年riscv内核库中str和mem函数的优化内容总结 @@ -203,9 +203,9 @@ sb a1, -3(a3) li a4, 6 bgeu a4, a2, 6f -/* - * Adding additional detection to avoid - * redundant stores can lead +/* + * Adding additional detection to avoid + * redundant stores can lead * to better performance */ sb a1, 3(t0) @@ -256,7 +256,7 @@ sb a1, 7(t0) 在上述memset优化的基础上继续进行。 -大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据,memset 使用字节存储,效率相对低。 改进方案决定用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 +大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据,memset 使用字节存储,效率相对低。改进方案决定用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 ```c +void *__memset(void *s, int c, size_t count) @@ -302,6 +302,7 @@ sb a1, 7(t0) ``` ## String + ### Zbb string optimizations [Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) @@ -524,4 +525,4 @@ memory相关优化方法主要有两点:通过连续存储减少条件分支 string对于zbb支持部分的函数优化,主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架,之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数,以此优化部分情况下的zbb中str相关函数的使用效率。 -接下来将按照 Memory, String, 数据运算,其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读,敬请期待。 \ No newline at end of file +接下来将按照 Memory, String, 数据运算,其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读,敬请期待。 -- Gitee From e9cf47f9dbec7751d5700af228f896ac5aedb25c Mon Sep 17 00:00:00 2001 From: Jingqing3948 <2351290287@qq.com> Date: Sun, 18 Jun 2023 13:23:09 +0800 Subject: [PATCH 04/14] using tinycorrect --- ...ation-content-for-str-and-mem-functions.md | 73 ++++++++++--------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index 60a6a33..92c367b 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -1,4 +1,4 @@ -> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [spaces header toc]
+> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [urls pangu autocorrect]
> Author: Jingqing 2351290287@qq.com
> Date: 2023/6/17
> Revisor: Falcon
@@ -6,7 +6,7 @@ > Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)
> Sponsor: PLCT Lab, ISCAS -# 近半年riscv内核库中str和mem函数的优化内容总结 +# 近半年 RISC-V 内核库中 str 和 mem 函数的优化内容总结 ## 简介 @@ -16,17 +16,17 @@ ### riscv: optimized mem* functions -[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) +[riscv: optimized mem* functions][002] -对各种mem相关操作函数的优化。 +对各种 mem 相关操作函数的优化。 #### memcpy 主要是由“直接逐字节复制”转变为“先对齐再按字复制”。 -1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。 -2. 如果distance==0说明src和dest两者已经对齐,直接进行(32 or 64 bits)字长复制。 -3. 如果!=0说明未对齐,按照差值逐字复制。 +1. 如果仍未启用高效对齐访问 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变 dest 和 src 相对距离的情况下将 desc 对齐在字边界上。 +2. 如果 distance==0 说明 src 和 dest 两者已经对齐,直接进行(32 or 64 bits)字长复制。 +3. 如果!=0 说明未对齐,按照差值逐字复制。 ```c +void *__memcpy(void *dest, const void *src, size_t count) @@ -94,7 +94,7 @@ #### memmove -如果dest和src不重叠或者dest= BYTES_LONG; count -= BYTES_LONG) -+ *dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu ++ *dest.as_ulong++ = cu;//BYTES_LONG 的整数倍部分复制为 cu + } + + /* copy the remainder */ + while (count--) -+ *dest.as_u8++ = c;//剩余值全部设置为c ++ *dest.as_u8++ = c;//剩余值全部设置为 c + + return s; +} @@ -305,13 +305,13 @@ sb a1, 7(t0) ### Zbb string optimizations -[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) +[Zbb string optimizations][001] -主要是为zbb提供了通用的一些字符串支持,后续特定用法优化拓展需要单独实现。 +主要是为 zbb 提供了通用的一些字符串支持,后续特定用法优化拓展需要单独实现。 -- 为Zbb系统添加了允许未对齐访问的strcmp,strncmp,strlen以及生成相应makefile文件。 +- 为 Zbb 系统添加了允许未对齐访问的 strcmp,strncmp,strlen 以及生成相应 makefile 文件。 -- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义,简化。 +- 用位域而不是数字代替 CPU 的补丁拓展 errata-id 的宏定义,简化。 ```c -#define CPUFEATURE_SVPBMT 0 @@ -324,13 +324,13 @@ sb a1, 7(t0) ### Zbb+ fast-unaligned string optimization -[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ... +[Zbb + fast-unaligned string optimization][005] ... -添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体,在无法生效的情况下退回到通用情况。 +添加多个 strcmp 变体用于快速比较非对齐访问。优先使用效率高的优化变体,在无法生效的情况下退回到通用情况。 #### strcmp_zbb -检查两个字符串是否对齐到SZREG的边界。如果是,则以SZREG为单位比较两个字符串中的内容。如果不是,则按字节读取。 +检查两个字符串是否对齐到 SZREG 的边界。如果是,则以 SZREG 为单位比较两个字符串中的内容。如果不是,则按字节读取。 ```c +/* @@ -416,7 +416,7 @@ sb a1, 7(t0) #### strlen_zbb -启用CONFIG_RISCV_ISA_ZBB的前提下,移位对齐字符后从头开始以SZREG为单位读取,并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。 +启用 CONFIG_RISCV_ISA_ZBB 的前提下,移位对齐字符后从头开始以 SZREG 为单位读取,并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。 ```c +#ifdef CONFIG_RISCV_ISA_ZBB @@ -519,10 +519,17 @@ sb a1, 7(t0) ## 总结 -以上梳理了memory和strcmp相关优化代码,可以发现: +以上梳理了 memory 和 strcmp 相关优化代码,可以发现: -memory相关优化方法主要有两点:通过连续存储减少条件分支及其跳转次数,减少判断上的时间;以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分,批量操作一定程度上提高效率。 +memory 相关优化方法主要有两点:通过连续存储减少条件分支及其跳转次数,减少判断上的时间;以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分,批量操作一定程度上提高效率。 -string对于zbb支持部分的函数优化,主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架,之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数,以此优化部分情况下的zbb中str相关函数的使用效率。 +string 对于 zbb 支持部分的函数优化,主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架,之后又提出了优化对齐方式下按 SZREG 块单位执行函数的优化方案。当优化方案不适用时再使用通用函数,以此优化部分情况下的 zbb 中 str 相关函数的使用效率。 接下来将按照 Memory, String, 数据运算,其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读,敬请期待。 + +[001]: https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/ +[002]: https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/ +[003]: https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/ +[004]: https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/ +[005]: https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/ +[006]: https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/ -- Gitee From e920c45f40e10e47eda932accf956a05388b2d4c Mon Sep 17 00:00:00 2001 From: falcon Date: Tue, 20 Jun 2023 13:22:35 +0000 Subject: [PATCH 05/14] Update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md --- ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index 92c367b..e571a45 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -10,7 +10,7 @@ ## 简介 -本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况,主要涉及 Memory, String 操作两大部分。 +本文结合 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况,主要涉及 Memory, String 操作两大部分。 ## Memory -- Gitee From f3a545eb345e3a1c92db3a2c682a661ac736c372 Mon Sep 17 00:00:00 2001 From: falcon Date: Tue, 20 Jun 2023 13:22:46 +0000 Subject: [PATCH 06/14] Update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md --- ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index e571a45..e8ffd5d 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -18,7 +18,7 @@ [riscv: optimized mem* functions][002] -对各种 mem 相关操作函数的优化。 +该组 patchset 对各种 mem 相关操作函数进行了优化,以下逐个分析。 #### memcpy -- Gitee From b612cf1a8d7a1d928956736e3948ec4f80de4320 Mon Sep 17 00:00:00 2001 From: falcon Date: Tue, 20 Jun 2023 13:22:55 +0000 Subject: [PATCH 07/14] Update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md --- ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index e8ffd5d..2126d39 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -26,7 +26,7 @@ 1. 如果仍未启用高效对齐访问 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变 dest 和 src 相对距离的情况下将 desc 对齐在字边界上。 2. 如果 distance==0 说明 src 和 dest 两者已经对齐,直接进行(32 or 64 bits)字长复制。 -3. 如果!=0 说明未对齐,按照差值逐字复制。 +3. 如果 `distance !=0` 说明未对齐,按照差值逐字复制。 ```c +void *__memcpy(void *dest, const void *src, size_t count) -- Gitee From 31f9f6aaf501b00954da8376b4f15d53781efc3f Mon Sep 17 00:00:00 2001 From: falcon Date: Tue, 20 Jun 2023 13:23:04 +0000 Subject: [PATCH 08/14] Update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md --- ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index 2126d39..a4b7ddd 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -94,7 +94,7 @@ #### memmove -如果 dest 和 src 不重叠或者 dest Date: Tue, 20 Jun 2023 13:23:13 +0000 Subject: [PATCH 09/14] Update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md --- ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index a4b7ddd..874dd40 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -25,7 +25,7 @@ 主要是由“直接逐字节复制”转变为“先对齐再按字复制”。 1. 如果仍未启用高效对齐访问 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变 dest 和 src 相对距离的情况下将 desc 对齐在字边界上。 -2. 如果 distance==0 说明 src 和 dest 两者已经对齐,直接进行(32 or 64 bits)字长复制。 +2. 如果 `distance==0` 说明 src 和 dest 两者已经对齐,直接进行(32 or 64 bits)字长复制。 3. 如果 `distance !=0` 说明未对齐,按照差值逐字复制。 ```c -- Gitee From 7818b4078bfa17054d0251a6bc2bc470e07a61c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=81=B0=E6=B5=B7=E5=AE=BD=E6=9D=BE?= <2351290287@qq.com> Date: Tue, 20 Jun 2023 18:05:44 +0000 Subject: [PATCH 10/14] update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md. add example code to 004 and 005 examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 灰海宽松 <2351290287@qq.com> --- ...ation-content-for-str-and-mem-functions.md | 88 +++++++++++++++++-- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index 874dd40..854cf5c 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -250,6 +250,60 @@ sb a1, 7(t0) 2. 如果使用 Zicboz 扩展进行内存清零,代码会将地址和长度进行对齐,并使用 Zicboz 扩展的指令进行内存清零操作。 3. 在进行 Zicboz 扩展内存清零时,如果还有一些字节无法使用 Zicboz 扩展一次性清零,则会使用 Duff's 设备来处理剩余的字节。 +```c ++#ifdef CONFIG_RISCV_ISA_ZICBOZ ++ ALT_ZICBOZ("j .Ldo_memset", "nop") ++ /* ++ * t1 will be the Zicboz block size. ++ * Zero means we're not using Zicboz, and we don't when a1 != 0 ++ */ ++ li t1, 0 ++ bnez a1, .Ldo_memset ++ la a3, riscv_cboz_block_size ++ lw t1, 0(a3) ++ ++ /* ++ * Round to nearest Zicboz block-aligned address ++ * greater than or equal to the start address. ++ */ ++ addi a3, t1, -1 ++ not t2, a3 /* t2 is Zicboz block size mask */ ++ add a3, t0, a3 ++ and t3, a3, t2 /* t3 is Zicboz block aligned start */ ++ ++ /* Did we go too far or not have at least one block? */ ++ add a3, a0, a2 ++ and a3, a3, t2 ++ bgtu a3, t3, .Ldo_zero ++ li t1, 0 ++ j .Ldo_memset ++ ++.Ldo_zero: ++ /* Use Duff for initial bytes if there are any */ ++ bne t3, t0, .Ldo_memset ++ ++.Ldo_zero2: ++ /* Calculate end address */ ++ and a3, a2, t2 ++ add a3, t0, a3 ++ sub a4, a3, t0 ++ ++.Lzero_loop: ++ CBO_ZERO(t0) ++ add t0, t0, t1 ++ bltu t0, a3, .Lzero_loop ++ li t1, 0 /* We're done with Zicboz */ ++ ++ sub a2, a2, a4 /* Update count */ ++ sltiu a3, a2, 16 ++ bnez a3, .Lfinish ++ ++ /* t0 is Zicboz block size aligned, so it must be SZREG aligned */ ++ j .Ldo_duff3 ++#endif ++ +``` + ### RISC-V: Optimize memset for data sizes less than 16 bytes [RISC-V: Optimize memset for data sizes less than 16 bytes][006] ... @@ -328,6 +382,26 @@ sb a1, 7(t0) 添加多个 strcmp 变体用于快速比较非对齐访问。优先使用效率高的优化变体,在无法生效的情况下退回到通用情况。 +```c ++static bool __init_or_module cpufeature_probe_fast_unaligned(unsigned int stage) ++{ ++ int cpu; ++ ++ if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) ++ return false; ++ ++ for_each_possible_cpu(cpu) { ++ long perf = per_cpu(misaligned_access_speed, cpu); ++ ++ if (perf != RISCV_HWPROBE_MISALIGNED_FAST) ++ return false; ++ } ++ ++ return true; ++} ++ +``` + #### strcmp_zbb 检查两个字符串是否对齐到 SZREG 的边界。如果是,则以 SZREG 为单位比较两个字符串中的内容。如果不是,则按字节读取。 @@ -527,9 +601,11 @@ string 对于 zbb 支持部分的函数优化,主要是先提供通用支持 接下来将按照 Memory, String, 数据运算,其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读,敬请期待。 -[001]: https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/ -[002]: https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/ -[003]: https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/ -[004]: https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/ -[005]: https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/ -[006]: https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/ +## 参考资料 + +- [001]: [https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) +- [002]: [https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) +- [003]: [https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/) +- [004]: [https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/) +- [005]: [https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) +- [006]: [https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) -- Gitee From e48b8e421e3e2978181928cfdf767c26bd8bd4ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=81=B0=E6=B5=B7=E5=AE=BD=E6=9D=BE?= <2351290287@qq.com> Date: Tue, 20 Jun 2023 18:14:00 +0000 Subject: [PATCH 11/14] update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md. modify the form of reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 灰海宽松 <2351290287@qq.com> --- ...optimization-content-for-str-and-mem-functions.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index 854cf5c..14020ee 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -603,9 +603,9 @@ string 对于 zbb 支持部分的函数优化,主要是先提供通用支持 ## 参考资料 -- [001]: [https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) -- [002]: [https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) -- [003]: [https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/) -- [004]: [https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/) -- [005]: [https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) -- [006]: [https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) +- 001: [Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) +- 002: [riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) +- 003: [riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/) +- 004: [RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/) +- 005: [Zbb+ fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) +- 006: [RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) -- Gitee From 2029cf6932d92a386eb27250fd58f41416e29552 Mon Sep 17 00:00:00 2001 From: falcon Date: Wed, 21 Jun 2023 02:54:35 +0000 Subject: [PATCH 12/14] Update articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md --- ...ation-content-for-str-and-mem-functions.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md index 14020ee..340704d 100644 --- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md +++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md @@ -603,9 +603,16 @@ string 对于 zbb 支持部分的函数优化,主要是先提供通用支持 ## 参考资料 -- 001: [Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/) -- 002: [riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/) -- 003: [riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/) -- 004: [RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/) -- 005: [Zbb+ fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) -- 006: [RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) +- [Zbb string optimizations][001] +- [riscv: optimized mem* functions][002] +- [riscv: lib: optimize memcmp with ld insn][003] +- [RISC-V: Apply Zicboz to clear_page and memset][004] +- [Zbb+ fast-unaligned string optimization][005] +- [RISC-V: Optimize memset for data sizes less than 16 bytes][006] + +[001]: https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/ +[002]: https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/ +[003]: https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/ +[004]: https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/ +[005]: https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/ +[006]: https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/ -- Gitee From f4011249834a410b4985be1daa5ded2577797574 Mon Sep 17 00:00:00 2001 From: Jingqing3948 <2351290287@qq.com> Date: Wed, 21 Jun 2023 15:20:11 +0800 Subject: [PATCH 13/14] change file name --- ...mem-functions.md => 20230617-riscv-kernel-libc-opt-summary.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename articles/{20230617-summary-of-optimization-content-for-str-and-mem-functions.md => 20230617-riscv-kernel-libc-opt-summary.md} (100%) diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-riscv-kernel-libc-opt-summary.md similarity index 100% rename from articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md rename to articles/20230617-riscv-kernel-libc-opt-summary.md -- Gitee From 08ef519efeb235f5f487033070f9560a0b0e018b Mon Sep 17 00:00:00 2001 From: Jingqing3948 <2351290287@qq.com> Date: Wed, 21 Jun 2023 15:25:40 +0800 Subject: [PATCH 14/14] change filename --- ...el-libc-opt-summary.md => 20230617-riscv-klibc-opt-summary.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename articles/{20230617-riscv-kernel-libc-opt-summary.md => 20230617-riscv-klibc-opt-summary.md} (100%) diff --git a/articles/20230617-riscv-kernel-libc-opt-summary.md b/articles/20230617-riscv-klibc-opt-summary.md similarity index 100% rename from articles/20230617-riscv-kernel-libc-opt-summary.md rename to articles/20230617-riscv-klibc-opt-summary.md -- Gitee