From a3fd4df815400eadbfd5125c8393aa5b103d3498 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=81=B0=E6=B5=B7=E5=AE=BD=E6=9D=BE?= <2351290287@qq.com>
Date: Sat, 17 Jun 2023 15:31:21 +0000
Subject: [PATCH 01/14] add
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 灰海宽松 <2351290287@qq.com>
---
 ...ation-content-for-str-and-mem-functions.md | 526 ++++++++++++++++++
 1 file changed, 526 insertions(+)
 create mode 100644 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
new file mode 100644
index 0000000..2a4bac8
--- /dev/null
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -0,0 +1,526 @@
+> Author: Jingqing 2351290287@qq.com
+> Date: 2023/6/17
+> Revisor: 
+> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux)
+> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)
+> Sponsor: PLCT Lab, ISCAS
+
+# 近半年riscv内核库中str和mem函数的优化内容总结
+
+## 简介
+
+本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况，主要涉及 Memory, String 操作两大部分。
+
+## Memory
+
+### riscv: optimized mem* functions
+
+[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
+
+对各种mem相关操作函数的优化。
+
+#### memcpy
+
+主要是由“直接逐字节复制”转变为“先对齐再按字复制”。
+
+1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS，则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。
+2. 如果distance==0说明src和dest两者已经对齐，直接进行（32 or 64 bits）字长复制。
+3. 如果!=0说明未对齐，按照差值逐字复制。
+
+```c
++void *__memcpy(void *dest, const void *src, size_t count)
++{
++	union const_types s = { .as_u8 = src };
++	union types d = { .as_u8 = dest };
++	int distance = 0;
++
++	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
++		if (count < MIN_THRESHOLD)
++			goto copy_remainder;
++
++		/* Copy a byte at time until destination is aligned. */
++		for (; d.as_uptr & WORD_MASK; count--)
++			*d.as_u8++ = *s.as_u8++;
++
++		distance = s.as_uptr & WORD_MASK;
++	}
++
++	if (distance) {
++		unsigned long last, next;
++
++		/*
++		 * s is distance bytes ahead of d, and d just reached
++		 * the alignment boundary. Move s backward to word align it
++		 * and shift data to compensate for distance, in order to do
++		 * word-by-word copy.
++		 */
++		s.as_u8 -= distance;
++
++		next = s.as_ulong[0];
++		for (; count >= BYTES_LONG; count -= BYTES_LONG) {
++			last = next;
++			next = s.as_ulong[1];
++
++			d.as_ulong[0] = last >> (distance * 8) |
++					next << ((BYTES_LONG - distance) * 8);
++
++			d.as_ulong++;
++			s.as_ulong++;
++		}
++
++		/* Restore s with the original offset. */
++		s.as_u8 += distance;
++	} else {
++		/*
++		 * If the source and dest lower bits are the same, do a simple
++		 * 32/64 bit wide copy.
++		 */
++		for (; count >= BYTES_LONG; count -= BYTES_LONG)
++			*d.as_ulong++ = *s.as_ulong++;
++	}
++
++copy_remainder:
++	while (count--)
++		*d.as_u8++ = *s.as_u8++;
++
++	return dest;
++}
++EXPORT_SYMBOL(__memcpy);
++
++void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
++EXPORT_SYMBOL(memcpy);
+```
+
+#### memmove
+
+如果dest和src不重叠或者dest<src，可以直接用memcpy（dest<src我的理解是从低地址往高地址复制，哪怕两者重叠也不会受干扰）。
+
+```c
++
++/*
++ * Simply check if the buffer overlaps an call memcpy() in case,
++ * otherwise do a simple one byte at time backward copy.
++ */
++void *__memmove(void *dest, const void *src, size_t count)
++{
++	if (dest < src || src + count <= dest)
++		return __memcpy(dest, src, count);
++
++	if (dest > src) {
++		const char *s = src + count;
++		char *tmp = dest + count;
++
++		while (count--)
++			*--tmp = *--s;
++	}
++	return dest;
++}
++EXPORT_SYMBOL(__memmove);
++
++void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove);
++EXPORT_SYMBOL(memmove);
+```
+
+#### memset
+
+旧memset：永远一次一个字节地填充。安全但是效率低。
+
+修改后：也是采用对齐机制，先按字节填充，等到和最大填充单位的倍数对齐时按最大填充单位填入。
+
+```c
++void *__memset(void *s, int c, size_t count)
++{
++	union types dest = { .as_u8 = s };
++
++	if (count >= MIN_THRESHOLD) {
++		unsigned long cu = (unsigned long)c;
++
++		/* Compose an ulong with 'c' repeated 4/8 times */
++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
++		cu *= 0x0101010101010101UL;
++#else
++		cu |= cu << 8;
++		cu |= cu << 16;
++		/* Suppress warning on 32 bit machines */
++		cu |= (cu << 16) << 16;
++#endif
++		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
++			/*
++			 * Fill the buffer one byte at time until
++			 * the destination is word aligned.
++			 */
++			for (; count && dest.as_uptr & WORD_MASK; count--)
++				*dest.as_u8++ = c;
++		}
++
++		/* Copy using the largest size allowed */
++		for (; count >= BYTES_LONG; count -= BYTES_LONG)
++			*dest.as_ulong++ = cu;
++	}
++
++	/* copy the remainder */
++	while (count--)
++		*dest.as_u8++ = c;
++
++	return s;
++}
++EXPORT_SYMBOL(__memset);
++
++void *memset(void *s, int c, size_t count) __weak __alias(__memset);
++EXPORT_SYMBOL(memset);
+```
+
+### riscv: lib: optimize memcmp with ld insn
+
+[riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
+
+这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题，没有看到作者提交新的版本。
+
+这笔优化的核心代码和解读如下：
+
+旧代码：
+
+```
+sb  a1, 0(t0)
+addi  t0, t0, 1
+bltu  t0, a3, 5b
+```
+
+新代码：
+
+```
+/* fill head and tail with minimal branching */
+sb      a1,  0(t0)
+sb      a1, -1(a3)
+li  a4, 2
+bgeu  a4, a2, 6f
+
+sb  a1,  1(t0)
+sb  a1,  2(t0)
+sb  a1, -2(a3)
+sb  a1, -3(a3)
+li  a4, 6
+bgeu  a4, a2, 6f
+
+/* 
+ * Adding additional detection to avoid 
+ * redundant stores can lead 
+ * to better performance
+ */
+sb  a1,  3(t0)
+sb  a1, -4(a3)
+li  a4, 8
+bgeu  a4, a2, 6f
+
+sb  a1,  4(t0)
+sb  a1, -5(a3)
+li  a4, 10
+bgeu  a4, a2, 6f
+
+sb  a1,  5(t0)
+sb  a1,  6(t0)
+sb  a1, -6(a3)
+sb  a1, -7(a3)
+li  a4, 14
+bgeu  a4, a2, 6f
+
+/* store the last byte */
+sb  a1,  7(t0)
+```
+
+主要的改动如下：
+
+1. 将旧代码中的一行 `addi t0, t0, 1` 替换为一系列新的存储指令，用于填充头部和尾部。新代码中的存储指令是以一定的间隔连续存储数据。
+2. 添加了额外的条件检测和分支，以避免重复存储，这可能会提高性能。
+3. 添加了一行 `li a4, 2` 来设置一个常数，用于条件比较。
+4. 添加了 `6f` 标签，用于跳转到代码的结尾。
+
+它的核心优化思路是用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
+
+### RISC-V: Apply Zicboz to clear_page and memset
+
+[RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
+
+引入Zicboz扩展后，Zicboz块大小的内存自然对齐。因此要对接收任意内存块地址和大小的memset()来清空内存的方法进行优化。
+
+分析发现当输入的地址未对齐或者太小时，Zicboz中的memset会显得效率低一些（多了几十条指令）。
+
+1. 首先检查是否启用了CONFIG_RISCV_ISA_ZICBOZ来判断是否使用Zicboz扩展。如果不使用Zicboz扩展或者传入的参数不适合使用Zicboz扩展，则代码会跳转到.Ldo_memset标签处执行内存清零的逻辑。
+2. 如果使用Zicboz扩展进行内存清零，代码会将地址和长度进行对齐，并使用Zicboz扩展的指令进行内存清零操作。
+3. 在进行Zicboz扩展内存清零时，如果还有一些字节无法使用Zicboz扩展一次性清零，则会使用Duff's设备来处理剩余的字节。
+
+### RISC-V: Optimize memset for data sizes less than 16 bytes
+
+[RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) ...
+
+在上述memset优化的基础上继续进行。
+
+大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据，memset 使用字节存储，效率相对低。 改进方案决定用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
+
+```c
++void *__memset(void *s, int c, size_t count)
++{
++	union types dest = { .as_u8 = s };
++
++	if (count >= MIN_THRESHOLD) {
++		unsigned long cu = (unsigned long)c;
++
++		/* Compose an ulong with 'c' repeated 4/8 times */
++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
++		cu *= 0x0101010101010101UL;
++#else
++		cu |= cu << 8;
++		cu |= cu << 16;
++		/* Suppress warning on 32 bit machines */
++		cu |= (cu << 16) << 16;//8bits的c复制4次来构造unsigned long的cu
++#endif
++		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
++			/*
++			 * Fill the buffer one byte at time until
++			 * the destination is word aligned.
++			 */
++			for (; count && dest.as_uptr & WORD_MASK; count--)
++				*dest.as_u8++ = c;//逐字节填充对应地址中的值=c
++		}
++
++		/* Copy using the largest size allowed */
++		for (; count >= BYTES_LONG; count -= BYTES_LONG)
++			*dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu
++	}
++
++	/* copy the remainder */
++	while (count--)
++		*dest.as_u8++ = c;//剩余值全部设置为c
++
++	return s;
++}
++EXPORT_SYMBOL(__memset);
++
++void *memset(void *s, int c, size_t count) __weak __alias(__memset);
++EXPORT_SYMBOL(memset);
+```
+
+## String
+### Zbb string optimizations
+
+[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
+
+主要是为zbb提供了通用的一些字符串支持，后续特定用法优化拓展需要单独实现。
+
+- 为Zbb系统添加了允许未对齐访问的strcmp，strncmp，strlen以及生成相应makefile文件。
+
+- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义，简化。
+
+  ```c
+  -#define	CPUFEATURE_SVPBMT 0
+  -#define	CPUFEATURE_ZICBOM 1
+  -#define	CPUFEATURE_ZBB 2
+  +#define	CPUFEATURE_SVPBMT 	(1 << 0)
+  +#define	CPUFEATURE_ZICBOM	(1 << 1)
+  +#define	CPUFEATURE_ZBB		(1 << 2)
+  ```
+
+### Zbb+ fast-unaligned string optimization
+
+[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ...
+
+添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体，在无法生效的情况下退回到通用情况。
+
+#### strcmp_zbb
+
+检查两个字符串是否对齐到SZREG的边界。如果是，则以SZREG为单位比较两个字符串中的内容。如果不是，则按字节读取。
+
+```c
++/*
++ * Variant of strcmp using the ZBB extension if available
++ */
++#ifdef CONFIG_RISCV_ISA_ZBB
++strcmp_zbb:
++
++.option push
++.option arch,+zbb
++
++	/*
++	 * Returns
++	 *   a0 - comparison result, value like strcmp
++	 *
++	 * Parameters
++	 *   a0 - string1
++	 *   a1 - string2
++	 *
++	 * Clobbers
++	 *   t0, t1, t2, t3, t4, t5
++	 */
++
++	or	t2, a0, a1
++	li	t4, -1
++	and	t2, t2, SZREG-1
++	bnez	t2, 3f
++
++	/* Main loop for aligned string.  */
++	.p2align 3
++1:
++	REG_L	t0, 0(a0)
++	REG_L	t1, 0(a1)
++	orc.b	t3, t0
++	bne	t3, t4, 2f
++	addi	a0, a0, SZREG
++	addi	a1, a1, SZREG
++	beq	t0, t1, 1b
++
++	/*
++	 * Words don't match, and no null byte in the first
++	 * word. Get bytes in big-endian order and compare.
++	 */
++#ifndef CONFIG_CPU_BIG_ENDIAN
++	rev8	t0, t0
++	rev8	t1, t1
++#endif
++
++	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
++	sltu	a0, t0, t1
++	neg	a0, a0
++	ori	a0, a0, 1
++	ret
++
++2:
++	/*
++	 * Found a null byte.
++	 * If words don't match, fall back to simple loop.
++	 */
++	bne	t0, t1, 3f
++
++	/* Otherwise, strings are equal. */
++	li	a0, 0
++	ret
++
++	/* Simple loop for misaligned strings. */
++	.p2align 3
++3:
++	lbu	t0, 0(a0)
++	lbu	t1, 0(a1)
++	addi	a0, a0, 1
++	addi	a1, a1, 1
++	bne	t0, t1, 4f
++	bnez	t0, 3b
++
++4:
++	sub	a0, t0, t1
++	ret
++
++.option pop
++#endif
+```
+
+#### strlen_zbb
+
+启用CONFIG_RISCV_ISA_ZBB的前提下，移位对齐字符后从头开始以SZREG为单位读取，并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。
+
+```c
++#ifdef CONFIG_RISCV_ISA_ZBB
++strlen_zbb:
++
++#ifdef CONFIG_CPU_BIG_ENDIAN
++# define CZ	clz
++# define SHIFT	sll
++#else
++# define CZ	ctz
++# define SHIFT	srl
++#endif
++
++.option push
++.option arch,+zbb
++
++	/*
++	 * Returns
++	 *   a0 - string length
++	 *
++	 * Parameters
++	 *   a0 - String to measure
++	 *
++	 * Clobbers
++	 *   t0, t1, t2, t3
++	 */
++
++	/* Number of irrelevant bytes in the first word. */
++	andi	t2, a0, SZREG-1
++
++	/* Align pointer. */
++	andi	t0, a0, -SZREG
++
++	li	t3, SZREG
++	sub	t3, t3, t2
++	slli	t2, t2, 3
++
++	/* Get the first word.  */
++	REG_L	t1, 0(t0)
++
++	/*
++	 * Shift away the partial data we loaded to remove the irrelevant bytes
++	 * preceding the string with the effect of adding NUL bytes at the
++	 * end of the string's first word.
++	 */
++	SHIFT	t1, t1, t2
++
++	/* Convert non-NUL into 0xff and NUL into 0x00. */
++	orc.b	t1, t1
++
++	/* Convert non-NUL into 0x00 and NUL into 0xff. */
++	not	t1, t1
++
++	/*
++	 * Search for the first set bit (corresponding to a NUL byte in the
++	 * original chunk).
++	 */
++	CZ	t1, t1
++
++	/*
++	 * The first chunk is special: compare against the number
++	 * of valid bytes in this chunk.
++	 */
++	srli	a0, t1, 3
++	bgtu	t3, a0, 3f
++
++	/* Prepare for the word comparison loop. */
++	addi	t2, t0, SZREG
++	li	t3, -1
++
++	/*
++	 * Our critical loop is 4 instructions and processes data in
++	 * 4 byte or 8 byte chunks.
++	 */
++	.p2align 3
++1:
++	REG_L	t1, SZREG(t0)
++	addi	t0, t0, SZREG
++	orc.b	t1, t1
++	beq	t1, t3, 1b
++2:
++	not	t1, t1
++	CZ	t1, t1
++
++	/* Get number of processed words.  */
++	sub	t2, t0, t2
++
++	/* Add number of characters in the first word.  */
++	add	a0, a0, t2
++	srli	t1, t1, 3
++
++	/* Add number of characters in the last word.  */
++	add	a0, a0, t1
++3:
++	ret
++
++.option pop
++#endif
+```
+
+## 总结
+
+以上梳理了memory和strcmp相关优化代码，可以发现：
+
+memory相关优化方法主要有两点：通过连续存储减少条件分支及其跳转次数，减少判断上的时间；以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分，批量操作一定程度上提高效率。
+
+string对于zbb支持部分的函数优化，主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架，之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数，以此优化部分情况下的zbb中str相关函数的使用效率。
+
+接下来将按照 Memory, String, 数据运算，其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读，敬请期待。
\ No newline at end of file
-- 
Gitee


From 2afb67d4cba266bfbd5e659dc8d204a5e39e936b Mon Sep 17 00:00:00 2001
From: Jingqing3948 <2351290287@qq.com>
Date: Sun, 18 Jun 2023 13:13:15 +0800
Subject: [PATCH 02/14] 20230618-after using tinycorrect

---
 ...ation-content-for-str-and-mem-functions.md | 1051 +++++++++--------
 1 file changed, 526 insertions(+), 525 deletions(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index 2a4bac8..a0ee3e5 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -1,526 +1,527 @@
-> Author: Jingqing 2351290287@qq.com
-> Date: 2023/6/17
-> Revisor: 
-> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux)
-> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)
-> Sponsor: PLCT Lab, ISCAS
-
-# 近半年riscv内核库中str和mem函数的优化内容总结
-
-## 简介
-
-本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况，主要涉及 Memory, String 操作两大部分。
-
-## Memory
-
-### riscv: optimized mem* functions
-
-[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
-
-对各种mem相关操作函数的优化。
-
-#### memcpy
-
-主要是由“直接逐字节复制”转变为“先对齐再按字复制”。
-
-1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS，则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。
-2. 如果distance==0说明src和dest两者已经对齐，直接进行（32 or 64 bits）字长复制。
-3. 如果!=0说明未对齐，按照差值逐字复制。
-
-```c
-+void *__memcpy(void *dest, const void *src, size_t count)
-+{
-+	union const_types s = { .as_u8 = src };
-+	union types d = { .as_u8 = dest };
-+	int distance = 0;
-+
-+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
-+		if (count < MIN_THRESHOLD)
-+			goto copy_remainder;
-+
-+		/* Copy a byte at time until destination is aligned. */
-+		for (; d.as_uptr & WORD_MASK; count--)
-+			*d.as_u8++ = *s.as_u8++;
-+
-+		distance = s.as_uptr & WORD_MASK;
-+	}
-+
-+	if (distance) {
-+		unsigned long last, next;
-+
-+		/*
-+		 * s is distance bytes ahead of d, and d just reached
-+		 * the alignment boundary. Move s backward to word align it
-+		 * and shift data to compensate for distance, in order to do
-+		 * word-by-word copy.
-+		 */
-+		s.as_u8 -= distance;
-+
-+		next = s.as_ulong[0];
-+		for (; count >= BYTES_LONG; count -= BYTES_LONG) {
-+			last = next;
-+			next = s.as_ulong[1];
-+
-+			d.as_ulong[0] = last >> (distance * 8) |
-+					next << ((BYTES_LONG - distance) * 8);
-+
-+			d.as_ulong++;
-+			s.as_ulong++;
-+		}
-+
-+		/* Restore s with the original offset. */
-+		s.as_u8 += distance;
-+	} else {
-+		/*
-+		 * If the source and dest lower bits are the same, do a simple
-+		 * 32/64 bit wide copy.
-+		 */
-+		for (; count >= BYTES_LONG; count -= BYTES_LONG)
-+			*d.as_ulong++ = *s.as_ulong++;
-+	}
-+
-+copy_remainder:
-+	while (count--)
-+		*d.as_u8++ = *s.as_u8++;
-+
-+	return dest;
-+}
-+EXPORT_SYMBOL(__memcpy);
-+
-+void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
-+EXPORT_SYMBOL(memcpy);
-```
-
-#### memmove
-
-如果dest和src不重叠或者dest<src，可以直接用memcpy（dest<src我的理解是从低地址往高地址复制，哪怕两者重叠也不会受干扰）。
-
-```c
-+
-+/*
-+ * Simply check if the buffer overlaps an call memcpy() in case,
-+ * otherwise do a simple one byte at time backward copy.
-+ */
-+void *__memmove(void *dest, const void *src, size_t count)
-+{
-+	if (dest < src || src + count <= dest)
-+		return __memcpy(dest, src, count);
-+
-+	if (dest > src) {
-+		const char *s = src + count;
-+		char *tmp = dest + count;
-+
-+		while (count--)
-+			*--tmp = *--s;
-+	}
-+	return dest;
-+}
-+EXPORT_SYMBOL(__memmove);
-+
-+void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove);
-+EXPORT_SYMBOL(memmove);
-```
-
-#### memset
-
-旧memset：永远一次一个字节地填充。安全但是效率低。
-
-修改后：也是采用对齐机制，先按字节填充，等到和最大填充单位的倍数对齐时按最大填充单位填入。
-
-```c
-+void *__memset(void *s, int c, size_t count)
-+{
-+	union types dest = { .as_u8 = s };
-+
-+	if (count >= MIN_THRESHOLD) {
-+		unsigned long cu = (unsigned long)c;
-+
-+		/* Compose an ulong with 'c' repeated 4/8 times */
-+#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
-+		cu *= 0x0101010101010101UL;
-+#else
-+		cu |= cu << 8;
-+		cu |= cu << 16;
-+		/* Suppress warning on 32 bit machines */
-+		cu |= (cu << 16) << 16;
-+#endif
-+		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
-+			/*
-+			 * Fill the buffer one byte at time until
-+			 * the destination is word aligned.
-+			 */
-+			for (; count && dest.as_uptr & WORD_MASK; count--)
-+				*dest.as_u8++ = c;
-+		}
-+
-+		/* Copy using the largest size allowed */
-+		for (; count >= BYTES_LONG; count -= BYTES_LONG)
-+			*dest.as_ulong++ = cu;
-+	}
-+
-+	/* copy the remainder */
-+	while (count--)
-+		*dest.as_u8++ = c;
-+
-+	return s;
-+}
-+EXPORT_SYMBOL(__memset);
-+
-+void *memset(void *s, int c, size_t count) __weak __alias(__memset);
-+EXPORT_SYMBOL(memset);
-```
-
-### riscv: lib: optimize memcmp with ld insn
-
-[riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
-
-这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题，没有看到作者提交新的版本。
-
-这笔优化的核心代码和解读如下：
-
-旧代码：
-
-```
-sb  a1, 0(t0)
-addi  t0, t0, 1
-bltu  t0, a3, 5b
-```
-
-新代码：
-
-```
-/* fill head and tail with minimal branching */
-sb      a1,  0(t0)
-sb      a1, -1(a3)
-li  a4, 2
-bgeu  a4, a2, 6f
-
-sb  a1,  1(t0)
-sb  a1,  2(t0)
-sb  a1, -2(a3)
-sb  a1, -3(a3)
-li  a4, 6
-bgeu  a4, a2, 6f
-
-/* 
- * Adding additional detection to avoid 
- * redundant stores can lead 
- * to better performance
- */
-sb  a1,  3(t0)
-sb  a1, -4(a3)
-li  a4, 8
-bgeu  a4, a2, 6f
-
-sb  a1,  4(t0)
-sb  a1, -5(a3)
-li  a4, 10
-bgeu  a4, a2, 6f
-
-sb  a1,  5(t0)
-sb  a1,  6(t0)
-sb  a1, -6(a3)
-sb  a1, -7(a3)
-li  a4, 14
-bgeu  a4, a2, 6f
-
-/* store the last byte */
-sb  a1,  7(t0)
-```
-
-主要的改动如下：
-
-1. 将旧代码中的一行 `addi t0, t0, 1` 替换为一系列新的存储指令，用于填充头部和尾部。新代码中的存储指令是以一定的间隔连续存储数据。
-2. 添加了额外的条件检测和分支，以避免重复存储，这可能会提高性能。
-3. 添加了一行 `li a4, 2` 来设置一个常数，用于条件比较。
-4. 添加了 `6f` 标签，用于跳转到代码的结尾。
-
-它的核心优化思路是用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
-
-### RISC-V: Apply Zicboz to clear_page and memset
-
-[RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
-
-引入Zicboz扩展后，Zicboz块大小的内存自然对齐。因此要对接收任意内存块地址和大小的memset()来清空内存的方法进行优化。
-
-分析发现当输入的地址未对齐或者太小时，Zicboz中的memset会显得效率低一些（多了几十条指令）。
-
-1. 首先检查是否启用了CONFIG_RISCV_ISA_ZICBOZ来判断是否使用Zicboz扩展。如果不使用Zicboz扩展或者传入的参数不适合使用Zicboz扩展，则代码会跳转到.Ldo_memset标签处执行内存清零的逻辑。
-2. 如果使用Zicboz扩展进行内存清零，代码会将地址和长度进行对齐，并使用Zicboz扩展的指令进行内存清零操作。
-3. 在进行Zicboz扩展内存清零时，如果还有一些字节无法使用Zicboz扩展一次性清零，则会使用Duff's设备来处理剩余的字节。
-
-### RISC-V: Optimize memset for data sizes less than 16 bytes
-
-[RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) ...
-
-在上述memset优化的基础上继续进行。
-
-大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据，memset 使用字节存储，效率相对低。 改进方案决定用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
-
-```c
-+void *__memset(void *s, int c, size_t count)
-+{
-+	union types dest = { .as_u8 = s };
-+
-+	if (count >= MIN_THRESHOLD) {
-+		unsigned long cu = (unsigned long)c;
-+
-+		/* Compose an ulong with 'c' repeated 4/8 times */
-+#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
-+		cu *= 0x0101010101010101UL;
-+#else
-+		cu |= cu << 8;
-+		cu |= cu << 16;
-+		/* Suppress warning on 32 bit machines */
-+		cu |= (cu << 16) << 16;//8bits的c复制4次来构造unsigned long的cu
-+#endif
-+		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
-+			/*
-+			 * Fill the buffer one byte at time until
-+			 * the destination is word aligned.
-+			 */
-+			for (; count && dest.as_uptr & WORD_MASK; count--)
-+				*dest.as_u8++ = c;//逐字节填充对应地址中的值=c
-+		}
-+
-+		/* Copy using the largest size allowed */
-+		for (; count >= BYTES_LONG; count -= BYTES_LONG)
-+			*dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu
-+	}
-+
-+	/* copy the remainder */
-+	while (count--)
-+		*dest.as_u8++ = c;//剩余值全部设置为c
-+
-+	return s;
-+}
-+EXPORT_SYMBOL(__memset);
-+
-+void *memset(void *s, int c, size_t count) __weak __alias(__memset);
-+EXPORT_SYMBOL(memset);
-```
-
-## String
-### Zbb string optimizations
-
-[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
-
-主要是为zbb提供了通用的一些字符串支持，后续特定用法优化拓展需要单独实现。
-
-- 为Zbb系统添加了允许未对齐访问的strcmp，strncmp，strlen以及生成相应makefile文件。
-
-- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义，简化。
-
-  ```c
-  -#define	CPUFEATURE_SVPBMT 0
-  -#define	CPUFEATURE_ZICBOM 1
-  -#define	CPUFEATURE_ZBB 2
-  +#define	CPUFEATURE_SVPBMT 	(1 << 0)
-  +#define	CPUFEATURE_ZICBOM	(1 << 1)
-  +#define	CPUFEATURE_ZBB		(1 << 2)
-  ```
-
-### Zbb+ fast-unaligned string optimization
-
-[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ...
-
-添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体，在无法生效的情况下退回到通用情况。
-
-#### strcmp_zbb
-
-检查两个字符串是否对齐到SZREG的边界。如果是，则以SZREG为单位比较两个字符串中的内容。如果不是，则按字节读取。
-
-```c
-+/*
-+ * Variant of strcmp using the ZBB extension if available
-+ */
-+#ifdef CONFIG_RISCV_ISA_ZBB
-+strcmp_zbb:
-+
-+.option push
-+.option arch,+zbb
-+
-+	/*
-+	 * Returns
-+	 *   a0 - comparison result, value like strcmp
-+	 *
-+	 * Parameters
-+	 *   a0 - string1
-+	 *   a1 - string2
-+	 *
-+	 * Clobbers
-+	 *   t0, t1, t2, t3, t4, t5
-+	 */
-+
-+	or	t2, a0, a1
-+	li	t4, -1
-+	and	t2, t2, SZREG-1
-+	bnez	t2, 3f
-+
-+	/* Main loop for aligned string.  */
-+	.p2align 3
-+1:
-+	REG_L	t0, 0(a0)
-+	REG_L	t1, 0(a1)
-+	orc.b	t3, t0
-+	bne	t3, t4, 2f
-+	addi	a0, a0, SZREG
-+	addi	a1, a1, SZREG
-+	beq	t0, t1, 1b
-+
-+	/*
-+	 * Words don't match, and no null byte in the first
-+	 * word. Get bytes in big-endian order and compare.
-+	 */
-+#ifndef CONFIG_CPU_BIG_ENDIAN
-+	rev8	t0, t0
-+	rev8	t1, t1
-+#endif
-+
-+	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
-+	sltu	a0, t0, t1
-+	neg	a0, a0
-+	ori	a0, a0, 1
-+	ret
-+
-+2:
-+	/*
-+	 * Found a null byte.
-+	 * If words don't match, fall back to simple loop.
-+	 */
-+	bne	t0, t1, 3f
-+
-+	/* Otherwise, strings are equal. */
-+	li	a0, 0
-+	ret
-+
-+	/* Simple loop for misaligned strings. */
-+	.p2align 3
-+3:
-+	lbu	t0, 0(a0)
-+	lbu	t1, 0(a1)
-+	addi	a0, a0, 1
-+	addi	a1, a1, 1
-+	bne	t0, t1, 4f
-+	bnez	t0, 3b
-+
-+4:
-+	sub	a0, t0, t1
-+	ret
-+
-+.option pop
-+#endif
-```
-
-#### strlen_zbb
-
-启用CONFIG_RISCV_ISA_ZBB的前提下，移位对齐字符后从头开始以SZREG为单位读取，并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。
-
-```c
-+#ifdef CONFIG_RISCV_ISA_ZBB
-+strlen_zbb:
-+
-+#ifdef CONFIG_CPU_BIG_ENDIAN
-+# define CZ	clz
-+# define SHIFT	sll
-+#else
-+# define CZ	ctz
-+# define SHIFT	srl
-+#endif
-+
-+.option push
-+.option arch,+zbb
-+
-+	/*
-+	 * Returns
-+	 *   a0 - string length
-+	 *
-+	 * Parameters
-+	 *   a0 - String to measure
-+	 *
-+	 * Clobbers
-+	 *   t0, t1, t2, t3
-+	 */
-+
-+	/* Number of irrelevant bytes in the first word. */
-+	andi	t2, a0, SZREG-1
-+
-+	/* Align pointer. */
-+	andi	t0, a0, -SZREG
-+
-+	li	t3, SZREG
-+	sub	t3, t3, t2
-+	slli	t2, t2, 3
-+
-+	/* Get the first word.  */
-+	REG_L	t1, 0(t0)
-+
-+	/*
-+	 * Shift away the partial data we loaded to remove the irrelevant bytes
-+	 * preceding the string with the effect of adding NUL bytes at the
-+	 * end of the string's first word.
-+	 */
-+	SHIFT	t1, t1, t2
-+
-+	/* Convert non-NUL into 0xff and NUL into 0x00. */
-+	orc.b	t1, t1
-+
-+	/* Convert non-NUL into 0x00 and NUL into 0xff. */
-+	not	t1, t1
-+
-+	/*
-+	 * Search for the first set bit (corresponding to a NUL byte in the
-+	 * original chunk).
-+	 */
-+	CZ	t1, t1
-+
-+	/*
-+	 * The first chunk is special: compare against the number
-+	 * of valid bytes in this chunk.
-+	 */
-+	srli	a0, t1, 3
-+	bgtu	t3, a0, 3f
-+
-+	/* Prepare for the word comparison loop. */
-+	addi	t2, t0, SZREG
-+	li	t3, -1
-+
-+	/*
-+	 * Our critical loop is 4 instructions and processes data in
-+	 * 4 byte or 8 byte chunks.
-+	 */
-+	.p2align 3
-+1:
-+	REG_L	t1, SZREG(t0)
-+	addi	t0, t0, SZREG
-+	orc.b	t1, t1
-+	beq	t1, t3, 1b
-+2:
-+	not	t1, t1
-+	CZ	t1, t1
-+
-+	/* Get number of processed words.  */
-+	sub	t2, t0, t2
-+
-+	/* Add number of characters in the first word.  */
-+	add	a0, a0, t2
-+	srli	t1, t1, 3
-+
-+	/* Add number of characters in the last word.  */
-+	add	a0, a0, t1
-+3:
-+	ret
-+
-+.option pop
-+#endif
-```
-
-## 总结
-
-以上梳理了memory和strcmp相关优化代码，可以发现：
-
-memory相关优化方法主要有两点：通过连续存储减少条件分支及其跳转次数，减少判断上的时间；以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分，批量操作一定程度上提高效率。
-
-string对于zbb支持部分的函数优化，主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架，之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数，以此优化部分情况下的zbb中str相关函数的使用效率。
-
+> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [tounix]<br/>
+> Author: Jingqing 2351290287@qq.com
+> Date: 2023/6/17
+> Revisor: 
+> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux)
+> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)
+> Sponsor: PLCT Lab, ISCAS
+
+# 近半年riscv内核库中str和mem函数的优化内容总结
+
+## 简介
+
+本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况，主要涉及 Memory, String 操作两大部分。
+
+## Memory
+
+### riscv: optimized mem* functions
+
+[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
+
+对各种mem相关操作函数的优化。
+
+#### memcpy
+
+主要是由“直接逐字节复制”转变为“先对齐再按字复制”。
+
+1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS，则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。
+2. 如果distance==0说明src和dest两者已经对齐，直接进行（32 or 64 bits）字长复制。
+3. 如果!=0说明未对齐，按照差值逐字复制。
+
+```c
++void *__memcpy(void *dest, const void *src, size_t count)
++{
++	union const_types s = { .as_u8 = src };
++	union types d = { .as_u8 = dest };
++	int distance = 0;
++
++	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
++		if (count < MIN_THRESHOLD)
++			goto copy_remainder;
++
++		/* Copy a byte at time until destination is aligned. */
++		for (; d.as_uptr & WORD_MASK; count--)
++			*d.as_u8++ = *s.as_u8++;
++
++		distance = s.as_uptr & WORD_MASK;
++	}
++
++	if (distance) {
++		unsigned long last, next;
++
++		/*
++		 * s is distance bytes ahead of d, and d just reached
++		 * the alignment boundary. Move s backward to word align it
++		 * and shift data to compensate for distance, in order to do
++		 * word-by-word copy.
++		 */
++		s.as_u8 -= distance;
++
++		next = s.as_ulong[0];
++		for (; count >= BYTES_LONG; count -= BYTES_LONG) {
++			last = next;
++			next = s.as_ulong[1];
++
++			d.as_ulong[0] = last >> (distance * 8) |
++					next << ((BYTES_LONG - distance) * 8);
++
++			d.as_ulong++;
++			s.as_ulong++;
++		}
++
++		/* Restore s with the original offset. */
++		s.as_u8 += distance;
++	} else {
++		/*
++		 * If the source and dest lower bits are the same, do a simple
++		 * 32/64 bit wide copy.
++		 */
++		for (; count >= BYTES_LONG; count -= BYTES_LONG)
++			*d.as_ulong++ = *s.as_ulong++;
++	}
++
++copy_remainder:
++	while (count--)
++		*d.as_u8++ = *s.as_u8++;
++
++	return dest;
++}
++EXPORT_SYMBOL(__memcpy);
++
++void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
++EXPORT_SYMBOL(memcpy);
+```
+
+#### memmove
+
+如果dest和src不重叠或者dest<src，可以直接用memcpy（dest<src我的理解是从低地址往高地址复制，哪怕两者重叠也不会受干扰）。
+
+```c
++
++/*
++ * Simply check if the buffer overlaps an call memcpy() in case,
++ * otherwise do a simple one byte at time backward copy.
++ */
++void *__memmove(void *dest, const void *src, size_t count)
++{
++	if (dest < src || src + count <= dest)
++		return __memcpy(dest, src, count);
++
++	if (dest > src) {
++		const char *s = src + count;
++		char *tmp = dest + count;
++
++		while (count--)
++			*--tmp = *--s;
++	}
++	return dest;
++}
++EXPORT_SYMBOL(__memmove);
++
++void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove);
++EXPORT_SYMBOL(memmove);
+```
+
+#### memset
+
+旧memset：永远一次一个字节地填充。安全但是效率低。
+
+修改后：也是采用对齐机制，先按字节填充，等到和最大填充单位的倍数对齐时按最大填充单位填入。
+
+```c
++void *__memset(void *s, int c, size_t count)
++{
++	union types dest = { .as_u8 = s };
++
++	if (count >= MIN_THRESHOLD) {
++		unsigned long cu = (unsigned long)c;
++
++		/* Compose an ulong with 'c' repeated 4/8 times */
++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
++		cu *= 0x0101010101010101UL;
++#else
++		cu |= cu << 8;
++		cu |= cu << 16;
++		/* Suppress warning on 32 bit machines */
++		cu |= (cu << 16) << 16;
++#endif
++		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
++			/*
++			 * Fill the buffer one byte at time until
++			 * the destination is word aligned.
++			 */
++			for (; count && dest.as_uptr & WORD_MASK; count--)
++				*dest.as_u8++ = c;
++		}
++
++		/* Copy using the largest size allowed */
++		for (; count >= BYTES_LONG; count -= BYTES_LONG)
++			*dest.as_ulong++ = cu;
++	}
++
++	/* copy the remainder */
++	while (count--)
++		*dest.as_u8++ = c;
++
++	return s;
++}
++EXPORT_SYMBOL(__memset);
++
++void *memset(void *s, int c, size_t count) __weak __alias(__memset);
++EXPORT_SYMBOL(memset);
+```
+
+### riscv: lib: optimize memcmp with ld insn
+
+[riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
+
+这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题，没有看到作者提交新的版本。
+
+这笔优化的核心代码和解读如下：
+
+旧代码：
+
+```
+sb  a1, 0(t0)
+addi  t0, t0, 1
+bltu  t0, a3, 5b
+```
+
+新代码：
+
+```
+/* fill head and tail with minimal branching */
+sb      a1,  0(t0)
+sb      a1, -1(a3)
+li  a4, 2
+bgeu  a4, a2, 6f
+
+sb  a1,  1(t0)
+sb  a1,  2(t0)
+sb  a1, -2(a3)
+sb  a1, -3(a3)
+li  a4, 6
+bgeu  a4, a2, 6f
+
+/* 
+ * Adding additional detection to avoid 
+ * redundant stores can lead 
+ * to better performance
+ */
+sb  a1,  3(t0)
+sb  a1, -4(a3)
+li  a4, 8
+bgeu  a4, a2, 6f
+
+sb  a1,  4(t0)
+sb  a1, -5(a3)
+li  a4, 10
+bgeu  a4, a2, 6f
+
+sb  a1,  5(t0)
+sb  a1,  6(t0)
+sb  a1, -6(a3)
+sb  a1, -7(a3)
+li  a4, 14
+bgeu  a4, a2, 6f
+
+/* store the last byte */
+sb  a1,  7(t0)
+```
+
+主要的改动如下：
+
+1. 将旧代码中的一行 `addi t0, t0, 1` 替换为一系列新的存储指令，用于填充头部和尾部。新代码中的存储指令是以一定的间隔连续存储数据。
+2. 添加了额外的条件检测和分支，以避免重复存储，这可能会提高性能。
+3. 添加了一行 `li a4, 2` 来设置一个常数，用于条件比较。
+4. 添加了 `6f` 标签，用于跳转到代码的结尾。
+
+它的核心优化思路是用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
+
+### RISC-V: Apply Zicboz to clear_page and memset
+
+[RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
+
+引入Zicboz扩展后，Zicboz块大小的内存自然对齐。因此要对接收任意内存块地址和大小的memset()来清空内存的方法进行优化。
+
+分析发现当输入的地址未对齐或者太小时，Zicboz中的memset会显得效率低一些（多了几十条指令）。
+
+1. 首先检查是否启用了CONFIG_RISCV_ISA_ZICBOZ来判断是否使用Zicboz扩展。如果不使用Zicboz扩展或者传入的参数不适合使用Zicboz扩展，则代码会跳转到.Ldo_memset标签处执行内存清零的逻辑。
+2. 如果使用Zicboz扩展进行内存清零，代码会将地址和长度进行对齐，并使用Zicboz扩展的指令进行内存清零操作。
+3. 在进行Zicboz扩展内存清零时，如果还有一些字节无法使用Zicboz扩展一次性清零，则会使用Duff's设备来处理剩余的字节。
+
+### RISC-V: Optimize memset for data sizes less than 16 bytes
+
+[RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) ...
+
+在上述memset优化的基础上继续进行。
+
+大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据，memset 使用字节存储，效率相对低。 改进方案决定用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
+
+```c
++void *__memset(void *s, int c, size_t count)
++{
++	union types dest = { .as_u8 = s };
++
++	if (count >= MIN_THRESHOLD) {
++		unsigned long cu = (unsigned long)c;
++
++		/* Compose an ulong with 'c' repeated 4/8 times */
++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
++		cu *= 0x0101010101010101UL;
++#else
++		cu |= cu << 8;
++		cu |= cu << 16;
++		/* Suppress warning on 32 bit machines */
++		cu |= (cu << 16) << 16;//8bits的c复制4次来构造unsigned long的cu
++#endif
++		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
++			/*
++			 * Fill the buffer one byte at time until
++			 * the destination is word aligned.
++			 */
++			for (; count && dest.as_uptr & WORD_MASK; count--)
++				*dest.as_u8++ = c;//逐字节填充对应地址中的值=c
++		}
++
++		/* Copy using the largest size allowed */
++		for (; count >= BYTES_LONG; count -= BYTES_LONG)
++			*dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu
++	}
++
++	/* copy the remainder */
++	while (count--)
++		*dest.as_u8++ = c;//剩余值全部设置为c
++
++	return s;
++}
++EXPORT_SYMBOL(__memset);
++
++void *memset(void *s, int c, size_t count) __weak __alias(__memset);
++EXPORT_SYMBOL(memset);
+```
+
+## String
+### Zbb string optimizations
+
+[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
+
+主要是为zbb提供了通用的一些字符串支持，后续特定用法优化拓展需要单独实现。
+
+- 为Zbb系统添加了允许未对齐访问的strcmp，strncmp，strlen以及生成相应makefile文件。
+
+- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义，简化。
+
+  ```c
+  -#define	CPUFEATURE_SVPBMT 0
+  -#define	CPUFEATURE_ZICBOM 1
+  -#define	CPUFEATURE_ZBB 2
+  +#define	CPUFEATURE_SVPBMT 	(1 << 0)
+  +#define	CPUFEATURE_ZICBOM	(1 << 1)
+  +#define	CPUFEATURE_ZBB		(1 << 2)
+  ```
+
+### Zbb+ fast-unaligned string optimization
+
+[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ...
+
+添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体，在无法生效的情况下退回到通用情况。
+
+#### strcmp_zbb
+
+检查两个字符串是否对齐到SZREG的边界。如果是，则以SZREG为单位比较两个字符串中的内容。如果不是，则按字节读取。
+
+```c
++/*
++ * Variant of strcmp using the ZBB extension if available
++ */
++#ifdef CONFIG_RISCV_ISA_ZBB
++strcmp_zbb:
++
++.option push
++.option arch,+zbb
++
++	/*
++	 * Returns
++	 *   a0 - comparison result, value like strcmp
++	 *
++	 * Parameters
++	 *   a0 - string1
++	 *   a1 - string2
++	 *
++	 * Clobbers
++	 *   t0, t1, t2, t3, t4, t5
++	 */
++
++	or	t2, a0, a1
++	li	t4, -1
++	and	t2, t2, SZREG-1
++	bnez	t2, 3f
++
++	/* Main loop for aligned string.  */
++	.p2align 3
++1:
++	REG_L	t0, 0(a0)
++	REG_L	t1, 0(a1)
++	orc.b	t3, t0
++	bne	t3, t4, 2f
++	addi	a0, a0, SZREG
++	addi	a1, a1, SZREG
++	beq	t0, t1, 1b
++
++	/*
++	 * Words don't match, and no null byte in the first
++	 * word. Get bytes in big-endian order and compare.
++	 */
++#ifndef CONFIG_CPU_BIG_ENDIAN
++	rev8	t0, t0
++	rev8	t1, t1
++#endif
++
++	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
++	sltu	a0, t0, t1
++	neg	a0, a0
++	ori	a0, a0, 1
++	ret
++
++2:
++	/*
++	 * Found a null byte.
++	 * If words don't match, fall back to simple loop.
++	 */
++	bne	t0, t1, 3f
++
++	/* Otherwise, strings are equal. */
++	li	a0, 0
++	ret
++
++	/* Simple loop for misaligned strings. */
++	.p2align 3
++3:
++	lbu	t0, 0(a0)
++	lbu	t1, 0(a1)
++	addi	a0, a0, 1
++	addi	a1, a1, 1
++	bne	t0, t1, 4f
++	bnez	t0, 3b
++
++4:
++	sub	a0, t0, t1
++	ret
++
++.option pop
++#endif
+```
+
+#### strlen_zbb
+
+启用CONFIG_RISCV_ISA_ZBB的前提下，移位对齐字符后从头开始以SZREG为单位读取，并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。
+
+```c
++#ifdef CONFIG_RISCV_ISA_ZBB
++strlen_zbb:
++
++#ifdef CONFIG_CPU_BIG_ENDIAN
++# define CZ	clz
++# define SHIFT	sll
++#else
++# define CZ	ctz
++# define SHIFT	srl
++#endif
++
++.option push
++.option arch,+zbb
++
++	/*
++	 * Returns
++	 *   a0 - string length
++	 *
++	 * Parameters
++	 *   a0 - String to measure
++	 *
++	 * Clobbers
++	 *   t0, t1, t2, t3
++	 */
++
++	/* Number of irrelevant bytes in the first word. */
++	andi	t2, a0, SZREG-1
++
++	/* Align pointer. */
++	andi	t0, a0, -SZREG
++
++	li	t3, SZREG
++	sub	t3, t3, t2
++	slli	t2, t2, 3
++
++	/* Get the first word.  */
++	REG_L	t1, 0(t0)
++
++	/*
++	 * Shift away the partial data we loaded to remove the irrelevant bytes
++	 * preceding the string with the effect of adding NUL bytes at the
++	 * end of the string's first word.
++	 */
++	SHIFT	t1, t1, t2
++
++	/* Convert non-NUL into 0xff and NUL into 0x00. */
++	orc.b	t1, t1
++
++	/* Convert non-NUL into 0x00 and NUL into 0xff. */
++	not	t1, t1
++
++	/*
++	 * Search for the first set bit (corresponding to a NUL byte in the
++	 * original chunk).
++	 */
++	CZ	t1, t1
++
++	/*
++	 * The first chunk is special: compare against the number
++	 * of valid bytes in this chunk.
++	 */
++	srli	a0, t1, 3
++	bgtu	t3, a0, 3f
++
++	/* Prepare for the word comparison loop. */
++	addi	t2, t0, SZREG
++	li	t3, -1
++
++	/*
++	 * Our critical loop is 4 instructions and processes data in
++	 * 4 byte or 8 byte chunks.
++	 */
++	.p2align 3
++1:
++	REG_L	t1, SZREG(t0)
++	addi	t0, t0, SZREG
++	orc.b	t1, t1
++	beq	t1, t3, 1b
++2:
++	not	t1, t1
++	CZ	t1, t1
++
++	/* Get number of processed words.  */
++	sub	t2, t0, t2
++
++	/* Add number of characters in the first word.  */
++	add	a0, a0, t2
++	srli	t1, t1, 3
++
++	/* Add number of characters in the last word.  */
++	add	a0, a0, t1
++3:
++	ret
++
++.option pop
++#endif
+```
+
+## 总结
+
+以上梳理了memory和strcmp相关优化代码，可以发现：
+
+memory相关优化方法主要有两点：通过连续存储减少条件分支及其跳转次数，减少判断上的时间；以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分，批量操作一定程度上提高效率。
+
+string对于zbb支持部分的函数优化，主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架，之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数，以此优化部分情况下的zbb中str相关函数的使用效率。
+
 接下来将按照 Memory, String, 数据运算，其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读，敬请期待。
\ No newline at end of file
-- 
Gitee


From 4057a2c1d19e021edea01955d81a7d29795fb483 Mon Sep 17 00:00:00 2001
From: Jingqing3948 <2351290287@qq.com>
Date: Sun, 18 Jun 2023 13:20:33 +0800
Subject: [PATCH 03/14] tinycorrect2

---
 ...ation-content-for-str-and-mem-functions.md | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index a0ee3e5..60a6a33 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -1,10 +1,10 @@
-> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [tounix]<br/>
-> Author: Jingqing 2351290287@qq.com
-> Date: 2023/6/17
-> Revisor: 
-> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux)
-> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)
-> Sponsor: PLCT Lab, ISCAS
+> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [spaces header toc]<br/>
+> Author:    Jingqing 2351290287@qq.com<br/>
+> Date:      2023/6/17<br/>
+> Revisor:   Falcon <falcon@tinylab.org><br/>
+> Project:   [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux)<br/>
+> Proposal:  [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)<br/>
+> Sponsor:   PLCT Lab, ISCAS
 
 # 近半年riscv内核库中str和mem函数的优化内容总结
 
@@ -203,9 +203,9 @@ sb  a1, -3(a3)
 li  a4, 6
 bgeu  a4, a2, 6f
 
-/* 
- * Adding additional detection to avoid 
- * redundant stores can lead 
+/*
+ * Adding additional detection to avoid
+ * redundant stores can lead
  * to better performance
  */
 sb  a1,  3(t0)
@@ -256,7 +256,7 @@ sb  a1,  7(t0)
 
 在上述memset优化的基础上继续进行。
 
-大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据，memset 使用字节存储，效率相对低。 改进方案决定用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
+大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据，memset 使用字节存储，效率相对低。改进方案决定用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
 
 ```c
 +void *__memset(void *s, int c, size_t count)
@@ -302,6 +302,7 @@ sb  a1,  7(t0)
 ```
 
 ## String
+
 ### Zbb string optimizations
 
 [Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
@@ -524,4 +525,4 @@ memory相关优化方法主要有两点：通过连续存储减少条件分支
 
 string对于zbb支持部分的函数优化，主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架，之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数，以此优化部分情况下的zbb中str相关函数的使用效率。
 
-接下来将按照 Memory, String, 数据运算，其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读，敬请期待。
\ No newline at end of file
+接下来将按照 Memory, String, 数据运算，其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读，敬请期待。
-- 
Gitee


From e9cf47f9dbec7751d5700af228f896ac5aedb25c Mon Sep 17 00:00:00 2001
From: Jingqing3948 <2351290287@qq.com>
Date: Sun, 18 Jun 2023 13:23:09 +0800
Subject: [PATCH 04/14] using tinycorrect

---
 ...ation-content-for-str-and-mem-functions.md | 73 ++++++++++---------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index 60a6a33..92c367b 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -1,4 +1,4 @@
-> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [spaces header toc]<br/>
+> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [urls pangu autocorrect]<br/>
 > Author:    Jingqing 2351290287@qq.com<br/>
 > Date:      2023/6/17<br/>
 > Revisor:   Falcon <falcon@tinylab.org><br/>
@@ -6,7 +6,7 @@
 > Proposal:  [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)<br/>
 > Sponsor:   PLCT Lab, ISCAS
 
-# 近半年riscv内核库中str和mem函数的优化内容总结
+# 近半年 RISC-V 内核库中 str 和 mem 函数的优化内容总结
 
 ## 简介
 
@@ -16,17 +16,17 @@
 
 ### riscv: optimized mem* functions
 
-[riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
+[riscv: optimized mem* functions][002]
 
-对各种mem相关操作函数的优化。
+对各种 mem 相关操作函数的优化。
 
 #### memcpy
 
 主要是由“直接逐字节复制”转变为“先对齐再按字复制”。
 
-1. 如果仍未启用高效对齐访问CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS，则先在不改变dest和src相对距离的情况下将desc对齐在字边界上。
-2. 如果distance==0说明src和dest两者已经对齐，直接进行（32 or 64 bits）字长复制。
-3. 如果!=0说明未对齐，按照差值逐字复制。
+1. 如果仍未启用高效对齐访问 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS，则先在不改变 dest 和 src 相对距离的情况下将 desc 对齐在字边界上。
+2. 如果 distance==0 说明 src 和 dest 两者已经对齐，直接进行（32 or 64 bits）字长复制。
+3. 如果！=0 说明未对齐，按照差值逐字复制。
 
 ```c
 +void *__memcpy(void *dest, const void *src, size_t count)
@@ -94,7 +94,7 @@
 
 #### memmove
 
-如果dest和src不重叠或者dest<src，可以直接用memcpy（dest<src我的理解是从低地址往高地址复制，哪怕两者重叠也不会受干扰）。
+如果 dest 和 src 不重叠或者 dest<src，可以直接用 memcpy（dest<src 我的理解是从低地址往高地址复制，哪怕两者重叠也不会受干扰）。
 
 ```c
 +
@@ -124,7 +124,7 @@
 
 #### memset
 
-旧memset：永远一次一个字节地填充。安全但是效率低。
+旧 memset：永远一次一个字节地填充。安全但是效率低。
 
 修改后：也是采用对齐机制，先按字节填充，等到和最大填充单位的倍数对齐时按最大填充单位填入。
 
@@ -173,7 +173,7 @@
 
 ### riscv: lib: optimize memcmp with ld insn
 
-[riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
+[riscv: lib: optimize memcmp with ld insn][003]
 
 这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题，没有看到作者提交新的版本。
 
@@ -240,23 +240,23 @@ sb  a1,  7(t0)
 
 ### RISC-V: Apply Zicboz to clear_page and memset
 
-[RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
+[RISC-V: Apply Zicboz to clear_page and memset][004]
 
-引入Zicboz扩展后，Zicboz块大小的内存自然对齐。因此要对接收任意内存块地址和大小的memset()来清空内存的方法进行优化。
+引入 Zicboz 扩展后，Zicboz 块大小的内存自然对齐。因此要对接收任意内存块地址和大小的 memset() 来清空内存的方法进行优化。
 
-分析发现当输入的地址未对齐或者太小时，Zicboz中的memset会显得效率低一些（多了几十条指令）。
+分析发现当输入的地址未对齐或者太小时，Zicboz 中的 memset 会显得效率低一些（多了几十条指令）。
 
-1. 首先检查是否启用了CONFIG_RISCV_ISA_ZICBOZ来判断是否使用Zicboz扩展。如果不使用Zicboz扩展或者传入的参数不适合使用Zicboz扩展，则代码会跳转到.Ldo_memset标签处执行内存清零的逻辑。
-2. 如果使用Zicboz扩展进行内存清零，代码会将地址和长度进行对齐，并使用Zicboz扩展的指令进行内存清零操作。
-3. 在进行Zicboz扩展内存清零时，如果还有一些字节无法使用Zicboz扩展一次性清零，则会使用Duff's设备来处理剩余的字节。
+1. 首先检查是否启用了 CONFIG_RISCV_ISA_ZICBOZ 来判断是否使用 Zicboz 扩展。如果不使用 Zicboz 扩展或者传入的参数不适合使用 Zicboz 扩展，则代码会跳转到.Ldo_memset 标签处执行内存清零的逻辑。
+2. 如果使用 Zicboz 扩展进行内存清零，代码会将地址和长度进行对齐，并使用 Zicboz 扩展的指令进行内存清零操作。
+3. 在进行 Zicboz 扩展内存清零时，如果还有一些字节无法使用 Zicboz 扩展一次性清零，则会使用 Duff's 设备来处理剩余的字节。
 
 ### RISC-V: Optimize memset for data sizes less than 16 bytes
 
-[RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/) ...
+[RISC-V: Optimize memset for data sizes less than 16 bytes][006] ...
 
-在上述memset优化的基础上继续进行。
+在上述 memset 优化的基础上继续进行。
 
-大于等于16字节先对齐后按16byte倍数存储。对于尾部数据或小于16字节的数据，memset 使用字节存储，效率相对低。改进方案决定用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
+大于等于 16 字节先对齐后按 16byte 倍数存储。对于尾部数据或小于 16 字节的数据，memset 使用字节存储，效率相对低。改进方案决定用许多分支结构填充头尾，这样虽然可能有一部分存储冗余，但是因为并行存储，减少跳转次数，提高了效率。
 
 ```c
 +void *__memset(void *s, int c, size_t count)
@@ -273,7 +273,7 @@ sb  a1,  7(t0)
 +		cu |= cu << 8;
 +		cu |= cu << 16;
 +		/* Suppress warning on 32 bit machines */
-+		cu |= (cu << 16) << 16;//8bits的c复制4次来构造unsigned long的cu
++		cu |= (cu << 16) << 16;//8bits 的 c 复制 4 次来构造 unsigned long 的 cu
 +#endif
 +		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
 +			/*
@@ -286,12 +286,12 @@ sb  a1,  7(t0)
 +
 +		/* Copy using the largest size allowed */
 +		for (; count >= BYTES_LONG; count -= BYTES_LONG)
-+			*dest.as_ulong++ = cu;//BYTES_LONG的整数倍部分复制为cu
++			*dest.as_ulong++ = cu;//BYTES_LONG 的整数倍部分复制为 cu
 +	}
 +
 +	/* copy the remainder */
 +	while (count--)
-+		*dest.as_u8++ = c;//剩余值全部设置为c
++		*dest.as_u8++ = c;//剩余值全部设置为 c
 +
 +	return s;
 +}
@@ -305,13 +305,13 @@ sb  a1,  7(t0)
 
 ### Zbb string optimizations
 
-[Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
+[Zbb string optimizations][001]
 
-主要是为zbb提供了通用的一些字符串支持，后续特定用法优化拓展需要单独实现。
+主要是为 zbb 提供了通用的一些字符串支持，后续特定用法优化拓展需要单独实现。
 
-- 为Zbb系统添加了允许未对齐访问的strcmp，strncmp，strlen以及生成相应makefile文件。
+- 为 Zbb 系统添加了允许未对齐访问的 strcmp，strncmp，strlen 以及生成相应 makefile 文件。
 
-- 用位域而不是数字代替CPU的补丁拓展errata-id的宏定义，简化。
+- 用位域而不是数字代替 CPU 的补丁拓展 errata-id 的宏定义，简化。
 
   ```c
   -#define	CPUFEATURE_SVPBMT 0
@@ -324,13 +324,13 @@ sb  a1,  7(t0)
 
 ### Zbb+ fast-unaligned string optimization
 
-[Zbb + fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/) ...
+[Zbb + fast-unaligned string optimization][005] ...
 
-添加多个strcmp变体用于快速比较非对齐访问。优先使用效率高的优化变体，在无法生效的情况下退回到通用情况。
+添加多个 strcmp 变体用于快速比较非对齐访问。优先使用效率高的优化变体，在无法生效的情况下退回到通用情况。
 
 #### strcmp_zbb
 
-检查两个字符串是否对齐到SZREG的边界。如果是，则以SZREG为单位比较两个字符串中的内容。如果不是，则按字节读取。
+检查两个字符串是否对齐到 SZREG 的边界。如果是，则以 SZREG 为单位比较两个字符串中的内容。如果不是，则按字节读取。
 
 ```c
 +/*
@@ -416,7 +416,7 @@ sb  a1,  7(t0)
 
 #### strlen_zbb
 
-启用CONFIG_RISCV_ISA_ZBB的前提下，移位对齐字符后从头开始以SZREG为单位读取，并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。
+启用 CONFIG_RISCV_ISA_ZBB 的前提下，移位对齐字符后从头开始以 SZREG 为单位读取，并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。
 
 ```c
 +#ifdef CONFIG_RISCV_ISA_ZBB
@@ -519,10 +519,17 @@ sb  a1,  7(t0)
 
 ## 总结
 
-以上梳理了memory和strcmp相关优化代码，可以发现：
+以上梳理了 memory 和 strcmp 相关优化代码，可以发现：
 
-memory相关优化方法主要有两点：通过连续存储减少条件分支及其跳转次数，减少判断上的时间；以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分，批量操作一定程度上提高效率。
+memory 相关优化方法主要有两点：通过连续存储减少条件分支及其跳转次数，减少判断上的时间；以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分，批量操作一定程度上提高效率。
 
-string对于zbb支持部分的函数优化，主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架，之后又提出了优化对齐方式下按SZREG块单位执行函数的优化方案。当优化方案不适用时再使用通用函数，以此优化部分情况下的zbb中str相关函数的使用效率。
+string 对于 zbb 支持部分的函数优化，主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架，之后又提出了优化对齐方式下按 SZREG 块单位执行函数的优化方案。当优化方案不适用时再使用通用函数，以此优化部分情况下的 zbb 中 str 相关函数的使用效率。
 
 接下来将按照 Memory, String, 数据运算，其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读，敬请期待。
+
+[001]: https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/
+[002]: https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/
+[003]: https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/
+[004]: https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/
+[005]: https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/
+[006]: https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/
-- 
Gitee


From e920c45f40e10e47eda932accf956a05388b2d4c Mon Sep 17 00:00:00 2001
From: falcon <falcon@tinylab.org>
Date: Tue, 20 Jun 2023 13:22:35 +0000
Subject: [PATCH 05/14] Update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md

---
 ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index 92c367b..e571a45 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -10,7 +10,7 @@
 
 ## 简介
 
-本文结合 lore.kernel.org/linux-riscv 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况，主要涉及 Memory, String 操作两大部分。
+本文结合 <https://lore.kernel.org/linux-riscv> 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况，主要涉及 Memory, String 操作两大部分。
 
 ## Memory
 
-- 
Gitee


From f3a545eb345e3a1c92db3a2c682a661ac736c372 Mon Sep 17 00:00:00 2001
From: falcon <falcon@tinylab.org>
Date: Tue, 20 Jun 2023 13:22:46 +0000
Subject: [PATCH 06/14] Update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md

---
 ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index e571a45..e8ffd5d 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -18,7 +18,7 @@
 
 [riscv: optimized mem* functions][002]
 
-对各种 mem 相关操作函数的优化。
+该组 patchset 对各种 mem 相关操作函数进行了优化，以下逐个分析。
 
 #### memcpy
 
-- 
Gitee


From b612cf1a8d7a1d928956736e3948ec4f80de4320 Mon Sep 17 00:00:00 2001
From: falcon <falcon@tinylab.org>
Date: Tue, 20 Jun 2023 13:22:55 +0000
Subject: [PATCH 07/14] Update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md

---
 ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index e8ffd5d..2126d39 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -26,7 +26,7 @@
 
 1. 如果仍未启用高效对齐访问 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS，则先在不改变 dest 和 src 相对距离的情况下将 desc 对齐在字边界上。
 2. 如果 distance==0 说明 src 和 dest 两者已经对齐，直接进行（32 or 64 bits）字长复制。
-3. 如果！=0 说明未对齐，按照差值逐字复制。
+3. 如果 `distance !=0` 说明未对齐，按照差值逐字复制。
 
 ```c
 +void *__memcpy(void *dest, const void *src, size_t count)
-- 
Gitee


From 31f9f6aaf501b00954da8376b4f15d53781efc3f Mon Sep 17 00:00:00 2001
From: falcon <falcon@tinylab.org>
Date: Tue, 20 Jun 2023 13:23:04 +0000
Subject: [PATCH 08/14] Update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md

---
 ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index 2126d39..a4b7ddd 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -94,7 +94,7 @@
 
 #### memmove
 
-如果 dest 和 src 不重叠或者 dest<src，可以直接用 memcpy（dest<src 我的理解是从低地址往高地址复制，哪怕两者重叠也不会受干扰）。
+如果 dest 和 src 不重叠或者 `dest<src`，可以直接用 memcpy（`dest<src` 我的理解是从低地址往高地址复制，哪怕两者重叠也不会受干扰）。
 
 ```c
 +
-- 
Gitee


From 6d45a768b755eca9898a85f803b70a8bd37c2e2a Mon Sep 17 00:00:00 2001
From: falcon <falcon@tinylab.org>
Date: Tue, 20 Jun 2023 13:23:13 +0000
Subject: [PATCH 09/14] Update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md

---
 ...summary-of-optimization-content-for-str-and-mem-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index a4b7ddd..874dd40 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -25,7 +25,7 @@
 主要是由“直接逐字节复制”转变为“先对齐再按字复制”。
 
 1. 如果仍未启用高效对齐访问 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS，则先在不改变 dest 和 src 相对距离的情况下将 desc 对齐在字边界上。
-2. 如果 distance==0 说明 src 和 dest 两者已经对齐，直接进行（32 or 64 bits）字长复制。
+2. 如果 `distance==0` 说明 src 和 dest 两者已经对齐，直接进行（32 or 64 bits）字长复制。
 3. 如果 `distance !=0` 说明未对齐，按照差值逐字复制。
 
 ```c
-- 
Gitee


From 7818b4078bfa17054d0251a6bc2bc470e07a61c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=81=B0=E6=B5=B7=E5=AE=BD=E6=9D=BE?= <2351290287@qq.com>
Date: Tue, 20 Jun 2023 18:05:44 +0000
Subject: [PATCH 10/14] update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md.
 add example code to 004 and 005 examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 灰海宽松 <2351290287@qq.com>
---
 ...ation-content-for-str-and-mem-functions.md | 88 +++++++++++++++++--
 1 file changed, 82 insertions(+), 6 deletions(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index 874dd40..854cf5c 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -250,6 +250,60 @@ sb  a1,  7(t0)
 2. 如果使用 Zicboz 扩展进行内存清零，代码会将地址和长度进行对齐，并使用 Zicboz 扩展的指令进行内存清零操作。
 3. 在进行 Zicboz 扩展内存清零时，如果还有一些字节无法使用 Zicboz 扩展一次性清零，则会使用 Duff's 设备来处理剩余的字节。
 
+```c
++#ifdef CONFIG_RISCV_ISA_ZICBOZ
++	ALT_ZICBOZ("j .Ldo_memset", "nop")
++	/*
++	 * t1 will be the Zicboz block size.
++	 * Zero means we're not using Zicboz, and we don't when a1 != 0
++	 */
++	li	t1, 0
++	bnez	a1, .Ldo_memset
++	la	a3, riscv_cboz_block_size
++	lw	t1, 0(a3)
++
++	/*
++	 * Round to nearest Zicboz block-aligned address
++	 * greater than or equal to the start address.
++	 */
++	addi	a3, t1, -1
++	not	t2, a3			/* t2 is Zicboz block size mask */
++	add	a3, t0, a3
++	and	t3, a3, t2		/* t3 is Zicboz block aligned start */
++
++	/* Did we go too far or not have at least one block? */
++	add	a3, a0, a2
++	and	a3, a3, t2
++	bgtu	a3, t3, .Ldo_zero
++	li	t1, 0
++	j	.Ldo_memset
++
++.Ldo_zero:
++	/* Use Duff for initial bytes if there are any */
++	bne	t3, t0, .Ldo_memset
++
++.Ldo_zero2:
++	/* Calculate end address */
++	and	a3, a2, t2
++	add	a3, t0, a3
++	sub	a4, a3, t0
++
++.Lzero_loop:
++	CBO_ZERO(t0)
++	add	t0, t0, t1
++	bltu	t0, a3, .Lzero_loop
++	li	t1, 0			/* We're done with Zicboz */
++
++	sub	a2, a2, a4		/* Update count */
++	sltiu	a3, a2, 16
++	bnez	a3, .Lfinish
++
++	/* t0 is Zicboz block size aligned, so it must be SZREG aligned */
++	j	.Ldo_duff3
++#endif
++
+```
+
 ### RISC-V: Optimize memset for data sizes less than 16 bytes
 
 [RISC-V: Optimize memset for data sizes less than 16 bytes][006] ...
@@ -328,6 +382,26 @@ sb  a1,  7(t0)
 
 添加多个 strcmp 变体用于快速比较非对齐访问。优先使用效率高的优化变体，在无法生效的情况下退回到通用情况。
 
+```c
++static bool __init_or_module cpufeature_probe_fast_unaligned(unsigned int stage)
++{
++	int cpu;
++
++	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
++		return false;
++
++	for_each_possible_cpu(cpu) {
++		long perf = per_cpu(misaligned_access_speed, cpu);
++
++		if (perf != RISCV_HWPROBE_MISALIGNED_FAST)
++			return false;
++	}
++
++	return true;
++}
++
+```
+
 #### strcmp_zbb
 
 检查两个字符串是否对齐到 SZREG 的边界。如果是，则以 SZREG 为单位比较两个字符串中的内容。如果不是，则按字节读取。
@@ -527,9 +601,11 @@ string 对于 zbb 支持部分的函数优化，主要是先提供通用支持
 
 接下来将按照 Memory, String, 数据运算，其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读，敬请期待。
 
-[001]: https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/
-[002]: https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/
-[003]: https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/
-[004]: https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/
-[005]: https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/
-[006]: https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/
+## 参考资料
+
+- [001]: [https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
+- [002]: [https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
+- [003]: [https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
+- [004]: [https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
+- [005]: [https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/)
+- [006]: [https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/)
-- 
Gitee


From e48b8e421e3e2978181928cfdf767c26bd8bd4ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=81=B0=E6=B5=B7=E5=AE=BD=E6=9D=BE?= <2351290287@qq.com>
Date: Tue, 20 Jun 2023 18:14:00 +0000
Subject: [PATCH 11/14] update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md.
 modify the form of reference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 灰海宽松 <2351290287@qq.com>
---
 ...optimization-content-for-str-and-mem-functions.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index 854cf5c..14020ee 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -603,9 +603,9 @@ string 对于 zbb 支持部分的函数优化，主要是先提供通用支持
 
 ## 参考资料
 
-- [001]: [https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
-- [002]: [https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
-- [003]: [https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
-- [004]: [https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
-- [005]: [https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/)
-- [006]: [https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/)
+- 001: [Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
+- 002: [riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
+- 003: [riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
+- 004: [RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
+- 005: [Zbb+ fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/)
+- 006: [RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/)
-- 
Gitee


From 2029cf6932d92a386eb27250fd58f41416e29552 Mon Sep 17 00:00:00 2001
From: falcon <falcon@tinylab.org>
Date: Wed, 21 Jun 2023 02:54:35 +0000
Subject: [PATCH 12/14] Update
 articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md

---
 ...ation-content-for-str-and-mem-functions.md | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
index 14020ee..340704d 100644
--- a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
+++ b/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
@@ -603,9 +603,16 @@ string 对于 zbb 支持部分的函数优化，主要是先提供通用支持
 
 ## 参考资料
 
-- 001: [Zbb string optimizations](https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/)
-- 002: [riscv: optimized mem* functions](https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/)
-- 003: [riscv: lib: optimize memcmp with ld insn](https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/)
-- 004: [RISC-V: Apply Zicboz to clear_page and memset](https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/)
-- 005: [Zbb+ fast-unaligned string optimization](https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/)
-- 006: [RISC-V: Optimize memset for data sizes less than 16 bytes](https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/)
+- [Zbb string optimizations][001]
+- [riscv: optimized mem* functions][002]
+- [riscv: lib: optimize memcmp with ld insn][003]
+- [RISC-V: Apply Zicboz to clear_page and memset][004]
+- [Zbb+ fast-unaligned string optimization][005]
+- [RISC-V: Optimize memset for data sizes less than 16 bytes][006]
+
+[001]: https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/
+[002]: https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/
+[003]: https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/
+[004]: https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/
+[005]: https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/
+[006]: https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/
-- 
Gitee


From f4011249834a410b4985be1daa5ded2577797574 Mon Sep 17 00:00:00 2001
From: Jingqing3948 <2351290287@qq.com>
Date: Wed, 21 Jun 2023 15:20:11 +0800
Subject: [PATCH 13/14] change file name

---
 ...mem-functions.md => 20230617-riscv-kernel-libc-opt-summary.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename articles/{20230617-summary-of-optimization-content-for-str-and-mem-functions.md => 20230617-riscv-kernel-libc-opt-summary.md} (100%)

diff --git a/articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md b/articles/20230617-riscv-kernel-libc-opt-summary.md
similarity index 100%
rename from articles/20230617-summary-of-optimization-content-for-str-and-mem-functions.md
rename to articles/20230617-riscv-kernel-libc-opt-summary.md
-- 
Gitee


From 08ef519efeb235f5f487033070f9560a0b0e018b Mon Sep 17 00:00:00 2001
From: Jingqing3948 <2351290287@qq.com>
Date: Wed, 21 Jun 2023 15:25:40 +0800
Subject: [PATCH 14/14] change filename

---
 ...el-libc-opt-summary.md => 20230617-riscv-klibc-opt-summary.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename articles/{20230617-riscv-kernel-libc-opt-summary.md => 20230617-riscv-klibc-opt-summary.md} (100%)

diff --git a/articles/20230617-riscv-kernel-libc-opt-summary.md b/articles/20230617-riscv-klibc-opt-summary.md
similarity index 100%
rename from articles/20230617-riscv-kernel-libc-opt-summary.md
rename to articles/20230617-riscv-klibc-opt-summary.md
-- 
Gitee