arch/riscv/lib/memmove.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
 */

#include <linux/linkage.h>
#include <asm/asm.h>

SYM_FUNC_START(__memmove)
SYM_FUNC_START_WEAK(memmove)
	/*
	 * Returns
	 *   a0 - dest
	 *
	 * Parameters
	 *   a0 - Inclusive first byte of dest
	 *   a1 - Inclusive first byte of src
	 *   a2 - Length of copy n
	 *
	 * Because the return matches the parameter register a0,
	 * we will not clobber or modify that register.
	 *
	 * Note: This currently only works on little-endian.
	 * To port to big-endian, reverse the direction of shifts
	 * in the 2 misaligned fixup copy loops.
	 */

	/* Return if nothing to do */
	beq a0, a1, return_from_memmove
	beqz a2, return_from_memmove

	/*
	 * Register Uses
	 *      Forward Copy: a1 - Index counter of src
	 *      Reverse Copy: a4 - Index counter of src
	 *      Forward Copy: t3 - Index counter of dest
	 *      Reverse Copy: t4 - Index counter of dest
	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
	 *   Both Copy Modes: t0 - Link / Temporary for load-store
	 *   Both Copy Modes: t1 - Temporary for load-store
	 *   Both Copy Modes: t2 - Temporary for load-store
	 *   Both Copy Modes: a5 - dest to src alignment offset
	 *   Both Copy Modes: a6 - Shift ammount
	 *   Both Copy Modes: a7 - Inverse Shift ammount
	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
	 */

	/*
	 * Solve for some register values now.
	 * Byte copy does not need t5 or t6.
	 */
	mv   t3, a0
	add  t4, a0, a2
	add  a4, a1, a2

	/*
	 * Byte copy if copying less than (2 * SZREG) bytes. This can
	 * cause problems with the bulk copy implementation and is
	 * small enough not to bother.
	 */
	andi t0, a2, -(2 * SZREG)
	beqz t0, byte_copy

	/*
	 * Now solve for t5 and t6.
	 */
	andi t5, t3, -SZREG
	andi t6, t4, -SZREG
	/*
	 * If dest(Register t3) rounded down to the nearest naturally
	 * aligned SZREG address, does not equal dest, then add SZREG
	 * to find the low-bound of SZREG alignment in the dest memory
	 * region.  Note that this could overshoot the dest memory
	 * region if n is less than SZREG.  This is one reason why
	 * we always byte copy if n is less than SZREG.
	 * Otherwise, dest is already naturally aligned to SZREG.
	 */
	beq  t5, t3, 1f
		addi t5, t5, SZREG
	1:

	/*
	 * If the dest and src are co-aligned to SZREG, then there is
	 * no need for the full rigmarole of a full misaligned fixup copy.
	 * Instead, do a simpler co-aligned copy.
	 */
	xor  t0, a0, a1
	andi t1, t0, (SZREG - 1)
	beqz t1, coaligned_copy
	/* Fall through to misaligned fixup copy */

misaligned_fixup_copy:
	bltu a1, a0, misaligned_fixup_copy_reverse

misaligned_fixup_copy_forward:
	jal  t0, byte_copy_until_aligned_forward

	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
	sub  a5, a1, t3 /* Find the difference between src and dest */
	andi a1, a1, -SZREG /* Align the src pointer */
	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/

	/*
	 * Compute The Inverse Shift
	 * a7 = XLEN - a6 = XLEN + -a6
	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
	 * Add that to XLEN.  XLEN = SZREG * 8.
	 */
	not  a7, a6
	addi a7, a7, (SZREG * 8 + 1)

	/*
	 * Fix Misalignment Copy Loop - Forward
	 * load_val0 = load_ptr[0];
	 * do {
	 * 	load_val1 = load_ptr[1];
	 * 	store_ptr += 2;
	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
	 *
	 * 	if (store_ptr == {a2})
	 * 		break;
	 *
	 * 	load_val0 = load_ptr[2];
	 * 	load_ptr += 2;
	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
	 *
	 * } while (store_ptr != store_ptr_end);
	 * store_ptr = store_ptr_end;
	 */

	REG_L t0, (0 * SZREG)(a1)
	1:
	REG_L t1, (1 * SZREG)(a1)
	addi  t3, t3, (2 * SZREG)
	srl   t0, t0, a6
	sll   t2, t1, a7
	or    t2, t0, t2
	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)

	beq   t3, a2, 2f

	REG_L t0, (2 * SZREG)(a1)
	addi  a1, a1, (2 * SZREG)
	srl   t1, t1, a6
	sll   t2, t0, a7
	or    t2, t1, t2
	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)

	bne   t3, t6, 1b
	2:
	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */

	add  a1, t3, a5 /* Restore the src pointer */
	j byte_copy_forward /* Copy any remaining bytes */

misaligned_fixup_copy_reverse:
	jal  t0, byte_copy_until_aligned_reverse

	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
	sub  a5, a4, t4 /* Find the difference between src and dest */
	andi a4, a4, -SZREG /* Align the src pointer */
	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/

	/*
	 * Compute The Inverse Shift
	 * a7 = XLEN - a6 = XLEN + -a6
	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
	 * Add that to XLEN.  XLEN = SZREG * 8.
	 */
	not  a7, a6
	addi a7, a7, (SZREG * 8 + 1)

	/*
	 * Fix Misalignment Copy Loop - Reverse
	 * load_val1 = load_ptr[0];
	 * do {
	 * 	load_val0 = load_ptr[-1];
	 * 	store_ptr -= 2;
	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
	 *
	 * 	if (store_ptr == {a2})
	 * 		break;
	 *
	 * 	load_val1 = load_ptr[-2];
	 * 	load_ptr -= 2;
	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
	 *
	 * } while (store_ptr != store_ptr_end);
	 * store_ptr = store_ptr_end;
	 */

	REG_L t1, ( 0 * SZREG)(a4)
	1:
	REG_L t0, (-1 * SZREG)(a4)
	addi  t4, t4, (-2 * SZREG)
	sll   t1, t1, a7
	srl   t2, t0, a6
	or    t2, t1, t2
	REG_S t2, ( 1 * SZREG)(t4)

	beq   t4, a2, 2f

	REG_L t1, (-2 * SZREG)(a4)
	addi  a4, a4, (-2 * SZREG)
	sll   t0, t0, a7
	srl   t2, t1, a6
	or    t2, t0, t2
	REG_S t2, ( 0 * SZREG)(t4)

	bne   t4, t5, 1b
	2:
	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */

	add  a4, t4, a5 /* Restore the src pointer */
	j byte_copy_reverse /* Copy any remaining bytes */

/*
 * Simple copy loops for SZREG co-aligned memory locations.
 * These also make calls to do byte copies for any unaligned
 * data at their terminations.
 */
coaligned_copy:
	bltu a1, a0, coaligned_copy_reverse

coaligned_copy_forward:
	jal t0, byte_copy_until_aligned_forward

	1:
	REG_L t1, ( 0 * SZREG)(a1)
	addi  a1, a1, SZREG
	addi  t3, t3, SZREG
	REG_S t1, (-1 * SZREG)(t3)
	bne   t3, t6, 1b

	j byte_copy_forward /* Copy any remaining bytes */

coaligned_copy_reverse:
	jal t0, byte_copy_until_aligned_reverse

	1:
	REG_L t1, (-1 * SZREG)(a4)
	addi  a4, a4, -SZREG
	addi  t4, t4, -SZREG
	REG_S t1, ( 0 * SZREG)(t4)
	bne   t4, t5, 1b

	j byte_copy_reverse /* Copy any remaining bytes */

/*
 * These are basically sub-functions within the function.  They
 * are used to byte copy until the dest pointer is in alignment.
 * At which point, a bulk copy method can be used by the
 * calling code.  These work on the same registers as the bulk
 * copy loops.  Therefore, the register values can be picked
 * up from where they were left and we avoid code duplication
 * without any overhead except the call in and return jumps.
 */
byte_copy_until_aligned_forward:
	beq  t3, t5, 2f
	1:
	lb   t1,  0(a1)
	addi a1, a1, 1
	addi t3, t3, 1
	sb   t1, -1(t3)
	bne  t3, t5, 1b
	2:
	jalr zero, 0x0(t0) /* Return to multibyte copy loop */

byte_copy_until_aligned_reverse:
	beq  t4, t6, 2f
	1:
	lb   t1, -1(a4)
	addi a4, a4, -1
	addi t4, t4, -1
	sb   t1,  0(t4)
	bne  t4, t6, 1b
	2:
	jalr zero, 0x0(t0) /* Return to multibyte copy loop */

/*
 * Simple byte copy loops.
 * These will byte copy until they reach the end of data to copy.
 * At that point, they will call to return from memmove.
 */
byte_copy:
	bltu a1, a0, byte_copy_reverse

byte_copy_forward:
	beq  t3, t4, 2f
	1:
	lb   t1,  0(a1)
	addi a1, a1, 1
	addi t3, t3, 1
	sb   t1, -1(t3)
	bne  t3, t4, 1b
	2:
	ret

byte_copy_reverse:
	beq  t4, t3, 2f
	1:
	lb   t1, -1(a4)
	addi a4, a4, -1
	addi t4, t4, -1
	sb   t1,  0(t4)
	bne  t4, t3, 1b
	2:

return_from_memmove:
	ret

SYM_FUNC_END(memmove)
SYM_FUNC_END(__memmove)