1//
2// Copyright (c) 2012 - 2016, Linaro Limited
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//     * Redistributions of source code must retain the above copyright
8//       notice, this list of conditions and the following disclaimer.
9//     * Redistributions in binary form must reproduce the above copyright
10//       notice, this list of conditions and the following disclaimer in the
11//       documentation and/or other materials provided with the distribution.
12//     * Neither the name of the Linaro nor the
13//       names of its contributors may be used to endorse or promote products
14//       derived from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27//
28
29//
30// Copyright (c) 2015 ARM Ltd
31// All rights reserved.
32//
33// Redistribution and use in source and binary forms, with or without
34// modification, are permitted provided that the following conditions
35// are met:
36// 1. Redistributions of source code must retain the above copyright
37//    notice, this list of conditions and the following disclaimer.
38// 2. Redistributions in binary form must reproduce the above copyright
39//    notice, this list of conditions and the following disclaimer in the
40//    documentation and/or other materials provided with the distribution.
41// 3. The name of the company may not be used to endorse or promote
42//    products derived from this software without specific prior written
43//    permission.
44//
45// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
46// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
47// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
48// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
50// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
51// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
52// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
53// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
54// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55//
56
57// Assumptions:
58//
59// ARMv8-a, AArch64, unaligned accesses.
60//
61//
62
63#define dstin     x0
64#define src       x1
65#define count     x2
66#define dst       x3
67#define srcend    x4
68#define dstend    x5
69#define A_l       x6
70#define A_lw      w6
71#define A_h       x7
72#define A_hw      w7
73#define B_l       x8
74#define B_lw      w8
75#define B_h       x9
76#define C_l       x10
77#define C_h       x11
78#define D_l       x12
79#define D_h       x13
80#define E_l       x14
81#define E_h       x15
82#define F_l       srcend
83#define F_h       dst
84#define tmp1      x9
85#define tmp2      x3
86
87#define L(l) .L ## l
88
89// Copies are split into 3 main cases: small copies of up to 16 bytes,
90// medium copies of 17..96 bytes which are fully unrolled. Large copies
91// of more than 96 bytes align the destination and use an unrolled loop
92// processing 64 bytes per iteration.
93// Small and medium copies read all data before writing, allowing any
94// kind of overlap, and memmove tailcalls memcpy for these cases as
95// well as non-overlapping copies.
96
97__memcpy:
98    prfm    PLDL1KEEP, [src]
99    add     srcend, src, count
100    add     dstend, dstin, count
101    cmp     count, 16
102    b.ls    L(copy16)
103    cmp     count, 96
104    b.hi    L(copy_long)
105
106    // Medium copies: 17..96 bytes.
107    sub     tmp1, count, 1
108    ldp     A_l, A_h, [src]
109    tbnz    tmp1, 6, L(copy96)
110    ldp     D_l, D_h, [srcend, -16]
111    tbz     tmp1, 5, 1f
112    ldp     B_l, B_h, [src, 16]
113    ldp     C_l, C_h, [srcend, -32]
114    stp     B_l, B_h, [dstin, 16]
115    stp     C_l, C_h, [dstend, -32]
1161:
117    stp     A_l, A_h, [dstin]
118    stp     D_l, D_h, [dstend, -16]
119    ret
120
121    .p2align 4
122    // Small copies: 0..16 bytes.
123L(copy16):
124    cmp     count, 8
125    b.lo    1f
126    ldr     A_l, [src]
127    ldr     A_h, [srcend, -8]
128    str     A_l, [dstin]
129    str     A_h, [dstend, -8]
130    ret
131    .p2align 4
1321:
133    tbz     count, 2, 1f
134    ldr     A_lw, [src]
135    ldr     A_hw, [srcend, -4]
136    str     A_lw, [dstin]
137    str     A_hw, [dstend, -4]
138    ret
139
140    // Copy 0..3 bytes.  Use a branchless sequence that copies the same
141    // byte 3 times if count==1, or the 2nd byte twice if count==2.
1421:
143    cbz     count, 2f
144    lsr     tmp1, count, 1
145    ldrb    A_lw, [src]
146    ldrb    A_hw, [srcend, -1]
147    ldrb    B_lw, [src, tmp1]
148    strb    A_lw, [dstin]
149    strb    B_lw, [dstin, tmp1]
150    strb    A_hw, [dstend, -1]
1512:  ret
152
153    .p2align 4
154    // Copy 64..96 bytes.  Copy 64 bytes from the start and
155    // 32 bytes from the end.
156L(copy96):
157    ldp     B_l, B_h, [src, 16]
158    ldp     C_l, C_h, [src, 32]
159    ldp     D_l, D_h, [src, 48]
160    ldp     E_l, E_h, [srcend, -32]
161    ldp     F_l, F_h, [srcend, -16]
162    stp     A_l, A_h, [dstin]
163    stp     B_l, B_h, [dstin, 16]
164    stp     C_l, C_h, [dstin, 32]
165    stp     D_l, D_h, [dstin, 48]
166    stp     E_l, E_h, [dstend, -32]
167    stp     F_l, F_h, [dstend, -16]
168    ret
169
170    // Align DST to 16 byte alignment so that we don't cross cache line
171    // boundaries on both loads and stores. There are at least 96 bytes
172    // to copy, so copy 16 bytes unaligned and then align.	The loop
173    // copies 64 bytes per iteration and prefetches one iteration ahead.
174
175    .p2align 4
176L(copy_long):
177    and     tmp1, dstin, 15
178    bic     dst, dstin, 15
179    ldp     D_l, D_h, [src]
180    sub     src, src, tmp1
181    add     count, count, tmp1      // Count is now 16 too large.
182    ldp     A_l, A_h, [src, 16]
183    stp     D_l, D_h, [dstin]
184    ldp     B_l, B_h, [src, 32]
185    ldp     C_l, C_h, [src, 48]
186    ldp     D_l, D_h, [src, 64]!
187    subs    count, count, 128 + 16  // Test and readjust count.
188    b.ls    2f
1891:
190    stp     A_l, A_h, [dst, 16]
191    ldp     A_l, A_h, [src, 16]
192    stp     B_l, B_h, [dst, 32]
193    ldp     B_l, B_h, [src, 32]
194    stp     C_l, C_h, [dst, 48]
195    ldp     C_l, C_h, [src, 48]
196    stp     D_l, D_h, [dst, 64]!
197    ldp     D_l, D_h, [src, 64]!
198    subs    count, count, 64
199    b.hi    1b
200
201    // Write the last full set of 64 bytes.	 The remainder is at most 64
202    // bytes, so it is safe to always copy 64 bytes from the end even if
203    // there is just 1 byte left.
2042:
205    ldp     E_l, E_h, [srcend, -64]
206    stp     A_l, A_h, [dst, 16]
207    ldp     A_l, A_h, [srcend, -48]
208    stp     B_l, B_h, [dst, 32]
209    ldp     B_l, B_h, [srcend, -32]
210    stp     C_l, C_h, [dst, 48]
211    ldp     C_l, C_h, [srcend, -16]
212    stp     D_l, D_h, [dst, 64]
213    stp     E_l, E_h, [dstend, -64]
214    stp     A_l, A_h, [dstend, -48]
215    stp     B_l, B_h, [dstend, -32]
216    stp     C_l, C_h, [dstend, -16]
217    ret
218
219
220//
221// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
222// Larger backwards copies are also handled by memcpy. The only remaining
223// case is forward large copies.  The destination is aligned, and an
224// unrolled loop processes 64 bytes per iteration.
225//
226
227ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
228ASM_PFX(InternalMemCopyMem):
229    sub     tmp2, dstin, src
230    cmp     count, 96
231    ccmp    tmp2, count, 2, hi
232    b.hs    __memcpy
233
234    cbz     tmp2, 3f
235    add     dstend, dstin, count
236    add     srcend, src, count
237
238    // Align dstend to 16 byte alignment so that we don't cross cache line
239    // boundaries on both loads and stores. There are at least 96 bytes
240    // to copy, so copy 16 bytes unaligned and then align. The loop
241    // copies 64 bytes per iteration and prefetches one iteration ahead.
242
243    and     tmp2, dstend, 15
244    ldp     D_l, D_h, [srcend, -16]
245    sub     srcend, srcend, tmp2
246    sub     count, count, tmp2
247    ldp     A_l, A_h, [srcend, -16]
248    stp     D_l, D_h, [dstend, -16]
249    ldp     B_l, B_h, [srcend, -32]
250    ldp     C_l, C_h, [srcend, -48]
251    ldp     D_l, D_h, [srcend, -64]!
252    sub     dstend, dstend, tmp2
253    subs    count, count, 128
254    b.ls    2f
255    nop
2561:
257    stp     A_l, A_h, [dstend, -16]
258    ldp     A_l, A_h, [srcend, -16]
259    stp     B_l, B_h, [dstend, -32]
260    ldp     B_l, B_h, [srcend, -32]
261    stp     C_l, C_h, [dstend, -48]
262    ldp     C_l, C_h, [srcend, -48]
263    stp     D_l, D_h, [dstend, -64]!
264    ldp     D_l, D_h, [srcend, -64]!
265    subs    count, count, 64
266    b.hi    1b
267
268    // Write the last full set of 64 bytes. The remainder is at most 64
269    // bytes, so it is safe to always copy 64 bytes from the start even if
270    // there is just 1 byte left.
2712:
272    ldp     E_l, E_h, [src, 48]
273    stp     A_l, A_h, [dstend, -16]
274    ldp     A_l, A_h, [src, 32]
275    stp     B_l, B_h, [dstend, -32]
276    ldp     B_l, B_h, [src, 16]
277    stp     C_l, C_h, [dstend, -48]
278    ldp     C_l, C_h, [src]
279    stp     D_l, D_h, [dstend, -64]
280    stp     E_l, E_h, [dstin, 48]
281    stp     A_l, A_h, [dstin, 32]
282    stp     B_l, B_h, [dstin, 16]
283    stp     C_l, C_h, [dstin]
2843:  ret
285