5 TMP = 3 /* N and TMP don't overlap */
12 MOVW R(TS), to+0(FP) /* need to save for return value */
13 MOVW from+4(FP), R(FROM)
16 ADD R(N), R(TS), R(TE) /* to end pointer */
22 ADD R(N), R(FROM) /* from end pointer */
23 CMP $4, R(N) /* need at least 4 bytes to copy */
26 _b4align: /* align destination on 4 */
27 AND.S $3, R(TE), R(TMP)
30 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
31 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
34 _b4aligned: /* is source now aligned? */
35 AND.S $3, R(FROM), R(TMP)
38 ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */
43 MOVM.DB.W (R(FROM)), [R4-R7]
44 MOVM.DB.W [R4-R7], (R(TE))
45 MOVM.DB.W (R(FROM)), [R4-R7]
46 MOVM.DB.W [R4-R7], (R(TE))
49 _b4tail: /* do remaining words if possible */
55 MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */
56 MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */
59 _b1tail: /* remaining bytes */
63 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
64 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
68 CMP $4, R(N) /* need at least 4 bytes to copy */
71 _f4align: /* align destination on 4 */
72 AND.S $3, R(TS), R(TMP)
75 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
76 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
79 _f4aligned: /* is source now aligned? */
80 AND.S $3, R(FROM), R(TMP)
83 SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */
88 MOVM.IA.W (R(FROM)), [R4-R7]
89 MOVM.IA.W [R4-R7], (R(TS))
90 MOVM.IA.W (R(FROM)), [R4-R7]
91 MOVM.IA.W [R4-R7], (R(TS))
95 SUB $3, R(TE), R(TMP) /* do remaining words if possible */
100 MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */
101 MOVW.P R4, 4(R(TS)) /* implicit write back */
108 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
109 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
126 CMP $2, R(TMP) /* is R(TMP) < 2 ? */
128 MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */
129 MOVW.LT $24, R(LSHIFT)
130 MOVW.LT $1, R(OFFSET)
132 MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */
133 MOVW.EQ $16, R(LSHIFT)
134 MOVW.EQ $2, R(OFFSET)
136 MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */
137 MOVW.GT $8, R(LSHIFT)
138 MOVW.GT $3, R(OFFSET)
140 ADD $8, R(TS), R(TMP) /* do 8-byte chunks if possible */
144 BIC $3, R(FROM) /* align source */
145 MOVW (R(FROM)), R(BR0) /* prime first block register */
151 MOVW R(BR0)<<R(LSHIFT), R(BW1)
152 MOVM.DB.W (R(FROM)), [R(BR0)-R(BR1)]
153 ORR R(BR1)>>R(RSHIFT), R(BW1)
155 MOVW R(BR1)<<R(LSHIFT), R(BW0)
156 ORR R(BR0)>>R(RSHIFT), R(BW0)
158 MOVM.DB.W [R(BW0)-R(BW1)], (R(TE))
162 ADD R(OFFSET), R(FROM)
177 MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */
178 MOVW.LT $24, R(LSHIFT)
179 MOVW.LT $3, R(OFFSET)
181 MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */
182 MOVW.EQ $16, R(LSHIFT)
183 MOVW.EQ $2, R(OFFSET)
185 MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */
186 MOVW.GT $8, R(LSHIFT)
187 MOVW.GT $1, R(OFFSET)
189 SUB $8, R(TE), R(TMP) /* do 8-byte chunks if possible */
193 BIC $3, R(FROM) /* align source */
194 MOVW.P 4(R(FROM)), R(FR1) /* prime last block register, implicit write back */
200 MOVW R(FR1)>>R(RSHIFT), R(FW0)
201 MOVM.IA.W (R(FROM)), [R(FR0)-R(FR1)]
202 ORR R(FR0)<<R(LSHIFT), R(FW0)
204 MOVW R(FR0)>>R(RSHIFT), R(FW1)
205 ORR R(FR1)<<R(LSHIFT), R(FW1)
207 MOVM.IA.W [R(FW0)-R(FW1)], (R(TS))
211 SUB R(OFFSET), R(FROM)