1/*#************************************************************************#*/
2/*#-------------------------------------------------------------------------*/
3/*#                                                                         */
4/*# FUNCTION NAME: memset()                                                 */
5/*#                                                                         */
6/*# PARAMETERS:  void* dst;   Destination address.                          */
7/*#              int     c;   Value of byte to write.                       */
8/*#              int   len;   Number of bytes to write.                     */
9/*#                                                                         */
10/*# RETURNS:     dst.                                                       */
11/*#                                                                         */
12/*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
13/*#              Framework taken from memcpy.  This routine is              */
14/*#              very sensitive to compiler changes in register allocation. */
15/*#              Should really be rewritten to avoid this problem.          */
16/*#                                                                         */
17/*#-------------------------------------------------------------------------*/
18/*#                                                                         */
19/*# HISTORY                                                                 */
20/*#                                                                         */
21/*# DATE      NAME            CHANGES                                       */
22/*# ----      ----            -------                                       */
23/*# 990713    HP              Tired of watching this function (or           */
24/*#                           really, the nonoptimized generic              */
25/*#                           implementation) take up 90% of simulator      */
26/*#                           output.  Measurements needed.                 */
27/*#                                                                         */
28/*#-------------------------------------------------------------------------*/
29
30#include <linux/types.h>
31
32/* No, there's no macro saying 12*4, since it is "hard" to get it into
33   the asm in a good way.  Thus better to expose the problem everywhere.
34   */
35
36/* Assuming 1 cycle per dword written or read (ok, not really true), and
37   one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
38   so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
39
40#define ZERO_BLOCK_SIZE (1*12*4)
41
42void *memset(void *pdst,
43             int c,
44             size_t plen)
45{
46  /* Ok.  Now we want the parameters put in special registers.
47     Make sure the compiler is able to make something useful of this. */
48
49  register char *return_dst __asm__ ("r10") = pdst;
50  register int n __asm__ ("r12") = plen;
51  register int lc __asm__ ("r11") = c;
52
53  /* Most apps use memset sanely.  Only those memsetting about 3..4
54     bytes or less get penalized compared to the generic implementation
55     - and that's not really sane use. */
56
57  /* Ugh.  This is fragile at best.  Check with newer GCC releases, if
58     they compile cascaded "x |= x << 8" sanely! */
59  __asm__("movu.b %0,$r13	\n\
60           lslq 8,$r13		\n\
61	   move.b %0,$r13	\n\
62	   move.d $r13,%0	\n\
63	   lslq 16,$r13		\n\
64	   or.d $r13,%0"
65          : "=r" (lc) : "0" (lc) : "r13");
66
67  {
68    register char *dst __asm__ ("r13") = pdst;
69
70  /* This is NONPORTABLE, but since this whole routine is     */
71  /* grossly nonportable that doesn't matter.                 */
72
73  if (((unsigned long) pdst & 3) != 0
74     /* Oops! n=0 must be a legal call, regardless of alignment. */
75      && n >= 3)
76  {
77    if ((unsigned long)dst & 1)
78    {
79      *dst = (char) lc;
80      n--;
81      dst++;
82    }
83
84    if ((unsigned long)dst & 2)
85    {
86      *(short *)dst = lc;
87      n -= 2;
88      dst += 2;
89    }
90  }
91
92  /* Now the fun part.  For the threshold value of this, check the equation
93     above. */
94  /* Decide which copying method to use. */
95  if (n >= ZERO_BLOCK_SIZE)
96  {
97    /* For large copies we use 'movem' */
98
99  /* It is not optimal to tell the compiler about clobbering any
100     registers; that will move the saving/restoring of those registers
101     to the function prologue/epilogue, and make non-movem sizes
102     suboptimal.
103
104      This method is not foolproof; it assumes that the "asm reg"
105     declarations at the beginning of the function really are used
106     here (beware: they may be moved to temporary registers).
107      This way, we do not have to save/move the registers around into
108     temporaries; we can safely use them straight away.
109
110      If you want to check that the allocation was right; then
111      check the equalities in the first comment.  It should say
112      "r13=r13, r12=r12, r11=r11" */
113    __asm__ volatile ("							\n\
114        ;; Check that the register asm declaration got right.		\n\
115        ;; The GCC manual says it will work, but there *has* been bugs.	\n\
116	.ifnc %0-%1-%4,$r13-$r12-$r11					\n\
117	.err								\n\
118	.endif								\n\
119									\n\
120	;; Save the registers we'll clobber in the movem process	\n\
121	;; on the stack.  Don't mention them to gcc, it will only be	\n\
122	;; upset.							\n\
123	subq 	11*4,$sp						\n\
124        movem   $r10,[$sp]						\n\
125									\n\
126        move.d  $r11,$r0						\n\
127        move.d  $r11,$r1						\n\
128        move.d  $r11,$r2						\n\
129        move.d  $r11,$r3						\n\
130        move.d  $r11,$r4						\n\
131        move.d  $r11,$r5						\n\
132        move.d  $r11,$r6						\n\
133        move.d  $r11,$r7						\n\
134        move.d  $r11,$r8						\n\
135        move.d  $r11,$r9						\n\
136        move.d  $r11,$r10						\n\
137									\n\
138        ;; Now we've got this:						\n\
139	;; r13 - dst							\n\
140	;; r12 - n							\n\
141									\n\
142        ;; Update n for the first loop					\n\
143        subq    12*4,$r12						\n\
1440:									\n\
145        subq   12*4,$r12						\n\
146        bge     0b							\n\
147	movem	$r11,[$r13+]						\n\
148									\n\
149        addq   12*4,$r12  ;; compensate for last loop underflowing n	\n\
150									\n\
151	;; Restore registers from stack					\n\
152        movem [$sp+],$r10"
153
154     /* Outputs */ : "=r" (dst), "=r" (n)
155     /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
156  }
157
158    /* Either we directly starts copying, using dword copying
159       in a loop, or we copy as much as possible with 'movem'
160       and then the last block (<44 bytes) is copied here.
161       This will work since 'movem' will have updated src,dst,n. */
162
163    while ( n >= 16 )
164    {
165      *((long*)dst)++ = lc;
166      *((long*)dst)++ = lc;
167      *((long*)dst)++ = lc;
168      *((long*)dst)++ = lc;
169      n -= 16;
170    }
171
172    /* A switch() is definitely the fastest although it takes a LOT of code.
173     * Particularly if you inline code this.
174     */
175    switch (n)
176    {
177      case 0:
178        break;
179      case 1:
180        *(char*)dst = (char) lc;
181        break;
182      case 2:
183        *(short*)dst = (short) lc;
184        break;
185      case 3:
186        *((short*)dst)++ = (short) lc;
187        *(char*)dst = (char) lc;
188        break;
189      case 4:
190        *((long*)dst)++ = lc;
191        break;
192      case 5:
193        *((long*)dst)++ = lc;
194        *(char*)dst = (char) lc;
195        break;
196      case 6:
197        *((long*)dst)++ = lc;
198        *(short*)dst = (short) lc;
199        break;
200      case 7:
201        *((long*)dst)++ = lc;
202        *((short*)dst)++ = (short) lc;
203        *(char*)dst = (char) lc;
204        break;
205      case 8:
206        *((long*)dst)++ = lc;
207        *((long*)dst)++ = lc;
208        break;
209      case 9:
210        *((long*)dst)++ = lc;
211        *((long*)dst)++ = lc;
212        *(char*)dst = (char) lc;
213        break;
214      case 10:
215        *((long*)dst)++ = lc;
216        *((long*)dst)++ = lc;
217        *(short*)dst = (short) lc;
218        break;
219      case 11:
220        *((long*)dst)++ = lc;
221        *((long*)dst)++ = lc;
222        *((short*)dst)++ = (short) lc;
223        *(char*)dst = (char) lc;
224        break;
225      case 12:
226        *((long*)dst)++ = lc;
227        *((long*)dst)++ = lc;
228        *((long*)dst)++ = lc;
229        break;
230      case 13:
231        *((long*)dst)++ = lc;
232        *((long*)dst)++ = lc;
233        *((long*)dst)++ = lc;
234        *(char*)dst = (char) lc;
235        break;
236      case 14:
237        *((long*)dst)++ = lc;
238        *((long*)dst)++ = lc;
239        *((long*)dst)++ = lc;
240        *(short*)dst = (short) lc;
241        break;
242      case 15:
243        *((long*)dst)++ = lc;
244        *((long*)dst)++ = lc;
245        *((long*)dst)++ = lc;
246        *((short*)dst)++ = (short) lc;
247        *(char*)dst = (char) lc;
248        break;
249    }
250  }
251
252  return return_dst; /* destination pointer. */
253} /* memset() */
254