1/**
2 * \file pcm/pcm_dmix_x86_64.h
3 * \ingroup PCM_Plugins
4 * \brief PCM Direct Stream Mixing (dmix) Plugin Interface - X86-64 assembler code
5 * \author Takashi Iwai <tiwai@suse.de>
6 * \date 2003
7 */
8/*
9 *  PCM - Direct Stream Mixing
10 *  Copyright (c) 2003 by Jaroslav Kysela <perex@perex.cz>
11 *                        Takashi Iwai <tiwai@suse.de>
12 *
13 *
14 *   This library is free software; you can redistribute it and/or modify
15 *   it under the terms of the GNU Lesser General Public License as
16 *   published by the Free Software Foundation; either version 2.1 of
17 *   the License, or (at your option) any later version.
18 *
19 *   This program is distributed in the hope that it will be useful,
20 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
21 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 *   GNU Lesser General Public License for more details.
23 *
24 *   You should have received a copy of the GNU Lesser General Public
25 *   License along with this library; if not, write to the Free Software
26 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
27 *
28 */
29
30/*
31 *  MMX optimized
32 */
33static void MIX_AREAS_16(unsigned int size,
34			 volatile signed short *dst, signed short *src,
35			 volatile signed int *sum, size_t dst_step,
36			 size_t src_step, size_t sum_step)
37{
38	unsigned long long old_rbx;
39
40	/*
41	 *  RSI - src
42	 *  RDI - dst
43	 *  RBX - sum
44	 *  ECX - old sample
45	 *  EAX - sample / temporary
46	 *  EDX - temporary
47	 */
48	__asm__ __volatile__ (
49		"\n"
50
51		"\tmovq %%rbx, %7\n"
52		/*
53		 *  initialization, load RSI, RDI, RBX registers
54		 */
55		"\tmovq %1, %%rdi\n"
56		"\tmovq %2, %%rsi\n"
57		"\tmovq %3, %%rbx\n"
58
59		/*
60		 * while (size-- > 0) {
61		 */
62		"\tcmpl $0, %0\n"
63		"jz 6f\n"
64
65		"\t.p2align 4,,15\n"
66
67		"1:"
68
69		/*
70		 *   sample = *src;
71		 *   sum_sample = *sum;
72		 *   if (cmpxchg(*dst, 0, 1) == 0)
73		 *     sample -= sum_sample;
74		 *   xadd(*sum, sample);
75		 */
76		"\tmovw $0, %%ax\n"
77		"\tmovw $1, %%cx\n"
78		"\tmovl (%%rbx), %%edx\n"
79		"\t" LOCK_PREFIX "cmpxchgw %%cx, (%%rdi)\n"
80		"\tmovswl (%%rsi), %%ecx\n"
81		"\tjnz 2f\n"
82		"\t" XSUB " %%edx, %%ecx\n"
83		"2:"
84		"\t" LOCK_PREFIX XADD " %%ecx, (%%rbx)\n"
85
86		/*
87		 *   do {
88		 *     sample = old_sample = *sum;
89		 *     saturate(v);
90		 *     *dst = sample;
91		 *   } while (v != *sum);
92		 */
93
94		"3:"
95		"\tmovl (%%rbx), %%ecx\n"
96		"\tmovd %%ecx, %%mm0\n"
97		"\tpackssdw %%mm1, %%mm0\n"
98		"\tmovd %%mm0, %%eax\n"
99		"\tmovw %%ax, (%%rdi)\n"
100		"\tcmpl %%ecx, (%%rbx)\n"
101		"\tjnz 3b\n"
102
103		/*
104		 * while (size-- > 0)
105		 */
106		"\tadd %4, %%rdi\n"
107		"\tadd %5, %%rsi\n"
108		"\tadd %6, %%rbx\n"
109		"\tdecl %0\n"
110		"\tjnz 1b\n"
111
112		"6:"
113
114		"\temms\n"
115		"\tmovq %7, %%rbx\n"
116
117		: /* no output regs */
118		: "m" (size), "m" (dst), "m" (src),
119		  "m" (sum), "m" (dst_step), "m" (src_step),
120		  "m" (sum_step), "m" (old_rbx)
121		: "rsi", "rdi", "edx", "ecx", "eax"
122	);
123}
124
125/*
126 *  32-bit version (24-bit resolution)
127 */
128static void MIX_AREAS_32(unsigned int size,
129			 volatile signed int *dst, signed int *src,
130			 volatile signed int *sum, size_t dst_step,
131			 size_t src_step, size_t sum_step)
132{
133	unsigned long long old_rbx;
134
135	/*
136	 *  RSI - src
137	 *  RDI - dst
138	 *  RBX - sum
139	 *  ECX - old sample
140	 *  EAX - sample / temporary
141	 *  EDX - temporary
142	 */
143	__asm__ __volatile__ (
144		"\n"
145
146		"\tmovq %%rbx, %7\n"
147		/*
148		 *  initialization, load ESI, EDI, EBX registers
149		 */
150		"\tmovq %1, %%rdi\n"
151		"\tmovq %2, %%rsi\n"
152		"\tmovq %3, %%rbx\n"
153
154		/*
155		 * while (size-- > 0) {
156		 */
157		"\tcmpl $0, %0\n"
158		"jz 6f\n"
159
160		"\t.p2align 4,,15\n"
161
162		"1:"
163
164		/*
165		 *   sample = *src;
166		 *   sum_sample = *sum;
167		 *   if (cmpxchg(*dst, 0, 1) == 0)
168		 *     sample -= sum_sample;
169		 *   xadd(*sum, sample);
170		 */
171		"\tmovl $0, %%eax\n"
172		"\tmovl $1, %%ecx\n"
173		"\tmovl (%%rbx), %%edx\n"
174		"\t" LOCK_PREFIX "cmpxchgl %%ecx, (%%rdi)\n"
175		"\tjnz 2f\n"
176		"\tmovl (%%rsi), %%ecx\n"
177		/* sample >>= 8 */
178		"\tsarl $8, %%ecx\n"
179		"\t" XSUB " %%edx, %%ecx\n"
180		"\tjmp 21f\n"
181		"2:"
182		"\tmovl (%%rsi), %%ecx\n"
183		/* sample >>= 8 */
184		"\tsarl $8, %%ecx\n"
185		"21:"
186		"\t" LOCK_PREFIX XADD " %%ecx, (%%rbx)\n"
187
188		/*
189		 *   do {
190		 *     sample = old_sample = *sum;
191		 *     saturate(v);
192		 *     *dst = sample;
193		 *   } while (v != *sum);
194		 */
195
196		"3:"
197		"\tmovl (%%rbx), %%ecx\n"
198		/*
199		 *  if (sample > 0x7fff00)
200		 */
201		"\tmovl $0x7fffff, %%eax\n"
202		"\tcmpl %%eax, %%ecx\n"
203		"\tjg 4f\n"
204		/*
205		 *  if (sample < -0x800000)
206		 */
207		"\tmovl $-0x800000, %%eax\n"
208		"\tcmpl %%eax, %%ecx\n"
209		"\tjl 4f\n"
210		"\tmovl %%ecx, %%eax\n"
211		"4:"
212		/*
213		 *  sample <<= 8;
214		 */
215		"\tsall $8, %%eax\n"
216		"\tmovl %%eax, (%%rdi)\n"
217		"\tcmpl %%ecx, (%%rbx)\n"
218		"\tjnz 3b\n"
219
220		/*
221		 * while (size-- > 0)
222		 */
223		"\tadd %4, %%rdi\n"
224		"\tadd %5, %%rsi\n"
225		"\tadd %6, %%rbx\n"
226		"\tdecl %0\n"
227		"\tjnz 1b\n"
228
229		"6:"
230		"\tmovq %7, %%rbx\n"
231
232		: /* no output regs */
233		: "m" (size), "m" (dst), "m" (src),
234		  "m" (sum), "m" (dst_step), "m" (src_step),
235		  "m" (sum_step), "m" (old_rbx)
236		: "rsi", "rdi", "edx", "ecx", "eax"
237	);
238}
239
240/*
241 *  24-bit version
242 */
243static void MIX_AREAS_24(unsigned int size,
244			 volatile unsigned char *dst, unsigned char *src,
245			 volatile signed int *sum, size_t dst_step,
246			 size_t src_step, size_t sum_step)
247{
248	unsigned long long old_rbx;
249
250	/*
251	 *  RSI - src
252	 *  RDI - dst
253	 *  RBX - sum
254	 *  ECX - old sample
255	 *  EAX - sample / temporary
256	 *  EDX - temporary
257	 */
258	__asm__ __volatile__ (
259		"\n"
260
261		"\tmovq %%rbx, %7\n"
262		/*
263		 *  initialization, load ESI, EDI, EBX registers
264		 */
265		"\tmovq %1, %%rdi\n"
266		"\tmovq %2, %%rsi\n"
267		"\tmovq %3, %%rbx\n"
268
269		/*
270		 * while (size-- > 0) {
271		 */
272		"\tcmpl $0, %0\n"
273		"jz 6f\n"
274
275		"\t.p2align 4,,15\n"
276
277		"1:"
278
279		/*
280		 *   sample = *src;
281		 *   sum_sample = *sum;
282		 *   if (test_and_set_bit(0, dst) == 0)
283		 *     sample -= sum_sample;
284		 *   *sum += sample;
285		 */
286		"\tmovsbl 2(%%rsi), %%eax\n"
287		"\tmovzwl (%%rsi), %%ecx\n"
288		"\tmovl (%%rbx), %%edx\n"
289		"\tsall $16, %%eax\n"
290		"\torl %%eax, %%ecx\n"
291		"\t" LOCK_PREFIX "btsw $0, (%%rdi)\n"
292		"\tjc 2f\n"
293		"\t" XSUB " %%edx, %%ecx\n"
294		"2:"
295		"\t" LOCK_PREFIX XADD " %%ecx, (%%rbx)\n"
296
297		/*
298		 *   do {
299		 *     sample = old_sample = *sum;
300		 *     saturate(sample);
301		 *     *dst = sample | 1;
302		 *   } while (old_sample != *sum);
303		 */
304
305		"3:"
306		"\tmovl (%%rbx), %%ecx\n"
307
308		"\tmovl $0x7fffff, %%eax\n"
309		"\tmovl $-0x7fffff, %%edx\n"
310		"\tcmpl %%eax, %%ecx\n"
311		"\tcmovng %%ecx, %%eax\n"
312		"\tcmpl %%edx, %%ecx\n"
313		"\tcmovl %%edx, %%eax\n"
314
315		"\torl $1, %%eax\n"
316		"\tmovw %%ax, (%%rdi)\n"
317		"\tshrl $16, %%eax\n"
318		"\tmovb %%al, 2(%%rdi)\n"
319
320		"\tcmpl %%ecx, (%%rbx)\n"
321		"\tjnz 3b\n"
322
323		/*
324		 * while (size-- > 0)
325		 */
326		"\tadd %4, %%rdi\n"
327		"\tadd %5, %%rsi\n"
328		"\tadd %6, %%rbx\n"
329		"\tdecl %0\n"
330		"\tjnz 1b\n"
331
332		"6:"
333		"\tmovq %7, %%rbx\n"
334
335		: /* no output regs */
336		: "m" (size), "m" (dst), "m" (src),
337		  "m" (sum), "m" (dst_step), "m" (src_step),
338		  "m" (sum_step), "m" (old_rbx)
339		: "rsi", "rdi", "edx", "ecx", "eax"
340	);
341}
342