1;  libFLAC - Free Lossless Audio Codec library
2;  Copyright (C) 2004,2005,2006,2007  Josh Coalson
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;
8;  - Redistributions of source code must retain the above copyright
9;  notice, this list of conditions and the following disclaimer.
10;
11;  - Redistributions in binary form must reproduce the above copyright
12;  notice, this list of conditions and the following disclaimer in the
13;  documentation and/or other materials provided with the distribution.
14;
15;  - Neither the name of the Xiph.org Foundation nor the names of its
16;  contributors may be used to endorse or promote products derived from
17;  this software without specific prior written permission.
18;
19;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
23;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31.text
32	.align 2
33.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
34
35.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
36
37_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
38;	r3: residual[]
39;	r4: data_len
40;	r5: qlp_coeff[]
41;	r6: order
42;	r7: lp_quantization
43;	r8: data[]
44
45; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
46; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
47; bps<=15 for mid-side coding, since that uses an extra bit)
48
49; these should be fast; the inner loop is unrolled (it takes no more than
50; 3*(order%4) instructions, all of which are arithmetic), and all of the
51; coefficients and all relevant history stay in registers, so the outer loop
52; has only one load from memory (the residual)
53
54; I have not yet run this through simg4, so there may be some avoidable stalls,
55; and there may be a somewhat more clever way to do the outer loop
56
57; the branch mechanism may prevent dynamic loading; I still need to examine
58; this issue, and there may be a more elegant method
59
60	stmw r31,-4(r1)
61
62	addi r9,r1,-28
63	li r31,0xf
64	andc r9,r9,r31 ; for quadword-aligned stack data
65
66	slwi r6,r6,2 ; adjust for word size
67	slwi r4,r4,2
68	add r4,r4,r8 ; r4 = data+data_len
69
70	mfspr r0,256 ; cache old vrsave
71	addis r31,0,hi16(0xfffffc00)
72	ori r31,r31,lo16(0xfffffc00)
73	mtspr 256,r31 ; declare VRs in vrsave
74
75	cmplw cr0,r8,r4 ; i<data_len
76	bc 4,0,L1400
77
78	; load coefficients into v0-v7 and initial history into v8-v15
79	li r31,0xf
80	and r31,r8,r31 ; r31: data%4
81	li r11,16
82	subf r31,r31,r11 ; r31: 4-(data%4)
83	slwi r31,r31,3 ; convert to bits for vsro
84	li r10,-4
85	stw r31,-4(r9)
86	lvewx v0,r10,r9
87	vspltisb v18,-1
88	vsro v18,v18,v0 ; v18: mask vector
89
90	li r31,0x8
91	lvsl v0,0,r31
92	vsldoi v0,v0,v0,12
93	li r31,0xc
94	lvsl v1,0,r31
95	vspltisb v2,0
96	vspltisb v3,-1
97	vmrglw v2,v2,v3
98	vsel v0,v1,v0,v2 ; v0: reversal permutation vector
99
100	add r10,r5,r6
101	lvsl v17,0,r5 ; v17: coefficient alignment permutation vector
102	vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector
103
104	mr r11,r8
105	lvsl v16,0,r11 ; v16: history alignment permutation vector
106
107	lvx v0,0,r5
108	addi r5,r5,16
109	lvx v1,0,r5
110	vperm v0,v0,v1,v17
111	lvx v8,0,r11
112	addi r11,r11,-16
113	lvx v9,0,r11
114	vperm v8,v9,v8,v16
115	cmplw cr0,r5,r10
116	bc 12,0,L1101
117	vand v0,v0,v18
118	addis r31,0,hi16(L1307)
119	ori r31,r31,lo16(L1307)
120	b L1199
121
122L1101:
123	addi r5,r5,16
124	lvx v2,0,r5
125	vperm v1,v1,v2,v17
126	addi r11,r11,-16
127	lvx v10,0,r11
128	vperm v9,v10,v9,v16
129	cmplw cr0,r5,r10
130	bc 12,0,L1102
131	vand v1,v1,v18
132	addis r31,0,hi16(L1306)
133	ori r31,r31,lo16(L1306)
134	b L1199
135
136L1102:
137	addi r5,r5,16
138	lvx v3,0,r5
139	vperm v2,v2,v3,v17
140	addi r11,r11,-16
141	lvx v11,0,r11
142	vperm v10,v11,v10,v16
143	cmplw cr0,r5,r10
144	bc 12,0,L1103
145	vand v2,v2,v18
146	addis r31,0,hi16(L1305)
147	ori r31,r31,lo16(L1305)
148	b L1199
149
150L1103:
151	addi r5,r5,16
152	lvx v4,0,r5
153	vperm v3,v3,v4,v17
154	addi r11,r11,-16
155	lvx v12,0,r11
156	vperm v11,v12,v11,v16
157	cmplw cr0,r5,r10
158	bc 12,0,L1104
159	vand v3,v3,v18
160	addis r31,0,hi16(L1304)
161	ori r31,r31,lo16(L1304)
162	b L1199
163
164L1104:
165	addi r5,r5,16
166	lvx v5,0,r5
167	vperm v4,v4,v5,v17
168	addi r11,r11,-16
169	lvx v13,0,r11
170	vperm v12,v13,v12,v16
171	cmplw cr0,r5,r10
172	bc 12,0,L1105
173	vand v4,v4,v18
174	addis r31,0,hi16(L1303)
175	ori r31,r31,lo16(L1303)
176	b L1199
177
178L1105:
179	addi r5,r5,16
180	lvx v6,0,r5
181	vperm v5,v5,v6,v17
182	addi r11,r11,-16
183	lvx v14,0,r11
184	vperm v13,v14,v13,v16
185	cmplw cr0,r5,r10
186	bc 12,0,L1106
187	vand v5,v5,v18
188	addis r31,0,hi16(L1302)
189	ori r31,r31,lo16(L1302)
190	b L1199
191
192L1106:
193	addi r5,r5,16
194	lvx v7,0,r5
195	vperm v6,v6,v7,v17
196	addi r11,r11,-16
197	lvx v15,0,r11
198	vperm v14,v15,v14,v16
199	cmplw cr0,r5,r10
200	bc 12,0,L1107
201	vand v6,v6,v18
202	addis r31,0,hi16(L1301)
203	ori r31,r31,lo16(L1301)
204	b L1199
205
206L1107:
207	addi r5,r5,16
208	lvx v19,0,r5
209	vperm v7,v7,v19,v17
210	addi r11,r11,-16
211	lvx v19,0,r11
212	vperm v15,v19,v15,v16
213	vand v7,v7,v18
214	addis r31,0,hi16(L1300)
215	ori r31,r31,lo16(L1300)
216
217L1199:
218	mtctr r31
219
220	; set up invariant vectors
221	vspltish v16,0 ; v16: zero vector
222
223	li r10,-12
224	lvsr v17,r10,r8 ; v17: result shift vector
225	lvsl v18,r10,r3 ; v18: residual shift back vector
226
227	li r10,-4
228	stw r7,-4(r9)
229	lvewx v19,r10,r9 ; v19: lp_quantization vector
230
231L1200:
232	vmulosh v20,v0,v8 ; v20: sum vector
233	bcctr 20,0
234
235L1300:
236	vmulosh v21,v7,v15
237	vsldoi v15,v15,v14,4 ; increment history
238	vaddsws v20,v20,v21
239
240L1301:
241	vmulosh v21,v6,v14
242	vsldoi v14,v14,v13,4
243	vaddsws v20,v20,v21
244
245L1302:
246	vmulosh v21,v5,v13
247	vsldoi v13,v13,v12,4
248	vaddsws v20,v20,v21
249
250L1303:
251	vmulosh v21,v4,v12
252	vsldoi v12,v12,v11,4
253	vaddsws v20,v20,v21
254
255L1304:
256	vmulosh v21,v3,v11
257	vsldoi v11,v11,v10,4
258	vaddsws v20,v20,v21
259
260L1305:
261	vmulosh v21,v2,v10
262	vsldoi v10,v10,v9,4
263	vaddsws v20,v20,v21
264
265L1306:
266	vmulosh v21,v1,v9
267	vsldoi v9,v9,v8,4
268	vaddsws v20,v20,v21
269
270L1307:
271	vsumsws v20,v20,v16 ; v20[3]: sum
272	vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization
273
274	lvewx v21,0,r3 ; v21[n]: *residual
275	vperm v21,v21,v21,v18 ; v21[3]: *residual
276	vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)
277	vsldoi v18,v18,v18,4 ; increment shift vector
278
279	vperm v21,v20,v20,v17 ; v21[n]: shift for storage
280	vsldoi v17,v17,v17,12 ; increment shift vector
281	stvewx v21,0,r8
282
283	vsldoi v20,v20,v20,12
284	vsldoi v8,v8,v20,4 ; insert value onto history
285
286	addi r3,r3,4
287	addi r8,r8,4
288	cmplw cr0,r8,r4 ; i<data_len
289	bc 12,0,L1200
290
291L1400:
292	mtspr 256,r0 ; restore old vrsave
293	lmw r31,-4(r1)
294	blr
295
296_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
297;	r3: residual[]
298;	r4: data_len
299;	r5: qlp_coeff[]
300;	r6: order
301;	r7: lp_quantization
302;	r8: data[]
303
304; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
305; this version assumes order<=8; it uses fewer vector registers, which should
306; save time in context switches, and has less code, which may improve
307; instruction caching
308
309	stmw r31,-4(r1)
310
311	addi r9,r1,-28
312	li r31,0xf
313	andc r9,r9,r31 ; for quadword-aligned stack data
314
315	slwi r6,r6,2 ; adjust for word size
316	slwi r4,r4,2
317	add r4,r4,r8 ; r4 = data+data_len
318
319	mfspr r0,256 ; cache old vrsave
320	addis r31,0,hi16(0xffc00000)
321	ori r31,r31,lo16(0xffc00000)
322	mtspr 256,r31 ; declare VRs in vrsave
323
324	cmplw cr0,r8,r4 ; i<data_len
325	bc 4,0,L2400
326
327	; load coefficients into v0-v1 and initial history into v2-v3
328	li r31,0xf
329	and r31,r8,r31 ; r31: data%4
330	li r11,16
331	subf r31,r31,r11 ; r31: 4-(data%4)
332	slwi r31,r31,3 ; convert to bits for vsro
333	li r10,-4
334	stw r31,-4(r9)
335	lvewx v0,r10,r9
336	vspltisb v6,-1
337	vsro v6,v6,v0 ; v6: mask vector
338
339	li r31,0x8
340	lvsl v0,0,r31
341	vsldoi v0,v0,v0,12
342	li r31,0xc
343	lvsl v1,0,r31
344	vspltisb v2,0
345	vspltisb v3,-1
346	vmrglw v2,v2,v3
347	vsel v0,v1,v0,v2 ; v0: reversal permutation vector
348
349	add r10,r5,r6
350	lvsl v5,0,r5 ; v5: coefficient alignment permutation vector
351	vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector
352
353	mr r11,r8
354	lvsl v4,0,r11 ; v4: history alignment permutation vector
355
356	lvx v0,0,r5
357	addi r5,r5,16
358	lvx v1,0,r5
359	vperm v0,v0,v1,v5
360	lvx v2,0,r11
361	addi r11,r11,-16
362	lvx v3,0,r11
363	vperm v2,v3,v2,v4
364	cmplw cr0,r5,r10
365	bc 12,0,L2101
366	vand v0,v0,v6
367	addis r31,0,hi16(L2301)
368	ori r31,r31,lo16(L2301)
369	b L2199
370
371L2101:
372	addi r5,r5,16
373	lvx v7,0,r5
374	vperm v1,v1,v7,v5
375	addi r11,r11,-16
376	lvx v7,0,r11
377	vperm v3,v7,v3,v4
378	vand v1,v1,v6
379	addis r31,0,hi16(L2300)
380	ori r31,r31,lo16(L2300)
381
382L2199:
383	mtctr r31
384
385	; set up invariant vectors
386	vspltish v4,0 ; v4: zero vector
387
388	li r10,-12
389	lvsr v5,r10,r8 ; v5: result shift vector
390	lvsl v6,r10,r3 ; v6: residual shift back vector
391
392	li r10,-4
393	stw r7,-4(r9)
394	lvewx v7,r10,r9 ; v7: lp_quantization vector
395
396L2200:
397	vmulosh v8,v0,v2 ; v8: sum vector
398	bcctr 20,0
399
400L2300:
401	vmulosh v9,v1,v3
402	vsldoi v3,v3,v2,4
403	vaddsws v8,v8,v9
404
405L2301:
406	vsumsws v8,v8,v4 ; v8[3]: sum
407	vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization
408
409	lvewx v9,0,r3 ; v9[n]: *residual
410	vperm v9,v9,v9,v6 ; v9[3]: *residual
411	vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)
412	vsldoi v6,v6,v6,4 ; increment shift vector
413
414	vperm v9,v8,v8,v5 ; v9[n]: shift for storage
415	vsldoi v5,v5,v5,12 ; increment shift vector
416	stvewx v9,0,r8
417
418	vsldoi v8,v8,v8,12
419	vsldoi v2,v2,v8,4 ; insert value onto history
420
421	addi r3,r3,4
422	addi r8,r8,4
423	cmplw cr0,r8,r4 ; i<data_len
424	bc 12,0,L2200
425
426L2400:
427	mtspr 256,r0 ; restore old vrsave
428	lmw r31,-4(r1)
429	blr
430