1;
2; Copyright 2009, Christian Packmann.
3; All rights reserved.
4; Distributed under the terms of the MIT License, see
5; http://www.opensource.org/licenses/mit-license.php
6
7; Assembly code for Painter::_DrawBitmapBilinearCopy32() in Painter.cpp
8; This code implements only the inner x-loop, all other processing
9; is done in the C code.
10
11
12; ******  GENERAL NOTES  *****
13
14; The implemented algorithm looks like this:
15; (pixLT * leftWeight  +  pixRT * rightWeight) * topWeight
16;                            +
17; (pixLB * leftWeight  +  pixRB * rightWeight) * bottomWeight
18;
19; with LT = LeftTop, RT = RightTop, LB = LeftBottom, RB = RightBottom
20;
21; For more detailed information, see the C implementation in
22; Painter.cpp
23;
24; Implementation notes:
25; The calculations are performed with 16-bit arithmetic. All values
26; are held in vars/registers as 8-bit values high-shifted by 8 bits;
27; i.e. 255<<8. This works because PMULHUW is used for MULs, and this
28; algorithm limits the variable values appropriately during all steps.
29; This will not work for all algorithms, so take note of that if you
30; want to recycle some of the code.
31
32; Notes on the code itself:
33; I've tried to keep the code small. That's why I'm using memory accesses
34; via index registers as much as possible. This costs execution time due
35; to the generated µops, but should minimize decode bandwidth pressure
36; due to the many MMX instructions.
37; Temporary variables are always stored to the stack instead of global
38; data space for this reason. So far I haven't exceeded 8-byte offsets,
39; so the instructions only need to encode a BYTE-offset instead of a DWORD.
40
41; Notes on code formatting/comments:
42; - integer and vector instructions are indented differently. I find this
43;   helpful when parsing code, especially when I haven't looked at it for a
44;   longer time.
45; - I've tried to comment the code so that it will be understandable and
46;   maintainable in the future, and also by other persons than myself.
47;   The current comments aren't yet fully standardized, I'm still working
48;   on a coherent system for indicating the variables held within a register
49;   which will help in understanding the data flow. Any suggestions
50;   regarding this are welcome.
51; - Abbreviations for datatypes:
52;   B = Byte		  8 bit
53;   W = Word		 16 bit
54;   DW = Doubleword	 32 bit
55;   QW = Quadword	 64 bit
56;	DQ = Doublequad	128 bit
57;	A "p" in front of one of the datatypes signifies that the
58;	variable/register is encoded in packed form; i.e. pW means
59;	"packed Words"; four Words for a MMX register, 8 for a SSE register.
60;	This should help in understanding the logical meaning of the data
61;	transformations.
62;	For better readability, the datatype indicator for a register is
63;   breacketed with '#', a MMX register with 2 uint32 of value 255 would be
64;   #pD# 255 255
65
66
67
68; ******  Global exports  *****
69
70; Do NOT use '_' in front of your defines, this is done
71; with YASMs --prefix option at assembly time.
72GLOBAL bilinear_scale_xloop_mmxsse
73
74
75; ********************
76; ******  DATA  ******
77; ********************
78SECTION .data
79
80DATA_SECTION:
81ALIGN 16
82DATA_SSSE3:
83; data which is identical for MMX and SSE code is shared by declaring
84; it as DQ but providing two labels. MMX code just accesses the
85; first half.
86c4x16UW_129_LShift8: 	TIMES 4 dw 129<<8
87c4x16UW_255_LShift8:	TIMES 4 dw 255<<8
88c2x32UD_ff000000:		TIMES 4 DD 0xff000000
89
90; Argument definitions
91
92; Parameter offsets assume "push ebp"
93PAR_srcPtr EQU 	8
94PAR_dstPtr EQU 	12
95PAR_xWeightPtr EQU 16
96PAR_xmin EQU 	20
97PAR_xmax EQU 	24
98PAR_wTop EQU 	28
99PAR_srcBPR EQU 	32
100
101; Stack storage definitions
102ST_Q_wTop					EQU 0
103ST_Q_wBottom				EQU 8
104ST_Q_c4x16UW_129_LShift8	EQU 16
105ST_Q_c4x16UW_255_LShift8	EQU 24
106ST_Q_lftWeight_A			EQU 32
107ST_Q_rgtWeight_A			EQU 40
108ST_Q_lftWeight_B			EQU 48
109ST_Q_rgtWeight_B			EQU 56
110
111
112; ********************
113; ******  CODE  ******
114; ********************
115SECTION .code
116
117
118; void bilinear_scale_xloop_mmxsse(void* src, void* dst, void* xWeights,
119;				uint32 xmin, uint32 xmax, uint16 wTop, uint32 srcBPR )
120; Loop stats:
121;		34 instructions (6 moves, 5 integer, 23 vector)
122; 		12 memory accesses
123ALIGN 16
124bilinear_scale_xloop_mmxsse:
125	push	ebp
126	mov		ebp, esp
127	and		esp, 0xfffffff8	; align stack to 8-byte boundary
128	push	ebx
129	push	edi
130	push	esi
131	sub		esp, 4 + 32	; +4 aligns to 8-byte boundary again; add 4 x QW
132; xmin > xmax?
133	mov		eax, [ebp + PAR_xmin]
134	cmp		eax, [ebp + PAR_xmax]
135	ja		.exit
136; preparations
137	; prepare wTop
138	mov		eax, [ebp + PAR_wTop]	; #pB#: 0 0 0 top
139	shl		eax, 8					; #pB#: 0 0 top 0
140	movd		mm0, eax			; #pW# 0 0 0 top
141	pshufw		mm0, mm0, 00000000b	; #pW# top top top top
142	movq		[esp + ST_Q_wTop], mm0
143	; move constants
144	movq		mm5, [c4x16UW_255_LShift8]
145	movq		[esp + ST_Q_c4x16UW_255_LShift8], mm5
146	; prepare wBottom
147	movq		mm1, mm5	; #pW# 255 255 255 255
148	psubw		mm1, mm0	; 255 - wTop = wBottom
149	movq		[esp + ST_Q_wBottom], mm1
150
151; load params; leave ebx, ecx as scratch
152	mov		eax, [ebp + PAR_xmin]	; loop counter
153	mov		edx, [ebp + PAR_xWeightPtr]	; xWeights array
154	mov		esi, [ebp + PAR_srcPtr]		; source bitmap
155	mov		edi, [ebp + PAR_dstPtr]		; desination bitmap
156	movq	mm6, [c4x16UW_129_LShift8]
157	movq	mm7, [c2x32UD_ff000000]
158
159; main loop
160ALIGN 16
161.loop:
162	; load Left/Right weights into mm0/mm1
163	movzx	ebx, WORD [edx + eax*4 + 2] ; xWeights + x*4 + 2-> FilterInfo[x].weight
164	shl			ebx, 8		; #pB# 0 0 leftW 0
165	pxor		mm2, mm2	; clear before use
166	movd		mm0, ebx	; #pW# 0 0 0 leftW
167	movq		mm1, [esp + ST_Q_c4x16UW_255_LShift8]
168	pshufw		mm0, mm0, 00000000b	; #pW# lW lW lW lW
169	psubw		mm1, mm0			; #pW# rW rW rW rW
170	movzx	ecx, WORD [edx + eax*4] ; xWeights + x*4 -> FilterInfo[x].index
171	pxor		mm3, mm3	; clear before use
172	mov		ebx, ecx
173	; process top and bottom pixels, interleave instructions to avoid latencies
174	pxor		mm4, mm4	; clear before use
175	; unpack pixel to high byte
176	punpcklbw	mm2, [esi + ecx]	; pixLeftTop
177	; unpack pixel to high byte
178	punpcklbw	mm3, [esi + ecx + 4] ; pixRightTop
179
180	add		ebx, [ebp + PAR_srcBPR]	; address:bottom pixels
181	pmulhuw		mm2, mm0	; pixLT * leftWeight
182	pmulhuw		mm3, mm1	; pixRT * rightWeight
183	; calc address for bottom pix
184	pxor		mm5, mm5	; clear before use
185	punpcklbw	mm4, [esi + ebx]	; pixLeftBottom
186	punpcklbw	mm5, [esi + ebx + 4] ; pixRightBottom
187	pmulhuw		mm4, mm0	; pixLB * leftWeight
188	pmulhuw		mm5, mm1	; pixRB * rightWeight
189
190	paddw		mm2, mm3	; pixLT + pixRT
191	paddw		mm4, mm5	; pixLB + pixRB
192	pmulhuw		mm2, [esp + ST_Q_wTop]	; * weightTop
193	pmulhuw		mm4, [esp + ST_Q_wBottom]	; * weightBottom
194
195	; add both temp results
196	paddw		mm2, mm4
197	; divide by 65025 using integer reciprocal: (*129 >> 7)
198	pmulhuw		mm2, mm6
199	psrlw		mm2, 7
200	; pack & store
201	packuswb	mm2, mm2
202	por			mm2, mm7	; | 0xff000000
203	movd		[edi], mm2	; store pixel as DWord
204	add		edi, 4
205; loopctr <= xmax?
206	inc		eax
207	cmp		eax, [ebp + PAR_xmax]
208	jle		.loop
209.exit:
210	emms	; Don't EVER forget to call EMMS!
211	add		esp, 4 + 32	; restore  stack pointer
212	pop		esi
213	pop		edi
214	pop		ebx
215	mov		esp, ebp
216	pop		ebp
217	ret
218