1// $Id: WKdmCompress.intel.s,v 1.1 2010/01/28 22:33:24 cclee Exp cclee $
2//
3// This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Compressor. The function prototype is
4//
5// unsigned int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, unsigned int num_input_words);
6//
7// The implementation assumes the input buffer is a memory page (4096 bytes or 1024 words), or something less than 4KB.
8//
9// WKdm Compression algorithm is briefly stated as follows:
10//
11// There is a dynamically updated dictionary of 16 words, each initialized with "1".
12//
13// the dictionary is indexed as follows,
14//	0, x = input_word
15//  1, hash_index = (x>>10)&255
16//  2, dict_location = &dictionary[hash_index]
17//  3, dict_word = *dict_location
18//
19// Sequentially for each input word, it is classified/tagged into 4 classes
20//	0 : if the input word is 0
21//  1 : the higher 22 bits of the input word is identically to the higher bits from the dictionary (hash table indexed)
22//  2 : the above condition (partially 22 higher bits matched) is not met, a dictionary miss condition
23//  3 : the input word is exactly matched to the word from the dictionary (hash table index)
24//
25// after each input word is classified, each tag is represented by 2 bits. Furthermore, for each class
26//	0 : no further info is needed
27//  1 : the hash_index is represented by 4-bits (8 packed into a word),
28//		the lower 10-bits is sent to the decompressor (3 packed into a word)
29//  2 : the 32-bit word is sent to the decompressor
30//  3 : the hash_index is represented by 4-bits (8 packed into a word)
31//
32// for classes 1 and 2, the input word is used to update the dictionary after it is classified/tagged
33//
34// the following implementation was started from compiling (gcc -O3) the original C code (WKdmCompress.c)
35// and then subsequentially improved and documented.
36// For i386, it speeds up ~ 1.5 times
37// For x86_64, it speeds up ~ 1.3 times
38//
39// cclee, 1/28/10
40
41#if !(defined __i386__ || defined __x86_64__)
42
43typedef char DummyDefinition;
44
45#else		// i386 or x86_64 architectures
46
47#if defined	__i386__			// 32-bit implementation
48
49	.text
50	.align 4,0x90
51
52.globl _WKdm_compress
53_WKdm_compress:
54
55	pushl	%ebp
56	movl	%esp, %ebp
57
58	pushl	%edi
59	pushl	%esi
60	pushl	%ebx
61
62	// allocate stack memory for local variables
63
64	subl	$6316, %esp
65
66	leal	_hashLookupTable, %ebx			        // hashTable
67
68	movl	8(%ebp), %edx					// %edx = src_buf
69	movl	12(%ebp), %esi					// %esi = dest_buf
70	movl	16(%ebp), %eax					// %eax = num_input_words
71
72	leal	-1112(%ebp), %ecx				// tempTagsArray
73	movl	%ecx, -6272(%ebp)				// a copy of char* next_tag = (char *) tempTagsArray;
74
75	leal	-2136(%ebp), %ecx				// tempQPosArray
76	movl	%ecx, -6264(%ebp)				// char* next_qp = (char *) tempQPosArray;
77	movl	%ecx, -6252(%ebp)
78
79	leal	(%edx,%eax,4), %ecx				// src_buf + num_input_words*4
80	movl	%ecx, -6244(%ebp)				// end_of_input = src_buf + num_input_words;
81
82	// PRELOAD_DICTIONARY;
83	movl	$1, -88(%ebp)
84	movl	$1, -84(%ebp)
85	movl	$1, -80(%ebp)
86	movl	$1, -76(%ebp)
87	movl	$1, -72(%ebp)
88	movl	$1, -68(%ebp)
89	movl	$1, -64(%ebp)
90	movl	$1, -60(%ebp)
91	movl	$1, -56(%ebp)
92	movl	$1, -52(%ebp)
93	movl	$1, -48(%ebp)
94	movl	$1, -44(%ebp)
95	movl	$1, -40(%ebp)
96	movl	$1, -36(%ebp)
97	movl	$1, -32(%ebp)
98	movl	$1, -28(%ebp)
99
100	shrl	$4, %eax						// (num_input_words / 16)
101	leal	16(%esi,%eax,4), %eax			// dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
102	movl	%eax, -6256(%ebp)				// next_full_patt = dest_buf + TAGS_AREA_OFFSET + (num_input_words / 16);
103
104	leal	-6232(%ebp), %eax				// &tempLowBitsArray[0]
105	movl	%eax, -6260(%ebp)				// save a copy of &tempLowBitsArray[0]
106	movl	%eax, -6248(%ebp)				// save a copy of &tempLowBitsArray[0]
107
108	cmpl	%ecx, %edx						// next_input_word (%edx) vs end_of_input (%ecx)
109	jae		L_done_search					// if (next_input_word >= end_of_input) skip the following search loop
110
111	leal	-1111(%ebp), %esi				// &next_tag[1]
112	leal	-88(%ebp), %ebp					// dictionary
113
114	movl	%edx, %edi						// next_input_word
115
116	#define		next_input_word		%edi
117	#define		dictionary			%ebp
118	#define		next_tag			%esi
119
120	jmp		L5
121
122	.align 4,0x90
123L_RECORD_ZERO:
124	movb	$0, -1(next_tag)				// *next_tag = ZERO;
125L8:
126	addl	$4, next_input_word				// next_input_word++;
127	incl	next_tag						// next_tag++
128	cmpl	next_input_word, 84(%esp)		// end_of_input vs next_input_word
129	jbe		L_done_search					// if (next_input_word>=end_of_input), skip to L_done_search
130L5:
131	movl	(next_input_word), %ecx			// input_word = *next_input_word;
132	movl	%ecx, %eax						// a copy of input_word
133	testl	%ecx, %ecx						// input_word
134	je		L_RECORD_ZERO					// if (input_word==0) RECORD_ZERO
135	shrl	$10, %eax						// input_high_bits = HIGH_BITS(input_word);
136	movl	%eax, (%esp)					// save a copy of input_high_bits;
137	andl	$255, %eax						// 8 bits index to Hash Table
138	movsbl	(%ebx,%eax),%edx				// HASH_TO_DICT_BYTE_OFFSET(input_word)
139	addl	dictionary, %edx				// ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
140	movl	(%edx), %eax					// dict_word = *dict_location;
141	cmpl	%eax, %ecx						// cmp input_word vs dict_word
142	je		L_RECORD_EXACT
143	shrl	$10, %eax						// HIGH_BITS(dict_word)
144	cmpl	%eax, (%esp)					// input_high_bits vs HIGH_BITS(dict_word)
145	je		L_RECORD_PARTIAL				// if (input_high_bits == HIGH_BITS(dict_word)) RECORD_PARTIAL
146
147L_RECORD_MISS:
148	movb	$2, -1(next_tag)				// *next_tag = 2 for miss
149	movl	72(%esp), %eax					// next_full_patt
150	movl	%ecx, (%eax)					// *next_full_patt = input_word;
151	addl	$4, %eax						// next_full_patt++;
152	movl	%eax, 72(%esp)					// save next_full_patt
153	movl	%ecx, (%edx)					// *dict_location = input_word
154	jmp		L8
155
156	.align 4,0x90
157L_RECORD_EXACT:
158	movb	$3, -1(next_tag)				// *next_tag = 3 for exact
159	subl	dictionary, %edx				// dict_location - dictionary
160	sarl	$2, %edx						// divide by 4 for word offset
161	movl	76(%esp), %eax					// next_qp
162	movb	%dl, (%eax)						// *next_qp = word offset (4-bit)
163	incl	%eax							// next_qp++
164	movl	%eax, 76(%esp)					// save next_qp
165	jmp		L8
166
167L_done_search:
168
169	// restore %ebp as normal use (was used as dictionary)
170	movl	%esp, %ebp
171	addl	$6328, %ebp
172
173	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
174	movl	-6256(%ebp), %edi				// next_full_patt
175	subl	12(%ebp), %edi					// next_full_patt - dest_buf
176	movl	%edi, %eax						// next_full_patt - dest_buf
177	sarl	$2, %eax						// in 4-byte words
178	movl	%eax, -6240(%ebp)				// save (next_full_patt - dest_buf) in words
179	movl	12(%ebp), %edx					// dest_buf
180	movl	%eax, 4(%edx)					// dest_buf[1] = next_full_patt - dest_buf
181
182	movl	-6272(%ebp), %ecx				// &tempTagsArray[0]
183	decl	next_tag
184	cmpl	next_tag, %ecx					// next_tag vs &tempTagsArray[0]
185	jae		L13								// if &tempTagsArray[0] >= next_tag, skip the following WK_pack_2bits
186
187	movl	%edx, %ebx						// a copy of dest_buf
188
189	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
190
191	.align 4,0x90
192L_WK_pack_2bits:
193	movl	4(%ecx), %eax					// w1
194	sall	$2, %eax						// w1 << 2
195	movl	8(%ecx), %edx					// w2
196	sall	$4, %edx						// w2 << 4
197	orl		%edx, %eax						// (w1<<2) | (w2<<4)
198	orl		(%ecx), %eax					// (w0) | (w1<<2) | (w2<<4)
199	movl	12(%ecx), %edx					// w3
200	sall	$6, %edx						// (w3<<6)
201	orl		%edx, %eax						// (w0) | (w1<<2) | (w2<<4) | (w3<<6)
202	movl	%eax, 16(%ebx)					// save at *(dest_buf + HEADER_SIZE_IN_WORDS)
203	addl	$16, %ecx						// tempTagsArray += 16;
204	addl	$4, %ebx						// dest_buf += 4;
205	cmpl    %ecx, next_tag					// cmp next_tag vs dest_buf
206	ja		L_WK_pack_2bits					// if (next_tag > dest_buf) repeat L_WK_pack_2bits
207
208	/* Pack the queue positions into the area just after the full words. */
209L13:
210	movl	-6252(%ebp), %eax				// next_qp
211	movl	-6264(%ebp), %ecx				// (char *) tempQPosArray
212	movl	%eax, %esi						// next_qp
213	subl	%ecx, %eax						// num_bytes_to_pack = next_qp - (char *) tempQPosArray;
214	addl	$7, %eax						// num_bytes_to_pack + 7
215	andl	$-8, %eax						// clear lower 3 bits, (num_packed_words<<3)
216	addl	%eax, %ecx						// endQPosArray = tempQPosArray + num_source_words;
217	cmpl	%ecx, %esi						// next_qp vs endQPosArray
218	jae		L16
219	.align 4,0x90
220L30:
221	movb	$0, (%esi)						// *next_qp = 0;
222	incl	%esi							// next_qp++
223	cmpl	%ecx, %esi						// next_qp vs endQPosArray
224	jne		L30								//
225
226L16:
227	movl	-6256(%ebp), %ebx				// next_full_patt
228	cmpl	-6264(%ebp), %ecx				// endQPosArray vs tempQPosArray
229	jbe		L20								// if (endQPosArray<=tempQPosArray) skip L_WK_pack_4bits
230	movl	-6264(%ebp), %edx				// tempQPosArray
231
232
233	// boundary_tmp = WK_pack_4bits(tempQPosArray, endQPosArray, next_full_patt);
234
235	.align 4,0x90
236L21:
237	movl	4(%edx), %eax					// src_next[1]
238	sall	$4, %eax						// (src_next[1] << 4)
239	orl		(%edx), %eax					// temp = src_next[0] | (src_next[1] << 4)
240	movl	%eax, (%ebx)					// dest_next[0] = temp;
241	addl	$4, %ebx						// dest_next++;
242	addl	$8, %edx						// src_next += 2;
243	cmpl	%edx, %ecx						// source_end vs src_next
244	ja		L21								// while (src_next < source_end) repeat the loop
245
246	movl	%ebx, %edi						// boundary_tmp
247
248	subl	12(%ebp), %edi					// boundary_tmp - dest_buf
249	movl	%edi, %eax						// boundary_tmp - dest_buf
250	sarl	$2, %eax						// translate into word offset
251
252	movl	%eax, -6240(%ebp)				// save (next_full_patt - dest_buf) in words
253
254L20:
255	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
256	movl	-6240(%ebp), %ecx				// boundary_tmp - dest_buf
257	movl	12(%ebp), %edx					// dest_buf
258	movl	%ecx, 8(%edx)					// dest_buf[2] = boundary_tmp - dest_buf
259
260	movl	-6260(%ebp), %ecx				// tempLowBitsArray
261	movl	-6248(%ebp), %edx				// next_low_bits
262	subl	%ecx, %edx						// next_low_bits - tempLowBitsArray
263	sarl	$2, %edx						// num_tenbits_to_pack
264
265	subl	$3, %edx						// pre-decrement num_tenbits_to_pack by 3
266	jl		1f								// if num_tenbits_to_pack < 3, skip the following loop
267	.align	4,0x90
2680:
269	movl	4(%ecx), %eax					// w1
270	sall	$10, %eax						// w1<<10
271	movl	8(%ecx), %esi					// w2
272	sall	$20, %esi						// w2<<20
273	orl		%esi, %eax						// (w1<<10) | (w2<<20)
274	orl		(%ecx), %eax					// (w0) | (w1<<10) | (w2<<20)
275	movl	%eax, (%ebx)					// pack w0,w1,w2 into 1 dest_buf word
276	addl	$4, %ebx						// dest_buf++
277	addl	$12, %ecx						// next w0/w1/w2 triplet
278	subl	$3, %edx						// num_tenbits_to_pack-=3
279	jge		0b								// if no less than 3 elements, back to loop head
280
2811:	addl	$3, %edx						// post-increment num_tenbits_to_pack by 3
282	je		3f								// if num_tenbits_to_pack is a multiple of 3, skip the following
283	movl	(%ecx), %eax					// w0
284	subl	$1, %edx						// num_tenbits_to_pack --
285	je		2f								//
286	movl    4(%ecx), %esi					// w1
287	sall	$10, %esi						// w1<<10
288	orl		%esi, %eax
2892:
290	movl	%eax, (%ebx)					// write the final dest_buf word
291	addl	$4, %ebx						// dest_buf++
2923:
293	movl	%ebx, %eax						// boundary_tmp
294	subl	12(%ebp), %eax					// boundary_tmp - dest_buf
295	sarl	$2, %eax						// boundary_tmp - dest_buf in terms of words
296	movl	12(%ebp), %esi					// dest_buf
297	movl	%eax, 12(%esi)					// SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp);
298	sall	$2, %eax						// boundary_tmp - dest_buf in terms of bytes
299	addl	$6316, %esp						// pop out stack memory
300	popl	%ebx
301	popl	%esi
302	popl	%edi
303	leave
304	ret
305
306	.align 4,0x90
307
308L_RECORD_PARTIAL:
309	movb	$1, -1(next_tag)						// *next_tag = 1 for partial matched
310	movl	%edx, %eax								// dict_location
311	subl	dictionary, %eax						// %eax = dict_location - dictionary
312	movl	%ecx, (%edx)							// *dict_location = input_word;
313	sarl	$2, %eax								// offset in 32-bit word
314	movl	76(%esp), %edx							// next_qp
315	movb	%al, (%edx)								// update *next_qp
316	incl	%edx									// next_qp++
317	movl	%edx, 76(%esp)							// save next_qp
318	movl	%ecx, %eax								// a copy of input_word
319	andl	$1023, %eax								// lower 10 bits
320	movl	80(%esp), %edx							// next_low_bits
321	movl	%eax, (%edx)							// EMIT_WORD(next_low_bits,(low_bits_pattern))
322	addl	$4, %edx								// next_low_bits++
323	movl	%edx, 80(%esp)							// save next_low_bits
324	jmp		L8
325
326#endif		// i386 architectures
327
328#if defined __x86_64__			// 64-bit implementation
329	.text
330	.align 4,0x90
331
332.globl _WKdm_compress
333_WKdm_compress:
334	pushq	%rbp
335	movq	%rsp, %rbp
336	pushq	%r15
337	pushq	%r14
338	pushq	%r13
339	pushq	%r12
340	pushq	%rbx
341	subq	$6112, %rsp
342
343	#define	tempTagsArray	-6264(%rbp)
344	#define	tempLowBitsArray	-6272(%rbp)
345	#define	next_tag			%r8
346	#define	next_input_word		%rdi
347	#define	end_of_input		%r13
348	#define	next_full_patt		%rbx
349	#define	dict_location		%rcx
350	#define	next_qp				%r10
351	#define	dictionary			%r11
352	#define	dest_buf			%r12
353	#define	hashTable			%r14
354	#define tempQPosArray		%r15
355	#define	next_low_bits		%rsi
356
357	movq	%rsi, %r12						// dest_buf
358
359	leaq	-1136(%rbp), %rax				// &tempTagsArray[0]
360	movq	%rax, tempTagsArray
361	leaq	1(%rax), next_tag				// next_tag always points to the one following the current tag
362
363	leaq	-2160(%rbp), %r15				// &tempQPosArray[0]
364	movq	%r15, next_qp					// next_qp
365
366	mov		%edx, %eax						// num_input_words
367	leaq	(%rdi,%rax,4), end_of_input		// end_of_input = src_buf + num_input_words
368
369	// PRELOAD_DICTIONARY;
370	movl	$1, -112(%rbp)
371	movl	$1, -108(%rbp)
372	movl	$1, -104(%rbp)
373	movl	$1, -100(%rbp)
374	movl	$1, -96(%rbp)
375	movl	$1, -92(%rbp)
376	movl	$1, -88(%rbp)
377	movl	$1, -84(%rbp)
378	movl	$1, -80(%rbp)
379	movl	$1, -76(%rbp)
380	movl	$1, -72(%rbp)
381	movl	$1, -68(%rbp)
382	movl	$1, -64(%rbp)
383	movl	$1, -60(%rbp)
384	movl	$1, -56(%rbp)
385	movl	$1, -52(%rbp)
386
387	shrl	$4, %edx						// (num_input_words / 16)
388	mov		%edx, %edx						// sign extension into quad word
389	leaq	16(%rsi,%rdx,4), %rbx			// dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
390
391	leaq	-6256(%rbp), %rax				// &tempLowBitsArray[0]
392	movq	%rax, tempLowBitsArray			// save for later reference
393	movq	%rax, next_low_bits				// next_low_bits
394
395	cmpq	end_of_input, next_input_word	// next_input_word vs end_of_input
396	jae		L_done_search					// if (next_input_word>=end_of_input) no work to do in search
397	leaq	-112(%rbp), dictionary			// dictionary
398	leaq	_hashLookupTable(%rip), hashTable	// hash look up table
399	jmp	L5
400
401	.align 4,0x90
402L_RECORD_ZERO:
403	movb	$0, -1(next_tag)						// *next_tag = ZERO;
404L8:
405	addq	$4, next_input_word 					// next_input_word++;
406	incq	next_tag								// next_tag++
407	cmpq	next_input_word, end_of_input 			// end_of_input vs next_input_word
408	jbe		L_done_search
409L5:
410	movl	(next_input_word), %edx					// input_word = *next_input_word;
411	movl	%edx, %r9d								// a copy of input_word
412	testl	%edx, %edx								// input_word
413	je		L_RECORD_ZERO							// if (input_word==0) RECORD_ZERO
414	shrl	$10, %r9d								// input_high_bits = HIGH_BITS(input_word);
415	movzbl	%r9b, %eax								// 8-bit index to the Hash Table
416	movsbq	(hashTable,%rax),%rax					// HASH_TO_DICT_BYTE_OFFSET(input_word)
417	leaq	(dictionary, %rax), dict_location		// ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
418	movl	(dict_location), %eax					// dict_word = *dict_location;
419	cmpl	%eax, %edx								// dict_word vs input_word
420	je		L_RECORD_EXACT							// if identical, RECORD_EXACT
421	shrl	$10, %eax								// HIGH_BITS(dict_word)
422	cmpl	%eax, %r9d								// input_high_bits vs HIGH_BITS(dict_word)
423	je		L_RECORD_PARTIAL						// if identical, RECORD_PARTIAL
424
425L_RECORD_MISS:
426	movb	$2, -1(next_tag)						// *next_tag = 2 for miss
427	movl	%edx, (next_full_patt)					// *next_full_patt = input_word;
428	addq	$4, next_full_patt						// next_full_patt++
429	movl	%edx, (dict_location)					// *dict_location = input_word
430	addq	$4, next_input_word						// next_input_word++
431	incq	next_tag								// next_tag++
432	cmpq	next_input_word, end_of_input			// end_of_input vs next_input_word
433	ja		L5										// if (end_of_input>next_input_word) repeat from L5
434
435L_done_search:
436
437	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
438	//movq	next_full_patt, %r11					// next_full_patt
439	movq	next_full_patt, %rax					// next_full_patt
440	subq	dest_buf, %rax							// next_full_patt - dest_buf
441	sarq	$2, %rax								// offset in 4-bytes
442	movl	%eax, %r13d								// r13d = (next_full_patt - dest_buf)
443	movl	%eax, 4(dest_buf)						// dest_buf[1] = next_full_patt - dest_buf
444
445	decq	next_tag
446	cmpq	next_tag, tempTagsArray					// &tempTagsArray[0] vs next_tag
447	jae		L13										// if (&tempTagsArray[0] >= next_tag), skip the following
448
449	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
450
451	movq	dest_buf, %rdi							// dest_buf
452	movq	tempTagsArray, %rcx						// &tempTagsArray[0]
453
454	.align 4,0x90
455L_pack_2bits:
456	movl	4(%rcx), %eax							// w1
457	sall	$2, %eax								// w1 << 2
458	movl	8(%rcx), %edx							// w2
459	sall	$4, %edx								// w2 << 4
460	orl		%edx, %eax								// (w1<<2) | (w2<<4)
461	orl		(%rcx), %eax							// (w0) | (w1<<2) | (w2<<4)
462	movl	12(%rcx), %edx							// w3
463	sall	$6, %edx								// w3 << 6
464	orl		%edx, %eax								// (w0) | (w1<<2) | (w2<<4) | (w3<<6)
465	movl	%eax, 16(%rdi)							// save at *(dest_buf + HEADER_SIZE_IN_WORDS)
466	addq	$16, %rcx								// tempTagsArray += 16;
467	addq	$4, %rdi								// dest_buf += 4;
468	cmpq	%rcx, next_tag							// cmp next_tag vs dest_buf
469	ja		L_pack_2bits							// if (next_tag > dest_buf) repeat L_pack_2bits
470
471	/* Pack the queue positions into the area just after the full words. */
472
473L13:
474	movl	%r10d, %eax								// next_qp
475	subl	%r15d, %eax								// num_bytes_to_pack = next_qp - (char *) tempQPosArray;
476	addl	$7, %eax								// num_bytes_to_pack+7
477	shrl	$3, %eax								// num_packed_words = (num_bytes_to_pack + 7) >> 3
478	addl	%eax, %eax								// num_source_words = num_packed_words * 2;
479	mov		%eax, %eax
480	leaq	(tempQPosArray,%rax,4), %rcx			// endQPosArray = tempQPosArray + num_source_words
481	cmpq	%rcx, %r10								// next_qp vs endQPosArray
482	jae		L16										// if (next_qp >= endQPosArray) skip the following zero paddings
483	.align 4,0x90
484L30:
485	movb	$0, (next_qp)							// *next_qp = 0
486	incq	next_qp									// next_qp++
487	cmpq	%rcx, next_qp							// next_qp vs endQPosArray
488	jne		L30										// repeat while next_qp < endQPosArray
489L16:
490	movq	%rbx, %rdi								// next_full_patt
491	cmpq	tempQPosArray, %rcx						// endQPosArray vs tempQPosArray
492	jbe		L20										// if (endQPosArray <= tempQPosArray) skip the following
493	movq	tempQPosArray, %rdx						// tempQPosArray
494
495	.align 4,0x90
496L_pack_4bits:
497	movl	4(%rdx), %eax							// src_next[1]
498	sall	$4, %eax								// (src_next[1] << 4)
499	orl		(%rdx), %eax							// temp = src_next[0] | (src_next[1] << 4)
500	movl	%eax, (%rdi)							// dest_next[0] = temp;
501	addq	$4, %rdi								// dest_next++;
502	addq	$8, %rdx								// src_next += 2;
503	cmpq	%rdx, %rcx								// source_end vs src_next
504	ja		L_pack_4bits							// while (src_next < source_end) repeat the loop
505
506	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
507	//movq	%rdi, %r11								// boundary_tmp
508	movq	%rdi, %rax								// boundary_tmp
509	subq	dest_buf, %rax							// boundary_tmp - dest_buf
510	movq	%rax, %r13								// boundary_tmp - dest_buf
511	shrq	$2, %r13								// boundary_tmp - dest_buf in words
512L20:
513	movl	%r13d, 8(dest_buf)						// dest_buf[2] = boundary_tmp - dest_buf
514
515	movq	tempLowBitsArray, %rcx					// tempLowBitsArray
516	movq	next_low_bits, %rbx						// next_low_bits
517	subq	%rcx, %rbx								// next_low_bits - tempLowBitsArray (in bytes)
518	sarq	$2, %rbx								// num_tenbits_to_pack (in words)
519
520	#define	size	%ebx
521
522	subl	$3, size								// pre-decrement num_tenbits_to_pack by 3
523	jl		1f										// if num_tenbits_to_pack < 3, skip the following loop
524
525	.align	4,0x90
5260:
527	movl	4(%rcx), %eax							// w1
528	sall	$10, %eax								// w1 << 10
529	movl	8(%rcx), %edx							// w2
530	sall	$20, %edx								// w2 << 20
531	orl		%edx, %eax								// (w1<<10) | (w2<<20)
532	orl		(%rcx), %eax							// (w0) | (w1<<10) | (w2<<20)
533	movl	%eax, (%rdi)							// pack w0,w1,w2 into 1 dest_buf word
534	addq	$4, %rdi								// dest_buf++
535	addq	$12, %rcx								// next w0/w1/w2 triplet
536	subl	$3, size								// num_tenbits_to_pack-=3
537	jge		0b										// if no less than 3 elements, back to loop head
538
5391: 	addl	$3, size								// post-increment num_tenbits_to_pack by 3
540	je		3f										// if num_tenbits_to_pack is a multiple of 3, skip the following
541	movl	(%rcx), %eax							// w0
542	subl	$1, size								// num_tenbits_to_pack--
543	je		2f										//
544	movl	4(%rcx), %edx							// w1
545	sall	$10, %edx								// w1 << 10
546	orl		%edx, %eax								// w0 | (w1<<10)
547
5482:	movl	%eax, (%rdi)							// write the final dest_buf word
549	addq	$4, %rdi								// dest_buf++
550
5513:	movq	%rdi, %rax								// boundary_tmp
552	subq	dest_buf, %rax							// boundary_tmp - dest_buf
553	shrq	$2, %rax								// boundary_tmp - dest_buf in terms of words
554	movl	%eax, 12(dest_buf)						// SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
555	shlq	$2, %rax								// boundary_tmp - dest_buf in terms of bytes
556
557	// restore registers and return
558	addq	$6112, %rsp
559	popq	%rbx
560	popq	%r12
561	popq	%r13
562	popq	%r14
563	popq	%r15
564	leave
565	ret
566
567	.align 4,0x90
568L_RECORD_EXACT:
569	movb	$3, -1(next_tag)					// *next_tag = 3 for exact
570	subq	dictionary, %rcx					// dict_location - dictionary
571	sarq	$2, %rcx							// divide by 4 for word offset
572	movb	%cl, (next_qp)						// *next_qp = word offset (4-bit)
573	incq	next_qp								// next_qp++
574	jmp		L8
575
576	.align 4,0x90
577L_RECORD_PARTIAL:
578	movb	$1, -1(next_tag)					// *next_tag = 1 for partial matched
579	movq	%rcx, %rax							// dict_location
580	subq	dictionary, %rax					// dict_location - dictionary
581	movl	%edx, (%rcx)						// *dict_location = input_word;
582	sarq	$2, %rax							// offset in 32-bit word
583	movb	%al, (next_qp)						// update *next_qp
584	incq	next_qp								// next_qp++
585	andl	$1023, %edx							// lower 10 bits
586	movl	%edx, (next_low_bits)				// save next_low_bits
587	addq	$4, next_low_bits					// next_low_bits++
588	jmp	L8
589
590	// for some reason, keeping the following never executed code yields a better performance
591L41:
592	leaq	-6256(%rbp), %rax
593	movq	%rax, -6272(%rbp)
594	movq	%rax, %rsi
595	jmp		L_done_search
596#endif		// x86_64 architectures
597#endif		// i386 or x86_64 architectures
598