ghash-x86_64.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/ghash-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
2/* Do not modify. This file is auto-generated from ghash-x86_64.pl. */
3.text
4
5
6.globl	gcm_gmult_4bit
7.type	gcm_gmult_4bit,@function
8.align	16
9gcm_gmult_4bit:
10	pushq	%rbx
11	pushq	%rbp
12	pushq	%r12
13.Lgmult_prologue:
14
15	movzbq	15(%rdi),%r8
16	leaq	.Lrem_4bit(%rip),%r11
17	xorq	%rax,%rax
18	xorq	%rbx,%rbx
19	movb	%r8b,%al
20	movb	%r8b,%bl
21	shlb	$4,%al
22	movq	$14,%rcx
23	movq	8(%rsi,%rax,1),%r8
24	movq	(%rsi,%rax,1),%r9
25	andb	$0xf0,%bl
26	movq	%r8,%rdx
27	jmp	.Loop1
28
29.align	16
30.Loop1:
31	shrq	$4,%r8
32	andq	$0xf,%rdx
33	movq	%r9,%r10
34	movb	(%rdi,%rcx,1),%al
35	shrq	$4,%r9
36	xorq	8(%rsi,%rbx,1),%r8
37	shlq	$60,%r10
38	xorq	(%rsi,%rbx,1),%r9
39	movb	%al,%bl
40	xorq	(%r11,%rdx,8),%r9
41	movq	%r8,%rdx
42	shlb	$4,%al
43	xorq	%r10,%r8
44	decq	%rcx
45	js	.Lbreak1
46
47	shrq	$4,%r8
48	andq	$0xf,%rdx
49	movq	%r9,%r10
50	shrq	$4,%r9
51	xorq	8(%rsi,%rax,1),%r8
52	shlq	$60,%r10
53	xorq	(%rsi,%rax,1),%r9
54	andb	$0xf0,%bl
55	xorq	(%r11,%rdx,8),%r9
56	movq	%r8,%rdx
57	xorq	%r10,%r8
58	jmp	.Loop1
59
60.align	16
61.Lbreak1:
62	shrq	$4,%r8
63	andq	$0xf,%rdx
64	movq	%r9,%r10
65	shrq	$4,%r9
66	xorq	8(%rsi,%rax,1),%r8
67	shlq	$60,%r10
68	xorq	(%rsi,%rax,1),%r9
69	andb	$0xf0,%bl
70	xorq	(%r11,%rdx,8),%r9
71	movq	%r8,%rdx
72	xorq	%r10,%r8
73
74	shrq	$4,%r8
75	andq	$0xf,%rdx
76	movq	%r9,%r10
77	shrq	$4,%r9
78	xorq	8(%rsi,%rbx,1),%r8
79	shlq	$60,%r10
80	xorq	(%rsi,%rbx,1),%r9
81	xorq	%r10,%r8
82	xorq	(%r11,%rdx,8),%r9
83
84	bswapq	%r8
85	bswapq	%r9
86	movq	%r8,8(%rdi)
87	movq	%r9,(%rdi)
88
89	movq	16(%rsp),%rbx
90	leaq	24(%rsp),%rsp
91.Lgmult_epilogue:
92	.byte	0xf3,0xc3
93.size	gcm_gmult_4bit,.-gcm_gmult_4bit
94.globl	gcm_ghash_4bit
95.type	gcm_ghash_4bit,@function
96.align	16
97gcm_ghash_4bit:
98	pushq	%rbx
99	pushq	%rbp
100	pushq	%r12
101	pushq	%r13
102	pushq	%r14
103	pushq	%r15
104	subq	$280,%rsp
105.Lghash_prologue:
106	movq	%rdx,%r14
107	movq	%rcx,%r15
108	subq	$-128,%rsi
109	leaq	16+128(%rsp),%rbp
110	xorl	%edx,%edx
111	movq	0+0-128(%rsi),%r8
112	movq	0+8-128(%rsi),%rax
113	movb	%al,%dl
114	shrq	$4,%rax
115	movq	%r8,%r10
116	shrq	$4,%r8
117	movq	16+0-128(%rsi),%r9
118	shlb	$4,%dl
119	movq	16+8-128(%rsi),%rbx
120	shlq	$60,%r10
121	movb	%dl,0(%rsp)
122	orq	%r10,%rax
123	movb	%bl,%dl
124	shrq	$4,%rbx
125	movq	%r9,%r10
126	shrq	$4,%r9
127	movq	%r8,0(%rbp)
128	movq	32+0-128(%rsi),%r8
129	shlb	$4,%dl
130	movq	%rax,0-128(%rbp)
131	movq	32+8-128(%rsi),%rax
132	shlq	$60,%r10
133	movb	%dl,1(%rsp)
134	orq	%r10,%rbx
135	movb	%al,%dl
136	shrq	$4,%rax
137	movq	%r8,%r10
138	shrq	$4,%r8
139	movq	%r9,8(%rbp)
140	movq	48+0-128(%rsi),%r9
141	shlb	$4,%dl
142	movq	%rbx,8-128(%rbp)
143	movq	48+8-128(%rsi),%rbx
144	shlq	$60,%r10
145	movb	%dl,2(%rsp)
146	orq	%r10,%rax
147	movb	%bl,%dl
148	shrq	$4,%rbx
149	movq	%r9,%r10
150	shrq	$4,%r9
151	movq	%r8,16(%rbp)
152	movq	64+0-128(%rsi),%r8
153	shlb	$4,%dl
154	movq	%rax,16-128(%rbp)
155	movq	64+8-128(%rsi),%rax
156	shlq	$60,%r10
157	movb	%dl,3(%rsp)
158	orq	%r10,%rbx
159	movb	%al,%dl
160	shrq	$4,%rax
161	movq	%r8,%r10
162	shrq	$4,%r8
163	movq	%r9,24(%rbp)
164	movq	80+0-128(%rsi),%r9
165	shlb	$4,%dl
166	movq	%rbx,24-128(%rbp)
167	movq	80+8-128(%rsi),%rbx
168	shlq	$60,%r10
169	movb	%dl,4(%rsp)
170	orq	%r10,%rax
171	movb	%bl,%dl
172	shrq	$4,%rbx
173	movq	%r9,%r10
174	shrq	$4,%r9
175	movq	%r8,32(%rbp)
176	movq	96+0-128(%rsi),%r8
177	shlb	$4,%dl
178	movq	%rax,32-128(%rbp)
179	movq	96+8-128(%rsi),%rax
180	shlq	$60,%r10
181	movb	%dl,5(%rsp)
182	orq	%r10,%rbx
183	movb	%al,%dl
184	shrq	$4,%rax
185	movq	%r8,%r10
186	shrq	$4,%r8
187	movq	%r9,40(%rbp)
188	movq	112+0-128(%rsi),%r9
189	shlb	$4,%dl
190	movq	%rbx,40-128(%rbp)
191	movq	112+8-128(%rsi),%rbx
192	shlq	$60,%r10
193	movb	%dl,6(%rsp)
194	orq	%r10,%rax
195	movb	%bl,%dl
196	shrq	$4,%rbx
197	movq	%r9,%r10
198	shrq	$4,%r9
199	movq	%r8,48(%rbp)
200	movq	128+0-128(%rsi),%r8
201	shlb	$4,%dl
202	movq	%rax,48-128(%rbp)
203	movq	128+8-128(%rsi),%rax
204	shlq	$60,%r10
205	movb	%dl,7(%rsp)
206	orq	%r10,%rbx
207	movb	%al,%dl
208	shrq	$4,%rax
209	movq	%r8,%r10
210	shrq	$4,%r8
211	movq	%r9,56(%rbp)
212	movq	144+0-128(%rsi),%r9
213	shlb	$4,%dl
214	movq	%rbx,56-128(%rbp)
215	movq	144+8-128(%rsi),%rbx
216	shlq	$60,%r10
217	movb	%dl,8(%rsp)
218	orq	%r10,%rax
219	movb	%bl,%dl
220	shrq	$4,%rbx
221	movq	%r9,%r10
222	shrq	$4,%r9
223	movq	%r8,64(%rbp)
224	movq	160+0-128(%rsi),%r8
225	shlb	$4,%dl
226	movq	%rax,64-128(%rbp)
227	movq	160+8-128(%rsi),%rax
228	shlq	$60,%r10
229	movb	%dl,9(%rsp)
230	orq	%r10,%rbx
231	movb	%al,%dl
232	shrq	$4,%rax
233	movq	%r8,%r10
234	shrq	$4,%r8
235	movq	%r9,72(%rbp)
236	movq	176+0-128(%rsi),%r9
237	shlb	$4,%dl
238	movq	%rbx,72-128(%rbp)
239	movq	176+8-128(%rsi),%rbx
240	shlq	$60,%r10
241	movb	%dl,10(%rsp)
242	orq	%r10,%rax
243	movb	%bl,%dl
244	shrq	$4,%rbx
245	movq	%r9,%r10
246	shrq	$4,%r9
247	movq	%r8,80(%rbp)
248	movq	192+0-128(%rsi),%r8
249	shlb	$4,%dl
250	movq	%rax,80-128(%rbp)
251	movq	192+8-128(%rsi),%rax
252	shlq	$60,%r10
253	movb	%dl,11(%rsp)
254	orq	%r10,%rbx
255	movb	%al,%dl
256	shrq	$4,%rax
257	movq	%r8,%r10
258	shrq	$4,%r8
259	movq	%r9,88(%rbp)
260	movq	208+0-128(%rsi),%r9
261	shlb	$4,%dl
262	movq	%rbx,88-128(%rbp)
263	movq	208+8-128(%rsi),%rbx
264	shlq	$60,%r10
265	movb	%dl,12(%rsp)
266	orq	%r10,%rax
267	movb	%bl,%dl
268	shrq	$4,%rbx
269	movq	%r9,%r10
270	shrq	$4,%r9
271	movq	%r8,96(%rbp)
272	movq	224+0-128(%rsi),%r8
273	shlb	$4,%dl
274	movq	%rax,96-128(%rbp)
275	movq	224+8-128(%rsi),%rax
276	shlq	$60,%r10
277	movb	%dl,13(%rsp)
278	orq	%r10,%rbx
279	movb	%al,%dl
280	shrq	$4,%rax
281	movq	%r8,%r10
282	shrq	$4,%r8
283	movq	%r9,104(%rbp)
284	movq	240+0-128(%rsi),%r9
285	shlb	$4,%dl
286	movq	%rbx,104-128(%rbp)
287	movq	240+8-128(%rsi),%rbx
288	shlq	$60,%r10
289	movb	%dl,14(%rsp)
290	orq	%r10,%rax
291	movb	%bl,%dl
292	shrq	$4,%rbx
293	movq	%r9,%r10
294	shrq	$4,%r9
295	movq	%r8,112(%rbp)
296	shlb	$4,%dl
297	movq	%rax,112-128(%rbp)
298	shlq	$60,%r10
299	movb	%dl,15(%rsp)
300	orq	%r10,%rbx
301	movq	%r9,120(%rbp)
302	movq	%rbx,120-128(%rbp)
303	addq	$-128,%rsi
304	movq	8(%rdi),%r8
305	movq	0(%rdi),%r9
306	addq	%r14,%r15
307	leaq	.Lrem_8bit(%rip),%r11
308	jmp	.Louter_loop
309.align	16
310.Louter_loop:
311	xorq	(%r14),%r9
312	movq	8(%r14),%rdx
313	leaq	16(%r14),%r14
314	xorq	%r8,%rdx
315	movq	%r9,(%rdi)
316	movq	%rdx,8(%rdi)
317	shrq	$32,%rdx
318	xorq	%rax,%rax
319	roll	$8,%edx
320	movb	%dl,%al
321	movzbl	%dl,%ebx
322	shlb	$4,%al
323	shrl	$4,%ebx
324	roll	$8,%edx
325	movq	8(%rsi,%rax,1),%r8
326	movq	(%rsi,%rax,1),%r9
327	movb	%dl,%al
328	movzbl	%dl,%ecx
329	shlb	$4,%al
330	movzbq	(%rsp,%rbx,1),%r12
331	shrl	$4,%ecx
332	xorq	%r8,%r12
333	movq	%r9,%r10
334	shrq	$8,%r8
335	movzbq	%r12b,%r12
336	shrq	$8,%r9
337	xorq	-128(%rbp,%rbx,8),%r8
338	shlq	$56,%r10
339	xorq	(%rbp,%rbx,8),%r9
340	roll	$8,%edx
341	xorq	8(%rsi,%rax,1),%r8
342	xorq	(%rsi,%rax,1),%r9
343	movb	%dl,%al
344	xorq	%r10,%r8
345	movzwq	(%r11,%r12,2),%r12
346	movzbl	%dl,%ebx
347	shlb	$4,%al
348	movzbq	(%rsp,%rcx,1),%r13
349	shrl	$4,%ebx
350	shlq	$48,%r12
351	xorq	%r8,%r13
352	movq	%r9,%r10
353	xorq	%r12,%r9
354	shrq	$8,%r8
355	movzbq	%r13b,%r13
356	shrq	$8,%r9
357	xorq	-128(%rbp,%rcx,8),%r8
358	shlq	$56,%r10
359	xorq	(%rbp,%rcx,8),%r9
360	roll	$8,%edx
361	xorq	8(%rsi,%rax,1),%r8
362	xorq	(%rsi,%rax,1),%r9
363	movb	%dl,%al
364	xorq	%r10,%r8
365	movzwq	(%r11,%r13,2),%r13
366	movzbl	%dl,%ecx
367	shlb	$4,%al
368	movzbq	(%rsp,%rbx,1),%r12
369	shrl	$4,%ecx
370	shlq	$48,%r13
371	xorq	%r8,%r12
372	movq	%r9,%r10
373	xorq	%r13,%r9
374	shrq	$8,%r8
375	movzbq	%r12b,%r12
376	movl	8(%rdi),%edx
377	shrq	$8,%r9
378	xorq	-128(%rbp,%rbx,8),%r8
379	shlq	$56,%r10
380	xorq	(%rbp,%rbx,8),%r9
381	roll	$8,%edx
382	xorq	8(%rsi,%rax,1),%r8
383	xorq	(%rsi,%rax,1),%r9
384	movb	%dl,%al
385	xorq	%r10,%r8
386	movzwq	(%r11,%r12,2),%r12
387	movzbl	%dl,%ebx
388	shlb	$4,%al
389	movzbq	(%rsp,%rcx,1),%r13
390	shrl	$4,%ebx
391	shlq	$48,%r12
392	xorq	%r8,%r13
393	movq	%r9,%r10
394	xorq	%r12,%r9
395	shrq	$8,%r8
396	movzbq	%r13b,%r13
397	shrq	$8,%r9
398	xorq	-128(%rbp,%rcx,8),%r8
399	shlq	$56,%r10
400	xorq	(%rbp,%rcx,8),%r9
401	roll	$8,%edx
402	xorq	8(%rsi,%rax,1),%r8
403	xorq	(%rsi,%rax,1),%r9
404	movb	%dl,%al
405	xorq	%r10,%r8
406	movzwq	(%r11,%r13,2),%r13
407	movzbl	%dl,%ecx
408	shlb	$4,%al
409	movzbq	(%rsp,%rbx,1),%r12
410	shrl	$4,%ecx
411	shlq	$48,%r13
412	xorq	%r8,%r12
413	movq	%r9,%r10
414	xorq	%r13,%r9
415	shrq	$8,%r8
416	movzbq	%r12b,%r12
417	shrq	$8,%r9
418	xorq	-128(%rbp,%rbx,8),%r8
419	shlq	$56,%r10
420	xorq	(%rbp,%rbx,8),%r9
421	roll	$8,%edx
422	xorq	8(%rsi,%rax,1),%r8
423	xorq	(%rsi,%rax,1),%r9
424	movb	%dl,%al
425	xorq	%r10,%r8
426	movzwq	(%r11,%r12,2),%r12
427	movzbl	%dl,%ebx
428	shlb	$4,%al
429	movzbq	(%rsp,%rcx,1),%r13
430	shrl	$4,%ebx
431	shlq	$48,%r12
432	xorq	%r8,%r13
433	movq	%r9,%r10
434	xorq	%r12,%r9
435	shrq	$8,%r8
436	movzbq	%r13b,%r13
437	shrq	$8,%r9
438	xorq	-128(%rbp,%rcx,8),%r8
439	shlq	$56,%r10
440	xorq	(%rbp,%rcx,8),%r9
441	roll	$8,%edx
442	xorq	8(%rsi,%rax,1),%r8
443	xorq	(%rsi,%rax,1),%r9
444	movb	%dl,%al
445	xorq	%r10,%r8
446	movzwq	(%r11,%r13,2),%r13
447	movzbl	%dl,%ecx
448	shlb	$4,%al
449	movzbq	(%rsp,%rbx,1),%r12
450	shrl	$4,%ecx
451	shlq	$48,%r13
452	xorq	%r8,%r12
453	movq	%r9,%r10
454	xorq	%r13,%r9
455	shrq	$8,%r8
456	movzbq	%r12b,%r12
457	movl	4(%rdi),%edx
458	shrq	$8,%r9
459	xorq	-128(%rbp,%rbx,8),%r8
460	shlq	$56,%r10
461	xorq	(%rbp,%rbx,8),%r9
462	roll	$8,%edx
463	xorq	8(%rsi,%rax,1),%r8
464	xorq	(%rsi,%rax,1),%r9
465	movb	%dl,%al
466	xorq	%r10,%r8
467	movzwq	(%r11,%r12,2),%r12
468	movzbl	%dl,%ebx
469	shlb	$4,%al
470	movzbq	(%rsp,%rcx,1),%r13
471	shrl	$4,%ebx
472	shlq	$48,%r12
473	xorq	%r8,%r13
474	movq	%r9,%r10
475	xorq	%r12,%r9
476	shrq	$8,%r8
477	movzbq	%r13b,%r13
478	shrq	$8,%r9
479	xorq	-128(%rbp,%rcx,8),%r8
480	shlq	$56,%r10
481	xorq	(%rbp,%rcx,8),%r9
482	roll	$8,%edx
483	xorq	8(%rsi,%rax,1),%r8
484	xorq	(%rsi,%rax,1),%r9
485	movb	%dl,%al
486	xorq	%r10,%r8
487	movzwq	(%r11,%r13,2),%r13
488	movzbl	%dl,%ecx
489	shlb	$4,%al
490	movzbq	(%rsp,%rbx,1),%r12
491	shrl	$4,%ecx
492	shlq	$48,%r13
493	xorq	%r8,%r12
494	movq	%r9,%r10
495	xorq	%r13,%r9
496	shrq	$8,%r8
497	movzbq	%r12b,%r12
498	shrq	$8,%r9
499	xorq	-128(%rbp,%rbx,8),%r8
500	shlq	$56,%r10
501	xorq	(%rbp,%rbx,8),%r9
502	roll	$8,%edx
503	xorq	8(%rsi,%rax,1),%r8
504	xorq	(%rsi,%rax,1),%r9
505	movb	%dl,%al
506	xorq	%r10,%r8
507	movzwq	(%r11,%r12,2),%r12
508	movzbl	%dl,%ebx
509	shlb	$4,%al
510	movzbq	(%rsp,%rcx,1),%r13
511	shrl	$4,%ebx
512	shlq	$48,%r12
513	xorq	%r8,%r13
514	movq	%r9,%r10
515	xorq	%r12,%r9
516	shrq	$8,%r8
517	movzbq	%r13b,%r13
518	shrq	$8,%r9
519	xorq	-128(%rbp,%rcx,8),%r8
520	shlq	$56,%r10
521	xorq	(%rbp,%rcx,8),%r9
522	roll	$8,%edx
523	xorq	8(%rsi,%rax,1),%r8
524	xorq	(%rsi,%rax,1),%r9
525	movb	%dl,%al
526	xorq	%r10,%r8
527	movzwq	(%r11,%r13,2),%r13
528	movzbl	%dl,%ecx
529	shlb	$4,%al
530	movzbq	(%rsp,%rbx,1),%r12
531	shrl	$4,%ecx
532	shlq	$48,%r13
533	xorq	%r8,%r12
534	movq	%r9,%r10
535	xorq	%r13,%r9
536	shrq	$8,%r8
537	movzbq	%r12b,%r12
538	movl	0(%rdi),%edx
539	shrq	$8,%r9
540	xorq	-128(%rbp,%rbx,8),%r8
541	shlq	$56,%r10
542	xorq	(%rbp,%rbx,8),%r9
543	roll	$8,%edx
544	xorq	8(%rsi,%rax,1),%r8
545	xorq	(%rsi,%rax,1),%r9
546	movb	%dl,%al
547	xorq	%r10,%r8
548	movzwq	(%r11,%r12,2),%r12
549	movzbl	%dl,%ebx
550	shlb	$4,%al
551	movzbq	(%rsp,%rcx,1),%r13
552	shrl	$4,%ebx
553	shlq	$48,%r12
554	xorq	%r8,%r13
555	movq	%r9,%r10
556	xorq	%r12,%r9
557	shrq	$8,%r8
558	movzbq	%r13b,%r13
559	shrq	$8,%r9
560	xorq	-128(%rbp,%rcx,8),%r8
561	shlq	$56,%r10
562	xorq	(%rbp,%rcx,8),%r9
563	roll	$8,%edx
564	xorq	8(%rsi,%rax,1),%r8
565	xorq	(%rsi,%rax,1),%r9
566	movb	%dl,%al
567	xorq	%r10,%r8
568	movzwq	(%r11,%r13,2),%r13
569	movzbl	%dl,%ecx
570	shlb	$4,%al
571	movzbq	(%rsp,%rbx,1),%r12
572	shrl	$4,%ecx
573	shlq	$48,%r13
574	xorq	%r8,%r12
575	movq	%r9,%r10
576	xorq	%r13,%r9
577	shrq	$8,%r8
578	movzbq	%r12b,%r12
579	shrq	$8,%r9
580	xorq	-128(%rbp,%rbx,8),%r8
581	shlq	$56,%r10
582	xorq	(%rbp,%rbx,8),%r9
583	roll	$8,%edx
584	xorq	8(%rsi,%rax,1),%r8
585	xorq	(%rsi,%rax,1),%r9
586	movb	%dl,%al
587	xorq	%r10,%r8
588	movzwq	(%r11,%r12,2),%r12
589	movzbl	%dl,%ebx
590	shlb	$4,%al
591	movzbq	(%rsp,%rcx,1),%r13
592	shrl	$4,%ebx
593	shlq	$48,%r12
594	xorq	%r8,%r13
595	movq	%r9,%r10
596	xorq	%r12,%r9
597	shrq	$8,%r8
598	movzbq	%r13b,%r13
599	shrq	$8,%r9
600	xorq	-128(%rbp,%rcx,8),%r8
601	shlq	$56,%r10
602	xorq	(%rbp,%rcx,8),%r9
603	roll	$8,%edx
604	xorq	8(%rsi,%rax,1),%r8
605	xorq	(%rsi,%rax,1),%r9
606	movb	%dl,%al
607	xorq	%r10,%r8
608	movzwq	(%r11,%r13,2),%r13
609	movzbl	%dl,%ecx
610	shlb	$4,%al
611	movzbq	(%rsp,%rbx,1),%r12
612	andl	$240,%ecx
613	shlq	$48,%r13
614	xorq	%r8,%r12
615	movq	%r9,%r10
616	xorq	%r13,%r9
617	shrq	$8,%r8
618	movzbq	%r12b,%r12
619	movl	-4(%rdi),%edx
620	shrq	$8,%r9
621	xorq	-128(%rbp,%rbx,8),%r8
622	shlq	$56,%r10
623	xorq	(%rbp,%rbx,8),%r9
624	movzwq	(%r11,%r12,2),%r12
625	xorq	8(%rsi,%rax,1),%r8
626	xorq	(%rsi,%rax,1),%r9
627	shlq	$48,%r12
628	xorq	%r10,%r8
629	xorq	%r12,%r9
630	movzbq	%r8b,%r13
631	shrq	$4,%r8
632	movq	%r9,%r10
633	shlb	$4,%r13b
634	shrq	$4,%r9
635	xorq	8(%rsi,%rcx,1),%r8
636	movzwq	(%r11,%r13,2),%r13
637	shlq	$60,%r10
638	xorq	(%rsi,%rcx,1),%r9
639	xorq	%r10,%r8
640	shlq	$48,%r13
641	bswapq	%r8
642	xorq	%r13,%r9
643	bswapq	%r9
644	cmpq	%r15,%r14
645	jb	.Louter_loop
646	movq	%r8,8(%rdi)
647	movq	%r9,(%rdi)
648
649	leaq	280(%rsp),%rsi
650	movq	0(%rsi),%r15
651	movq	8(%rsi),%r14
652	movq	16(%rsi),%r13
653	movq	24(%rsi),%r12
654	movq	32(%rsi),%rbp
655	movq	40(%rsi),%rbx
656	leaq	48(%rsi),%rsp
657.Lghash_epilogue:
658	.byte	0xf3,0xc3
659.size	gcm_ghash_4bit,.-gcm_ghash_4bit
660.globl	gcm_init_clmul
661.type	gcm_init_clmul,@function
662.align	16
663gcm_init_clmul:
664.L_init_clmul:
665	movdqu	(%rsi),%xmm2
666	pshufd	$78,%xmm2,%xmm2
667
668
669	pshufd	$255,%xmm2,%xmm4
670	movdqa	%xmm2,%xmm3
671	psllq	$1,%xmm2
672	pxor	%xmm5,%xmm5
673	psrlq	$63,%xmm3
674	pcmpgtd	%xmm4,%xmm5
675	pslldq	$8,%xmm3
676	por	%xmm3,%xmm2
677
678
679	pand	.L0x1c2_polynomial(%rip),%xmm5
680	pxor	%xmm5,%xmm2
681
682
683	pshufd	$78,%xmm2,%xmm6
684	movdqa	%xmm2,%xmm0
685	pxor	%xmm2,%xmm6
686	movdqa	%xmm0,%xmm1
687	pshufd	$78,%xmm0,%xmm3
688	pxor	%xmm0,%xmm3
689.byte	102,15,58,68,194,0
690.byte	102,15,58,68,202,17
691.byte	102,15,58,68,222,0
692	pxor	%xmm0,%xmm3
693	pxor	%xmm1,%xmm3
694
695	movdqa	%xmm3,%xmm4
696	psrldq	$8,%xmm3
697	pslldq	$8,%xmm4
698	pxor	%xmm3,%xmm1
699	pxor	%xmm4,%xmm0
700
701	movdqa	%xmm0,%xmm4
702	movdqa	%xmm0,%xmm3
703	psllq	$5,%xmm0
704	pxor	%xmm0,%xmm3
705	psllq	$1,%xmm0
706	pxor	%xmm3,%xmm0
707	psllq	$57,%xmm0
708	movdqa	%xmm0,%xmm3
709	pslldq	$8,%xmm0
710	psrldq	$8,%xmm3
711	pxor	%xmm4,%xmm0
712	pxor	%xmm3,%xmm1
713
714
715	movdqa	%xmm0,%xmm4
716	psrlq	$1,%xmm0
717	pxor	%xmm4,%xmm1
718	pxor	%xmm0,%xmm4
719	psrlq	$5,%xmm0
720	pxor	%xmm4,%xmm0
721	psrlq	$1,%xmm0
722	pxor	%xmm1,%xmm0
723	pshufd	$78,%xmm2,%xmm3
724	pshufd	$78,%xmm0,%xmm4
725	pxor	%xmm2,%xmm3
726	movdqu	%xmm2,0(%rdi)
727	pxor	%xmm0,%xmm4
728	movdqu	%xmm0,16(%rdi)
729.byte	102,15,58,15,227,8
730	movdqu	%xmm4,32(%rdi)
731	movdqa	%xmm0,%xmm1
732	pshufd	$78,%xmm0,%xmm3
733	pxor	%xmm0,%xmm3
734.byte	102,15,58,68,194,0
735.byte	102,15,58,68,202,17
736.byte	102,15,58,68,222,0
737	pxor	%xmm0,%xmm3
738	pxor	%xmm1,%xmm3
739
740	movdqa	%xmm3,%xmm4
741	psrldq	$8,%xmm3
742	pslldq	$8,%xmm4
743	pxor	%xmm3,%xmm1
744	pxor	%xmm4,%xmm0
745
746	movdqa	%xmm0,%xmm4
747	movdqa	%xmm0,%xmm3
748	psllq	$5,%xmm0
749	pxor	%xmm0,%xmm3
750	psllq	$1,%xmm0
751	pxor	%xmm3,%xmm0
752	psllq	$57,%xmm0
753	movdqa	%xmm0,%xmm3
754	pslldq	$8,%xmm0
755	psrldq	$8,%xmm3
756	pxor	%xmm4,%xmm0
757	pxor	%xmm3,%xmm1
758
759
760	movdqa	%xmm0,%xmm4
761	psrlq	$1,%xmm0
762	pxor	%xmm4,%xmm1
763	pxor	%xmm0,%xmm4
764	psrlq	$5,%xmm0
765	pxor	%xmm4,%xmm0
766	psrlq	$1,%xmm0
767	pxor	%xmm1,%xmm0
768	movdqa	%xmm0,%xmm5
769	movdqa	%xmm0,%xmm1
770	pshufd	$78,%xmm0,%xmm3
771	pxor	%xmm0,%xmm3
772.byte	102,15,58,68,194,0
773.byte	102,15,58,68,202,17
774.byte	102,15,58,68,222,0
775	pxor	%xmm0,%xmm3
776	pxor	%xmm1,%xmm3
777
778	movdqa	%xmm3,%xmm4
779	psrldq	$8,%xmm3
780	pslldq	$8,%xmm4
781	pxor	%xmm3,%xmm1
782	pxor	%xmm4,%xmm0
783
784	movdqa	%xmm0,%xmm4
785	movdqa	%xmm0,%xmm3
786	psllq	$5,%xmm0
787	pxor	%xmm0,%xmm3
788	psllq	$1,%xmm0
789	pxor	%xmm3,%xmm0
790	psllq	$57,%xmm0
791	movdqa	%xmm0,%xmm3
792	pslldq	$8,%xmm0
793	psrldq	$8,%xmm3
794	pxor	%xmm4,%xmm0
795	pxor	%xmm3,%xmm1
796
797
798	movdqa	%xmm0,%xmm4
799	psrlq	$1,%xmm0
800	pxor	%xmm4,%xmm1
801	pxor	%xmm0,%xmm4
802	psrlq	$5,%xmm0
803	pxor	%xmm4,%xmm0
804	psrlq	$1,%xmm0
805	pxor	%xmm1,%xmm0
806	pshufd	$78,%xmm5,%xmm3
807	pshufd	$78,%xmm0,%xmm4
808	pxor	%xmm5,%xmm3
809	movdqu	%xmm5,48(%rdi)
810	pxor	%xmm0,%xmm4
811	movdqu	%xmm0,64(%rdi)
812.byte	102,15,58,15,227,8
813	movdqu	%xmm4,80(%rdi)
814	.byte	0xf3,0xc3
815.size	gcm_init_clmul,.-gcm_init_clmul
816.globl	gcm_gmult_clmul
817.type	gcm_gmult_clmul,@function
818.align	16
819gcm_gmult_clmul:
820.L_gmult_clmul:
821	movdqu	(%rdi),%xmm0
822	movdqa	.Lbswap_mask(%rip),%xmm5
823	movdqu	(%rsi),%xmm2
824	movdqu	32(%rsi),%xmm4
825.byte	102,15,56,0,197
826	movdqa	%xmm0,%xmm1
827	pshufd	$78,%xmm0,%xmm3
828	pxor	%xmm0,%xmm3
829.byte	102,15,58,68,194,0
830.byte	102,15,58,68,202,17
831.byte	102,15,58,68,220,0
832	pxor	%xmm0,%xmm3
833	pxor	%xmm1,%xmm3
834
835	movdqa	%xmm3,%xmm4
836	psrldq	$8,%xmm3
837	pslldq	$8,%xmm4
838	pxor	%xmm3,%xmm1
839	pxor	%xmm4,%xmm0
840
841	movdqa	%xmm0,%xmm4
842	movdqa	%xmm0,%xmm3
843	psllq	$5,%xmm0
844	pxor	%xmm0,%xmm3
845	psllq	$1,%xmm0
846	pxor	%xmm3,%xmm0
847	psllq	$57,%xmm0
848	movdqa	%xmm0,%xmm3
849	pslldq	$8,%xmm0
850	psrldq	$8,%xmm3
851	pxor	%xmm4,%xmm0
852	pxor	%xmm3,%xmm1
853
854
855	movdqa	%xmm0,%xmm4
856	psrlq	$1,%xmm0
857	pxor	%xmm4,%xmm1
858	pxor	%xmm0,%xmm4
859	psrlq	$5,%xmm0
860	pxor	%xmm4,%xmm0
861	psrlq	$1,%xmm0
862	pxor	%xmm1,%xmm0
863.byte	102,15,56,0,197
864	movdqu	%xmm0,(%rdi)
865	.byte	0xf3,0xc3
866.size	gcm_gmult_clmul,.-gcm_gmult_clmul
867.globl	gcm_ghash_clmul
868.type	gcm_ghash_clmul,@function
869.align	32
870gcm_ghash_clmul:
871.L_ghash_clmul:
872	movdqa	.Lbswap_mask(%rip),%xmm10
873
874	movdqu	(%rdi),%xmm0
875	movdqu	(%rsi),%xmm2
876	movdqu	32(%rsi),%xmm7
877.byte	102,65,15,56,0,194
878
879	subq	$0x10,%rcx
880	jz	.Lodd_tail
881
882	movdqu	16(%rsi),%xmm6
883	movl	OPENSSL_ia32cap_P+4(%rip),%eax
884	cmpq	$0x30,%rcx
885	jb	.Lskip4x
886
887	andl	$71303168,%eax
888	cmpl	$4194304,%eax
889	je	.Lskip4x
890
891	subq	$0x30,%rcx
892	movq	$0xA040608020C0E000,%rax
893	movdqu	48(%rsi),%xmm14
894	movdqu	64(%rsi),%xmm15
895
896
897
898
899	movdqu	48(%rdx),%xmm3
900	movdqu	32(%rdx),%xmm11
901.byte	102,65,15,56,0,218
902.byte	102,69,15,56,0,218
903	movdqa	%xmm3,%xmm5
904	pshufd	$78,%xmm3,%xmm4
905	pxor	%xmm3,%xmm4
906.byte	102,15,58,68,218,0
907.byte	102,15,58,68,234,17
908.byte	102,15,58,68,231,0
909
910	movdqa	%xmm11,%xmm13
911	pshufd	$78,%xmm11,%xmm12
912	pxor	%xmm11,%xmm12
913.byte	102,68,15,58,68,222,0
914.byte	102,68,15,58,68,238,17
915.byte	102,68,15,58,68,231,16
916	xorps	%xmm11,%xmm3
917	xorps	%xmm13,%xmm5
918	movups	80(%rsi),%xmm7
919	xorps	%xmm12,%xmm4
920
921	movdqu	16(%rdx),%xmm11
922	movdqu	0(%rdx),%xmm8
923.byte	102,69,15,56,0,218
924.byte	102,69,15,56,0,194
925	movdqa	%xmm11,%xmm13
926	pshufd	$78,%xmm11,%xmm12
927	pxor	%xmm8,%xmm0
928	pxor	%xmm11,%xmm12
929.byte	102,69,15,58,68,222,0
930	movdqa	%xmm0,%xmm1
931	pshufd	$78,%xmm0,%xmm8
932	pxor	%xmm0,%xmm8
933.byte	102,69,15,58,68,238,17
934.byte	102,68,15,58,68,231,0
935	xorps	%xmm11,%xmm3
936	xorps	%xmm13,%xmm5
937
938	leaq	64(%rdx),%rdx
939	subq	$0x40,%rcx
940	jc	.Ltail4x
941
942	jmp	.Lmod4_loop
943.align	32
944.Lmod4_loop:
945.byte	102,65,15,58,68,199,0
946	xorps	%xmm12,%xmm4
947	movdqu	48(%rdx),%xmm11
948.byte	102,69,15,56,0,218
949.byte	102,65,15,58,68,207,17
950	xorps	%xmm3,%xmm0
951	movdqu	32(%rdx),%xmm3
952	movdqa	%xmm11,%xmm13
953.byte	102,68,15,58,68,199,16
954	pshufd	$78,%xmm11,%xmm12
955	xorps	%xmm5,%xmm1
956	pxor	%xmm11,%xmm12
957.byte	102,65,15,56,0,218
958	movups	32(%rsi),%xmm7
959	xorps	%xmm4,%xmm8
960.byte	102,68,15,58,68,218,0
961	pshufd	$78,%xmm3,%xmm4
962
963	pxor	%xmm0,%xmm8
964	movdqa	%xmm3,%xmm5
965	pxor	%xmm1,%xmm8
966	pxor	%xmm3,%xmm4
967	movdqa	%xmm8,%xmm9
968.byte	102,68,15,58,68,234,17
969	pslldq	$8,%xmm8
970	psrldq	$8,%xmm9
971	pxor	%xmm8,%xmm0
972	movdqa	.L7_mask(%rip),%xmm8
973	pxor	%xmm9,%xmm1
974.byte	102,76,15,110,200
975
976	pand	%xmm0,%xmm8
977.byte	102,69,15,56,0,200
978	pxor	%xmm0,%xmm9
979.byte	102,68,15,58,68,231,0
980	psllq	$57,%xmm9
981	movdqa	%xmm9,%xmm8
982	pslldq	$8,%xmm9
983.byte	102,15,58,68,222,0
984	psrldq	$8,%xmm8
985	pxor	%xmm9,%xmm0
986	pxor	%xmm8,%xmm1
987	movdqu	0(%rdx),%xmm8
988
989	movdqa	%xmm0,%xmm9
990	psrlq	$1,%xmm0
991.byte	102,15,58,68,238,17
992	xorps	%xmm11,%xmm3
993	movdqu	16(%rdx),%xmm11
994.byte	102,69,15,56,0,218
995.byte	102,15,58,68,231,16
996	xorps	%xmm13,%xmm5
997	movups	80(%rsi),%xmm7
998.byte	102,69,15,56,0,194
999	pxor	%xmm9,%xmm1
1000	pxor	%xmm0,%xmm9
1001	psrlq	$5,%xmm0
1002
1003	movdqa	%xmm11,%xmm13
1004	pxor	%xmm12,%xmm4
1005	pshufd	$78,%xmm11,%xmm12
1006	pxor	%xmm9,%xmm0
1007	pxor	%xmm8,%xmm1
1008	pxor	%xmm11,%xmm12
1009.byte	102,69,15,58,68,222,0
1010	psrlq	$1,%xmm0
1011	pxor	%xmm1,%xmm0
1012	movdqa	%xmm0,%xmm1
1013.byte	102,69,15,58,68,238,17
1014	xorps	%xmm11,%xmm3
1015	pshufd	$78,%xmm0,%xmm8
1016	pxor	%xmm0,%xmm8
1017
1018.byte	102,68,15,58,68,231,0
1019	xorps	%xmm13,%xmm5
1020
1021	leaq	64(%rdx),%rdx
1022	subq	$0x40,%rcx
1023	jnc	.Lmod4_loop
1024
1025.Ltail4x:
1026.byte	102,65,15,58,68,199,0
1027.byte	102,65,15,58,68,207,17
1028.byte	102,68,15,58,68,199,16
1029	xorps	%xmm12,%xmm4
1030	xorps	%xmm3,%xmm0
1031	xorps	%xmm5,%xmm1
1032	pxor	%xmm0,%xmm1
1033	pxor	%xmm4,%xmm8
1034
1035	pxor	%xmm1,%xmm8
1036	pxor	%xmm0,%xmm1
1037
1038	movdqa	%xmm8,%xmm9
1039	psrldq	$8,%xmm8
1040	pslldq	$8,%xmm9
1041	pxor	%xmm8,%xmm1
1042	pxor	%xmm9,%xmm0
1043
1044	movdqa	%xmm0,%xmm4
1045	movdqa	%xmm0,%xmm3
1046	psllq	$5,%xmm0
1047	pxor	%xmm0,%xmm3
1048	psllq	$1,%xmm0
1049	pxor	%xmm3,%xmm0
1050	psllq	$57,%xmm0
1051	movdqa	%xmm0,%xmm3
1052	pslldq	$8,%xmm0
1053	psrldq	$8,%xmm3
1054	pxor	%xmm4,%xmm0
1055	pxor	%xmm3,%xmm1
1056
1057
1058	movdqa	%xmm0,%xmm4
1059	psrlq	$1,%xmm0
1060	pxor	%xmm4,%xmm1
1061	pxor	%xmm0,%xmm4
1062	psrlq	$5,%xmm0
1063	pxor	%xmm4,%xmm0
1064	psrlq	$1,%xmm0
1065	pxor	%xmm1,%xmm0
1066	addq	$0x40,%rcx
1067	jz	.Ldone
1068	movdqu	32(%rsi),%xmm7
1069	subq	$0x10,%rcx
1070	jz	.Lodd_tail
1071.Lskip4x:
1072
1073
1074
1075
1076
1077	movdqu	(%rdx),%xmm8
1078	movdqu	16(%rdx),%xmm3
1079.byte	102,69,15,56,0,194
1080.byte	102,65,15,56,0,218
1081	pxor	%xmm8,%xmm0
1082
1083	movdqa	%xmm3,%xmm5
1084	pshufd	$78,%xmm3,%xmm4
1085	pxor	%xmm3,%xmm4
1086.byte	102,15,58,68,218,0
1087.byte	102,15,58,68,234,17
1088.byte	102,15,58,68,231,0
1089
1090	leaq	32(%rdx),%rdx
1091	nop
1092	subq	$0x20,%rcx
1093	jbe	.Leven_tail
1094	nop
1095	jmp	.Lmod_loop
1096
1097.align	32
1098.Lmod_loop:
1099	movdqa	%xmm0,%xmm1
1100	movdqa	%xmm4,%xmm8
1101	pshufd	$78,%xmm0,%xmm4
1102	pxor	%xmm0,%xmm4
1103
1104.byte	102,15,58,68,198,0
1105.byte	102,15,58,68,206,17
1106.byte	102,15,58,68,231,16
1107
1108	pxor	%xmm3,%xmm0
1109	pxor	%xmm5,%xmm1
1110	movdqu	(%rdx),%xmm9
1111	pxor	%xmm0,%xmm8
1112.byte	102,69,15,56,0,202
1113	movdqu	16(%rdx),%xmm3
1114
1115	pxor	%xmm1,%xmm8
1116	pxor	%xmm9,%xmm1
1117	pxor	%xmm8,%xmm4
1118.byte	102,65,15,56,0,218
1119	movdqa	%xmm4,%xmm8
1120	psrldq	$8,%xmm8
1121	pslldq	$8,%xmm4
1122	pxor	%xmm8,%xmm1
1123	pxor	%xmm4,%xmm0
1124
1125	movdqa	%xmm3,%xmm5
1126
1127	movdqa	%xmm0,%xmm9
1128	movdqa	%xmm0,%xmm8
1129	psllq	$5,%xmm0
1130	pxor	%xmm0,%xmm8
1131.byte	102,15,58,68,218,0
1132	psllq	$1,%xmm0
1133	pxor	%xmm8,%xmm0
1134	psllq	$57,%xmm0
1135	movdqa	%xmm0,%xmm8
1136	pslldq	$8,%xmm0
1137	psrldq	$8,%xmm8
1138	pxor	%xmm9,%xmm0
1139	pshufd	$78,%xmm5,%xmm4
1140	pxor	%xmm8,%xmm1
1141	pxor	%xmm5,%xmm4
1142
1143	movdqa	%xmm0,%xmm9
1144	psrlq	$1,%xmm0
1145.byte	102,15,58,68,234,17
1146	pxor	%xmm9,%xmm1
1147	pxor	%xmm0,%xmm9
1148	psrlq	$5,%xmm0
1149	pxor	%xmm9,%xmm0
1150	leaq	32(%rdx),%rdx
1151	psrlq	$1,%xmm0
1152.byte	102,15,58,68,231,0
1153	pxor	%xmm1,%xmm0
1154
1155	subq	$0x20,%rcx
1156	ja	.Lmod_loop
1157
1158.Leven_tail:
1159	movdqa	%xmm0,%xmm1
1160	movdqa	%xmm4,%xmm8
1161	pshufd	$78,%xmm0,%xmm4
1162	pxor	%xmm0,%xmm4
1163
1164.byte	102,15,58,68,198,0
1165.byte	102,15,58,68,206,17
1166.byte	102,15,58,68,231,16
1167
1168	pxor	%xmm3,%xmm0
1169	pxor	%xmm5,%xmm1
1170	pxor	%xmm0,%xmm8
1171	pxor	%xmm1,%xmm8
1172	pxor	%xmm8,%xmm4
1173	movdqa	%xmm4,%xmm8
1174	psrldq	$8,%xmm8
1175	pslldq	$8,%xmm4
1176	pxor	%xmm8,%xmm1
1177	pxor	%xmm4,%xmm0
1178
1179	movdqa	%xmm0,%xmm4
1180	movdqa	%xmm0,%xmm3
1181	psllq	$5,%xmm0
1182	pxor	%xmm0,%xmm3
1183	psllq	$1,%xmm0
1184	pxor	%xmm3,%xmm0
1185	psllq	$57,%xmm0
1186	movdqa	%xmm0,%xmm3
1187	pslldq	$8,%xmm0
1188	psrldq	$8,%xmm3
1189	pxor	%xmm4,%xmm0
1190	pxor	%xmm3,%xmm1
1191
1192
1193	movdqa	%xmm0,%xmm4
1194	psrlq	$1,%xmm0
1195	pxor	%xmm4,%xmm1
1196	pxor	%xmm0,%xmm4
1197	psrlq	$5,%xmm0
1198	pxor	%xmm4,%xmm0
1199	psrlq	$1,%xmm0
1200	pxor	%xmm1,%xmm0
1201	testq	%rcx,%rcx
1202	jnz	.Ldone
1203
1204.Lodd_tail:
1205	movdqu	(%rdx),%xmm8
1206.byte	102,69,15,56,0,194
1207	pxor	%xmm8,%xmm0
1208	movdqa	%xmm0,%xmm1
1209	pshufd	$78,%xmm0,%xmm3
1210	pxor	%xmm0,%xmm3
1211.byte	102,15,58,68,194,0
1212.byte	102,15,58,68,202,17
1213.byte	102,15,58,68,223,0
1214	pxor	%xmm0,%xmm3
1215	pxor	%xmm1,%xmm3
1216
1217	movdqa	%xmm3,%xmm4
1218	psrldq	$8,%xmm3
1219	pslldq	$8,%xmm4
1220	pxor	%xmm3,%xmm1
1221	pxor	%xmm4,%xmm0
1222
1223	movdqa	%xmm0,%xmm4
1224	movdqa	%xmm0,%xmm3
1225	psllq	$5,%xmm0
1226	pxor	%xmm0,%xmm3
1227	psllq	$1,%xmm0
1228	pxor	%xmm3,%xmm0
1229	psllq	$57,%xmm0
1230	movdqa	%xmm0,%xmm3
1231	pslldq	$8,%xmm0
1232	psrldq	$8,%xmm3
1233	pxor	%xmm4,%xmm0
1234	pxor	%xmm3,%xmm1
1235
1236
1237	movdqa	%xmm0,%xmm4
1238	psrlq	$1,%xmm0
1239	pxor	%xmm4,%xmm1
1240	pxor	%xmm0,%xmm4
1241	psrlq	$5,%xmm0
1242	pxor	%xmm4,%xmm0
1243	psrlq	$1,%xmm0
1244	pxor	%xmm1,%xmm0
1245.Ldone:
1246.byte	102,65,15,56,0,194
1247	movdqu	%xmm0,(%rdi)
1248	.byte	0xf3,0xc3
1249.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1250.globl	gcm_init_avx
1251.type	gcm_init_avx,@function
1252.align	32
1253gcm_init_avx:
1254	vzeroupper
1255
1256	vmovdqu	(%rsi),%xmm2
1257	vpshufd	$78,%xmm2,%xmm2
1258
1259
1260	vpshufd	$255,%xmm2,%xmm4
1261	vpsrlq	$63,%xmm2,%xmm3
1262	vpsllq	$1,%xmm2,%xmm2
1263	vpxor	%xmm5,%xmm5,%xmm5
1264	vpcmpgtd	%xmm4,%xmm5,%xmm5
1265	vpslldq	$8,%xmm3,%xmm3
1266	vpor	%xmm3,%xmm2,%xmm2
1267
1268
1269	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1270	vpxor	%xmm5,%xmm2,%xmm2
1271
1272	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1273	vmovdqa	%xmm2,%xmm0
1274	vpxor	%xmm2,%xmm6,%xmm6
1275	movq	$4,%r10
1276	jmp	.Linit_start_avx
1277.align	32
1278.Linit_loop_avx:
1279	vpalignr	$8,%xmm3,%xmm4,%xmm5
1280	vmovdqu	%xmm5,-16(%rdi)
1281	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1282	vpxor	%xmm0,%xmm3,%xmm3
1283	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1284	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1285	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1286	vpxor	%xmm0,%xmm1,%xmm4
1287	vpxor	%xmm4,%xmm3,%xmm3
1288
1289	vpslldq	$8,%xmm3,%xmm4
1290	vpsrldq	$8,%xmm3,%xmm3
1291	vpxor	%xmm4,%xmm0,%xmm0
1292	vpxor	%xmm3,%xmm1,%xmm1
1293	vpsllq	$57,%xmm0,%xmm3
1294	vpsllq	$62,%xmm0,%xmm4
1295	vpxor	%xmm3,%xmm4,%xmm4
1296	vpsllq	$63,%xmm0,%xmm3
1297	vpxor	%xmm3,%xmm4,%xmm4
1298	vpslldq	$8,%xmm4,%xmm3
1299	vpsrldq	$8,%xmm4,%xmm4
1300	vpxor	%xmm3,%xmm0,%xmm0
1301	vpxor	%xmm4,%xmm1,%xmm1
1302
1303	vpsrlq	$1,%xmm0,%xmm4
1304	vpxor	%xmm0,%xmm1,%xmm1
1305	vpxor	%xmm4,%xmm0,%xmm0
1306	vpsrlq	$5,%xmm4,%xmm4
1307	vpxor	%xmm4,%xmm0,%xmm0
1308	vpsrlq	$1,%xmm0,%xmm0
1309	vpxor	%xmm1,%xmm0,%xmm0
1310.Linit_start_avx:
1311	vmovdqa	%xmm0,%xmm5
1312	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1313	vpxor	%xmm0,%xmm3,%xmm3
1314	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1315	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1316	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1317	vpxor	%xmm0,%xmm1,%xmm4
1318	vpxor	%xmm4,%xmm3,%xmm3
1319
1320	vpslldq	$8,%xmm3,%xmm4
1321	vpsrldq	$8,%xmm3,%xmm3
1322	vpxor	%xmm4,%xmm0,%xmm0
1323	vpxor	%xmm3,%xmm1,%xmm1
1324	vpsllq	$57,%xmm0,%xmm3
1325	vpsllq	$62,%xmm0,%xmm4
1326	vpxor	%xmm3,%xmm4,%xmm4
1327	vpsllq	$63,%xmm0,%xmm3
1328	vpxor	%xmm3,%xmm4,%xmm4
1329	vpslldq	$8,%xmm4,%xmm3
1330	vpsrldq	$8,%xmm4,%xmm4
1331	vpxor	%xmm3,%xmm0,%xmm0
1332	vpxor	%xmm4,%xmm1,%xmm1
1333
1334	vpsrlq	$1,%xmm0,%xmm4
1335	vpxor	%xmm0,%xmm1,%xmm1
1336	vpxor	%xmm4,%xmm0,%xmm0
1337	vpsrlq	$5,%xmm4,%xmm4
1338	vpxor	%xmm4,%xmm0,%xmm0
1339	vpsrlq	$1,%xmm0,%xmm0
1340	vpxor	%xmm1,%xmm0,%xmm0
1341	vpshufd	$78,%xmm5,%xmm3
1342	vpshufd	$78,%xmm0,%xmm4
1343	vpxor	%xmm5,%xmm3,%xmm3
1344	vmovdqu	%xmm5,0(%rdi)
1345	vpxor	%xmm0,%xmm4,%xmm4
1346	vmovdqu	%xmm0,16(%rdi)
1347	leaq	48(%rdi),%rdi
1348	subq	$1,%r10
1349	jnz	.Linit_loop_avx
1350
1351	vpalignr	$8,%xmm4,%xmm3,%xmm5
1352	vmovdqu	%xmm5,-16(%rdi)
1353
1354	vzeroupper
1355	.byte	0xf3,0xc3
1356.size	gcm_init_avx,.-gcm_init_avx
1357.globl	gcm_gmult_avx
1358.type	gcm_gmult_avx,@function
1359.align	32
1360gcm_gmult_avx:
1361	jmp	.L_gmult_clmul
1362.size	gcm_gmult_avx,.-gcm_gmult_avx
1363.globl	gcm_ghash_avx
1364.type	gcm_ghash_avx,@function
1365.align	32
1366gcm_ghash_avx:
1367	vzeroupper
1368
1369	vmovdqu	(%rdi),%xmm10
1370	leaq	.L0x1c2_polynomial(%rip),%r10
1371	leaq	64(%rsi),%rsi
1372	vmovdqu	.Lbswap_mask(%rip),%xmm13
1373	vpshufb	%xmm13,%xmm10,%xmm10
1374	cmpq	$0x80,%rcx
1375	jb	.Lshort_avx
1376	subq	$0x80,%rcx
1377
1378	vmovdqu	112(%rdx),%xmm14
1379	vmovdqu	0-64(%rsi),%xmm6
1380	vpshufb	%xmm13,%xmm14,%xmm14
1381	vmovdqu	32-64(%rsi),%xmm7
1382
1383	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1384	vmovdqu	96(%rdx),%xmm15
1385	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1386	vpxor	%xmm14,%xmm9,%xmm9
1387	vpshufb	%xmm13,%xmm15,%xmm15
1388	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1389	vmovdqu	16-64(%rsi),%xmm6
1390	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1391	vmovdqu	80(%rdx),%xmm14
1392	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1393	vpxor	%xmm15,%xmm8,%xmm8
1394
1395	vpshufb	%xmm13,%xmm14,%xmm14
1396	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1397	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1398	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1399	vmovdqu	48-64(%rsi),%xmm6
1400	vpxor	%xmm14,%xmm9,%xmm9
1401	vmovdqu	64(%rdx),%xmm15
1402	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1403	vmovdqu	80-64(%rsi),%xmm7
1404
1405	vpshufb	%xmm13,%xmm15,%xmm15
1406	vpxor	%xmm0,%xmm3,%xmm3
1407	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1408	vpxor	%xmm1,%xmm4,%xmm4
1409	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1410	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1411	vmovdqu	64-64(%rsi),%xmm6
1412	vpxor	%xmm2,%xmm5,%xmm5
1413	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1414	vpxor	%xmm15,%xmm8,%xmm8
1415
1416	vmovdqu	48(%rdx),%xmm14
1417	vpxor	%xmm3,%xmm0,%xmm0
1418	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1419	vpxor	%xmm4,%xmm1,%xmm1
1420	vpshufb	%xmm13,%xmm14,%xmm14
1421	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1422	vmovdqu	96-64(%rsi),%xmm6
1423	vpxor	%xmm5,%xmm2,%xmm2
1424	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1425	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1426	vmovdqu	128-64(%rsi),%xmm7
1427	vpxor	%xmm14,%xmm9,%xmm9
1428
1429	vmovdqu	32(%rdx),%xmm15
1430	vpxor	%xmm0,%xmm3,%xmm3
1431	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1432	vpxor	%xmm1,%xmm4,%xmm4
1433	vpshufb	%xmm13,%xmm15,%xmm15
1434	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1435	vmovdqu	112-64(%rsi),%xmm6
1436	vpxor	%xmm2,%xmm5,%xmm5
1437	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1438	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1439	vpxor	%xmm15,%xmm8,%xmm8
1440
1441	vmovdqu	16(%rdx),%xmm14
1442	vpxor	%xmm3,%xmm0,%xmm0
1443	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1444	vpxor	%xmm4,%xmm1,%xmm1
1445	vpshufb	%xmm13,%xmm14,%xmm14
1446	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1447	vmovdqu	144-64(%rsi),%xmm6
1448	vpxor	%xmm5,%xmm2,%xmm2
1449	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1450	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1451	vmovdqu	176-64(%rsi),%xmm7
1452	vpxor	%xmm14,%xmm9,%xmm9
1453
1454	vmovdqu	(%rdx),%xmm15
1455	vpxor	%xmm0,%xmm3,%xmm3
1456	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1457	vpxor	%xmm1,%xmm4,%xmm4
1458	vpshufb	%xmm13,%xmm15,%xmm15
1459	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1460	vmovdqu	160-64(%rsi),%xmm6
1461	vpxor	%xmm2,%xmm5,%xmm5
1462	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1463
1464	leaq	128(%rdx),%rdx
1465	cmpq	$0x80,%rcx
1466	jb	.Ltail_avx
1467
1468	vpxor	%xmm10,%xmm15,%xmm15
1469	subq	$0x80,%rcx
1470	jmp	.Loop8x_avx
1471
1472.align	32
1473.Loop8x_avx:
1474	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1475	vmovdqu	112(%rdx),%xmm14
1476	vpxor	%xmm0,%xmm3,%xmm3
1477	vpxor	%xmm15,%xmm8,%xmm8
1478	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1479	vpshufb	%xmm13,%xmm14,%xmm14
1480	vpxor	%xmm1,%xmm4,%xmm4
1481	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1482	vmovdqu	0-64(%rsi),%xmm6
1483	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1484	vpxor	%xmm2,%xmm5,%xmm5
1485	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1486	vmovdqu	32-64(%rsi),%xmm7
1487	vpxor	%xmm14,%xmm9,%xmm9
1488
1489	vmovdqu	96(%rdx),%xmm15
1490	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1491	vpxor	%xmm3,%xmm10,%xmm10
1492	vpshufb	%xmm13,%xmm15,%xmm15
1493	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1494	vxorps	%xmm4,%xmm11,%xmm11
1495	vmovdqu	16-64(%rsi),%xmm6
1496	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1497	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1498	vpxor	%xmm5,%xmm12,%xmm12
1499	vxorps	%xmm15,%xmm8,%xmm8
1500
1501	vmovdqu	80(%rdx),%xmm14
1502	vpxor	%xmm10,%xmm12,%xmm12
1503	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1504	vpxor	%xmm11,%xmm12,%xmm12
1505	vpslldq	$8,%xmm12,%xmm9
1506	vpxor	%xmm0,%xmm3,%xmm3
1507	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1508	vpsrldq	$8,%xmm12,%xmm12
1509	vpxor	%xmm9,%xmm10,%xmm10
1510	vmovdqu	48-64(%rsi),%xmm6
1511	vpshufb	%xmm13,%xmm14,%xmm14
1512	vxorps	%xmm12,%xmm11,%xmm11
1513	vpxor	%xmm1,%xmm4,%xmm4
1514	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1515	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1516	vmovdqu	80-64(%rsi),%xmm7
1517	vpxor	%xmm14,%xmm9,%xmm9
1518	vpxor	%xmm2,%xmm5,%xmm5
1519
1520	vmovdqu	64(%rdx),%xmm15
1521	vpalignr	$8,%xmm10,%xmm10,%xmm12
1522	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1523	vpshufb	%xmm13,%xmm15,%xmm15
1524	vpxor	%xmm3,%xmm0,%xmm0
1525	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1526	vmovdqu	64-64(%rsi),%xmm6
1527	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1528	vpxor	%xmm4,%xmm1,%xmm1
1529	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1530	vxorps	%xmm15,%xmm8,%xmm8
1531	vpxor	%xmm5,%xmm2,%xmm2
1532
1533	vmovdqu	48(%rdx),%xmm14
1534	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1535	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1536	vpshufb	%xmm13,%xmm14,%xmm14
1537	vpxor	%xmm0,%xmm3,%xmm3
1538	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1539	vmovdqu	96-64(%rsi),%xmm6
1540	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1541	vpxor	%xmm1,%xmm4,%xmm4
1542	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1543	vmovdqu	128-64(%rsi),%xmm7
1544	vpxor	%xmm14,%xmm9,%xmm9
1545	vpxor	%xmm2,%xmm5,%xmm5
1546
1547	vmovdqu	32(%rdx),%xmm15
1548	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1549	vpshufb	%xmm13,%xmm15,%xmm15
1550	vpxor	%xmm3,%xmm0,%xmm0
1551	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1552	vmovdqu	112-64(%rsi),%xmm6
1553	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1554	vpxor	%xmm4,%xmm1,%xmm1
1555	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1556	vpxor	%xmm15,%xmm8,%xmm8
1557	vpxor	%xmm5,%xmm2,%xmm2
1558	vxorps	%xmm12,%xmm10,%xmm10
1559
1560	vmovdqu	16(%rdx),%xmm14
1561	vpalignr	$8,%xmm10,%xmm10,%xmm12
1562	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1563	vpshufb	%xmm13,%xmm14,%xmm14
1564	vpxor	%xmm0,%xmm3,%xmm3
1565	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1566	vmovdqu	144-64(%rsi),%xmm6
1567	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1568	vxorps	%xmm11,%xmm12,%xmm12
1569	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1570	vpxor	%xmm1,%xmm4,%xmm4
1571	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1572	vmovdqu	176-64(%rsi),%xmm7
1573	vpxor	%xmm14,%xmm9,%xmm9
1574	vpxor	%xmm2,%xmm5,%xmm5
1575
1576	vmovdqu	(%rdx),%xmm15
1577	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1578	vpshufb	%xmm13,%xmm15,%xmm15
1579	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1580	vmovdqu	160-64(%rsi),%xmm6
1581	vpxor	%xmm12,%xmm15,%xmm15
1582	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1583	vpxor	%xmm10,%xmm15,%xmm15
1584
1585	leaq	128(%rdx),%rdx
1586	subq	$0x80,%rcx
1587	jnc	.Loop8x_avx
1588
1589	addq	$0x80,%rcx
1590	jmp	.Ltail_no_xor_avx
1591
1592.align	32
1593.Lshort_avx:
1594	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1595	leaq	(%rdx,%rcx,1),%rdx
1596	vmovdqu	0-64(%rsi),%xmm6
1597	vmovdqu	32-64(%rsi),%xmm7
1598	vpshufb	%xmm13,%xmm14,%xmm15
1599
1600	vmovdqa	%xmm0,%xmm3
1601	vmovdqa	%xmm1,%xmm4
1602	vmovdqa	%xmm2,%xmm5
1603	subq	$0x10,%rcx
1604	jz	.Ltail_avx
1605
1606	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1607	vpxor	%xmm0,%xmm3,%xmm3
1608	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1609	vpxor	%xmm15,%xmm8,%xmm8
1610	vmovdqu	-32(%rdx),%xmm14
1611	vpxor	%xmm1,%xmm4,%xmm4
1612	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1613	vmovdqu	16-64(%rsi),%xmm6
1614	vpshufb	%xmm13,%xmm14,%xmm15
1615	vpxor	%xmm2,%xmm5,%xmm5
1616	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1617	vpsrldq	$8,%xmm7,%xmm7
1618	subq	$0x10,%rcx
1619	jz	.Ltail_avx
1620
1621	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1622	vpxor	%xmm0,%xmm3,%xmm3
1623	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1624	vpxor	%xmm15,%xmm8,%xmm8
1625	vmovdqu	-48(%rdx),%xmm14
1626	vpxor	%xmm1,%xmm4,%xmm4
1627	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1628	vmovdqu	48-64(%rsi),%xmm6
1629	vpshufb	%xmm13,%xmm14,%xmm15
1630	vpxor	%xmm2,%xmm5,%xmm5
1631	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1632	vmovdqu	80-64(%rsi),%xmm7
1633	subq	$0x10,%rcx
1634	jz	.Ltail_avx
1635
1636	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1637	vpxor	%xmm0,%xmm3,%xmm3
1638	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1639	vpxor	%xmm15,%xmm8,%xmm8
1640	vmovdqu	-64(%rdx),%xmm14
1641	vpxor	%xmm1,%xmm4,%xmm4
1642	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1643	vmovdqu	64-64(%rsi),%xmm6
1644	vpshufb	%xmm13,%xmm14,%xmm15
1645	vpxor	%xmm2,%xmm5,%xmm5
1646	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1647	vpsrldq	$8,%xmm7,%xmm7
1648	subq	$0x10,%rcx
1649	jz	.Ltail_avx
1650
1651	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1652	vpxor	%xmm0,%xmm3,%xmm3
1653	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1654	vpxor	%xmm15,%xmm8,%xmm8
1655	vmovdqu	-80(%rdx),%xmm14
1656	vpxor	%xmm1,%xmm4,%xmm4
1657	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1658	vmovdqu	96-64(%rsi),%xmm6
1659	vpshufb	%xmm13,%xmm14,%xmm15
1660	vpxor	%xmm2,%xmm5,%xmm5
1661	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1662	vmovdqu	128-64(%rsi),%xmm7
1663	subq	$0x10,%rcx
1664	jz	.Ltail_avx
1665
1666	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1667	vpxor	%xmm0,%xmm3,%xmm3
1668	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1669	vpxor	%xmm15,%xmm8,%xmm8
1670	vmovdqu	-96(%rdx),%xmm14
1671	vpxor	%xmm1,%xmm4,%xmm4
1672	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1673	vmovdqu	112-64(%rsi),%xmm6
1674	vpshufb	%xmm13,%xmm14,%xmm15
1675	vpxor	%xmm2,%xmm5,%xmm5
1676	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1677	vpsrldq	$8,%xmm7,%xmm7
1678	subq	$0x10,%rcx
1679	jz	.Ltail_avx
1680
1681	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1682	vpxor	%xmm0,%xmm3,%xmm3
1683	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1684	vpxor	%xmm15,%xmm8,%xmm8
1685	vmovdqu	-112(%rdx),%xmm14
1686	vpxor	%xmm1,%xmm4,%xmm4
1687	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1688	vmovdqu	144-64(%rsi),%xmm6
1689	vpshufb	%xmm13,%xmm14,%xmm15
1690	vpxor	%xmm2,%xmm5,%xmm5
1691	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1692	vmovq	184-64(%rsi),%xmm7
1693	subq	$0x10,%rcx
1694	jmp	.Ltail_avx
1695
1696.align	32
1697.Ltail_avx:
1698	vpxor	%xmm10,%xmm15,%xmm15
1699.Ltail_no_xor_avx:
1700	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1701	vpxor	%xmm0,%xmm3,%xmm3
1702	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1703	vpxor	%xmm15,%xmm8,%xmm8
1704	vpxor	%xmm1,%xmm4,%xmm4
1705	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1706	vpxor	%xmm2,%xmm5,%xmm5
1707	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1708
1709	vmovdqu	(%r10),%xmm12
1710
1711	vpxor	%xmm0,%xmm3,%xmm10
1712	vpxor	%xmm1,%xmm4,%xmm11
1713	vpxor	%xmm2,%xmm5,%xmm5
1714
1715	vpxor	%xmm10,%xmm5,%xmm5
1716	vpxor	%xmm11,%xmm5,%xmm5
1717	vpslldq	$8,%xmm5,%xmm9
1718	vpsrldq	$8,%xmm5,%xmm5
1719	vpxor	%xmm9,%xmm10,%xmm10
1720	vpxor	%xmm5,%xmm11,%xmm11
1721
1722	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1723	vpalignr	$8,%xmm10,%xmm10,%xmm10
1724	vpxor	%xmm9,%xmm10,%xmm10
1725
1726	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1727	vpalignr	$8,%xmm10,%xmm10,%xmm10
1728	vpxor	%xmm11,%xmm10,%xmm10
1729	vpxor	%xmm9,%xmm10,%xmm10
1730
1731	cmpq	$0,%rcx
1732	jne	.Lshort_avx
1733
1734	vpshufb	%xmm13,%xmm10,%xmm10
1735	vmovdqu	%xmm10,(%rdi)
1736	vzeroupper
1737	.byte	0xf3,0xc3
1738.size	gcm_ghash_avx,.-gcm_ghash_avx
1739.align	64
1740.Lbswap_mask:
1741.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1742.L0x1c2_polynomial:
1743.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1744.L7_mask:
1745.long	7,0,7,0
1746.L7_mask_poly:
1747.long	7,0,450,0
1748.align	64
1749.type	.Lrem_4bit,@object
1750.Lrem_4bit:
1751.long	0,0,0,471859200,0,943718400,0,610271232
1752.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1753.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1754.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1755.type	.Lrem_8bit,@object
1756.Lrem_8bit:
1757.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1758.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1759.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1760.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1761.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1762.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1763.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1764.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1765.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1766.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1767.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1768.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1769.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1770.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1771.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1772.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1773.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1774.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1775.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1776.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1777.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1778.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1779.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1780.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1781.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1782.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1783.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1784.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1785.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1786.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1787.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1788.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1789
1790.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1791.align	64
1792