x86cpuid.pl revision 325335
1#!/usr/bin/env perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC, "${dir}perlasm", "perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],"x86cpuid");
8
9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10
11&function_begin("OPENSSL_ia32_cpuid");
12	&xor	("edx","edx");
13	&pushf	();
14	&pop	("eax");
15	&mov	("ecx","eax");
16	&xor	("eax",1<<21);
17	&push	("eax");
18	&popf	();
19	&pushf	();
20	&pop	("eax");
21	&xor	("ecx","eax");
22	&xor	("eax","eax");
23	&mov	("esi",&wparam(0));
24	&mov	(&DWP(8,"esi"),"eax");	# clear extended feature flags
25	&bt	("ecx",21);
26	&jnc	(&label("nocpuid"));
27	&cpuid	();
28	&mov	("edi","eax");		# max value for standard query level
29
30	&xor	("eax","eax");
31	&cmp	("ebx",0x756e6547);	# "Genu"
32	&setne	(&LB("eax"));
33	&mov	("ebp","eax");
34	&cmp	("edx",0x49656e69);	# "ineI"
35	&setne	(&LB("eax"));
36	&or	("ebp","eax");
37	&cmp	("ecx",0x6c65746e);	# "ntel"
38	&setne	(&LB("eax"));
39	&or	("ebp","eax");		# 0 indicates Intel CPU
40	&jz	(&label("intel"));
41
42	&cmp	("ebx",0x68747541);	# "Auth"
43	&setne	(&LB("eax"));
44	&mov	("esi","eax");
45	&cmp	("edx",0x69746E65);	# "enti"
46	&setne	(&LB("eax"));
47	&or	("esi","eax");
48	&cmp	("ecx",0x444D4163);	# "cAMD"
49	&setne	(&LB("eax"));
50	&or	("esi","eax");		# 0 indicates AMD CPU
51	&jnz	(&label("intel"));
52
53	# AMD specific
54	&mov	("eax",0x80000000);
55	&cpuid	();
56	&cmp	("eax",0x80000001);
57	&jb	(&label("intel"));
58	&mov	("esi","eax");
59	&mov	("eax",0x80000001);
60	&cpuid	();
61	&or	("ebp","ecx");
62	&and	("ebp",1<<11|1);	# isolate XOP bit
63	&cmp	("esi",0x80000008);
64	&jb	(&label("intel"));
65
66	&mov	("eax",0x80000008);
67	&cpuid	();
68	&movz	("esi",&LB("ecx"));	# number of cores - 1
69	&inc	("esi");		# number of cores
70
71	&mov	("eax",1);
72	&xor	("ecx","ecx");
73	&cpuid	();
74	&bt	("edx",28);
75	&jnc	(&label("generic"));
76	&shr	("ebx",16);
77	&and	("ebx",0xff);
78	&cmp	("ebx","esi");
79	&ja	(&label("generic"));
80	&and	("edx",0xefffffff);	# clear hyper-threading bit
81	&jmp	(&label("generic"));
82
83&set_label("intel");
84	&cmp	("edi",4);
85	&mov	("esi",-1);
86	&jb	(&label("nocacheinfo"));
87
88	&mov	("eax",4);
89	&mov	("ecx",0);		# query L1D
90	&cpuid	();
91	&mov	("esi","eax");
92	&shr	("esi",14);
93	&and	("esi",0xfff);		# number of cores -1 per L1D
94
95&set_label("nocacheinfo");
96	&mov	("eax",1);
97	&xor	("ecx","ecx");
98	&cpuid	();
99	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
100	&cmp	("ebp",0);
101	&jne	(&label("notintel"));
102	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
103	&and	(&HB("eax"),15);	# familiy ID
104	&cmp	(&HB("eax"),15);	# P4?
105	&jne	(&label("notintel"));
106	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
107&set_label("notintel");
108	&bt	("edx",28);		# test hyper-threading bit
109	&jnc	(&label("generic"));
110	&and	("edx",0xefffffff);
111	&cmp	("esi",0);
112	&je	(&label("generic"));
113
114	&or	("edx",0x10000000);
115	&shr	("ebx",16);
116	&cmp	(&LB("ebx"),1);
117	&ja	(&label("generic"));
118	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
119
120&set_label("generic");
121	&and	("ebp",1<<11);		# isolate AMD XOP flag
122	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
123	&mov	("esi","edx");		# %ebp:%esi is copy of %ecx:%edx
124	&or	("ebp","ecx");		# merge AMD XOP flag
125
126	&cmp	("edi",7);
127	&mov	("edi",&wparam(0));
128	&jb	(&label("no_extended_info"));
129	&mov	("eax",7);
130	&xor	("ecx","ecx");
131	&cpuid	();
132	&mov	(&DWP(8,"edi"),"ebx");	# save extended feature flag
133&set_label("no_extended_info");
134
135	&bt	("ebp",27);		# check OSXSAVE bit
136	&jnc	(&label("clear_avx"));
137	&xor	("ecx","ecx");
138	&data_byte(0x0f,0x01,0xd0);	# xgetbv
139	&and	("eax",6);
140	&cmp	("eax",6);
141	&je	(&label("done"));
142	&cmp	("eax",2);
143	&je	(&label("clear_avx"));
144&set_label("clear_xmm");
145	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
146	&and	("esi",0xfeffffff);	# clear FXSR
147&set_label("clear_avx");
148	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
149	&and	(&DWP(8,"edi"),0xffffffdf);	# clear AVX2
150&set_label("done");
151	&mov	("eax","esi");
152	&mov	("edx","ebp");
153&set_label("nocpuid");
154&function_end("OPENSSL_ia32_cpuid");
155
156&external_label("OPENSSL_ia32cap_P");
157
158&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
159	&xor	("eax","eax");
160	&xor	("edx","edx");
161	&picmeup("ecx","OPENSSL_ia32cap_P");
162	&bt	(&DWP(0,"ecx"),4);
163	&jnc	(&label("notsc"));
164	&rdtsc	();
165&set_label("notsc");
166	&ret	();
167&function_end_B("OPENSSL_rdtsc");
168
169# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host],
170# but it's safe to call it on any [supported] 32-bit platform...
171# Just check for [non-]zero return value...
172&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
173	&picmeup("ecx","OPENSSL_ia32cap_P");
174	&bt	(&DWP(0,"ecx"),4);
175	&jnc	(&label("nohalt"));	# no TSC
176
177	&data_word(0x9058900e);		# push %cs; pop %eax
178	&and	("eax",3);
179	&jnz	(&label("nohalt"));	# not enough privileges
180
181	&pushf	();
182	&pop	("eax");
183	&bt	("eax",9);
184	&jnc	(&label("nohalt"));	# interrupts are disabled
185
186	&rdtsc	();
187	&push	("edx");
188	&push	("eax");
189	&halt	();
190	&rdtsc	();
191
192	&sub	("eax",&DWP(0,"esp"));
193	&sbb	("edx",&DWP(4,"esp"));
194	&add	("esp",8);
195	&ret	();
196
197&set_label("nohalt");
198	&xor	("eax","eax");
199	&xor	("edx","edx");
200	&ret	();
201&function_end_B("OPENSSL_instrument_halt");
202
203# Essentially there is only one use for this function. Under DJGPP:
204#
205#	#include <go32.h>
206#	...
207#	i=OPENSSL_far_spin(_dos_ds,0x46c);
208#	...
209# to obtain the number of spins till closest timer interrupt.
210
211&function_begin_B("OPENSSL_far_spin");
212	&pushf	();
213	&pop	("eax");
214	&bt	("eax",9);
215	&jnc	(&label("nospin"));	# interrupts are disabled
216
217	&mov	("eax",&DWP(4,"esp"));
218	&mov	("ecx",&DWP(8,"esp"));
219	&data_word (0x90d88e1e);	# push %ds, mov %eax,%ds
220	&xor	("eax","eax");
221	&mov	("edx",&DWP(0,"ecx"));
222	&jmp	(&label("spin"));
223
224	&align	(16);
225&set_label("spin");
226	&inc	("eax");
227	&cmp	("edx",&DWP(0,"ecx"));
228	&je	(&label("spin"));
229
230	&data_word (0x1f909090);	# pop	%ds
231	&ret	();
232
233&set_label("nospin");
234	&xor	("eax","eax");
235	&xor	("edx","edx");
236	&ret	();
237&function_end_B("OPENSSL_far_spin");
238
239&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
240	&xor	("eax","eax");
241	&xor	("edx","edx");
242	&picmeup("ecx","OPENSSL_ia32cap_P");
243	&mov	("ecx",&DWP(0,"ecx"));
244	&bt	(&DWP(0,"ecx"),1);
245	&jnc	(&label("no_x87"));
246	if ($sse2) {
247		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
248		&cmp	("ecx",1<<26|1<<24);
249		&jne	(&label("no_sse2"));
250		&pxor	("xmm0","xmm0");
251		&pxor	("xmm1","xmm1");
252		&pxor	("xmm2","xmm2");
253		&pxor	("xmm3","xmm3");
254		&pxor	("xmm4","xmm4");
255		&pxor	("xmm5","xmm5");
256		&pxor	("xmm6","xmm6");
257		&pxor	("xmm7","xmm7");
258	&set_label("no_sse2");
259	}
260	# just a bunch of fldz to zap the fp/mm bank followed by finit...
261	&data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b);
262&set_label("no_x87");
263	&lea	("eax",&DWP(4,"esp"));
264	&ret	();
265&function_end_B("OPENSSL_wipe_cpu");
266
267&function_begin_B("OPENSSL_atomic_add");
268	&mov	("edx",&DWP(4,"esp"));	# fetch the pointer, 1st arg
269	&mov	("ecx",&DWP(8,"esp"));	# fetch the increment, 2nd arg
270	&push	("ebx");
271	&nop	();
272	&mov	("eax",&DWP(0,"edx"));
273&set_label("spin");
274	&lea	("ebx",&DWP(0,"eax","ecx"));
275	&nop	();
276	&data_word(0x1ab10ff0);	# lock;	cmpxchg	%ebx,(%edx)	# %eax is envolved and is always reloaded
277	&jne	(&label("spin"));
278	&mov	("eax","ebx");	# OpenSSL expects the new value
279	&pop	("ebx");
280	&ret	();
281&function_end_B("OPENSSL_atomic_add");
282
283# This function can become handy under Win32 in situations when
284# we don't know which calling convention, __stdcall or __cdecl(*),
285# indirect callee is using. In C it can be deployed as
286#
287#ifdef OPENSSL_CPUID_OBJ
288#	type OPENSSL_indirect_call(void *f,...);
289#	...
290#	OPENSSL_indirect_call(func,[up to $max arguments]);
291#endif
292#
293# (*)	it's designed to work even for __fastcall if number of
294#	arguments is 1 or 2!
295&function_begin_B("OPENSSL_indirect_call");
296	{
297	my ($max,$i)=(7,);	# $max has to be chosen as 4*n-1
298				# in order to preserve eventual
299				# stack alignment
300	&push	("ebp");
301	&mov	("ebp","esp");
302	&sub	("esp",$max*4);
303	&mov	("ecx",&DWP(12,"ebp"));
304	&mov	(&DWP(0,"esp"),"ecx");
305	&mov	("edx",&DWP(16,"ebp"));
306	&mov	(&DWP(4,"esp"),"edx");
307	for($i=2;$i<$max;$i++)
308		{
309		# Some copies will be redundant/bogus...
310		&mov	("eax",&DWP(12+$i*4,"ebp"));
311		&mov	(&DWP(0+$i*4,"esp"),"eax");
312		}
313	&call_ptr	(&DWP(8,"ebp"));# make the call...
314	&mov	("esp","ebp");	# ... and just restore the stack pointer
315				# without paying attention to what we called,
316				# (__cdecl *func) or (__stdcall *one).
317	&pop	("ebp");
318	&ret	();
319	}
320&function_end_B("OPENSSL_indirect_call");
321
322&function_begin_B("OPENSSL_cleanse");
323	&mov	("edx",&wparam(0));
324	&mov	("ecx",&wparam(1));
325	&xor	("eax","eax");
326	&cmp	("ecx",7);
327	&jae	(&label("lot"));
328	&cmp	("ecx",0);
329	&je	(&label("ret"));
330&set_label("little");
331	&mov	(&BP(0,"edx"),"al");
332	&sub	("ecx",1);
333	&lea	("edx",&DWP(1,"edx"));
334	&jnz	(&label("little"));
335&set_label("ret");
336	&ret	();
337
338&set_label("lot",16);
339	&test	("edx",3);
340	&jz	(&label("aligned"));
341	&mov	(&BP(0,"edx"),"al");
342	&lea	("ecx",&DWP(-1,"ecx"));
343	&lea	("edx",&DWP(1,"edx"));
344	&jmp	(&label("lot"));
345&set_label("aligned");
346	&mov	(&DWP(0,"edx"),"eax");
347	&lea	("ecx",&DWP(-4,"ecx"));
348	&test	("ecx",-4);
349	&lea	("edx",&DWP(4,"edx"));
350	&jnz	(&label("aligned"));
351	&cmp	("ecx",0);
352	&jne	(&label("little"));
353	&ret	();
354&function_end_B("OPENSSL_cleanse");
355
356&function_begin_B("OPENSSL_ia32_rdrand");
357	&mov	("ecx",8);
358&set_label("loop");
359	&rdrand	("eax");
360	&jc	(&label("break"));
361	&loop	(&label("loop"));
362&set_label("break");
363	&cmp	("eax",0);
364	&cmove	("eax","ecx");
365	&ret	();
366&function_end_B("OPENSSL_ia32_rdrand");
367
368&function_begin_B("OPENSSL_ia32_rdseed");
369	&mov	("ecx",8);
370&set_label("loop");
371	&rdseed	("eax");
372	&jc	(&label("break"));
373	&loop	(&label("loop"));
374&set_label("break");
375	&cmp	("eax",0);
376	&cmove	("eax","ecx");
377	&ret	();
378&function_end_B("OPENSSL_ia32_rdseed");
379
380&initseg("OPENSSL_cpuid_setup");
381
382&hidden("OPENSSL_cpuid_setup");
383&hidden("OPENSSL_ia32cap_P");
384
385&asm_finish();
386