stubRoutines_x86.cpp revision 9364:cd86b5699825
1/*
2 * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "runtime/deoptimization.hpp"
27#include "runtime/frame.inline.hpp"
28#include "runtime/stubRoutines.hpp"
29#include "runtime/thread.inline.hpp"
30#include "crc32c.h"
31
32// Implementation of the platform-specific part of StubRoutines - for
33// a description of how to extend it, see the stubRoutines.hpp file.
34
35address StubRoutines::x86::_verify_mxcsr_entry = NULL;
36address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
37address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
38address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
39
40uint64_t StubRoutines::x86::_crc_by128_masks[] =
41{
42  /* The fields in this structure are arranged so that they can be
43   * picked up two at a time with 128-bit loads.
44   *
45   * Because of flipped bit order for this CRC polynomials
46   * the constant for X**N is left-shifted by 1.  This is because
47   * a 64 x 64 polynomial multiply produces a 127-bit result
48   * but the highest term is always aligned to bit 0 in the container.
49   * Pre-shifting by one fixes this, at the cost of potentially making
50   * the 32-bit constant no longer fit in a 32-bit container (thus the
51   * use of uint64_t, though this is also the size used by the carry-
52   * less multiply instruction.
53   *
54   * In addition, the flipped bit order and highest-term-at-least-bit
55   * multiply changes the constants used.  The 96-bit result will be
56   * aligned to the high-term end of the target 128-bit container,
57   * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
58   * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
59   *
60   * This cause additional problems in the 128-to-64-bit reduction; see the
61   * code for details.  By storing a mask in the otherwise unused half of
62   * a 128-bit constant, bits can be cleared before multiplication without
63   * storing and reloading.  Note that staying on a 128-bit datapath means
64   * that some data is uselessly stored and some unused data is intersected
65   * with an irrelevant constant.
66   */
67
68  ((uint64_t) 0xffffffffUL),     /* low  of K_M_64    */
69  ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64    */
70  ((uint64_t) 0xba8ccbe8U << 1), /* low  of K_160_96  */
71  ((uint64_t) 0x6655004fU << 1), /* high of K_160_96  */
72  ((uint64_t) 0xaa2215eaU << 1), /* low  of K_544_480 */
73  ((uint64_t) 0xe3720acbU << 1)  /* high of K_544_480 */
74};
75
76/**
77 *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
78 */
79juint StubRoutines::x86::_crc_table[] =
80{
81    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
82    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
83    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
84    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
85    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
86    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
87    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
88    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
89    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
90    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
91    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
92    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
93    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
94    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
95    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
96    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
97    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
98    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
99    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
100    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
101    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
102    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
103    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
104    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
105    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
106    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
107    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
108    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
109    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
110    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
111    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
112    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
113    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
114    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
115    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
116    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
117    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
118    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
119    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
120    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
121    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
122    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
123    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
124    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
125    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
126    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
127    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
128    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
129    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
130    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
131    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
132    0x2d02ef8dUL
133};
134
135#define D 32
136#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)
137
138#define TILL_CYCLE 31
139uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]
140
141// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
142// Listing 1: Multiplication of normalized polynomials
143// "a" and "b" occupy D least significant bits.
144uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
145  uint32_t product = 0;
146  uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
147  b_pow_x_table[0] = b;
148  for (int k = 0; k < D; ++k) {
149    // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
150    if ((a & (((uint32_t)1) << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];
151
152    // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
153    if (b_pow_x_table[k] & 1) {
154      // If degree of (b_pow_x_table[k] * x) is D, then
155      // degree of (b_pow_x_table[k] * x - P) is less than D.
156      b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
157    }
158    else {
159      b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
160    }
161  }
162  return product;
163}
164#undef D
165#undef P
166
167// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
168void crc32c_init_pow_2k(void) {
169  // _crc32c_pow_2k_table(0) =
170  // x^(2^k) mod P(x) = x mod P(x) = x
171  // Since we are operating on a reflected values
172  // x = 10b, reflect(x) = 0x40000000
173  _crc32c_pow_2k_table[0] = 0x40000000;
174
175  for (int k = 1; k < TILL_CYCLE; k++) {
176    // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
177    uint32_t tmp = _crc32c_pow_2k_table[k - 1];
178    _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
179  }
180}
181
182// x^N mod P(x)
183uint32_t crc32c_f_pow_n(uint32_t n) {
184  //            result = 1 (polynomial)
185  uint32_t one, result = 0x80000000, i = 0;
186
187  while (one = (n & 1), (n == 1 || n - one > 0)) {
188    if (one) {
189      result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
190    }
191    n >>= 1;
192    i++;
193  }
194
195  return result;
196}
197
198juint *StubRoutines::x86::_crc32c_table;
199
200void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {
201
202  static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
203
204  crc32c_init_pow_2k();
205
206  pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8);      // 8N * 8 = 64N
207  pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2);  // 128N
208
209  pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
210  pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);
211
212  pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
213  pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
214            crc32c_f_pow_n(CRC32C_LOW * 8 * 2);
215
216  if (is_pclmulqdq_table_supported) {
217    _crc32c_table = pow_n;
218  } else {
219    static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];
220
221    for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
222      static juint X_CONST = pow_n[j];
223      for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
224      // S. Gueron / Information Processing Letters 112 (2012) 184
225      // Algorithm 3: Generating a carry-less multiplication lookup table.
226      // Input: A 32-bit constant, X_CONST.
227      // Output: A table of 256 entries, each one is a 64-bit quadword,
228      // that can be used for computing "byte" * X_CONST, for a given byte.
229        pclmulqdq_table[j * 256 + i] =
230          ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
231          ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
232          ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
233      }
234    }
235    _crc32c_table = (juint*)pclmulqdq_table;
236  }
237}
238