matmul_i8.c revision 1.1.1.1
1/* Implementation of the MATMUL intrinsic 2 Copyright (C) 2002-2019 Free Software Foundation, Inc. 3 Contributed by Paul Brook <paul@nowt.org> 4 5This file is part of the GNU Fortran runtime library (libgfortran). 6 7Libgfortran is free software; you can redistribute it and/or 8modify it under the terms of the GNU General Public 9License as published by the Free Software Foundation; either 10version 3 of the License, or (at your option) any later version. 11 12Libgfortran is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17Under Section 7 of GPL version 3, you are granted additional 18permissions described in the GCC Runtime Library Exception, version 193.1, as published by the Free Software Foundation. 20 21You should have received a copy of the GNU General Public License and 22a copy of the GCC Runtime Library Exception along with this program; 23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24<http://www.gnu.org/licenses/>. */ 25 26#include "libgfortran.h" 27#include <string.h> 28#include <assert.h> 29 30 31#if defined (HAVE_GFC_INTEGER_8) 32 33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 34 passed to us by the front-end, in which case we call it for large 35 matrices. */ 36 37typedef void (*blas_call)(const char *, const char *, const int *, const int *, 38 const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *, 39 const int *, const GFC_INTEGER_8 *, const int *, 40 const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *, 41 int, int); 42 43/* The order of loops is different in the case of plain matrix 44 multiplication C=MATMUL(A,B), and in the frequent special case where 45 the argument A is the temporary result of a TRANSPOSE intrinsic: 46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 47 looking at their strides. 48 49 The equivalent Fortran pseudo-code is: 50 51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 52 IF (.NOT.IS_TRANSPOSED(A)) THEN 53 C = 0 54 DO J=1,N 55 DO K=1,COUNT 56 DO I=1,M 57 C(I,J) = C(I,J)+A(I,K)*B(K,J) 58 ELSE 59 DO J=1,N 60 DO I=1,M 61 S = 0 62 DO K=1,COUNT 63 S = S+A(I,K)*B(K,J) 64 C(I,J) = S 65 ENDIF 66*/ 67 68/* If try_blas is set to a nonzero value, then the matmul function will 69 see if there is a way to perform the matrix multiplication by a call 70 to the BLAS gemm function. */ 71 72extern void matmul_i8 (gfc_array_i8 * const restrict retarray, 73 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 74 int blas_limit, blas_call gemm); 75export_proto(matmul_i8); 76 77/* Put exhaustive list of possible architectures here here, ORed together. */ 78 79#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 80 81#ifdef HAVE_AVX 82static void 83matmul_i8_avx (gfc_array_i8 * const restrict retarray, 84 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 85 int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 86static void 87matmul_i8_avx (gfc_array_i8 * const restrict retarray, 88 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 89 int blas_limit, blas_call gemm) 90{ 91 const GFC_INTEGER_8 * restrict abase; 92 const GFC_INTEGER_8 * restrict bbase; 93 GFC_INTEGER_8 * restrict dest; 94 95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 96 index_type x, y, n, count, xcount, ycount; 97 98 assert (GFC_DESCRIPTOR_RANK (a) == 2 99 || GFC_DESCRIPTOR_RANK (b) == 2); 100 101/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 102 103 Either A or B (but not both) can be rank 1: 104 105 o One-dimensional argument A is implicitly treated as a row matrix 106 dimensioned [1,count], so xcount=1. 107 108 o One-dimensional argument B is implicitly treated as a column matrix 109 dimensioned [count, 1], so ycount=1. 110*/ 111 112 if (retarray->base_addr == NULL) 113 { 114 if (GFC_DESCRIPTOR_RANK (a) == 1) 115 { 116 GFC_DIMENSION_SET(retarray->dim[0], 0, 117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 118 } 119 else if (GFC_DESCRIPTOR_RANK (b) == 1) 120 { 121 GFC_DIMENSION_SET(retarray->dim[0], 0, 122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 123 } 124 else 125 { 126 GFC_DIMENSION_SET(retarray->dim[0], 0, 127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 128 129 GFC_DIMENSION_SET(retarray->dim[1], 0, 130 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 131 GFC_DESCRIPTOR_EXTENT(retarray,0)); 132 } 133 134 retarray->base_addr 135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); 136 retarray->offset = 0; 137 } 138 else if (unlikely (compile_options.bounds_check)) 139 { 140 index_type ret_extent, arg_extent; 141 142 if (GFC_DESCRIPTOR_RANK (a) == 1) 143 { 144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 146 if (arg_extent != ret_extent) 147 runtime_error ("Array bound mismatch for dimension 1 of " 148 "array (%ld/%ld) ", 149 (long int) ret_extent, (long int) arg_extent); 150 } 151 else if (GFC_DESCRIPTOR_RANK (b) == 1) 152 { 153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 155 if (arg_extent != ret_extent) 156 runtime_error ("Array bound mismatch for dimension 1 of " 157 "array (%ld/%ld) ", 158 (long int) ret_extent, (long int) arg_extent); 159 } 160 else 161 { 162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 164 if (arg_extent != ret_extent) 165 runtime_error ("Array bound mismatch for dimension 1 of " 166 "array (%ld/%ld) ", 167 (long int) ret_extent, (long int) arg_extent); 168 169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 171 if (arg_extent != ret_extent) 172 runtime_error ("Array bound mismatch for dimension 2 of " 173 "array (%ld/%ld) ", 174 (long int) ret_extent, (long int) arg_extent); 175 } 176 } 177 178 179 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 180 { 181 /* One-dimensional result may be addressed in the code below 182 either as a row or a column matrix. We want both cases to 183 work. */ 184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 185 } 186 else 187 { 188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 190 } 191 192 193 if (GFC_DESCRIPTOR_RANK (a) == 1) 194 { 195 /* Treat it as a a row matrix A[1,count]. */ 196 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 197 aystride = 1; 198 199 xcount = 1; 200 count = GFC_DESCRIPTOR_EXTENT(a,0); 201 } 202 else 203 { 204 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 205 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 206 207 count = GFC_DESCRIPTOR_EXTENT(a,1); 208 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 209 } 210 211 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 212 { 213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 215 "in dimension 1: is %ld, should be %ld", 216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 217 } 218 219 if (GFC_DESCRIPTOR_RANK (b) == 1) 220 { 221 /* Treat it as a column matrix B[count,1] */ 222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 223 224 /* bystride should never be used for 1-dimensional b. 225 The value is only used for calculation of the 226 memory by the buffer. */ 227 bystride = 256; 228 ycount = 1; 229 } 230 else 231 { 232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 233 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 234 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 235 } 236 237 abase = a->base_addr; 238 bbase = b->base_addr; 239 dest = retarray->base_addr; 240 241 /* Now that everything is set up, we perform the multiplication 242 itself. */ 243 244#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 245#define min(a,b) ((a) <= (b) ? (a) : (b)) 246#define max(a,b) ((a) >= (b) ? (a) : (b)) 247 248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 249 && (bxstride == 1 || bystride == 1) 250 && (((float) xcount) * ((float) ycount) * ((float) count) 251 > POW3(blas_limit))) 252 { 253 const int m = xcount, n = ycount, k = count, ldc = rystride; 254 const GFC_INTEGER_8 one = 1, zero = 0; 255 const int lda = (axstride == 1) ? aystride : axstride, 256 ldb = (bxstride == 1) ? bystride : bxstride; 257 258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 259 { 260 assert (gemm != NULL); 261 const char *transa, *transb; 262 if (try_blas & 2) 263 transa = "C"; 264 else 265 transa = axstride == 1 ? "N" : "T"; 266 267 if (try_blas & 4) 268 transb = "C"; 269 else 270 transb = bxstride == 1 ? "N" : "T"; 271 272 gemm (transa, transb , &m, 273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 274 &ldc, 1, 1); 275 return; 276 } 277 } 278 279 if (rxstride == 1 && axstride == 1 && bxstride == 1) 280 { 281 /* This block of code implements a tuned matmul, derived from 282 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 283 284 Bo Kagstrom and Per Ling 285 Department of Computing Science 286 Umea University 287 S-901 87 Umea, Sweden 288 289 from netlib.org, translated to C, and modified for matmul.m4. */ 290 291 const GFC_INTEGER_8 *a, *b; 292 GFC_INTEGER_8 *c; 293 const index_type m = xcount, n = ycount, k = count; 294 295 /* System generated locals */ 296 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 297 i1, i2, i3, i4, i5, i6; 298 299 /* Local variables */ 300 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, 301 f13, f14, f23, f24, f33, f34, f43, f44; 302 index_type i, j, l, ii, jj, ll; 303 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 304 GFC_INTEGER_8 *t1; 305 306 a = abase; 307 b = bbase; 308 c = retarray->base_addr; 309 310 /* Parameter adjustments */ 311 c_dim1 = rystride; 312 c_offset = 1 + c_dim1; 313 c -= c_offset; 314 a_dim1 = aystride; 315 a_offset = 1 + a_dim1; 316 a -= a_offset; 317 b_dim1 = bystride; 318 b_offset = 1 + b_dim1; 319 b -= b_offset; 320 321 /* Empty c first. */ 322 for (j=1; j<=n; j++) 323 for (i=1; i<=m; i++) 324 c[i + j * c_dim1] = (GFC_INTEGER_8)0; 325 326 /* Early exit if possible */ 327 if (m == 0 || n == 0 || k == 0) 328 return; 329 330 /* Adjust size of t1 to what is needed. */ 331 index_type t1_dim, a_sz; 332 if (aystride == 1) 333 a_sz = rystride; 334 else 335 a_sz = a_dim1; 336 337 t1_dim = a_sz * 256 + b_dim1; 338 if (t1_dim > 65536) 339 t1_dim = 65536; 340 341 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); 342 343 /* Start turning the crank. */ 344 i1 = n; 345 for (jj = 1; jj <= i1; jj += 512) 346 { 347 /* Computing MIN */ 348 i2 = 512; 349 i3 = n - jj + 1; 350 jsec = min(i2,i3); 351 ujsec = jsec - jsec % 4; 352 i2 = k; 353 for (ll = 1; ll <= i2; ll += 256) 354 { 355 /* Computing MIN */ 356 i3 = 256; 357 i4 = k - ll + 1; 358 lsec = min(i3,i4); 359 ulsec = lsec - lsec % 2; 360 361 i3 = m; 362 for (ii = 1; ii <= i3; ii += 256) 363 { 364 /* Computing MIN */ 365 i4 = 256; 366 i5 = m - ii + 1; 367 isec = min(i4,i5); 368 uisec = isec - isec % 2; 369 i4 = ll + ulsec - 1; 370 for (l = ll; l <= i4; l += 2) 371 { 372 i5 = ii + uisec - 1; 373 for (i = ii; i <= i5; i += 2) 374 { 375 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 376 a[i + l * a_dim1]; 377 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 378 a[i + (l + 1) * a_dim1]; 379 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 380 a[i + 1 + l * a_dim1]; 381 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 382 a[i + 1 + (l + 1) * a_dim1]; 383 } 384 if (uisec < isec) 385 { 386 t1[l - ll + 1 + (isec << 8) - 257] = 387 a[ii + isec - 1 + l * a_dim1]; 388 t1[l - ll + 2 + (isec << 8) - 257] = 389 a[ii + isec - 1 + (l + 1) * a_dim1]; 390 } 391 } 392 if (ulsec < lsec) 393 { 394 i4 = ii + isec - 1; 395 for (i = ii; i<= i4; ++i) 396 { 397 t1[lsec + ((i - ii + 1) << 8) - 257] = 398 a[i + (ll + lsec - 1) * a_dim1]; 399 } 400 } 401 402 uisec = isec - isec % 4; 403 i4 = jj + ujsec - 1; 404 for (j = jj; j <= i4; j += 4) 405 { 406 i5 = ii + uisec - 1; 407 for (i = ii; i <= i5; i += 4) 408 { 409 f11 = c[i + j * c_dim1]; 410 f21 = c[i + 1 + j * c_dim1]; 411 f12 = c[i + (j + 1) * c_dim1]; 412 f22 = c[i + 1 + (j + 1) * c_dim1]; 413 f13 = c[i + (j + 2) * c_dim1]; 414 f23 = c[i + 1 + (j + 2) * c_dim1]; 415 f14 = c[i + (j + 3) * c_dim1]; 416 f24 = c[i + 1 + (j + 3) * c_dim1]; 417 f31 = c[i + 2 + j * c_dim1]; 418 f41 = c[i + 3 + j * c_dim1]; 419 f32 = c[i + 2 + (j + 1) * c_dim1]; 420 f42 = c[i + 3 + (j + 1) * c_dim1]; 421 f33 = c[i + 2 + (j + 2) * c_dim1]; 422 f43 = c[i + 3 + (j + 2) * c_dim1]; 423 f34 = c[i + 2 + (j + 3) * c_dim1]; 424 f44 = c[i + 3 + (j + 3) * c_dim1]; 425 i6 = ll + lsec - 1; 426 for (l = ll; l <= i6; ++l) 427 { 428 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 429 * b[l + j * b_dim1]; 430 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 431 * b[l + j * b_dim1]; 432 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 433 * b[l + (j + 1) * b_dim1]; 434 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 435 * b[l + (j + 1) * b_dim1]; 436 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 437 * b[l + (j + 2) * b_dim1]; 438 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 439 * b[l + (j + 2) * b_dim1]; 440 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 441 * b[l + (j + 3) * b_dim1]; 442 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 443 * b[l + (j + 3) * b_dim1]; 444 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 445 * b[l + j * b_dim1]; 446 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 447 * b[l + j * b_dim1]; 448 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 449 * b[l + (j + 1) * b_dim1]; 450 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 451 * b[l + (j + 1) * b_dim1]; 452 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 453 * b[l + (j + 2) * b_dim1]; 454 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 455 * b[l + (j + 2) * b_dim1]; 456 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 457 * b[l + (j + 3) * b_dim1]; 458 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 459 * b[l + (j + 3) * b_dim1]; 460 } 461 c[i + j * c_dim1] = f11; 462 c[i + 1 + j * c_dim1] = f21; 463 c[i + (j + 1) * c_dim1] = f12; 464 c[i + 1 + (j + 1) * c_dim1] = f22; 465 c[i + (j + 2) * c_dim1] = f13; 466 c[i + 1 + (j + 2) * c_dim1] = f23; 467 c[i + (j + 3) * c_dim1] = f14; 468 c[i + 1 + (j + 3) * c_dim1] = f24; 469 c[i + 2 + j * c_dim1] = f31; 470 c[i + 3 + j * c_dim1] = f41; 471 c[i + 2 + (j + 1) * c_dim1] = f32; 472 c[i + 3 + (j + 1) * c_dim1] = f42; 473 c[i + 2 + (j + 2) * c_dim1] = f33; 474 c[i + 3 + (j + 2) * c_dim1] = f43; 475 c[i + 2 + (j + 3) * c_dim1] = f34; 476 c[i + 3 + (j + 3) * c_dim1] = f44; 477 } 478 if (uisec < isec) 479 { 480 i5 = ii + isec - 1; 481 for (i = ii + uisec; i <= i5; ++i) 482 { 483 f11 = c[i + j * c_dim1]; 484 f12 = c[i + (j + 1) * c_dim1]; 485 f13 = c[i + (j + 2) * c_dim1]; 486 f14 = c[i + (j + 3) * c_dim1]; 487 i6 = ll + lsec - 1; 488 for (l = ll; l <= i6; ++l) 489 { 490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 491 257] * b[l + j * b_dim1]; 492 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 493 257] * b[l + (j + 1) * b_dim1]; 494 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 495 257] * b[l + (j + 2) * b_dim1]; 496 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 497 257] * b[l + (j + 3) * b_dim1]; 498 } 499 c[i + j * c_dim1] = f11; 500 c[i + (j + 1) * c_dim1] = f12; 501 c[i + (j + 2) * c_dim1] = f13; 502 c[i + (j + 3) * c_dim1] = f14; 503 } 504 } 505 } 506 if (ujsec < jsec) 507 { 508 i4 = jj + jsec - 1; 509 for (j = jj + ujsec; j <= i4; ++j) 510 { 511 i5 = ii + uisec - 1; 512 for (i = ii; i <= i5; i += 4) 513 { 514 f11 = c[i + j * c_dim1]; 515 f21 = c[i + 1 + j * c_dim1]; 516 f31 = c[i + 2 + j * c_dim1]; 517 f41 = c[i + 3 + j * c_dim1]; 518 i6 = ll + lsec - 1; 519 for (l = ll; l <= i6; ++l) 520 { 521 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 522 257] * b[l + j * b_dim1]; 523 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 524 257] * b[l + j * b_dim1]; 525 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 526 257] * b[l + j * b_dim1]; 527 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 528 257] * b[l + j * b_dim1]; 529 } 530 c[i + j * c_dim1] = f11; 531 c[i + 1 + j * c_dim1] = f21; 532 c[i + 2 + j * c_dim1] = f31; 533 c[i + 3 + j * c_dim1] = f41; 534 } 535 i5 = ii + isec - 1; 536 for (i = ii + uisec; i <= i5; ++i) 537 { 538 f11 = c[i + j * c_dim1]; 539 i6 = ll + lsec - 1; 540 for (l = ll; l <= i6; ++l) 541 { 542 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 543 257] * b[l + j * b_dim1]; 544 } 545 c[i + j * c_dim1] = f11; 546 } 547 } 548 } 549 } 550 } 551 } 552 free(t1); 553 return; 554 } 555 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 556 { 557 if (GFC_DESCRIPTOR_RANK (a) != 1) 558 { 559 const GFC_INTEGER_8 *restrict abase_x; 560 const GFC_INTEGER_8 *restrict bbase_y; 561 GFC_INTEGER_8 *restrict dest_y; 562 GFC_INTEGER_8 s; 563 564 for (y = 0; y < ycount; y++) 565 { 566 bbase_y = &bbase[y*bystride]; 567 dest_y = &dest[y*rystride]; 568 for (x = 0; x < xcount; x++) 569 { 570 abase_x = &abase[x*axstride]; 571 s = (GFC_INTEGER_8) 0; 572 for (n = 0; n < count; n++) 573 s += abase_x[n] * bbase_y[n]; 574 dest_y[x] = s; 575 } 576 } 577 } 578 else 579 { 580 const GFC_INTEGER_8 *restrict bbase_y; 581 GFC_INTEGER_8 s; 582 583 for (y = 0; y < ycount; y++) 584 { 585 bbase_y = &bbase[y*bystride]; 586 s = (GFC_INTEGER_8) 0; 587 for (n = 0; n < count; n++) 588 s += abase[n*axstride] * bbase_y[n]; 589 dest[y*rystride] = s; 590 } 591 } 592 } 593 else if (axstride < aystride) 594 { 595 for (y = 0; y < ycount; y++) 596 for (x = 0; x < xcount; x++) 597 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; 598 599 for (y = 0; y < ycount; y++) 600 for (n = 0; n < count; n++) 601 for (x = 0; x < xcount; x++) 602 /* dest[x,y] += a[x,n] * b[n,y] */ 603 dest[x*rxstride + y*rystride] += 604 abase[x*axstride + n*aystride] * 605 bbase[n*bxstride + y*bystride]; 606 } 607 else if (GFC_DESCRIPTOR_RANK (a) == 1) 608 { 609 const GFC_INTEGER_8 *restrict bbase_y; 610 GFC_INTEGER_8 s; 611 612 for (y = 0; y < ycount; y++) 613 { 614 bbase_y = &bbase[y*bystride]; 615 s = (GFC_INTEGER_8) 0; 616 for (n = 0; n < count; n++) 617 s += abase[n*axstride] * bbase_y[n*bxstride]; 618 dest[y*rxstride] = s; 619 } 620 } 621 else 622 { 623 const GFC_INTEGER_8 *restrict abase_x; 624 const GFC_INTEGER_8 *restrict bbase_y; 625 GFC_INTEGER_8 *restrict dest_y; 626 GFC_INTEGER_8 s; 627 628 for (y = 0; y < ycount; y++) 629 { 630 bbase_y = &bbase[y*bystride]; 631 dest_y = &dest[y*rystride]; 632 for (x = 0; x < xcount; x++) 633 { 634 abase_x = &abase[x*axstride]; 635 s = (GFC_INTEGER_8) 0; 636 for (n = 0; n < count; n++) 637 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 638 dest_y[x*rxstride] = s; 639 } 640 } 641 } 642} 643#undef POW3 644#undef min 645#undef max 646 647#endif /* HAVE_AVX */ 648 649#ifdef HAVE_AVX2 650static void 651matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, 652 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 653 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 654static void 655matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, 656 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 657 int blas_limit, blas_call gemm) 658{ 659 const GFC_INTEGER_8 * restrict abase; 660 const GFC_INTEGER_8 * restrict bbase; 661 GFC_INTEGER_8 * restrict dest; 662 663 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 664 index_type x, y, n, count, xcount, ycount; 665 666 assert (GFC_DESCRIPTOR_RANK (a) == 2 667 || GFC_DESCRIPTOR_RANK (b) == 2); 668 669/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 670 671 Either A or B (but not both) can be rank 1: 672 673 o One-dimensional argument A is implicitly treated as a row matrix 674 dimensioned [1,count], so xcount=1. 675 676 o One-dimensional argument B is implicitly treated as a column matrix 677 dimensioned [count, 1], so ycount=1. 678*/ 679 680 if (retarray->base_addr == NULL) 681 { 682 if (GFC_DESCRIPTOR_RANK (a) == 1) 683 { 684 GFC_DIMENSION_SET(retarray->dim[0], 0, 685 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 686 } 687 else if (GFC_DESCRIPTOR_RANK (b) == 1) 688 { 689 GFC_DIMENSION_SET(retarray->dim[0], 0, 690 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 691 } 692 else 693 { 694 GFC_DIMENSION_SET(retarray->dim[0], 0, 695 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 696 697 GFC_DIMENSION_SET(retarray->dim[1], 0, 698 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 699 GFC_DESCRIPTOR_EXTENT(retarray,0)); 700 } 701 702 retarray->base_addr 703 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); 704 retarray->offset = 0; 705 } 706 else if (unlikely (compile_options.bounds_check)) 707 { 708 index_type ret_extent, arg_extent; 709 710 if (GFC_DESCRIPTOR_RANK (a) == 1) 711 { 712 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 713 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 714 if (arg_extent != ret_extent) 715 runtime_error ("Array bound mismatch for dimension 1 of " 716 "array (%ld/%ld) ", 717 (long int) ret_extent, (long int) arg_extent); 718 } 719 else if (GFC_DESCRIPTOR_RANK (b) == 1) 720 { 721 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 722 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 723 if (arg_extent != ret_extent) 724 runtime_error ("Array bound mismatch for dimension 1 of " 725 "array (%ld/%ld) ", 726 (long int) ret_extent, (long int) arg_extent); 727 } 728 else 729 { 730 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 731 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 732 if (arg_extent != ret_extent) 733 runtime_error ("Array bound mismatch for dimension 1 of " 734 "array (%ld/%ld) ", 735 (long int) ret_extent, (long int) arg_extent); 736 737 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 738 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 739 if (arg_extent != ret_extent) 740 runtime_error ("Array bound mismatch for dimension 2 of " 741 "array (%ld/%ld) ", 742 (long int) ret_extent, (long int) arg_extent); 743 } 744 } 745 746 747 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 748 { 749 /* One-dimensional result may be addressed in the code below 750 either as a row or a column matrix. We want both cases to 751 work. */ 752 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 753 } 754 else 755 { 756 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 757 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 758 } 759 760 761 if (GFC_DESCRIPTOR_RANK (a) == 1) 762 { 763 /* Treat it as a a row matrix A[1,count]. */ 764 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 765 aystride = 1; 766 767 xcount = 1; 768 count = GFC_DESCRIPTOR_EXTENT(a,0); 769 } 770 else 771 { 772 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 773 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 774 775 count = GFC_DESCRIPTOR_EXTENT(a,1); 776 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 777 } 778 779 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 780 { 781 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 782 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 783 "in dimension 1: is %ld, should be %ld", 784 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 785 } 786 787 if (GFC_DESCRIPTOR_RANK (b) == 1) 788 { 789 /* Treat it as a column matrix B[count,1] */ 790 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 791 792 /* bystride should never be used for 1-dimensional b. 793 The value is only used for calculation of the 794 memory by the buffer. */ 795 bystride = 256; 796 ycount = 1; 797 } 798 else 799 { 800 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 801 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 802 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 803 } 804 805 abase = a->base_addr; 806 bbase = b->base_addr; 807 dest = retarray->base_addr; 808 809 /* Now that everything is set up, we perform the multiplication 810 itself. */ 811 812#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 813#define min(a,b) ((a) <= (b) ? (a) : (b)) 814#define max(a,b) ((a) >= (b) ? (a) : (b)) 815 816 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 817 && (bxstride == 1 || bystride == 1) 818 && (((float) xcount) * ((float) ycount) * ((float) count) 819 > POW3(blas_limit))) 820 { 821 const int m = xcount, n = ycount, k = count, ldc = rystride; 822 const GFC_INTEGER_8 one = 1, zero = 0; 823 const int lda = (axstride == 1) ? aystride : axstride, 824 ldb = (bxstride == 1) ? bystride : bxstride; 825 826 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 827 { 828 assert (gemm != NULL); 829 const char *transa, *transb; 830 if (try_blas & 2) 831 transa = "C"; 832 else 833 transa = axstride == 1 ? "N" : "T"; 834 835 if (try_blas & 4) 836 transb = "C"; 837 else 838 transb = bxstride == 1 ? "N" : "T"; 839 840 gemm (transa, transb , &m, 841 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 842 &ldc, 1, 1); 843 return; 844 } 845 } 846 847 if (rxstride == 1 && axstride == 1 && bxstride == 1) 848 { 849 /* This block of code implements a tuned matmul, derived from 850 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 851 852 Bo Kagstrom and Per Ling 853 Department of Computing Science 854 Umea University 855 S-901 87 Umea, Sweden 856 857 from netlib.org, translated to C, and modified for matmul.m4. */ 858 859 const GFC_INTEGER_8 *a, *b; 860 GFC_INTEGER_8 *c; 861 const index_type m = xcount, n = ycount, k = count; 862 863 /* System generated locals */ 864 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 865 i1, i2, i3, i4, i5, i6; 866 867 /* Local variables */ 868 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, 869 f13, f14, f23, f24, f33, f34, f43, f44; 870 index_type i, j, l, ii, jj, ll; 871 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 872 GFC_INTEGER_8 *t1; 873 874 a = abase; 875 b = bbase; 876 c = retarray->base_addr; 877 878 /* Parameter adjustments */ 879 c_dim1 = rystride; 880 c_offset = 1 + c_dim1; 881 c -= c_offset; 882 a_dim1 = aystride; 883 a_offset = 1 + a_dim1; 884 a -= a_offset; 885 b_dim1 = bystride; 886 b_offset = 1 + b_dim1; 887 b -= b_offset; 888 889 /* Empty c first. */ 890 for (j=1; j<=n; j++) 891 for (i=1; i<=m; i++) 892 c[i + j * c_dim1] = (GFC_INTEGER_8)0; 893 894 /* Early exit if possible */ 895 if (m == 0 || n == 0 || k == 0) 896 return; 897 898 /* Adjust size of t1 to what is needed. */ 899 index_type t1_dim, a_sz; 900 if (aystride == 1) 901 a_sz = rystride; 902 else 903 a_sz = a_dim1; 904 905 t1_dim = a_sz * 256 + b_dim1; 906 if (t1_dim > 65536) 907 t1_dim = 65536; 908 909 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); 910 911 /* Start turning the crank. */ 912 i1 = n; 913 for (jj = 1; jj <= i1; jj += 512) 914 { 915 /* Computing MIN */ 916 i2 = 512; 917 i3 = n - jj + 1; 918 jsec = min(i2,i3); 919 ujsec = jsec - jsec % 4; 920 i2 = k; 921 for (ll = 1; ll <= i2; ll += 256) 922 { 923 /* Computing MIN */ 924 i3 = 256; 925 i4 = k - ll + 1; 926 lsec = min(i3,i4); 927 ulsec = lsec - lsec % 2; 928 929 i3 = m; 930 for (ii = 1; ii <= i3; ii += 256) 931 { 932 /* Computing MIN */ 933 i4 = 256; 934 i5 = m - ii + 1; 935 isec = min(i4,i5); 936 uisec = isec - isec % 2; 937 i4 = ll + ulsec - 1; 938 for (l = ll; l <= i4; l += 2) 939 { 940 i5 = ii + uisec - 1; 941 for (i = ii; i <= i5; i += 2) 942 { 943 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 944 a[i + l * a_dim1]; 945 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 946 a[i + (l + 1) * a_dim1]; 947 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 948 a[i + 1 + l * a_dim1]; 949 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 950 a[i + 1 + (l + 1) * a_dim1]; 951 } 952 if (uisec < isec) 953 { 954 t1[l - ll + 1 + (isec << 8) - 257] = 955 a[ii + isec - 1 + l * a_dim1]; 956 t1[l - ll + 2 + (isec << 8) - 257] = 957 a[ii + isec - 1 + (l + 1) * a_dim1]; 958 } 959 } 960 if (ulsec < lsec) 961 { 962 i4 = ii + isec - 1; 963 for (i = ii; i<= i4; ++i) 964 { 965 t1[lsec + ((i - ii + 1) << 8) - 257] = 966 a[i + (ll + lsec - 1) * a_dim1]; 967 } 968 } 969 970 uisec = isec - isec % 4; 971 i4 = jj + ujsec - 1; 972 for (j = jj; j <= i4; j += 4) 973 { 974 i5 = ii + uisec - 1; 975 for (i = ii; i <= i5; i += 4) 976 { 977 f11 = c[i + j * c_dim1]; 978 f21 = c[i + 1 + j * c_dim1]; 979 f12 = c[i + (j + 1) * c_dim1]; 980 f22 = c[i + 1 + (j + 1) * c_dim1]; 981 f13 = c[i + (j + 2) * c_dim1]; 982 f23 = c[i + 1 + (j + 2) * c_dim1]; 983 f14 = c[i + (j + 3) * c_dim1]; 984 f24 = c[i + 1 + (j + 3) * c_dim1]; 985 f31 = c[i + 2 + j * c_dim1]; 986 f41 = c[i + 3 + j * c_dim1]; 987 f32 = c[i + 2 + (j + 1) * c_dim1]; 988 f42 = c[i + 3 + (j + 1) * c_dim1]; 989 f33 = c[i + 2 + (j + 2) * c_dim1]; 990 f43 = c[i + 3 + (j + 2) * c_dim1]; 991 f34 = c[i + 2 + (j + 3) * c_dim1]; 992 f44 = c[i + 3 + (j + 3) * c_dim1]; 993 i6 = ll + lsec - 1; 994 for (l = ll; l <= i6; ++l) 995 { 996 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 997 * b[l + j * b_dim1]; 998 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 999 * b[l + j * b_dim1]; 1000 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1001 * b[l + (j + 1) * b_dim1]; 1002 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1003 * b[l + (j + 1) * b_dim1]; 1004 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1005 * b[l + (j + 2) * b_dim1]; 1006 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1007 * b[l + (j + 2) * b_dim1]; 1008 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1009 * b[l + (j + 3) * b_dim1]; 1010 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1011 * b[l + (j + 3) * b_dim1]; 1012 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1013 * b[l + j * b_dim1]; 1014 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1015 * b[l + j * b_dim1]; 1016 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1017 * b[l + (j + 1) * b_dim1]; 1018 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1019 * b[l + (j + 1) * b_dim1]; 1020 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1021 * b[l + (j + 2) * b_dim1]; 1022 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1023 * b[l + (j + 2) * b_dim1]; 1024 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1025 * b[l + (j + 3) * b_dim1]; 1026 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1027 * b[l + (j + 3) * b_dim1]; 1028 } 1029 c[i + j * c_dim1] = f11; 1030 c[i + 1 + j * c_dim1] = f21; 1031 c[i + (j + 1) * c_dim1] = f12; 1032 c[i + 1 + (j + 1) * c_dim1] = f22; 1033 c[i + (j + 2) * c_dim1] = f13; 1034 c[i + 1 + (j + 2) * c_dim1] = f23; 1035 c[i + (j + 3) * c_dim1] = f14; 1036 c[i + 1 + (j + 3) * c_dim1] = f24; 1037 c[i + 2 + j * c_dim1] = f31; 1038 c[i + 3 + j * c_dim1] = f41; 1039 c[i + 2 + (j + 1) * c_dim1] = f32; 1040 c[i + 3 + (j + 1) * c_dim1] = f42; 1041 c[i + 2 + (j + 2) * c_dim1] = f33; 1042 c[i + 3 + (j + 2) * c_dim1] = f43; 1043 c[i + 2 + (j + 3) * c_dim1] = f34; 1044 c[i + 3 + (j + 3) * c_dim1] = f44; 1045 } 1046 if (uisec < isec) 1047 { 1048 i5 = ii + isec - 1; 1049 for (i = ii + uisec; i <= i5; ++i) 1050 { 1051 f11 = c[i + j * c_dim1]; 1052 f12 = c[i + (j + 1) * c_dim1]; 1053 f13 = c[i + (j + 2) * c_dim1]; 1054 f14 = c[i + (j + 3) * c_dim1]; 1055 i6 = ll + lsec - 1; 1056 for (l = ll; l <= i6; ++l) 1057 { 1058 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1059 257] * b[l + j * b_dim1]; 1060 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1061 257] * b[l + (j + 1) * b_dim1]; 1062 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1063 257] * b[l + (j + 2) * b_dim1]; 1064 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1065 257] * b[l + (j + 3) * b_dim1]; 1066 } 1067 c[i + j * c_dim1] = f11; 1068 c[i + (j + 1) * c_dim1] = f12; 1069 c[i + (j + 2) * c_dim1] = f13; 1070 c[i + (j + 3) * c_dim1] = f14; 1071 } 1072 } 1073 } 1074 if (ujsec < jsec) 1075 { 1076 i4 = jj + jsec - 1; 1077 for (j = jj + ujsec; j <= i4; ++j) 1078 { 1079 i5 = ii + uisec - 1; 1080 for (i = ii; i <= i5; i += 4) 1081 { 1082 f11 = c[i + j * c_dim1]; 1083 f21 = c[i + 1 + j * c_dim1]; 1084 f31 = c[i + 2 + j * c_dim1]; 1085 f41 = c[i + 3 + j * c_dim1]; 1086 i6 = ll + lsec - 1; 1087 for (l = ll; l <= i6; ++l) 1088 { 1089 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1090 257] * b[l + j * b_dim1]; 1091 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1092 257] * b[l + j * b_dim1]; 1093 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1094 257] * b[l + j * b_dim1]; 1095 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1096 257] * b[l + j * b_dim1]; 1097 } 1098 c[i + j * c_dim1] = f11; 1099 c[i + 1 + j * c_dim1] = f21; 1100 c[i + 2 + j * c_dim1] = f31; 1101 c[i + 3 + j * c_dim1] = f41; 1102 } 1103 i5 = ii + isec - 1; 1104 for (i = ii + uisec; i <= i5; ++i) 1105 { 1106 f11 = c[i + j * c_dim1]; 1107 i6 = ll + lsec - 1; 1108 for (l = ll; l <= i6; ++l) 1109 { 1110 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1111 257] * b[l + j * b_dim1]; 1112 } 1113 c[i + j * c_dim1] = f11; 1114 } 1115 } 1116 } 1117 } 1118 } 1119 } 1120 free(t1); 1121 return; 1122 } 1123 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1124 { 1125 if (GFC_DESCRIPTOR_RANK (a) != 1) 1126 { 1127 const GFC_INTEGER_8 *restrict abase_x; 1128 const GFC_INTEGER_8 *restrict bbase_y; 1129 GFC_INTEGER_8 *restrict dest_y; 1130 GFC_INTEGER_8 s; 1131 1132 for (y = 0; y < ycount; y++) 1133 { 1134 bbase_y = &bbase[y*bystride]; 1135 dest_y = &dest[y*rystride]; 1136 for (x = 0; x < xcount; x++) 1137 { 1138 abase_x = &abase[x*axstride]; 1139 s = (GFC_INTEGER_8) 0; 1140 for (n = 0; n < count; n++) 1141 s += abase_x[n] * bbase_y[n]; 1142 dest_y[x] = s; 1143 } 1144 } 1145 } 1146 else 1147 { 1148 const GFC_INTEGER_8 *restrict bbase_y; 1149 GFC_INTEGER_8 s; 1150 1151 for (y = 0; y < ycount; y++) 1152 { 1153 bbase_y = &bbase[y*bystride]; 1154 s = (GFC_INTEGER_8) 0; 1155 for (n = 0; n < count; n++) 1156 s += abase[n*axstride] * bbase_y[n]; 1157 dest[y*rystride] = s; 1158 } 1159 } 1160 } 1161 else if (axstride < aystride) 1162 { 1163 for (y = 0; y < ycount; y++) 1164 for (x = 0; x < xcount; x++) 1165 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; 1166 1167 for (y = 0; y < ycount; y++) 1168 for (n = 0; n < count; n++) 1169 for (x = 0; x < xcount; x++) 1170 /* dest[x,y] += a[x,n] * b[n,y] */ 1171 dest[x*rxstride + y*rystride] += 1172 abase[x*axstride + n*aystride] * 1173 bbase[n*bxstride + y*bystride]; 1174 } 1175 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1176 { 1177 const GFC_INTEGER_8 *restrict bbase_y; 1178 GFC_INTEGER_8 s; 1179 1180 for (y = 0; y < ycount; y++) 1181 { 1182 bbase_y = &bbase[y*bystride]; 1183 s = (GFC_INTEGER_8) 0; 1184 for (n = 0; n < count; n++) 1185 s += abase[n*axstride] * bbase_y[n*bxstride]; 1186 dest[y*rxstride] = s; 1187 } 1188 } 1189 else 1190 { 1191 const GFC_INTEGER_8 *restrict abase_x; 1192 const GFC_INTEGER_8 *restrict bbase_y; 1193 GFC_INTEGER_8 *restrict dest_y; 1194 GFC_INTEGER_8 s; 1195 1196 for (y = 0; y < ycount; y++) 1197 { 1198 bbase_y = &bbase[y*bystride]; 1199 dest_y = &dest[y*rystride]; 1200 for (x = 0; x < xcount; x++) 1201 { 1202 abase_x = &abase[x*axstride]; 1203 s = (GFC_INTEGER_8) 0; 1204 for (n = 0; n < count; n++) 1205 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1206 dest_y[x*rxstride] = s; 1207 } 1208 } 1209 } 1210} 1211#undef POW3 1212#undef min 1213#undef max 1214 1215#endif /* HAVE_AVX2 */ 1216 1217#ifdef HAVE_AVX512F 1218static void 1219matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, 1220 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 1221 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 1222static void 1223matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, 1224 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 1225 int blas_limit, blas_call gemm) 1226{ 1227 const GFC_INTEGER_8 * restrict abase; 1228 const GFC_INTEGER_8 * restrict bbase; 1229 GFC_INTEGER_8 * restrict dest; 1230 1231 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1232 index_type x, y, n, count, xcount, ycount; 1233 1234 assert (GFC_DESCRIPTOR_RANK (a) == 2 1235 || GFC_DESCRIPTOR_RANK (b) == 2); 1236 1237/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1238 1239 Either A or B (but not both) can be rank 1: 1240 1241 o One-dimensional argument A is implicitly treated as a row matrix 1242 dimensioned [1,count], so xcount=1. 1243 1244 o One-dimensional argument B is implicitly treated as a column matrix 1245 dimensioned [count, 1], so ycount=1. 1246*/ 1247 1248 if (retarray->base_addr == NULL) 1249 { 1250 if (GFC_DESCRIPTOR_RANK (a) == 1) 1251 { 1252 GFC_DIMENSION_SET(retarray->dim[0], 0, 1253 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1254 } 1255 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1256 { 1257 GFC_DIMENSION_SET(retarray->dim[0], 0, 1258 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1259 } 1260 else 1261 { 1262 GFC_DIMENSION_SET(retarray->dim[0], 0, 1263 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1264 1265 GFC_DIMENSION_SET(retarray->dim[1], 0, 1266 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1267 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1268 } 1269 1270 retarray->base_addr 1271 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); 1272 retarray->offset = 0; 1273 } 1274 else if (unlikely (compile_options.bounds_check)) 1275 { 1276 index_type ret_extent, arg_extent; 1277 1278 if (GFC_DESCRIPTOR_RANK (a) == 1) 1279 { 1280 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1281 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1282 if (arg_extent != ret_extent) 1283 runtime_error ("Array bound mismatch for dimension 1 of " 1284 "array (%ld/%ld) ", 1285 (long int) ret_extent, (long int) arg_extent); 1286 } 1287 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1288 { 1289 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1290 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1291 if (arg_extent != ret_extent) 1292 runtime_error ("Array bound mismatch for dimension 1 of " 1293 "array (%ld/%ld) ", 1294 (long int) ret_extent, (long int) arg_extent); 1295 } 1296 else 1297 { 1298 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1299 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1300 if (arg_extent != ret_extent) 1301 runtime_error ("Array bound mismatch for dimension 1 of " 1302 "array (%ld/%ld) ", 1303 (long int) ret_extent, (long int) arg_extent); 1304 1305 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1306 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1307 if (arg_extent != ret_extent) 1308 runtime_error ("Array bound mismatch for dimension 2 of " 1309 "array (%ld/%ld) ", 1310 (long int) ret_extent, (long int) arg_extent); 1311 } 1312 } 1313 1314 1315 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1316 { 1317 /* One-dimensional result may be addressed in the code below 1318 either as a row or a column matrix. We want both cases to 1319 work. */ 1320 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1321 } 1322 else 1323 { 1324 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1325 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1326 } 1327 1328 1329 if (GFC_DESCRIPTOR_RANK (a) == 1) 1330 { 1331 /* Treat it as a a row matrix A[1,count]. */ 1332 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1333 aystride = 1; 1334 1335 xcount = 1; 1336 count = GFC_DESCRIPTOR_EXTENT(a,0); 1337 } 1338 else 1339 { 1340 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1341 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1342 1343 count = GFC_DESCRIPTOR_EXTENT(a,1); 1344 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1345 } 1346 1347 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1348 { 1349 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1350 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1351 "in dimension 1: is %ld, should be %ld", 1352 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1353 } 1354 1355 if (GFC_DESCRIPTOR_RANK (b) == 1) 1356 { 1357 /* Treat it as a column matrix B[count,1] */ 1358 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1359 1360 /* bystride should never be used for 1-dimensional b. 1361 The value is only used for calculation of the 1362 memory by the buffer. */ 1363 bystride = 256; 1364 ycount = 1; 1365 } 1366 else 1367 { 1368 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1369 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1370 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1371 } 1372 1373 abase = a->base_addr; 1374 bbase = b->base_addr; 1375 dest = retarray->base_addr; 1376 1377 /* Now that everything is set up, we perform the multiplication 1378 itself. */ 1379 1380#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1381#define min(a,b) ((a) <= (b) ? (a) : (b)) 1382#define max(a,b) ((a) >= (b) ? (a) : (b)) 1383 1384 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1385 && (bxstride == 1 || bystride == 1) 1386 && (((float) xcount) * ((float) ycount) * ((float) count) 1387 > POW3(blas_limit))) 1388 { 1389 const int m = xcount, n = ycount, k = count, ldc = rystride; 1390 const GFC_INTEGER_8 one = 1, zero = 0; 1391 const int lda = (axstride == 1) ? aystride : axstride, 1392 ldb = (bxstride == 1) ? bystride : bxstride; 1393 1394 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1395 { 1396 assert (gemm != NULL); 1397 const char *transa, *transb; 1398 if (try_blas & 2) 1399 transa = "C"; 1400 else 1401 transa = axstride == 1 ? "N" : "T"; 1402 1403 if (try_blas & 4) 1404 transb = "C"; 1405 else 1406 transb = bxstride == 1 ? "N" : "T"; 1407 1408 gemm (transa, transb , &m, 1409 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1410 &ldc, 1, 1); 1411 return; 1412 } 1413 } 1414 1415 if (rxstride == 1 && axstride == 1 && bxstride == 1) 1416 { 1417 /* This block of code implements a tuned matmul, derived from 1418 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 1419 1420 Bo Kagstrom and Per Ling 1421 Department of Computing Science 1422 Umea University 1423 S-901 87 Umea, Sweden 1424 1425 from netlib.org, translated to C, and modified for matmul.m4. */ 1426 1427 const GFC_INTEGER_8 *a, *b; 1428 GFC_INTEGER_8 *c; 1429 const index_type m = xcount, n = ycount, k = count; 1430 1431 /* System generated locals */ 1432 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 1433 i1, i2, i3, i4, i5, i6; 1434 1435 /* Local variables */ 1436 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, 1437 f13, f14, f23, f24, f33, f34, f43, f44; 1438 index_type i, j, l, ii, jj, ll; 1439 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 1440 GFC_INTEGER_8 *t1; 1441 1442 a = abase; 1443 b = bbase; 1444 c = retarray->base_addr; 1445 1446 /* Parameter adjustments */ 1447 c_dim1 = rystride; 1448 c_offset = 1 + c_dim1; 1449 c -= c_offset; 1450 a_dim1 = aystride; 1451 a_offset = 1 + a_dim1; 1452 a -= a_offset; 1453 b_dim1 = bystride; 1454 b_offset = 1 + b_dim1; 1455 b -= b_offset; 1456 1457 /* Empty c first. */ 1458 for (j=1; j<=n; j++) 1459 for (i=1; i<=m; i++) 1460 c[i + j * c_dim1] = (GFC_INTEGER_8)0; 1461 1462 /* Early exit if possible */ 1463 if (m == 0 || n == 0 || k == 0) 1464 return; 1465 1466 /* Adjust size of t1 to what is needed. */ 1467 index_type t1_dim, a_sz; 1468 if (aystride == 1) 1469 a_sz = rystride; 1470 else 1471 a_sz = a_dim1; 1472 1473 t1_dim = a_sz * 256 + b_dim1; 1474 if (t1_dim > 65536) 1475 t1_dim = 65536; 1476 1477 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); 1478 1479 /* Start turning the crank. */ 1480 i1 = n; 1481 for (jj = 1; jj <= i1; jj += 512) 1482 { 1483 /* Computing MIN */ 1484 i2 = 512; 1485 i3 = n - jj + 1; 1486 jsec = min(i2,i3); 1487 ujsec = jsec - jsec % 4; 1488 i2 = k; 1489 for (ll = 1; ll <= i2; ll += 256) 1490 { 1491 /* Computing MIN */ 1492 i3 = 256; 1493 i4 = k - ll + 1; 1494 lsec = min(i3,i4); 1495 ulsec = lsec - lsec % 2; 1496 1497 i3 = m; 1498 for (ii = 1; ii <= i3; ii += 256) 1499 { 1500 /* Computing MIN */ 1501 i4 = 256; 1502 i5 = m - ii + 1; 1503 isec = min(i4,i5); 1504 uisec = isec - isec % 2; 1505 i4 = ll + ulsec - 1; 1506 for (l = ll; l <= i4; l += 2) 1507 { 1508 i5 = ii + uisec - 1; 1509 for (i = ii; i <= i5; i += 2) 1510 { 1511 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 1512 a[i + l * a_dim1]; 1513 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 1514 a[i + (l + 1) * a_dim1]; 1515 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 1516 a[i + 1 + l * a_dim1]; 1517 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 1518 a[i + 1 + (l + 1) * a_dim1]; 1519 } 1520 if (uisec < isec) 1521 { 1522 t1[l - ll + 1 + (isec << 8) - 257] = 1523 a[ii + isec - 1 + l * a_dim1]; 1524 t1[l - ll + 2 + (isec << 8) - 257] = 1525 a[ii + isec - 1 + (l + 1) * a_dim1]; 1526 } 1527 } 1528 if (ulsec < lsec) 1529 { 1530 i4 = ii + isec - 1; 1531 for (i = ii; i<= i4; ++i) 1532 { 1533 t1[lsec + ((i - ii + 1) << 8) - 257] = 1534 a[i + (ll + lsec - 1) * a_dim1]; 1535 } 1536 } 1537 1538 uisec = isec - isec % 4; 1539 i4 = jj + ujsec - 1; 1540 for (j = jj; j <= i4; j += 4) 1541 { 1542 i5 = ii + uisec - 1; 1543 for (i = ii; i <= i5; i += 4) 1544 { 1545 f11 = c[i + j * c_dim1]; 1546 f21 = c[i + 1 + j * c_dim1]; 1547 f12 = c[i + (j + 1) * c_dim1]; 1548 f22 = c[i + 1 + (j + 1) * c_dim1]; 1549 f13 = c[i + (j + 2) * c_dim1]; 1550 f23 = c[i + 1 + (j + 2) * c_dim1]; 1551 f14 = c[i + (j + 3) * c_dim1]; 1552 f24 = c[i + 1 + (j + 3) * c_dim1]; 1553 f31 = c[i + 2 + j * c_dim1]; 1554 f41 = c[i + 3 + j * c_dim1]; 1555 f32 = c[i + 2 + (j + 1) * c_dim1]; 1556 f42 = c[i + 3 + (j + 1) * c_dim1]; 1557 f33 = c[i + 2 + (j + 2) * c_dim1]; 1558 f43 = c[i + 3 + (j + 2) * c_dim1]; 1559 f34 = c[i + 2 + (j + 3) * c_dim1]; 1560 f44 = c[i + 3 + (j + 3) * c_dim1]; 1561 i6 = ll + lsec - 1; 1562 for (l = ll; l <= i6; ++l) 1563 { 1564 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1565 * b[l + j * b_dim1]; 1566 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1567 * b[l + j * b_dim1]; 1568 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1569 * b[l + (j + 1) * b_dim1]; 1570 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1571 * b[l + (j + 1) * b_dim1]; 1572 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1573 * b[l + (j + 2) * b_dim1]; 1574 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1575 * b[l + (j + 2) * b_dim1]; 1576 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1577 * b[l + (j + 3) * b_dim1]; 1578 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1579 * b[l + (j + 3) * b_dim1]; 1580 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1581 * b[l + j * b_dim1]; 1582 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1583 * b[l + j * b_dim1]; 1584 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1585 * b[l + (j + 1) * b_dim1]; 1586 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1587 * b[l + (j + 1) * b_dim1]; 1588 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1589 * b[l + (j + 2) * b_dim1]; 1590 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1591 * b[l + (j + 2) * b_dim1]; 1592 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1593 * b[l + (j + 3) * b_dim1]; 1594 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1595 * b[l + (j + 3) * b_dim1]; 1596 } 1597 c[i + j * c_dim1] = f11; 1598 c[i + 1 + j * c_dim1] = f21; 1599 c[i + (j + 1) * c_dim1] = f12; 1600 c[i + 1 + (j + 1) * c_dim1] = f22; 1601 c[i + (j + 2) * c_dim1] = f13; 1602 c[i + 1 + (j + 2) * c_dim1] = f23; 1603 c[i + (j + 3) * c_dim1] = f14; 1604 c[i + 1 + (j + 3) * c_dim1] = f24; 1605 c[i + 2 + j * c_dim1] = f31; 1606 c[i + 3 + j * c_dim1] = f41; 1607 c[i + 2 + (j + 1) * c_dim1] = f32; 1608 c[i + 3 + (j + 1) * c_dim1] = f42; 1609 c[i + 2 + (j + 2) * c_dim1] = f33; 1610 c[i + 3 + (j + 2) * c_dim1] = f43; 1611 c[i + 2 + (j + 3) * c_dim1] = f34; 1612 c[i + 3 + (j + 3) * c_dim1] = f44; 1613 } 1614 if (uisec < isec) 1615 { 1616 i5 = ii + isec - 1; 1617 for (i = ii + uisec; i <= i5; ++i) 1618 { 1619 f11 = c[i + j * c_dim1]; 1620 f12 = c[i + (j + 1) * c_dim1]; 1621 f13 = c[i + (j + 2) * c_dim1]; 1622 f14 = c[i + (j + 3) * c_dim1]; 1623 i6 = ll + lsec - 1; 1624 for (l = ll; l <= i6; ++l) 1625 { 1626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1627 257] * b[l + j * b_dim1]; 1628 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1629 257] * b[l + (j + 1) * b_dim1]; 1630 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1631 257] * b[l + (j + 2) * b_dim1]; 1632 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1633 257] * b[l + (j + 3) * b_dim1]; 1634 } 1635 c[i + j * c_dim1] = f11; 1636 c[i + (j + 1) * c_dim1] = f12; 1637 c[i + (j + 2) * c_dim1] = f13; 1638 c[i + (j + 3) * c_dim1] = f14; 1639 } 1640 } 1641 } 1642 if (ujsec < jsec) 1643 { 1644 i4 = jj + jsec - 1; 1645 for (j = jj + ujsec; j <= i4; ++j) 1646 { 1647 i5 = ii + uisec - 1; 1648 for (i = ii; i <= i5; i += 4) 1649 { 1650 f11 = c[i + j * c_dim1]; 1651 f21 = c[i + 1 + j * c_dim1]; 1652 f31 = c[i + 2 + j * c_dim1]; 1653 f41 = c[i + 3 + j * c_dim1]; 1654 i6 = ll + lsec - 1; 1655 for (l = ll; l <= i6; ++l) 1656 { 1657 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1658 257] * b[l + j * b_dim1]; 1659 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1660 257] * b[l + j * b_dim1]; 1661 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1662 257] * b[l + j * b_dim1]; 1663 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1664 257] * b[l + j * b_dim1]; 1665 } 1666 c[i + j * c_dim1] = f11; 1667 c[i + 1 + j * c_dim1] = f21; 1668 c[i + 2 + j * c_dim1] = f31; 1669 c[i + 3 + j * c_dim1] = f41; 1670 } 1671 i5 = ii + isec - 1; 1672 for (i = ii + uisec; i <= i5; ++i) 1673 { 1674 f11 = c[i + j * c_dim1]; 1675 i6 = ll + lsec - 1; 1676 for (l = ll; l <= i6; ++l) 1677 { 1678 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1679 257] * b[l + j * b_dim1]; 1680 } 1681 c[i + j * c_dim1] = f11; 1682 } 1683 } 1684 } 1685 } 1686 } 1687 } 1688 free(t1); 1689 return; 1690 } 1691 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1692 { 1693 if (GFC_DESCRIPTOR_RANK (a) != 1) 1694 { 1695 const GFC_INTEGER_8 *restrict abase_x; 1696 const GFC_INTEGER_8 *restrict bbase_y; 1697 GFC_INTEGER_8 *restrict dest_y; 1698 GFC_INTEGER_8 s; 1699 1700 for (y = 0; y < ycount; y++) 1701 { 1702 bbase_y = &bbase[y*bystride]; 1703 dest_y = &dest[y*rystride]; 1704 for (x = 0; x < xcount; x++) 1705 { 1706 abase_x = &abase[x*axstride]; 1707 s = (GFC_INTEGER_8) 0; 1708 for (n = 0; n < count; n++) 1709 s += abase_x[n] * bbase_y[n]; 1710 dest_y[x] = s; 1711 } 1712 } 1713 } 1714 else 1715 { 1716 const GFC_INTEGER_8 *restrict bbase_y; 1717 GFC_INTEGER_8 s; 1718 1719 for (y = 0; y < ycount; y++) 1720 { 1721 bbase_y = &bbase[y*bystride]; 1722 s = (GFC_INTEGER_8) 0; 1723 for (n = 0; n < count; n++) 1724 s += abase[n*axstride] * bbase_y[n]; 1725 dest[y*rystride] = s; 1726 } 1727 } 1728 } 1729 else if (axstride < aystride) 1730 { 1731 for (y = 0; y < ycount; y++) 1732 for (x = 0; x < xcount; x++) 1733 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; 1734 1735 for (y = 0; y < ycount; y++) 1736 for (n = 0; n < count; n++) 1737 for (x = 0; x < xcount; x++) 1738 /* dest[x,y] += a[x,n] * b[n,y] */ 1739 dest[x*rxstride + y*rystride] += 1740 abase[x*axstride + n*aystride] * 1741 bbase[n*bxstride + y*bystride]; 1742 } 1743 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1744 { 1745 const GFC_INTEGER_8 *restrict bbase_y; 1746 GFC_INTEGER_8 s; 1747 1748 for (y = 0; y < ycount; y++) 1749 { 1750 bbase_y = &bbase[y*bystride]; 1751 s = (GFC_INTEGER_8) 0; 1752 for (n = 0; n < count; n++) 1753 s += abase[n*axstride] * bbase_y[n*bxstride]; 1754 dest[y*rxstride] = s; 1755 } 1756 } 1757 else 1758 { 1759 const GFC_INTEGER_8 *restrict abase_x; 1760 const GFC_INTEGER_8 *restrict bbase_y; 1761 GFC_INTEGER_8 *restrict dest_y; 1762 GFC_INTEGER_8 s; 1763 1764 for (y = 0; y < ycount; y++) 1765 { 1766 bbase_y = &bbase[y*bystride]; 1767 dest_y = &dest[y*rystride]; 1768 for (x = 0; x < xcount; x++) 1769 { 1770 abase_x = &abase[x*axstride]; 1771 s = (GFC_INTEGER_8) 0; 1772 for (n = 0; n < count; n++) 1773 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1774 dest_y[x*rxstride] = s; 1775 } 1776 } 1777 } 1778} 1779#undef POW3 1780#undef min 1781#undef max 1782 1783#endif /* HAVE_AVX512F */ 1784 1785/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 1786 1787#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 1788void 1789matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray, 1790 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 1791 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 1792internal_proto(matmul_i8_avx128_fma3); 1793#endif 1794 1795#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 1796void 1797matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray, 1798 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 1799 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 1800internal_proto(matmul_i8_avx128_fma4); 1801#endif 1802 1803/* Function to fall back to if there is no special processor-specific version. */ 1804static void 1805matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, 1806 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 1807 int blas_limit, blas_call gemm) 1808{ 1809 const GFC_INTEGER_8 * restrict abase; 1810 const GFC_INTEGER_8 * restrict bbase; 1811 GFC_INTEGER_8 * restrict dest; 1812 1813 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1814 index_type x, y, n, count, xcount, ycount; 1815 1816 assert (GFC_DESCRIPTOR_RANK (a) == 2 1817 || GFC_DESCRIPTOR_RANK (b) == 2); 1818 1819/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1820 1821 Either A or B (but not both) can be rank 1: 1822 1823 o One-dimensional argument A is implicitly treated as a row matrix 1824 dimensioned [1,count], so xcount=1. 1825 1826 o One-dimensional argument B is implicitly treated as a column matrix 1827 dimensioned [count, 1], so ycount=1. 1828*/ 1829 1830 if (retarray->base_addr == NULL) 1831 { 1832 if (GFC_DESCRIPTOR_RANK (a) == 1) 1833 { 1834 GFC_DIMENSION_SET(retarray->dim[0], 0, 1835 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1836 } 1837 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1838 { 1839 GFC_DIMENSION_SET(retarray->dim[0], 0, 1840 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1841 } 1842 else 1843 { 1844 GFC_DIMENSION_SET(retarray->dim[0], 0, 1845 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1846 1847 GFC_DIMENSION_SET(retarray->dim[1], 0, 1848 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1849 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1850 } 1851 1852 retarray->base_addr 1853 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); 1854 retarray->offset = 0; 1855 } 1856 else if (unlikely (compile_options.bounds_check)) 1857 { 1858 index_type ret_extent, arg_extent; 1859 1860 if (GFC_DESCRIPTOR_RANK (a) == 1) 1861 { 1862 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1863 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1864 if (arg_extent != ret_extent) 1865 runtime_error ("Array bound mismatch for dimension 1 of " 1866 "array (%ld/%ld) ", 1867 (long int) ret_extent, (long int) arg_extent); 1868 } 1869 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1870 { 1871 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1872 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1873 if (arg_extent != ret_extent) 1874 runtime_error ("Array bound mismatch for dimension 1 of " 1875 "array (%ld/%ld) ", 1876 (long int) ret_extent, (long int) arg_extent); 1877 } 1878 else 1879 { 1880 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1881 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1882 if (arg_extent != ret_extent) 1883 runtime_error ("Array bound mismatch for dimension 1 of " 1884 "array (%ld/%ld) ", 1885 (long int) ret_extent, (long int) arg_extent); 1886 1887 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1888 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1889 if (arg_extent != ret_extent) 1890 runtime_error ("Array bound mismatch for dimension 2 of " 1891 "array (%ld/%ld) ", 1892 (long int) ret_extent, (long int) arg_extent); 1893 } 1894 } 1895 1896 1897 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1898 { 1899 /* One-dimensional result may be addressed in the code below 1900 either as a row or a column matrix. We want both cases to 1901 work. */ 1902 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1903 } 1904 else 1905 { 1906 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1907 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1908 } 1909 1910 1911 if (GFC_DESCRIPTOR_RANK (a) == 1) 1912 { 1913 /* Treat it as a a row matrix A[1,count]. */ 1914 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1915 aystride = 1; 1916 1917 xcount = 1; 1918 count = GFC_DESCRIPTOR_EXTENT(a,0); 1919 } 1920 else 1921 { 1922 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1923 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1924 1925 count = GFC_DESCRIPTOR_EXTENT(a,1); 1926 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1927 } 1928 1929 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1930 { 1931 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1932 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1933 "in dimension 1: is %ld, should be %ld", 1934 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1935 } 1936 1937 if (GFC_DESCRIPTOR_RANK (b) == 1) 1938 { 1939 /* Treat it as a column matrix B[count,1] */ 1940 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1941 1942 /* bystride should never be used for 1-dimensional b. 1943 The value is only used for calculation of the 1944 memory by the buffer. */ 1945 bystride = 256; 1946 ycount = 1; 1947 } 1948 else 1949 { 1950 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1951 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1952 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1953 } 1954 1955 abase = a->base_addr; 1956 bbase = b->base_addr; 1957 dest = retarray->base_addr; 1958 1959 /* Now that everything is set up, we perform the multiplication 1960 itself. */ 1961 1962#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1963#define min(a,b) ((a) <= (b) ? (a) : (b)) 1964#define max(a,b) ((a) >= (b) ? (a) : (b)) 1965 1966 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1967 && (bxstride == 1 || bystride == 1) 1968 && (((float) xcount) * ((float) ycount) * ((float) count) 1969 > POW3(blas_limit))) 1970 { 1971 const int m = xcount, n = ycount, k = count, ldc = rystride; 1972 const GFC_INTEGER_8 one = 1, zero = 0; 1973 const int lda = (axstride == 1) ? aystride : axstride, 1974 ldb = (bxstride == 1) ? bystride : bxstride; 1975 1976 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1977 { 1978 assert (gemm != NULL); 1979 const char *transa, *transb; 1980 if (try_blas & 2) 1981 transa = "C"; 1982 else 1983 transa = axstride == 1 ? "N" : "T"; 1984 1985 if (try_blas & 4) 1986 transb = "C"; 1987 else 1988 transb = bxstride == 1 ? "N" : "T"; 1989 1990 gemm (transa, transb , &m, 1991 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1992 &ldc, 1, 1); 1993 return; 1994 } 1995 } 1996 1997 if (rxstride == 1 && axstride == 1 && bxstride == 1) 1998 { 1999 /* This block of code implements a tuned matmul, derived from 2000 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2001 2002 Bo Kagstrom and Per Ling 2003 Department of Computing Science 2004 Umea University 2005 S-901 87 Umea, Sweden 2006 2007 from netlib.org, translated to C, and modified for matmul.m4. */ 2008 2009 const GFC_INTEGER_8 *a, *b; 2010 GFC_INTEGER_8 *c; 2011 const index_type m = xcount, n = ycount, k = count; 2012 2013 /* System generated locals */ 2014 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2015 i1, i2, i3, i4, i5, i6; 2016 2017 /* Local variables */ 2018 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, 2019 f13, f14, f23, f24, f33, f34, f43, f44; 2020 index_type i, j, l, ii, jj, ll; 2021 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2022 GFC_INTEGER_8 *t1; 2023 2024 a = abase; 2025 b = bbase; 2026 c = retarray->base_addr; 2027 2028 /* Parameter adjustments */ 2029 c_dim1 = rystride; 2030 c_offset = 1 + c_dim1; 2031 c -= c_offset; 2032 a_dim1 = aystride; 2033 a_offset = 1 + a_dim1; 2034 a -= a_offset; 2035 b_dim1 = bystride; 2036 b_offset = 1 + b_dim1; 2037 b -= b_offset; 2038 2039 /* Empty c first. */ 2040 for (j=1; j<=n; j++) 2041 for (i=1; i<=m; i++) 2042 c[i + j * c_dim1] = (GFC_INTEGER_8)0; 2043 2044 /* Early exit if possible */ 2045 if (m == 0 || n == 0 || k == 0) 2046 return; 2047 2048 /* Adjust size of t1 to what is needed. */ 2049 index_type t1_dim, a_sz; 2050 if (aystride == 1) 2051 a_sz = rystride; 2052 else 2053 a_sz = a_dim1; 2054 2055 t1_dim = a_sz * 256 + b_dim1; 2056 if (t1_dim > 65536) 2057 t1_dim = 65536; 2058 2059 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); 2060 2061 /* Start turning the crank. */ 2062 i1 = n; 2063 for (jj = 1; jj <= i1; jj += 512) 2064 { 2065 /* Computing MIN */ 2066 i2 = 512; 2067 i3 = n - jj + 1; 2068 jsec = min(i2,i3); 2069 ujsec = jsec - jsec % 4; 2070 i2 = k; 2071 for (ll = 1; ll <= i2; ll += 256) 2072 { 2073 /* Computing MIN */ 2074 i3 = 256; 2075 i4 = k - ll + 1; 2076 lsec = min(i3,i4); 2077 ulsec = lsec - lsec % 2; 2078 2079 i3 = m; 2080 for (ii = 1; ii <= i3; ii += 256) 2081 { 2082 /* Computing MIN */ 2083 i4 = 256; 2084 i5 = m - ii + 1; 2085 isec = min(i4,i5); 2086 uisec = isec - isec % 2; 2087 i4 = ll + ulsec - 1; 2088 for (l = ll; l <= i4; l += 2) 2089 { 2090 i5 = ii + uisec - 1; 2091 for (i = ii; i <= i5; i += 2) 2092 { 2093 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2094 a[i + l * a_dim1]; 2095 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2096 a[i + (l + 1) * a_dim1]; 2097 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2098 a[i + 1 + l * a_dim1]; 2099 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2100 a[i + 1 + (l + 1) * a_dim1]; 2101 } 2102 if (uisec < isec) 2103 { 2104 t1[l - ll + 1 + (isec << 8) - 257] = 2105 a[ii + isec - 1 + l * a_dim1]; 2106 t1[l - ll + 2 + (isec << 8) - 257] = 2107 a[ii + isec - 1 + (l + 1) * a_dim1]; 2108 } 2109 } 2110 if (ulsec < lsec) 2111 { 2112 i4 = ii + isec - 1; 2113 for (i = ii; i<= i4; ++i) 2114 { 2115 t1[lsec + ((i - ii + 1) << 8) - 257] = 2116 a[i + (ll + lsec - 1) * a_dim1]; 2117 } 2118 } 2119 2120 uisec = isec - isec % 4; 2121 i4 = jj + ujsec - 1; 2122 for (j = jj; j <= i4; j += 4) 2123 { 2124 i5 = ii + uisec - 1; 2125 for (i = ii; i <= i5; i += 4) 2126 { 2127 f11 = c[i + j * c_dim1]; 2128 f21 = c[i + 1 + j * c_dim1]; 2129 f12 = c[i + (j + 1) * c_dim1]; 2130 f22 = c[i + 1 + (j + 1) * c_dim1]; 2131 f13 = c[i + (j + 2) * c_dim1]; 2132 f23 = c[i + 1 + (j + 2) * c_dim1]; 2133 f14 = c[i + (j + 3) * c_dim1]; 2134 f24 = c[i + 1 + (j + 3) * c_dim1]; 2135 f31 = c[i + 2 + j * c_dim1]; 2136 f41 = c[i + 3 + j * c_dim1]; 2137 f32 = c[i + 2 + (j + 1) * c_dim1]; 2138 f42 = c[i + 3 + (j + 1) * c_dim1]; 2139 f33 = c[i + 2 + (j + 2) * c_dim1]; 2140 f43 = c[i + 3 + (j + 2) * c_dim1]; 2141 f34 = c[i + 2 + (j + 3) * c_dim1]; 2142 f44 = c[i + 3 + (j + 3) * c_dim1]; 2143 i6 = ll + lsec - 1; 2144 for (l = ll; l <= i6; ++l) 2145 { 2146 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2147 * b[l + j * b_dim1]; 2148 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2149 * b[l + j * b_dim1]; 2150 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2151 * b[l + (j + 1) * b_dim1]; 2152 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2153 * b[l + (j + 1) * b_dim1]; 2154 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2155 * b[l + (j + 2) * b_dim1]; 2156 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2157 * b[l + (j + 2) * b_dim1]; 2158 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2159 * b[l + (j + 3) * b_dim1]; 2160 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2161 * b[l + (j + 3) * b_dim1]; 2162 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2163 * b[l + j * b_dim1]; 2164 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2165 * b[l + j * b_dim1]; 2166 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2167 * b[l + (j + 1) * b_dim1]; 2168 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2169 * b[l + (j + 1) * b_dim1]; 2170 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2171 * b[l + (j + 2) * b_dim1]; 2172 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2173 * b[l + (j + 2) * b_dim1]; 2174 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2175 * b[l + (j + 3) * b_dim1]; 2176 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2177 * b[l + (j + 3) * b_dim1]; 2178 } 2179 c[i + j * c_dim1] = f11; 2180 c[i + 1 + j * c_dim1] = f21; 2181 c[i + (j + 1) * c_dim1] = f12; 2182 c[i + 1 + (j + 1) * c_dim1] = f22; 2183 c[i + (j + 2) * c_dim1] = f13; 2184 c[i + 1 + (j + 2) * c_dim1] = f23; 2185 c[i + (j + 3) * c_dim1] = f14; 2186 c[i + 1 + (j + 3) * c_dim1] = f24; 2187 c[i + 2 + j * c_dim1] = f31; 2188 c[i + 3 + j * c_dim1] = f41; 2189 c[i + 2 + (j + 1) * c_dim1] = f32; 2190 c[i + 3 + (j + 1) * c_dim1] = f42; 2191 c[i + 2 + (j + 2) * c_dim1] = f33; 2192 c[i + 3 + (j + 2) * c_dim1] = f43; 2193 c[i + 2 + (j + 3) * c_dim1] = f34; 2194 c[i + 3 + (j + 3) * c_dim1] = f44; 2195 } 2196 if (uisec < isec) 2197 { 2198 i5 = ii + isec - 1; 2199 for (i = ii + uisec; i <= i5; ++i) 2200 { 2201 f11 = c[i + j * c_dim1]; 2202 f12 = c[i + (j + 1) * c_dim1]; 2203 f13 = c[i + (j + 2) * c_dim1]; 2204 f14 = c[i + (j + 3) * c_dim1]; 2205 i6 = ll + lsec - 1; 2206 for (l = ll; l <= i6; ++l) 2207 { 2208 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2209 257] * b[l + j * b_dim1]; 2210 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2211 257] * b[l + (j + 1) * b_dim1]; 2212 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2213 257] * b[l + (j + 2) * b_dim1]; 2214 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2215 257] * b[l + (j + 3) * b_dim1]; 2216 } 2217 c[i + j * c_dim1] = f11; 2218 c[i + (j + 1) * c_dim1] = f12; 2219 c[i + (j + 2) * c_dim1] = f13; 2220 c[i + (j + 3) * c_dim1] = f14; 2221 } 2222 } 2223 } 2224 if (ujsec < jsec) 2225 { 2226 i4 = jj + jsec - 1; 2227 for (j = jj + ujsec; j <= i4; ++j) 2228 { 2229 i5 = ii + uisec - 1; 2230 for (i = ii; i <= i5; i += 4) 2231 { 2232 f11 = c[i + j * c_dim1]; 2233 f21 = c[i + 1 + j * c_dim1]; 2234 f31 = c[i + 2 + j * c_dim1]; 2235 f41 = c[i + 3 + j * c_dim1]; 2236 i6 = ll + lsec - 1; 2237 for (l = ll; l <= i6; ++l) 2238 { 2239 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2240 257] * b[l + j * b_dim1]; 2241 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2242 257] * b[l + j * b_dim1]; 2243 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2244 257] * b[l + j * b_dim1]; 2245 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2246 257] * b[l + j * b_dim1]; 2247 } 2248 c[i + j * c_dim1] = f11; 2249 c[i + 1 + j * c_dim1] = f21; 2250 c[i + 2 + j * c_dim1] = f31; 2251 c[i + 3 + j * c_dim1] = f41; 2252 } 2253 i5 = ii + isec - 1; 2254 for (i = ii + uisec; i <= i5; ++i) 2255 { 2256 f11 = c[i + j * c_dim1]; 2257 i6 = ll + lsec - 1; 2258 for (l = ll; l <= i6; ++l) 2259 { 2260 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2261 257] * b[l + j * b_dim1]; 2262 } 2263 c[i + j * c_dim1] = f11; 2264 } 2265 } 2266 } 2267 } 2268 } 2269 } 2270 free(t1); 2271 return; 2272 } 2273 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2274 { 2275 if (GFC_DESCRIPTOR_RANK (a) != 1) 2276 { 2277 const GFC_INTEGER_8 *restrict abase_x; 2278 const GFC_INTEGER_8 *restrict bbase_y; 2279 GFC_INTEGER_8 *restrict dest_y; 2280 GFC_INTEGER_8 s; 2281 2282 for (y = 0; y < ycount; y++) 2283 { 2284 bbase_y = &bbase[y*bystride]; 2285 dest_y = &dest[y*rystride]; 2286 for (x = 0; x < xcount; x++) 2287 { 2288 abase_x = &abase[x*axstride]; 2289 s = (GFC_INTEGER_8) 0; 2290 for (n = 0; n < count; n++) 2291 s += abase_x[n] * bbase_y[n]; 2292 dest_y[x] = s; 2293 } 2294 } 2295 } 2296 else 2297 { 2298 const GFC_INTEGER_8 *restrict bbase_y; 2299 GFC_INTEGER_8 s; 2300 2301 for (y = 0; y < ycount; y++) 2302 { 2303 bbase_y = &bbase[y*bystride]; 2304 s = (GFC_INTEGER_8) 0; 2305 for (n = 0; n < count; n++) 2306 s += abase[n*axstride] * bbase_y[n]; 2307 dest[y*rystride] = s; 2308 } 2309 } 2310 } 2311 else if (axstride < aystride) 2312 { 2313 for (y = 0; y < ycount; y++) 2314 for (x = 0; x < xcount; x++) 2315 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; 2316 2317 for (y = 0; y < ycount; y++) 2318 for (n = 0; n < count; n++) 2319 for (x = 0; x < xcount; x++) 2320 /* dest[x,y] += a[x,n] * b[n,y] */ 2321 dest[x*rxstride + y*rystride] += 2322 abase[x*axstride + n*aystride] * 2323 bbase[n*bxstride + y*bystride]; 2324 } 2325 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2326 { 2327 const GFC_INTEGER_8 *restrict bbase_y; 2328 GFC_INTEGER_8 s; 2329 2330 for (y = 0; y < ycount; y++) 2331 { 2332 bbase_y = &bbase[y*bystride]; 2333 s = (GFC_INTEGER_8) 0; 2334 for (n = 0; n < count; n++) 2335 s += abase[n*axstride] * bbase_y[n*bxstride]; 2336 dest[y*rxstride] = s; 2337 } 2338 } 2339 else 2340 { 2341 const GFC_INTEGER_8 *restrict abase_x; 2342 const GFC_INTEGER_8 *restrict bbase_y; 2343 GFC_INTEGER_8 *restrict dest_y; 2344 GFC_INTEGER_8 s; 2345 2346 for (y = 0; y < ycount; y++) 2347 { 2348 bbase_y = &bbase[y*bystride]; 2349 dest_y = &dest[y*rystride]; 2350 for (x = 0; x < xcount; x++) 2351 { 2352 abase_x = &abase[x*axstride]; 2353 s = (GFC_INTEGER_8) 0; 2354 for (n = 0; n < count; n++) 2355 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 2356 dest_y[x*rxstride] = s; 2357 } 2358 } 2359 } 2360} 2361#undef POW3 2362#undef min 2363#undef max 2364 2365 2366/* Compiling main function, with selection code for the processor. */ 2367 2368/* Currently, this is i386 only. Adjust for other architectures. */ 2369 2370#include <config/i386/cpuinfo.h> 2371void matmul_i8 (gfc_array_i8 * const restrict retarray, 2372 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 2373 int blas_limit, blas_call gemm) 2374{ 2375 static void (*matmul_p) (gfc_array_i8 * const restrict retarray, 2376 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 2377 int blas_limit, blas_call gemm); 2378 2379 void (*matmul_fn) (gfc_array_i8 * const restrict retarray, 2380 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 2381 int blas_limit, blas_call gemm); 2382 2383 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 2384 if (matmul_fn == NULL) 2385 { 2386 matmul_fn = matmul_i8_vanilla; 2387 if (__cpu_model.__cpu_vendor == VENDOR_INTEL) 2388 { 2389 /* Run down the available processors in order of preference. */ 2390#ifdef HAVE_AVX512F 2391 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) 2392 { 2393 matmul_fn = matmul_i8_avx512f; 2394 goto store; 2395 } 2396 2397#endif /* HAVE_AVX512F */ 2398 2399#ifdef HAVE_AVX2 2400 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) 2401 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 2402 { 2403 matmul_fn = matmul_i8_avx2; 2404 goto store; 2405 } 2406 2407#endif 2408 2409#ifdef HAVE_AVX 2410 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2411 { 2412 matmul_fn = matmul_i8_avx; 2413 goto store; 2414 } 2415#endif /* HAVE_AVX */ 2416 } 2417 else if (__cpu_model.__cpu_vendor == VENDOR_AMD) 2418 { 2419#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 2420 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2421 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) 2422 { 2423 matmul_fn = matmul_i8_avx128_fma3; 2424 goto store; 2425 } 2426#endif 2427#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 2428 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) 2429 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4))) 2430 { 2431 matmul_fn = matmul_i8_avx128_fma4; 2432 goto store; 2433 } 2434#endif 2435 2436 } 2437 store: 2438 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 2439 } 2440 2441 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 2442} 2443 2444#else /* Just the vanilla function. */ 2445 2446void 2447matmul_i8 (gfc_array_i8 * const restrict retarray, 2448 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, 2449 int blas_limit, blas_call gemm) 2450{ 2451 const GFC_INTEGER_8 * restrict abase; 2452 const GFC_INTEGER_8 * restrict bbase; 2453 GFC_INTEGER_8 * restrict dest; 2454 2455 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 2456 index_type x, y, n, count, xcount, ycount; 2457 2458 assert (GFC_DESCRIPTOR_RANK (a) == 2 2459 || GFC_DESCRIPTOR_RANK (b) == 2); 2460 2461/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 2462 2463 Either A or B (but not both) can be rank 1: 2464 2465 o One-dimensional argument A is implicitly treated as a row matrix 2466 dimensioned [1,count], so xcount=1. 2467 2468 o One-dimensional argument B is implicitly treated as a column matrix 2469 dimensioned [count, 1], so ycount=1. 2470*/ 2471 2472 if (retarray->base_addr == NULL) 2473 { 2474 if (GFC_DESCRIPTOR_RANK (a) == 1) 2475 { 2476 GFC_DIMENSION_SET(retarray->dim[0], 0, 2477 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 2478 } 2479 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2480 { 2481 GFC_DIMENSION_SET(retarray->dim[0], 0, 2482 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2483 } 2484 else 2485 { 2486 GFC_DIMENSION_SET(retarray->dim[0], 0, 2487 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2488 2489 GFC_DIMENSION_SET(retarray->dim[1], 0, 2490 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 2491 GFC_DESCRIPTOR_EXTENT(retarray,0)); 2492 } 2493 2494 retarray->base_addr 2495 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); 2496 retarray->offset = 0; 2497 } 2498 else if (unlikely (compile_options.bounds_check)) 2499 { 2500 index_type ret_extent, arg_extent; 2501 2502 if (GFC_DESCRIPTOR_RANK (a) == 1) 2503 { 2504 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2505 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2506 if (arg_extent != ret_extent) 2507 runtime_error ("Array bound mismatch for dimension 1 of " 2508 "array (%ld/%ld) ", 2509 (long int) ret_extent, (long int) arg_extent); 2510 } 2511 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2512 { 2513 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2514 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2515 if (arg_extent != ret_extent) 2516 runtime_error ("Array bound mismatch for dimension 1 of " 2517 "array (%ld/%ld) ", 2518 (long int) ret_extent, (long int) arg_extent); 2519 } 2520 else 2521 { 2522 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2523 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2524 if (arg_extent != ret_extent) 2525 runtime_error ("Array bound mismatch for dimension 1 of " 2526 "array (%ld/%ld) ", 2527 (long int) ret_extent, (long int) arg_extent); 2528 2529 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2530 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 2531 if (arg_extent != ret_extent) 2532 runtime_error ("Array bound mismatch for dimension 2 of " 2533 "array (%ld/%ld) ", 2534 (long int) ret_extent, (long int) arg_extent); 2535 } 2536 } 2537 2538 2539 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 2540 { 2541 /* One-dimensional result may be addressed in the code below 2542 either as a row or a column matrix. We want both cases to 2543 work. */ 2544 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2545 } 2546 else 2547 { 2548 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2549 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 2550 } 2551 2552 2553 if (GFC_DESCRIPTOR_RANK (a) == 1) 2554 { 2555 /* Treat it as a a row matrix A[1,count]. */ 2556 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2557 aystride = 1; 2558 2559 xcount = 1; 2560 count = GFC_DESCRIPTOR_EXTENT(a,0); 2561 } 2562 else 2563 { 2564 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2565 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 2566 2567 count = GFC_DESCRIPTOR_EXTENT(a,1); 2568 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 2569 } 2570 2571 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 2572 { 2573 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 2574 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 2575 "in dimension 1: is %ld, should be %ld", 2576 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 2577 } 2578 2579 if (GFC_DESCRIPTOR_RANK (b) == 1) 2580 { 2581 /* Treat it as a column matrix B[count,1] */ 2582 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2583 2584 /* bystride should never be used for 1-dimensional b. 2585 The value is only used for calculation of the 2586 memory by the buffer. */ 2587 bystride = 256; 2588 ycount = 1; 2589 } 2590 else 2591 { 2592 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2593 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 2594 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 2595 } 2596 2597 abase = a->base_addr; 2598 bbase = b->base_addr; 2599 dest = retarray->base_addr; 2600 2601 /* Now that everything is set up, we perform the multiplication 2602 itself. */ 2603 2604#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 2605#define min(a,b) ((a) <= (b) ? (a) : (b)) 2606#define max(a,b) ((a) >= (b) ? (a) : (b)) 2607 2608 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 2609 && (bxstride == 1 || bystride == 1) 2610 && (((float) xcount) * ((float) ycount) * ((float) count) 2611 > POW3(blas_limit))) 2612 { 2613 const int m = xcount, n = ycount, k = count, ldc = rystride; 2614 const GFC_INTEGER_8 one = 1, zero = 0; 2615 const int lda = (axstride == 1) ? aystride : axstride, 2616 ldb = (bxstride == 1) ? bystride : bxstride; 2617 2618 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 2619 { 2620 assert (gemm != NULL); 2621 const char *transa, *transb; 2622 if (try_blas & 2) 2623 transa = "C"; 2624 else 2625 transa = axstride == 1 ? "N" : "T"; 2626 2627 if (try_blas & 4) 2628 transb = "C"; 2629 else 2630 transb = bxstride == 1 ? "N" : "T"; 2631 2632 gemm (transa, transb , &m, 2633 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 2634 &ldc, 1, 1); 2635 return; 2636 } 2637 } 2638 2639 if (rxstride == 1 && axstride == 1 && bxstride == 1) 2640 { 2641 /* This block of code implements a tuned matmul, derived from 2642 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2643 2644 Bo Kagstrom and Per Ling 2645 Department of Computing Science 2646 Umea University 2647 S-901 87 Umea, Sweden 2648 2649 from netlib.org, translated to C, and modified for matmul.m4. */ 2650 2651 const GFC_INTEGER_8 *a, *b; 2652 GFC_INTEGER_8 *c; 2653 const index_type m = xcount, n = ycount, k = count; 2654 2655 /* System generated locals */ 2656 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2657 i1, i2, i3, i4, i5, i6; 2658 2659 /* Local variables */ 2660 GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, 2661 f13, f14, f23, f24, f33, f34, f43, f44; 2662 index_type i, j, l, ii, jj, ll; 2663 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2664 GFC_INTEGER_8 *t1; 2665 2666 a = abase; 2667 b = bbase; 2668 c = retarray->base_addr; 2669 2670 /* Parameter adjustments */ 2671 c_dim1 = rystride; 2672 c_offset = 1 + c_dim1; 2673 c -= c_offset; 2674 a_dim1 = aystride; 2675 a_offset = 1 + a_dim1; 2676 a -= a_offset; 2677 b_dim1 = bystride; 2678 b_offset = 1 + b_dim1; 2679 b -= b_offset; 2680 2681 /* Empty c first. */ 2682 for (j=1; j<=n; j++) 2683 for (i=1; i<=m; i++) 2684 c[i + j * c_dim1] = (GFC_INTEGER_8)0; 2685 2686 /* Early exit if possible */ 2687 if (m == 0 || n == 0 || k == 0) 2688 return; 2689 2690 /* Adjust size of t1 to what is needed. */ 2691 index_type t1_dim, a_sz; 2692 if (aystride == 1) 2693 a_sz = rystride; 2694 else 2695 a_sz = a_dim1; 2696 2697 t1_dim = a_sz * 256 + b_dim1; 2698 if (t1_dim > 65536) 2699 t1_dim = 65536; 2700 2701 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); 2702 2703 /* Start turning the crank. */ 2704 i1 = n; 2705 for (jj = 1; jj <= i1; jj += 512) 2706 { 2707 /* Computing MIN */ 2708 i2 = 512; 2709 i3 = n - jj + 1; 2710 jsec = min(i2,i3); 2711 ujsec = jsec - jsec % 4; 2712 i2 = k; 2713 for (ll = 1; ll <= i2; ll += 256) 2714 { 2715 /* Computing MIN */ 2716 i3 = 256; 2717 i4 = k - ll + 1; 2718 lsec = min(i3,i4); 2719 ulsec = lsec - lsec % 2; 2720 2721 i3 = m; 2722 for (ii = 1; ii <= i3; ii += 256) 2723 { 2724 /* Computing MIN */ 2725 i4 = 256; 2726 i5 = m - ii + 1; 2727 isec = min(i4,i5); 2728 uisec = isec - isec % 2; 2729 i4 = ll + ulsec - 1; 2730 for (l = ll; l <= i4; l += 2) 2731 { 2732 i5 = ii + uisec - 1; 2733 for (i = ii; i <= i5; i += 2) 2734 { 2735 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2736 a[i + l * a_dim1]; 2737 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2738 a[i + (l + 1) * a_dim1]; 2739 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2740 a[i + 1 + l * a_dim1]; 2741 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2742 a[i + 1 + (l + 1) * a_dim1]; 2743 } 2744 if (uisec < isec) 2745 { 2746 t1[l - ll + 1 + (isec << 8) - 257] = 2747 a[ii + isec - 1 + l * a_dim1]; 2748 t1[l - ll + 2 + (isec << 8) - 257] = 2749 a[ii + isec - 1 + (l + 1) * a_dim1]; 2750 } 2751 } 2752 if (ulsec < lsec) 2753 { 2754 i4 = ii + isec - 1; 2755 for (i = ii; i<= i4; ++i) 2756 { 2757 t1[lsec + ((i - ii + 1) << 8) - 257] = 2758 a[i + (ll + lsec - 1) * a_dim1]; 2759 } 2760 } 2761 2762 uisec = isec - isec % 4; 2763 i4 = jj + ujsec - 1; 2764 for (j = jj; j <= i4; j += 4) 2765 { 2766 i5 = ii + uisec - 1; 2767 for (i = ii; i <= i5; i += 4) 2768 { 2769 f11 = c[i + j * c_dim1]; 2770 f21 = c[i + 1 + j * c_dim1]; 2771 f12 = c[i + (j + 1) * c_dim1]; 2772 f22 = c[i + 1 + (j + 1) * c_dim1]; 2773 f13 = c[i + (j + 2) * c_dim1]; 2774 f23 = c[i + 1 + (j + 2) * c_dim1]; 2775 f14 = c[i + (j + 3) * c_dim1]; 2776 f24 = c[i + 1 + (j + 3) * c_dim1]; 2777 f31 = c[i + 2 + j * c_dim1]; 2778 f41 = c[i + 3 + j * c_dim1]; 2779 f32 = c[i + 2 + (j + 1) * c_dim1]; 2780 f42 = c[i + 3 + (j + 1) * c_dim1]; 2781 f33 = c[i + 2 + (j + 2) * c_dim1]; 2782 f43 = c[i + 3 + (j + 2) * c_dim1]; 2783 f34 = c[i + 2 + (j + 3) * c_dim1]; 2784 f44 = c[i + 3 + (j + 3) * c_dim1]; 2785 i6 = ll + lsec - 1; 2786 for (l = ll; l <= i6; ++l) 2787 { 2788 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2789 * b[l + j * b_dim1]; 2790 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2791 * b[l + j * b_dim1]; 2792 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2793 * b[l + (j + 1) * b_dim1]; 2794 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2795 * b[l + (j + 1) * b_dim1]; 2796 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2797 * b[l + (j + 2) * b_dim1]; 2798 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2799 * b[l + (j + 2) * b_dim1]; 2800 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2801 * b[l + (j + 3) * b_dim1]; 2802 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2803 * b[l + (j + 3) * b_dim1]; 2804 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2805 * b[l + j * b_dim1]; 2806 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2807 * b[l + j * b_dim1]; 2808 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2809 * b[l + (j + 1) * b_dim1]; 2810 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2811 * b[l + (j + 1) * b_dim1]; 2812 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2813 * b[l + (j + 2) * b_dim1]; 2814 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2815 * b[l + (j + 2) * b_dim1]; 2816 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2817 * b[l + (j + 3) * b_dim1]; 2818 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2819 * b[l + (j + 3) * b_dim1]; 2820 } 2821 c[i + j * c_dim1] = f11; 2822 c[i + 1 + j * c_dim1] = f21; 2823 c[i + (j + 1) * c_dim1] = f12; 2824 c[i + 1 + (j + 1) * c_dim1] = f22; 2825 c[i + (j + 2) * c_dim1] = f13; 2826 c[i + 1 + (j + 2) * c_dim1] = f23; 2827 c[i + (j + 3) * c_dim1] = f14; 2828 c[i + 1 + (j + 3) * c_dim1] = f24; 2829 c[i + 2 + j * c_dim1] = f31; 2830 c[i + 3 + j * c_dim1] = f41; 2831 c[i + 2 + (j + 1) * c_dim1] = f32; 2832 c[i + 3 + (j + 1) * c_dim1] = f42; 2833 c[i + 2 + (j + 2) * c_dim1] = f33; 2834 c[i + 3 + (j + 2) * c_dim1] = f43; 2835 c[i + 2 + (j + 3) * c_dim1] = f34; 2836 c[i + 3 + (j + 3) * c_dim1] = f44; 2837 } 2838 if (uisec < isec) 2839 { 2840 i5 = ii + isec - 1; 2841 for (i = ii + uisec; i <= i5; ++i) 2842 { 2843 f11 = c[i + j * c_dim1]; 2844 f12 = c[i + (j + 1) * c_dim1]; 2845 f13 = c[i + (j + 2) * c_dim1]; 2846 f14 = c[i + (j + 3) * c_dim1]; 2847 i6 = ll + lsec - 1; 2848 for (l = ll; l <= i6; ++l) 2849 { 2850 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2851 257] * b[l + j * b_dim1]; 2852 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2853 257] * b[l + (j + 1) * b_dim1]; 2854 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2855 257] * b[l + (j + 2) * b_dim1]; 2856 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2857 257] * b[l + (j + 3) * b_dim1]; 2858 } 2859 c[i + j * c_dim1] = f11; 2860 c[i + (j + 1) * c_dim1] = f12; 2861 c[i + (j + 2) * c_dim1] = f13; 2862 c[i + (j + 3) * c_dim1] = f14; 2863 } 2864 } 2865 } 2866 if (ujsec < jsec) 2867 { 2868 i4 = jj + jsec - 1; 2869 for (j = jj + ujsec; j <= i4; ++j) 2870 { 2871 i5 = ii + uisec - 1; 2872 for (i = ii; i <= i5; i += 4) 2873 { 2874 f11 = c[i + j * c_dim1]; 2875 f21 = c[i + 1 + j * c_dim1]; 2876 f31 = c[i + 2 + j * c_dim1]; 2877 f41 = c[i + 3 + j * c_dim1]; 2878 i6 = ll + lsec - 1; 2879 for (l = ll; l <= i6; ++l) 2880 { 2881 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2882 257] * b[l + j * b_dim1]; 2883 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2884 257] * b[l + j * b_dim1]; 2885 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2886 257] * b[l + j * b_dim1]; 2887 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2888 257] * b[l + j * b_dim1]; 2889 } 2890 c[i + j * c_dim1] = f11; 2891 c[i + 1 + j * c_dim1] = f21; 2892 c[i + 2 + j * c_dim1] = f31; 2893 c[i + 3 + j * c_dim1] = f41; 2894 } 2895 i5 = ii + isec - 1; 2896 for (i = ii + uisec; i <= i5; ++i) 2897 { 2898 f11 = c[i + j * c_dim1]; 2899 i6 = ll + lsec - 1; 2900 for (l = ll; l <= i6; ++l) 2901 { 2902 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2903 257] * b[l + j * b_dim1]; 2904 } 2905 c[i + j * c_dim1] = f11; 2906 } 2907 } 2908 } 2909 } 2910 } 2911 } 2912 free(t1); 2913 return; 2914 } 2915 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2916 { 2917 if (GFC_DESCRIPTOR_RANK (a) != 1) 2918 { 2919 const GFC_INTEGER_8 *restrict abase_x; 2920 const GFC_INTEGER_8 *restrict bbase_y; 2921 GFC_INTEGER_8 *restrict dest_y; 2922 GFC_INTEGER_8 s; 2923 2924 for (y = 0; y < ycount; y++) 2925 { 2926 bbase_y = &bbase[y*bystride]; 2927 dest_y = &dest[y*rystride]; 2928 for (x = 0; x < xcount; x++) 2929 { 2930 abase_x = &abase[x*axstride]; 2931 s = (GFC_INTEGER_8) 0; 2932 for (n = 0; n < count; n++) 2933 s += abase_x[n] * bbase_y[n]; 2934 dest_y[x] = s; 2935 } 2936 } 2937 } 2938 else 2939 { 2940 const GFC_INTEGER_8 *restrict bbase_y; 2941 GFC_INTEGER_8 s; 2942 2943 for (y = 0; y < ycount; y++) 2944 { 2945 bbase_y = &bbase[y*bystride]; 2946 s = (GFC_INTEGER_8) 0; 2947 for (n = 0; n < count; n++) 2948 s += abase[n*axstride] * bbase_y[n]; 2949 dest[y*rystride] = s; 2950 } 2951 } 2952 } 2953 else if (axstride < aystride) 2954 { 2955 for (y = 0; y < ycount; y++) 2956 for (x = 0; x < xcount; x++) 2957 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; 2958 2959 for (y = 0; y < ycount; y++) 2960 for (n = 0; n < count; n++) 2961 for (x = 0; x < xcount; x++) 2962 /* dest[x,y] += a[x,n] * b[n,y] */ 2963 dest[x*rxstride + y*rystride] += 2964 abase[x*axstride + n*aystride] * 2965 bbase[n*bxstride + y*bystride]; 2966 } 2967 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2968 { 2969 const GFC_INTEGER_8 *restrict bbase_y; 2970 GFC_INTEGER_8 s; 2971 2972 for (y = 0; y < ycount; y++) 2973 { 2974 bbase_y = &bbase[y*bystride]; 2975 s = (GFC_INTEGER_8) 0; 2976 for (n = 0; n < count; n++) 2977 s += abase[n*axstride] * bbase_y[n*bxstride]; 2978 dest[y*rxstride] = s; 2979 } 2980 } 2981 else 2982 { 2983 const GFC_INTEGER_8 *restrict abase_x; 2984 const GFC_INTEGER_8 *restrict bbase_y; 2985 GFC_INTEGER_8 *restrict dest_y; 2986 GFC_INTEGER_8 s; 2987 2988 for (y = 0; y < ycount; y++) 2989 { 2990 bbase_y = &bbase[y*bystride]; 2991 dest_y = &dest[y*rystride]; 2992 for (x = 0; x < xcount; x++) 2993 { 2994 abase_x = &abase[x*axstride]; 2995 s = (GFC_INTEGER_8) 0; 2996 for (n = 0; n < count; n++) 2997 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 2998 dest_y[x*rxstride] = s; 2999 } 3000 } 3001 } 3002} 3003#undef POW3 3004#undef min 3005#undef max 3006 3007#endif 3008#endif 3009 3010