1/* Implementation of the MATMUL intrinsic 2 Copyright (C) 2002-2022 Free Software Foundation, Inc. 3 Contributed by Paul Brook <paul@nowt.org> 4 5This file is part of the GNU Fortran runtime library (libgfortran). 6 7Libgfortran is free software; you can redistribute it and/or 8modify it under the terms of the GNU General Public 9License as published by the Free Software Foundation; either 10version 3 of the License, or (at your option) any later version. 11 12Libgfortran is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17Under Section 7 of GPL version 3, you are granted additional 18permissions described in the GCC Runtime Library Exception, version 193.1, as published by the Free Software Foundation. 20 21You should have received a copy of the GNU General Public License and 22a copy of the GCC Runtime Library Exception along with this program; 23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24<http://www.gnu.org/licenses/>. */ 25 26#include "libgfortran.h" 27#include <string.h> 28#include <assert.h> 29 30 31#if defined (HAVE_GFC_INTEGER_2) 32 33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be 34 passed to us by the front-end, in which case we call it for large 35 matrices. */ 36 37typedef void (*blas_call)(const char *, const char *, const int *, const int *, 38 const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *, 39 const int *, const GFC_INTEGER_2 *, const int *, 40 const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *, 41 int, int); 42 43/* The order of loops is different in the case of plain matrix 44 multiplication C=MATMUL(A,B), and in the frequent special case where 45 the argument A is the temporary result of a TRANSPOSE intrinsic: 46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by 47 looking at their strides. 48 49 The equivalent Fortran pseudo-code is: 50 51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) 52 IF (.NOT.IS_TRANSPOSED(A)) THEN 53 C = 0 54 DO J=1,N 55 DO K=1,COUNT 56 DO I=1,M 57 C(I,J) = C(I,J)+A(I,K)*B(K,J) 58 ELSE 59 DO J=1,N 60 DO I=1,M 61 S = 0 62 DO K=1,COUNT 63 S = S+A(I,K)*B(K,J) 64 C(I,J) = S 65 ENDIF 66*/ 67 68/* If try_blas is set to a nonzero value, then the matmul function will 69 see if there is a way to perform the matrix multiplication by a call 70 to the BLAS gemm function. */ 71 72extern void matmul_i2 (gfc_array_i2 * const restrict retarray, 73 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 74 int blas_limit, blas_call gemm); 75export_proto(matmul_i2); 76 77/* Put exhaustive list of possible architectures here here, ORed together. */ 78 79#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) 80 81#ifdef HAVE_AVX 82static void 83matmul_i2_avx (gfc_array_i2 * const restrict retarray, 84 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 85 int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); 86static void 87matmul_i2_avx (gfc_array_i2 * const restrict retarray, 88 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 89 int blas_limit, blas_call gemm) 90{ 91 const GFC_INTEGER_2 * restrict abase; 92 const GFC_INTEGER_2 * restrict bbase; 93 GFC_INTEGER_2 * restrict dest; 94 95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 96 index_type x, y, n, count, xcount, ycount; 97 98 assert (GFC_DESCRIPTOR_RANK (a) == 2 99 || GFC_DESCRIPTOR_RANK (b) == 2); 100 101/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 102 103 Either A or B (but not both) can be rank 1: 104 105 o One-dimensional argument A is implicitly treated as a row matrix 106 dimensioned [1,count], so xcount=1. 107 108 o One-dimensional argument B is implicitly treated as a column matrix 109 dimensioned [count, 1], so ycount=1. 110*/ 111 112 if (retarray->base_addr == NULL) 113 { 114 if (GFC_DESCRIPTOR_RANK (a) == 1) 115 { 116 GFC_DIMENSION_SET(retarray->dim[0], 0, 117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 118 } 119 else if (GFC_DESCRIPTOR_RANK (b) == 1) 120 { 121 GFC_DIMENSION_SET(retarray->dim[0], 0, 122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 123 } 124 else 125 { 126 GFC_DIMENSION_SET(retarray->dim[0], 0, 127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 128 129 GFC_DIMENSION_SET(retarray->dim[1], 0, 130 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 131 GFC_DESCRIPTOR_EXTENT(retarray,0)); 132 } 133 134 retarray->base_addr 135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 136 retarray->offset = 0; 137 } 138 else if (unlikely (compile_options.bounds_check)) 139 { 140 index_type ret_extent, arg_extent; 141 142 if (GFC_DESCRIPTOR_RANK (a) == 1) 143 { 144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 146 if (arg_extent != ret_extent) 147 runtime_error ("Array bound mismatch for dimension 1 of " 148 "array (%ld/%ld) ", 149 (long int) ret_extent, (long int) arg_extent); 150 } 151 else if (GFC_DESCRIPTOR_RANK (b) == 1) 152 { 153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 155 if (arg_extent != ret_extent) 156 runtime_error ("Array bound mismatch for dimension 1 of " 157 "array (%ld/%ld) ", 158 (long int) ret_extent, (long int) arg_extent); 159 } 160 else 161 { 162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 164 if (arg_extent != ret_extent) 165 runtime_error ("Array bound mismatch for dimension 1 of " 166 "array (%ld/%ld) ", 167 (long int) ret_extent, (long int) arg_extent); 168 169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 171 if (arg_extent != ret_extent) 172 runtime_error ("Array bound mismatch for dimension 2 of " 173 "array (%ld/%ld) ", 174 (long int) ret_extent, (long int) arg_extent); 175 } 176 } 177 178 179 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 180 { 181 /* One-dimensional result may be addressed in the code below 182 either as a row or a column matrix. We want both cases to 183 work. */ 184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 185 } 186 else 187 { 188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 190 } 191 192 193 if (GFC_DESCRIPTOR_RANK (a) == 1) 194 { 195 /* Treat it as a a row matrix A[1,count]. */ 196 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 197 aystride = 1; 198 199 xcount = 1; 200 count = GFC_DESCRIPTOR_EXTENT(a,0); 201 } 202 else 203 { 204 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 205 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 206 207 count = GFC_DESCRIPTOR_EXTENT(a,1); 208 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 209 } 210 211 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 212 { 213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 215 "in dimension 1: is %ld, should be %ld", 216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 217 } 218 219 if (GFC_DESCRIPTOR_RANK (b) == 1) 220 { 221 /* Treat it as a column matrix B[count,1] */ 222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 223 224 /* bystride should never be used for 1-dimensional b. 225 The value is only used for calculation of the 226 memory by the buffer. */ 227 bystride = 256; 228 ycount = 1; 229 } 230 else 231 { 232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 233 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 234 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 235 } 236 237 abase = a->base_addr; 238 bbase = b->base_addr; 239 dest = retarray->base_addr; 240 241 /* Now that everything is set up, we perform the multiplication 242 itself. */ 243 244#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 245#define min(a,b) ((a) <= (b) ? (a) : (b)) 246#define max(a,b) ((a) >= (b) ? (a) : (b)) 247 248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 249 && (bxstride == 1 || bystride == 1) 250 && (((float) xcount) * ((float) ycount) * ((float) count) 251 > POW3(blas_limit))) 252 { 253 const int m = xcount, n = ycount, k = count, ldc = rystride; 254 const GFC_INTEGER_2 one = 1, zero = 0; 255 const int lda = (axstride == 1) ? aystride : axstride, 256 ldb = (bxstride == 1) ? bystride : bxstride; 257 258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 259 { 260 assert (gemm != NULL); 261 const char *transa, *transb; 262 if (try_blas & 2) 263 transa = "C"; 264 else 265 transa = axstride == 1 ? "N" : "T"; 266 267 if (try_blas & 4) 268 transb = "C"; 269 else 270 transb = bxstride == 1 ? "N" : "T"; 271 272 gemm (transa, transb , &m, 273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 274 &ldc, 1, 1); 275 return; 276 } 277 } 278 279 if (rxstride == 1 && axstride == 1 && bxstride == 1 280 && GFC_DESCRIPTOR_RANK (b) != 1) 281 { 282 /* This block of code implements a tuned matmul, derived from 283 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 284 285 Bo Kagstrom and Per Ling 286 Department of Computing Science 287 Umea University 288 S-901 87 Umea, Sweden 289 290 from netlib.org, translated to C, and modified for matmul.m4. */ 291 292 const GFC_INTEGER_2 *a, *b; 293 GFC_INTEGER_2 *c; 294 const index_type m = xcount, n = ycount, k = count; 295 296 /* System generated locals */ 297 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 298 i1, i2, i3, i4, i5, i6; 299 300 /* Local variables */ 301 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 302 f13, f14, f23, f24, f33, f34, f43, f44; 303 index_type i, j, l, ii, jj, ll; 304 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 305 GFC_INTEGER_2 *t1; 306 307 a = abase; 308 b = bbase; 309 c = retarray->base_addr; 310 311 /* Parameter adjustments */ 312 c_dim1 = rystride; 313 c_offset = 1 + c_dim1; 314 c -= c_offset; 315 a_dim1 = aystride; 316 a_offset = 1 + a_dim1; 317 a -= a_offset; 318 b_dim1 = bystride; 319 b_offset = 1 + b_dim1; 320 b -= b_offset; 321 322 /* Empty c first. */ 323 for (j=1; j<=n; j++) 324 for (i=1; i<=m; i++) 325 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 326 327 /* Early exit if possible */ 328 if (m == 0 || n == 0 || k == 0) 329 return; 330 331 /* Adjust size of t1 to what is needed. */ 332 index_type t1_dim, a_sz; 333 if (aystride == 1) 334 a_sz = rystride; 335 else 336 a_sz = a_dim1; 337 338 t1_dim = a_sz * 256 + b_dim1; 339 if (t1_dim > 65536) 340 t1_dim = 65536; 341 342 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 343 344 /* Start turning the crank. */ 345 i1 = n; 346 for (jj = 1; jj <= i1; jj += 512) 347 { 348 /* Computing MIN */ 349 i2 = 512; 350 i3 = n - jj + 1; 351 jsec = min(i2,i3); 352 ujsec = jsec - jsec % 4; 353 i2 = k; 354 for (ll = 1; ll <= i2; ll += 256) 355 { 356 /* Computing MIN */ 357 i3 = 256; 358 i4 = k - ll + 1; 359 lsec = min(i3,i4); 360 ulsec = lsec - lsec % 2; 361 362 i3 = m; 363 for (ii = 1; ii <= i3; ii += 256) 364 { 365 /* Computing MIN */ 366 i4 = 256; 367 i5 = m - ii + 1; 368 isec = min(i4,i5); 369 uisec = isec - isec % 2; 370 i4 = ll + ulsec - 1; 371 for (l = ll; l <= i4; l += 2) 372 { 373 i5 = ii + uisec - 1; 374 for (i = ii; i <= i5; i += 2) 375 { 376 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 377 a[i + l * a_dim1]; 378 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 379 a[i + (l + 1) * a_dim1]; 380 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 381 a[i + 1 + l * a_dim1]; 382 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 383 a[i + 1 + (l + 1) * a_dim1]; 384 } 385 if (uisec < isec) 386 { 387 t1[l - ll + 1 + (isec << 8) - 257] = 388 a[ii + isec - 1 + l * a_dim1]; 389 t1[l - ll + 2 + (isec << 8) - 257] = 390 a[ii + isec - 1 + (l + 1) * a_dim1]; 391 } 392 } 393 if (ulsec < lsec) 394 { 395 i4 = ii + isec - 1; 396 for (i = ii; i<= i4; ++i) 397 { 398 t1[lsec + ((i - ii + 1) << 8) - 257] = 399 a[i + (ll + lsec - 1) * a_dim1]; 400 } 401 } 402 403 uisec = isec - isec % 4; 404 i4 = jj + ujsec - 1; 405 for (j = jj; j <= i4; j += 4) 406 { 407 i5 = ii + uisec - 1; 408 for (i = ii; i <= i5; i += 4) 409 { 410 f11 = c[i + j * c_dim1]; 411 f21 = c[i + 1 + j * c_dim1]; 412 f12 = c[i + (j + 1) * c_dim1]; 413 f22 = c[i + 1 + (j + 1) * c_dim1]; 414 f13 = c[i + (j + 2) * c_dim1]; 415 f23 = c[i + 1 + (j + 2) * c_dim1]; 416 f14 = c[i + (j + 3) * c_dim1]; 417 f24 = c[i + 1 + (j + 3) * c_dim1]; 418 f31 = c[i + 2 + j * c_dim1]; 419 f41 = c[i + 3 + j * c_dim1]; 420 f32 = c[i + 2 + (j + 1) * c_dim1]; 421 f42 = c[i + 3 + (j + 1) * c_dim1]; 422 f33 = c[i + 2 + (j + 2) * c_dim1]; 423 f43 = c[i + 3 + (j + 2) * c_dim1]; 424 f34 = c[i + 2 + (j + 3) * c_dim1]; 425 f44 = c[i + 3 + (j + 3) * c_dim1]; 426 i6 = ll + lsec - 1; 427 for (l = ll; l <= i6; ++l) 428 { 429 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 430 * b[l + j * b_dim1]; 431 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 432 * b[l + j * b_dim1]; 433 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 434 * b[l + (j + 1) * b_dim1]; 435 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 436 * b[l + (j + 1) * b_dim1]; 437 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 438 * b[l + (j + 2) * b_dim1]; 439 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 440 * b[l + (j + 2) * b_dim1]; 441 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 442 * b[l + (j + 3) * b_dim1]; 443 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 444 * b[l + (j + 3) * b_dim1]; 445 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 446 * b[l + j * b_dim1]; 447 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 448 * b[l + j * b_dim1]; 449 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 450 * b[l + (j + 1) * b_dim1]; 451 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 452 * b[l + (j + 1) * b_dim1]; 453 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 454 * b[l + (j + 2) * b_dim1]; 455 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 456 * b[l + (j + 2) * b_dim1]; 457 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 458 * b[l + (j + 3) * b_dim1]; 459 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 460 * b[l + (j + 3) * b_dim1]; 461 } 462 c[i + j * c_dim1] = f11; 463 c[i + 1 + j * c_dim1] = f21; 464 c[i + (j + 1) * c_dim1] = f12; 465 c[i + 1 + (j + 1) * c_dim1] = f22; 466 c[i + (j + 2) * c_dim1] = f13; 467 c[i + 1 + (j + 2) * c_dim1] = f23; 468 c[i + (j + 3) * c_dim1] = f14; 469 c[i + 1 + (j + 3) * c_dim1] = f24; 470 c[i + 2 + j * c_dim1] = f31; 471 c[i + 3 + j * c_dim1] = f41; 472 c[i + 2 + (j + 1) * c_dim1] = f32; 473 c[i + 3 + (j + 1) * c_dim1] = f42; 474 c[i + 2 + (j + 2) * c_dim1] = f33; 475 c[i + 3 + (j + 2) * c_dim1] = f43; 476 c[i + 2 + (j + 3) * c_dim1] = f34; 477 c[i + 3 + (j + 3) * c_dim1] = f44; 478 } 479 if (uisec < isec) 480 { 481 i5 = ii + isec - 1; 482 for (i = ii + uisec; i <= i5; ++i) 483 { 484 f11 = c[i + j * c_dim1]; 485 f12 = c[i + (j + 1) * c_dim1]; 486 f13 = c[i + (j + 2) * c_dim1]; 487 f14 = c[i + (j + 3) * c_dim1]; 488 i6 = ll + lsec - 1; 489 for (l = ll; l <= i6; ++l) 490 { 491 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 492 257] * b[l + j * b_dim1]; 493 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 494 257] * b[l + (j + 1) * b_dim1]; 495 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 496 257] * b[l + (j + 2) * b_dim1]; 497 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 498 257] * b[l + (j + 3) * b_dim1]; 499 } 500 c[i + j * c_dim1] = f11; 501 c[i + (j + 1) * c_dim1] = f12; 502 c[i + (j + 2) * c_dim1] = f13; 503 c[i + (j + 3) * c_dim1] = f14; 504 } 505 } 506 } 507 if (ujsec < jsec) 508 { 509 i4 = jj + jsec - 1; 510 for (j = jj + ujsec; j <= i4; ++j) 511 { 512 i5 = ii + uisec - 1; 513 for (i = ii; i <= i5; i += 4) 514 { 515 f11 = c[i + j * c_dim1]; 516 f21 = c[i + 1 + j * c_dim1]; 517 f31 = c[i + 2 + j * c_dim1]; 518 f41 = c[i + 3 + j * c_dim1]; 519 i6 = ll + lsec - 1; 520 for (l = ll; l <= i6; ++l) 521 { 522 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 523 257] * b[l + j * b_dim1]; 524 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 525 257] * b[l + j * b_dim1]; 526 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 527 257] * b[l + j * b_dim1]; 528 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 529 257] * b[l + j * b_dim1]; 530 } 531 c[i + j * c_dim1] = f11; 532 c[i + 1 + j * c_dim1] = f21; 533 c[i + 2 + j * c_dim1] = f31; 534 c[i + 3 + j * c_dim1] = f41; 535 } 536 i5 = ii + isec - 1; 537 for (i = ii + uisec; i <= i5; ++i) 538 { 539 f11 = c[i + j * c_dim1]; 540 i6 = ll + lsec - 1; 541 for (l = ll; l <= i6; ++l) 542 { 543 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 544 257] * b[l + j * b_dim1]; 545 } 546 c[i + j * c_dim1] = f11; 547 } 548 } 549 } 550 } 551 } 552 } 553 free(t1); 554 return; 555 } 556 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 557 { 558 if (GFC_DESCRIPTOR_RANK (a) != 1) 559 { 560 const GFC_INTEGER_2 *restrict abase_x; 561 const GFC_INTEGER_2 *restrict bbase_y; 562 GFC_INTEGER_2 *restrict dest_y; 563 GFC_INTEGER_2 s; 564 565 for (y = 0; y < ycount; y++) 566 { 567 bbase_y = &bbase[y*bystride]; 568 dest_y = &dest[y*rystride]; 569 for (x = 0; x < xcount; x++) 570 { 571 abase_x = &abase[x*axstride]; 572 s = (GFC_INTEGER_2) 0; 573 for (n = 0; n < count; n++) 574 s += abase_x[n] * bbase_y[n]; 575 dest_y[x] = s; 576 } 577 } 578 } 579 else 580 { 581 const GFC_INTEGER_2 *restrict bbase_y; 582 GFC_INTEGER_2 s; 583 584 for (y = 0; y < ycount; y++) 585 { 586 bbase_y = &bbase[y*bystride]; 587 s = (GFC_INTEGER_2) 0; 588 for (n = 0; n < count; n++) 589 s += abase[n*axstride] * bbase_y[n]; 590 dest[y*rystride] = s; 591 } 592 } 593 } 594 else if (GFC_DESCRIPTOR_RANK (a) == 1) 595 { 596 const GFC_INTEGER_2 *restrict bbase_y; 597 GFC_INTEGER_2 s; 598 599 for (y = 0; y < ycount; y++) 600 { 601 bbase_y = &bbase[y*bystride]; 602 s = (GFC_INTEGER_2) 0; 603 for (n = 0; n < count; n++) 604 s += abase[n*axstride] * bbase_y[n*bxstride]; 605 dest[y*rxstride] = s; 606 } 607 } 608 else if (axstride < aystride) 609 { 610 for (y = 0; y < ycount; y++) 611 for (x = 0; x < xcount; x++) 612 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 613 614 for (y = 0; y < ycount; y++) 615 for (n = 0; n < count; n++) 616 for (x = 0; x < xcount; x++) 617 /* dest[x,y] += a[x,n] * b[n,y] */ 618 dest[x*rxstride + y*rystride] += 619 abase[x*axstride + n*aystride] * 620 bbase[n*bxstride + y*bystride]; 621 } 622 else 623 { 624 const GFC_INTEGER_2 *restrict abase_x; 625 const GFC_INTEGER_2 *restrict bbase_y; 626 GFC_INTEGER_2 *restrict dest_y; 627 GFC_INTEGER_2 s; 628 629 for (y = 0; y < ycount; y++) 630 { 631 bbase_y = &bbase[y*bystride]; 632 dest_y = &dest[y*rystride]; 633 for (x = 0; x < xcount; x++) 634 { 635 abase_x = &abase[x*axstride]; 636 s = (GFC_INTEGER_2) 0; 637 for (n = 0; n < count; n++) 638 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 639 dest_y[x*rxstride] = s; 640 } 641 } 642 } 643} 644#undef POW3 645#undef min 646#undef max 647 648#endif /* HAVE_AVX */ 649 650#ifdef HAVE_AVX2 651static void 652matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, 653 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 654 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); 655static void 656matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, 657 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 658 int blas_limit, blas_call gemm) 659{ 660 const GFC_INTEGER_2 * restrict abase; 661 const GFC_INTEGER_2 * restrict bbase; 662 GFC_INTEGER_2 * restrict dest; 663 664 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 665 index_type x, y, n, count, xcount, ycount; 666 667 assert (GFC_DESCRIPTOR_RANK (a) == 2 668 || GFC_DESCRIPTOR_RANK (b) == 2); 669 670/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 671 672 Either A or B (but not both) can be rank 1: 673 674 o One-dimensional argument A is implicitly treated as a row matrix 675 dimensioned [1,count], so xcount=1. 676 677 o One-dimensional argument B is implicitly treated as a column matrix 678 dimensioned [count, 1], so ycount=1. 679*/ 680 681 if (retarray->base_addr == NULL) 682 { 683 if (GFC_DESCRIPTOR_RANK (a) == 1) 684 { 685 GFC_DIMENSION_SET(retarray->dim[0], 0, 686 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 687 } 688 else if (GFC_DESCRIPTOR_RANK (b) == 1) 689 { 690 GFC_DIMENSION_SET(retarray->dim[0], 0, 691 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 692 } 693 else 694 { 695 GFC_DIMENSION_SET(retarray->dim[0], 0, 696 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 697 698 GFC_DIMENSION_SET(retarray->dim[1], 0, 699 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 700 GFC_DESCRIPTOR_EXTENT(retarray,0)); 701 } 702 703 retarray->base_addr 704 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 705 retarray->offset = 0; 706 } 707 else if (unlikely (compile_options.bounds_check)) 708 { 709 index_type ret_extent, arg_extent; 710 711 if (GFC_DESCRIPTOR_RANK (a) == 1) 712 { 713 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 715 if (arg_extent != ret_extent) 716 runtime_error ("Array bound mismatch for dimension 1 of " 717 "array (%ld/%ld) ", 718 (long int) ret_extent, (long int) arg_extent); 719 } 720 else if (GFC_DESCRIPTOR_RANK (b) == 1) 721 { 722 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 723 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 724 if (arg_extent != ret_extent) 725 runtime_error ("Array bound mismatch for dimension 1 of " 726 "array (%ld/%ld) ", 727 (long int) ret_extent, (long int) arg_extent); 728 } 729 else 730 { 731 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 732 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 733 if (arg_extent != ret_extent) 734 runtime_error ("Array bound mismatch for dimension 1 of " 735 "array (%ld/%ld) ", 736 (long int) ret_extent, (long int) arg_extent); 737 738 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 739 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 740 if (arg_extent != ret_extent) 741 runtime_error ("Array bound mismatch for dimension 2 of " 742 "array (%ld/%ld) ", 743 (long int) ret_extent, (long int) arg_extent); 744 } 745 } 746 747 748 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 749 { 750 /* One-dimensional result may be addressed in the code below 751 either as a row or a column matrix. We want both cases to 752 work. */ 753 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 754 } 755 else 756 { 757 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 758 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 759 } 760 761 762 if (GFC_DESCRIPTOR_RANK (a) == 1) 763 { 764 /* Treat it as a a row matrix A[1,count]. */ 765 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 766 aystride = 1; 767 768 xcount = 1; 769 count = GFC_DESCRIPTOR_EXTENT(a,0); 770 } 771 else 772 { 773 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 774 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 775 776 count = GFC_DESCRIPTOR_EXTENT(a,1); 777 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 778 } 779 780 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 781 { 782 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 783 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 784 "in dimension 1: is %ld, should be %ld", 785 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 786 } 787 788 if (GFC_DESCRIPTOR_RANK (b) == 1) 789 { 790 /* Treat it as a column matrix B[count,1] */ 791 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 792 793 /* bystride should never be used for 1-dimensional b. 794 The value is only used for calculation of the 795 memory by the buffer. */ 796 bystride = 256; 797 ycount = 1; 798 } 799 else 800 { 801 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 802 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 803 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 804 } 805 806 abase = a->base_addr; 807 bbase = b->base_addr; 808 dest = retarray->base_addr; 809 810 /* Now that everything is set up, we perform the multiplication 811 itself. */ 812 813#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 814#define min(a,b) ((a) <= (b) ? (a) : (b)) 815#define max(a,b) ((a) >= (b) ? (a) : (b)) 816 817 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 818 && (bxstride == 1 || bystride == 1) 819 && (((float) xcount) * ((float) ycount) * ((float) count) 820 > POW3(blas_limit))) 821 { 822 const int m = xcount, n = ycount, k = count, ldc = rystride; 823 const GFC_INTEGER_2 one = 1, zero = 0; 824 const int lda = (axstride == 1) ? aystride : axstride, 825 ldb = (bxstride == 1) ? bystride : bxstride; 826 827 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 828 { 829 assert (gemm != NULL); 830 const char *transa, *transb; 831 if (try_blas & 2) 832 transa = "C"; 833 else 834 transa = axstride == 1 ? "N" : "T"; 835 836 if (try_blas & 4) 837 transb = "C"; 838 else 839 transb = bxstride == 1 ? "N" : "T"; 840 841 gemm (transa, transb , &m, 842 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 843 &ldc, 1, 1); 844 return; 845 } 846 } 847 848 if (rxstride == 1 && axstride == 1 && bxstride == 1 849 && GFC_DESCRIPTOR_RANK (b) != 1) 850 { 851 /* This block of code implements a tuned matmul, derived from 852 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 853 854 Bo Kagstrom and Per Ling 855 Department of Computing Science 856 Umea University 857 S-901 87 Umea, Sweden 858 859 from netlib.org, translated to C, and modified for matmul.m4. */ 860 861 const GFC_INTEGER_2 *a, *b; 862 GFC_INTEGER_2 *c; 863 const index_type m = xcount, n = ycount, k = count; 864 865 /* System generated locals */ 866 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 867 i1, i2, i3, i4, i5, i6; 868 869 /* Local variables */ 870 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 871 f13, f14, f23, f24, f33, f34, f43, f44; 872 index_type i, j, l, ii, jj, ll; 873 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 874 GFC_INTEGER_2 *t1; 875 876 a = abase; 877 b = bbase; 878 c = retarray->base_addr; 879 880 /* Parameter adjustments */ 881 c_dim1 = rystride; 882 c_offset = 1 + c_dim1; 883 c -= c_offset; 884 a_dim1 = aystride; 885 a_offset = 1 + a_dim1; 886 a -= a_offset; 887 b_dim1 = bystride; 888 b_offset = 1 + b_dim1; 889 b -= b_offset; 890 891 /* Empty c first. */ 892 for (j=1; j<=n; j++) 893 for (i=1; i<=m; i++) 894 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 895 896 /* Early exit if possible */ 897 if (m == 0 || n == 0 || k == 0) 898 return; 899 900 /* Adjust size of t1 to what is needed. */ 901 index_type t1_dim, a_sz; 902 if (aystride == 1) 903 a_sz = rystride; 904 else 905 a_sz = a_dim1; 906 907 t1_dim = a_sz * 256 + b_dim1; 908 if (t1_dim > 65536) 909 t1_dim = 65536; 910 911 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 912 913 /* Start turning the crank. */ 914 i1 = n; 915 for (jj = 1; jj <= i1; jj += 512) 916 { 917 /* Computing MIN */ 918 i2 = 512; 919 i3 = n - jj + 1; 920 jsec = min(i2,i3); 921 ujsec = jsec - jsec % 4; 922 i2 = k; 923 for (ll = 1; ll <= i2; ll += 256) 924 { 925 /* Computing MIN */ 926 i3 = 256; 927 i4 = k - ll + 1; 928 lsec = min(i3,i4); 929 ulsec = lsec - lsec % 2; 930 931 i3 = m; 932 for (ii = 1; ii <= i3; ii += 256) 933 { 934 /* Computing MIN */ 935 i4 = 256; 936 i5 = m - ii + 1; 937 isec = min(i4,i5); 938 uisec = isec - isec % 2; 939 i4 = ll + ulsec - 1; 940 for (l = ll; l <= i4; l += 2) 941 { 942 i5 = ii + uisec - 1; 943 for (i = ii; i <= i5; i += 2) 944 { 945 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 946 a[i + l * a_dim1]; 947 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 948 a[i + (l + 1) * a_dim1]; 949 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 950 a[i + 1 + l * a_dim1]; 951 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 952 a[i + 1 + (l + 1) * a_dim1]; 953 } 954 if (uisec < isec) 955 { 956 t1[l - ll + 1 + (isec << 8) - 257] = 957 a[ii + isec - 1 + l * a_dim1]; 958 t1[l - ll + 2 + (isec << 8) - 257] = 959 a[ii + isec - 1 + (l + 1) * a_dim1]; 960 } 961 } 962 if (ulsec < lsec) 963 { 964 i4 = ii + isec - 1; 965 for (i = ii; i<= i4; ++i) 966 { 967 t1[lsec + ((i - ii + 1) << 8) - 257] = 968 a[i + (ll + lsec - 1) * a_dim1]; 969 } 970 } 971 972 uisec = isec - isec % 4; 973 i4 = jj + ujsec - 1; 974 for (j = jj; j <= i4; j += 4) 975 { 976 i5 = ii + uisec - 1; 977 for (i = ii; i <= i5; i += 4) 978 { 979 f11 = c[i + j * c_dim1]; 980 f21 = c[i + 1 + j * c_dim1]; 981 f12 = c[i + (j + 1) * c_dim1]; 982 f22 = c[i + 1 + (j + 1) * c_dim1]; 983 f13 = c[i + (j + 2) * c_dim1]; 984 f23 = c[i + 1 + (j + 2) * c_dim1]; 985 f14 = c[i + (j + 3) * c_dim1]; 986 f24 = c[i + 1 + (j + 3) * c_dim1]; 987 f31 = c[i + 2 + j * c_dim1]; 988 f41 = c[i + 3 + j * c_dim1]; 989 f32 = c[i + 2 + (j + 1) * c_dim1]; 990 f42 = c[i + 3 + (j + 1) * c_dim1]; 991 f33 = c[i + 2 + (j + 2) * c_dim1]; 992 f43 = c[i + 3 + (j + 2) * c_dim1]; 993 f34 = c[i + 2 + (j + 3) * c_dim1]; 994 f44 = c[i + 3 + (j + 3) * c_dim1]; 995 i6 = ll + lsec - 1; 996 for (l = ll; l <= i6; ++l) 997 { 998 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 999 * b[l + j * b_dim1]; 1000 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1001 * b[l + j * b_dim1]; 1002 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1003 * b[l + (j + 1) * b_dim1]; 1004 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1005 * b[l + (j + 1) * b_dim1]; 1006 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1007 * b[l + (j + 2) * b_dim1]; 1008 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1009 * b[l + (j + 2) * b_dim1]; 1010 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1011 * b[l + (j + 3) * b_dim1]; 1012 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1013 * b[l + (j + 3) * b_dim1]; 1014 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1015 * b[l + j * b_dim1]; 1016 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1017 * b[l + j * b_dim1]; 1018 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1019 * b[l + (j + 1) * b_dim1]; 1020 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1021 * b[l + (j + 1) * b_dim1]; 1022 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1023 * b[l + (j + 2) * b_dim1]; 1024 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1025 * b[l + (j + 2) * b_dim1]; 1026 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1027 * b[l + (j + 3) * b_dim1]; 1028 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1029 * b[l + (j + 3) * b_dim1]; 1030 } 1031 c[i + j * c_dim1] = f11; 1032 c[i + 1 + j * c_dim1] = f21; 1033 c[i + (j + 1) * c_dim1] = f12; 1034 c[i + 1 + (j + 1) * c_dim1] = f22; 1035 c[i + (j + 2) * c_dim1] = f13; 1036 c[i + 1 + (j + 2) * c_dim1] = f23; 1037 c[i + (j + 3) * c_dim1] = f14; 1038 c[i + 1 + (j + 3) * c_dim1] = f24; 1039 c[i + 2 + j * c_dim1] = f31; 1040 c[i + 3 + j * c_dim1] = f41; 1041 c[i + 2 + (j + 1) * c_dim1] = f32; 1042 c[i + 3 + (j + 1) * c_dim1] = f42; 1043 c[i + 2 + (j + 2) * c_dim1] = f33; 1044 c[i + 3 + (j + 2) * c_dim1] = f43; 1045 c[i + 2 + (j + 3) * c_dim1] = f34; 1046 c[i + 3 + (j + 3) * c_dim1] = f44; 1047 } 1048 if (uisec < isec) 1049 { 1050 i5 = ii + isec - 1; 1051 for (i = ii + uisec; i <= i5; ++i) 1052 { 1053 f11 = c[i + j * c_dim1]; 1054 f12 = c[i + (j + 1) * c_dim1]; 1055 f13 = c[i + (j + 2) * c_dim1]; 1056 f14 = c[i + (j + 3) * c_dim1]; 1057 i6 = ll + lsec - 1; 1058 for (l = ll; l <= i6; ++l) 1059 { 1060 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1061 257] * b[l + j * b_dim1]; 1062 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1063 257] * b[l + (j + 1) * b_dim1]; 1064 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1065 257] * b[l + (j + 2) * b_dim1]; 1066 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1067 257] * b[l + (j + 3) * b_dim1]; 1068 } 1069 c[i + j * c_dim1] = f11; 1070 c[i + (j + 1) * c_dim1] = f12; 1071 c[i + (j + 2) * c_dim1] = f13; 1072 c[i + (j + 3) * c_dim1] = f14; 1073 } 1074 } 1075 } 1076 if (ujsec < jsec) 1077 { 1078 i4 = jj + jsec - 1; 1079 for (j = jj + ujsec; j <= i4; ++j) 1080 { 1081 i5 = ii + uisec - 1; 1082 for (i = ii; i <= i5; i += 4) 1083 { 1084 f11 = c[i + j * c_dim1]; 1085 f21 = c[i + 1 + j * c_dim1]; 1086 f31 = c[i + 2 + j * c_dim1]; 1087 f41 = c[i + 3 + j * c_dim1]; 1088 i6 = ll + lsec - 1; 1089 for (l = ll; l <= i6; ++l) 1090 { 1091 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1092 257] * b[l + j * b_dim1]; 1093 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1094 257] * b[l + j * b_dim1]; 1095 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1096 257] * b[l + j * b_dim1]; 1097 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1098 257] * b[l + j * b_dim1]; 1099 } 1100 c[i + j * c_dim1] = f11; 1101 c[i + 1 + j * c_dim1] = f21; 1102 c[i + 2 + j * c_dim1] = f31; 1103 c[i + 3 + j * c_dim1] = f41; 1104 } 1105 i5 = ii + isec - 1; 1106 for (i = ii + uisec; i <= i5; ++i) 1107 { 1108 f11 = c[i + j * c_dim1]; 1109 i6 = ll + lsec - 1; 1110 for (l = ll; l <= i6; ++l) 1111 { 1112 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1113 257] * b[l + j * b_dim1]; 1114 } 1115 c[i + j * c_dim1] = f11; 1116 } 1117 } 1118 } 1119 } 1120 } 1121 } 1122 free(t1); 1123 return; 1124 } 1125 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1126 { 1127 if (GFC_DESCRIPTOR_RANK (a) != 1) 1128 { 1129 const GFC_INTEGER_2 *restrict abase_x; 1130 const GFC_INTEGER_2 *restrict bbase_y; 1131 GFC_INTEGER_2 *restrict dest_y; 1132 GFC_INTEGER_2 s; 1133 1134 for (y = 0; y < ycount; y++) 1135 { 1136 bbase_y = &bbase[y*bystride]; 1137 dest_y = &dest[y*rystride]; 1138 for (x = 0; x < xcount; x++) 1139 { 1140 abase_x = &abase[x*axstride]; 1141 s = (GFC_INTEGER_2) 0; 1142 for (n = 0; n < count; n++) 1143 s += abase_x[n] * bbase_y[n]; 1144 dest_y[x] = s; 1145 } 1146 } 1147 } 1148 else 1149 { 1150 const GFC_INTEGER_2 *restrict bbase_y; 1151 GFC_INTEGER_2 s; 1152 1153 for (y = 0; y < ycount; y++) 1154 { 1155 bbase_y = &bbase[y*bystride]; 1156 s = (GFC_INTEGER_2) 0; 1157 for (n = 0; n < count; n++) 1158 s += abase[n*axstride] * bbase_y[n]; 1159 dest[y*rystride] = s; 1160 } 1161 } 1162 } 1163 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1164 { 1165 const GFC_INTEGER_2 *restrict bbase_y; 1166 GFC_INTEGER_2 s; 1167 1168 for (y = 0; y < ycount; y++) 1169 { 1170 bbase_y = &bbase[y*bystride]; 1171 s = (GFC_INTEGER_2) 0; 1172 for (n = 0; n < count; n++) 1173 s += abase[n*axstride] * bbase_y[n*bxstride]; 1174 dest[y*rxstride] = s; 1175 } 1176 } 1177 else if (axstride < aystride) 1178 { 1179 for (y = 0; y < ycount; y++) 1180 for (x = 0; x < xcount; x++) 1181 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 1182 1183 for (y = 0; y < ycount; y++) 1184 for (n = 0; n < count; n++) 1185 for (x = 0; x < xcount; x++) 1186 /* dest[x,y] += a[x,n] * b[n,y] */ 1187 dest[x*rxstride + y*rystride] += 1188 abase[x*axstride + n*aystride] * 1189 bbase[n*bxstride + y*bystride]; 1190 } 1191 else 1192 { 1193 const GFC_INTEGER_2 *restrict abase_x; 1194 const GFC_INTEGER_2 *restrict bbase_y; 1195 GFC_INTEGER_2 *restrict dest_y; 1196 GFC_INTEGER_2 s; 1197 1198 for (y = 0; y < ycount; y++) 1199 { 1200 bbase_y = &bbase[y*bystride]; 1201 dest_y = &dest[y*rystride]; 1202 for (x = 0; x < xcount; x++) 1203 { 1204 abase_x = &abase[x*axstride]; 1205 s = (GFC_INTEGER_2) 0; 1206 for (n = 0; n < count; n++) 1207 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1208 dest_y[x*rxstride] = s; 1209 } 1210 } 1211 } 1212} 1213#undef POW3 1214#undef min 1215#undef max 1216 1217#endif /* HAVE_AVX2 */ 1218 1219#ifdef HAVE_AVX512F 1220static void 1221matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, 1222 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1223 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); 1224static void 1225matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, 1226 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1227 int blas_limit, blas_call gemm) 1228{ 1229 const GFC_INTEGER_2 * restrict abase; 1230 const GFC_INTEGER_2 * restrict bbase; 1231 GFC_INTEGER_2 * restrict dest; 1232 1233 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1234 index_type x, y, n, count, xcount, ycount; 1235 1236 assert (GFC_DESCRIPTOR_RANK (a) == 2 1237 || GFC_DESCRIPTOR_RANK (b) == 2); 1238 1239/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1240 1241 Either A or B (but not both) can be rank 1: 1242 1243 o One-dimensional argument A is implicitly treated as a row matrix 1244 dimensioned [1,count], so xcount=1. 1245 1246 o One-dimensional argument B is implicitly treated as a column matrix 1247 dimensioned [count, 1], so ycount=1. 1248*/ 1249 1250 if (retarray->base_addr == NULL) 1251 { 1252 if (GFC_DESCRIPTOR_RANK (a) == 1) 1253 { 1254 GFC_DIMENSION_SET(retarray->dim[0], 0, 1255 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1256 } 1257 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1258 { 1259 GFC_DIMENSION_SET(retarray->dim[0], 0, 1260 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1261 } 1262 else 1263 { 1264 GFC_DIMENSION_SET(retarray->dim[0], 0, 1265 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1266 1267 GFC_DIMENSION_SET(retarray->dim[1], 0, 1268 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1269 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1270 } 1271 1272 retarray->base_addr 1273 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 1274 retarray->offset = 0; 1275 } 1276 else if (unlikely (compile_options.bounds_check)) 1277 { 1278 index_type ret_extent, arg_extent; 1279 1280 if (GFC_DESCRIPTOR_RANK (a) == 1) 1281 { 1282 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1283 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1284 if (arg_extent != ret_extent) 1285 runtime_error ("Array bound mismatch for dimension 1 of " 1286 "array (%ld/%ld) ", 1287 (long int) ret_extent, (long int) arg_extent); 1288 } 1289 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1290 { 1291 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1292 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1293 if (arg_extent != ret_extent) 1294 runtime_error ("Array bound mismatch for dimension 1 of " 1295 "array (%ld/%ld) ", 1296 (long int) ret_extent, (long int) arg_extent); 1297 } 1298 else 1299 { 1300 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1301 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1302 if (arg_extent != ret_extent) 1303 runtime_error ("Array bound mismatch for dimension 1 of " 1304 "array (%ld/%ld) ", 1305 (long int) ret_extent, (long int) arg_extent); 1306 1307 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1308 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1309 if (arg_extent != ret_extent) 1310 runtime_error ("Array bound mismatch for dimension 2 of " 1311 "array (%ld/%ld) ", 1312 (long int) ret_extent, (long int) arg_extent); 1313 } 1314 } 1315 1316 1317 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1318 { 1319 /* One-dimensional result may be addressed in the code below 1320 either as a row or a column matrix. We want both cases to 1321 work. */ 1322 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1323 } 1324 else 1325 { 1326 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1327 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1328 } 1329 1330 1331 if (GFC_DESCRIPTOR_RANK (a) == 1) 1332 { 1333 /* Treat it as a a row matrix A[1,count]. */ 1334 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1335 aystride = 1; 1336 1337 xcount = 1; 1338 count = GFC_DESCRIPTOR_EXTENT(a,0); 1339 } 1340 else 1341 { 1342 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1343 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1344 1345 count = GFC_DESCRIPTOR_EXTENT(a,1); 1346 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1347 } 1348 1349 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1350 { 1351 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1352 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1353 "in dimension 1: is %ld, should be %ld", 1354 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1355 } 1356 1357 if (GFC_DESCRIPTOR_RANK (b) == 1) 1358 { 1359 /* Treat it as a column matrix B[count,1] */ 1360 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1361 1362 /* bystride should never be used for 1-dimensional b. 1363 The value is only used for calculation of the 1364 memory by the buffer. */ 1365 bystride = 256; 1366 ycount = 1; 1367 } 1368 else 1369 { 1370 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1371 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1372 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1373 } 1374 1375 abase = a->base_addr; 1376 bbase = b->base_addr; 1377 dest = retarray->base_addr; 1378 1379 /* Now that everything is set up, we perform the multiplication 1380 itself. */ 1381 1382#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1383#define min(a,b) ((a) <= (b) ? (a) : (b)) 1384#define max(a,b) ((a) >= (b) ? (a) : (b)) 1385 1386 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1387 && (bxstride == 1 || bystride == 1) 1388 && (((float) xcount) * ((float) ycount) * ((float) count) 1389 > POW3(blas_limit))) 1390 { 1391 const int m = xcount, n = ycount, k = count, ldc = rystride; 1392 const GFC_INTEGER_2 one = 1, zero = 0; 1393 const int lda = (axstride == 1) ? aystride : axstride, 1394 ldb = (bxstride == 1) ? bystride : bxstride; 1395 1396 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1397 { 1398 assert (gemm != NULL); 1399 const char *transa, *transb; 1400 if (try_blas & 2) 1401 transa = "C"; 1402 else 1403 transa = axstride == 1 ? "N" : "T"; 1404 1405 if (try_blas & 4) 1406 transb = "C"; 1407 else 1408 transb = bxstride == 1 ? "N" : "T"; 1409 1410 gemm (transa, transb , &m, 1411 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1412 &ldc, 1, 1); 1413 return; 1414 } 1415 } 1416 1417 if (rxstride == 1 && axstride == 1 && bxstride == 1 1418 && GFC_DESCRIPTOR_RANK (b) != 1) 1419 { 1420 /* This block of code implements a tuned matmul, derived from 1421 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 1422 1423 Bo Kagstrom and Per Ling 1424 Department of Computing Science 1425 Umea University 1426 S-901 87 Umea, Sweden 1427 1428 from netlib.org, translated to C, and modified for matmul.m4. */ 1429 1430 const GFC_INTEGER_2 *a, *b; 1431 GFC_INTEGER_2 *c; 1432 const index_type m = xcount, n = ycount, k = count; 1433 1434 /* System generated locals */ 1435 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 1436 i1, i2, i3, i4, i5, i6; 1437 1438 /* Local variables */ 1439 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 1440 f13, f14, f23, f24, f33, f34, f43, f44; 1441 index_type i, j, l, ii, jj, ll; 1442 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 1443 GFC_INTEGER_2 *t1; 1444 1445 a = abase; 1446 b = bbase; 1447 c = retarray->base_addr; 1448 1449 /* Parameter adjustments */ 1450 c_dim1 = rystride; 1451 c_offset = 1 + c_dim1; 1452 c -= c_offset; 1453 a_dim1 = aystride; 1454 a_offset = 1 + a_dim1; 1455 a -= a_offset; 1456 b_dim1 = bystride; 1457 b_offset = 1 + b_dim1; 1458 b -= b_offset; 1459 1460 /* Empty c first. */ 1461 for (j=1; j<=n; j++) 1462 for (i=1; i<=m; i++) 1463 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 1464 1465 /* Early exit if possible */ 1466 if (m == 0 || n == 0 || k == 0) 1467 return; 1468 1469 /* Adjust size of t1 to what is needed. */ 1470 index_type t1_dim, a_sz; 1471 if (aystride == 1) 1472 a_sz = rystride; 1473 else 1474 a_sz = a_dim1; 1475 1476 t1_dim = a_sz * 256 + b_dim1; 1477 if (t1_dim > 65536) 1478 t1_dim = 65536; 1479 1480 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 1481 1482 /* Start turning the crank. */ 1483 i1 = n; 1484 for (jj = 1; jj <= i1; jj += 512) 1485 { 1486 /* Computing MIN */ 1487 i2 = 512; 1488 i3 = n - jj + 1; 1489 jsec = min(i2,i3); 1490 ujsec = jsec - jsec % 4; 1491 i2 = k; 1492 for (ll = 1; ll <= i2; ll += 256) 1493 { 1494 /* Computing MIN */ 1495 i3 = 256; 1496 i4 = k - ll + 1; 1497 lsec = min(i3,i4); 1498 ulsec = lsec - lsec % 2; 1499 1500 i3 = m; 1501 for (ii = 1; ii <= i3; ii += 256) 1502 { 1503 /* Computing MIN */ 1504 i4 = 256; 1505 i5 = m - ii + 1; 1506 isec = min(i4,i5); 1507 uisec = isec - isec % 2; 1508 i4 = ll + ulsec - 1; 1509 for (l = ll; l <= i4; l += 2) 1510 { 1511 i5 = ii + uisec - 1; 1512 for (i = ii; i <= i5; i += 2) 1513 { 1514 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 1515 a[i + l * a_dim1]; 1516 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 1517 a[i + (l + 1) * a_dim1]; 1518 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 1519 a[i + 1 + l * a_dim1]; 1520 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 1521 a[i + 1 + (l + 1) * a_dim1]; 1522 } 1523 if (uisec < isec) 1524 { 1525 t1[l - ll + 1 + (isec << 8) - 257] = 1526 a[ii + isec - 1 + l * a_dim1]; 1527 t1[l - ll + 2 + (isec << 8) - 257] = 1528 a[ii + isec - 1 + (l + 1) * a_dim1]; 1529 } 1530 } 1531 if (ulsec < lsec) 1532 { 1533 i4 = ii + isec - 1; 1534 for (i = ii; i<= i4; ++i) 1535 { 1536 t1[lsec + ((i - ii + 1) << 8) - 257] = 1537 a[i + (ll + lsec - 1) * a_dim1]; 1538 } 1539 } 1540 1541 uisec = isec - isec % 4; 1542 i4 = jj + ujsec - 1; 1543 for (j = jj; j <= i4; j += 4) 1544 { 1545 i5 = ii + uisec - 1; 1546 for (i = ii; i <= i5; i += 4) 1547 { 1548 f11 = c[i + j * c_dim1]; 1549 f21 = c[i + 1 + j * c_dim1]; 1550 f12 = c[i + (j + 1) * c_dim1]; 1551 f22 = c[i + 1 + (j + 1) * c_dim1]; 1552 f13 = c[i + (j + 2) * c_dim1]; 1553 f23 = c[i + 1 + (j + 2) * c_dim1]; 1554 f14 = c[i + (j + 3) * c_dim1]; 1555 f24 = c[i + 1 + (j + 3) * c_dim1]; 1556 f31 = c[i + 2 + j * c_dim1]; 1557 f41 = c[i + 3 + j * c_dim1]; 1558 f32 = c[i + 2 + (j + 1) * c_dim1]; 1559 f42 = c[i + 3 + (j + 1) * c_dim1]; 1560 f33 = c[i + 2 + (j + 2) * c_dim1]; 1561 f43 = c[i + 3 + (j + 2) * c_dim1]; 1562 f34 = c[i + 2 + (j + 3) * c_dim1]; 1563 f44 = c[i + 3 + (j + 3) * c_dim1]; 1564 i6 = ll + lsec - 1; 1565 for (l = ll; l <= i6; ++l) 1566 { 1567 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1568 * b[l + j * b_dim1]; 1569 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1570 * b[l + j * b_dim1]; 1571 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1572 * b[l + (j + 1) * b_dim1]; 1573 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1574 * b[l + (j + 1) * b_dim1]; 1575 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1576 * b[l + (j + 2) * b_dim1]; 1577 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1578 * b[l + (j + 2) * b_dim1]; 1579 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 1580 * b[l + (j + 3) * b_dim1]; 1581 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 1582 * b[l + (j + 3) * b_dim1]; 1583 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1584 * b[l + j * b_dim1]; 1585 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1586 * b[l + j * b_dim1]; 1587 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1588 * b[l + (j + 1) * b_dim1]; 1589 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1590 * b[l + (j + 1) * b_dim1]; 1591 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1592 * b[l + (j + 2) * b_dim1]; 1593 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1594 * b[l + (j + 2) * b_dim1]; 1595 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 1596 * b[l + (j + 3) * b_dim1]; 1597 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 1598 * b[l + (j + 3) * b_dim1]; 1599 } 1600 c[i + j * c_dim1] = f11; 1601 c[i + 1 + j * c_dim1] = f21; 1602 c[i + (j + 1) * c_dim1] = f12; 1603 c[i + 1 + (j + 1) * c_dim1] = f22; 1604 c[i + (j + 2) * c_dim1] = f13; 1605 c[i + 1 + (j + 2) * c_dim1] = f23; 1606 c[i + (j + 3) * c_dim1] = f14; 1607 c[i + 1 + (j + 3) * c_dim1] = f24; 1608 c[i + 2 + j * c_dim1] = f31; 1609 c[i + 3 + j * c_dim1] = f41; 1610 c[i + 2 + (j + 1) * c_dim1] = f32; 1611 c[i + 3 + (j + 1) * c_dim1] = f42; 1612 c[i + 2 + (j + 2) * c_dim1] = f33; 1613 c[i + 3 + (j + 2) * c_dim1] = f43; 1614 c[i + 2 + (j + 3) * c_dim1] = f34; 1615 c[i + 3 + (j + 3) * c_dim1] = f44; 1616 } 1617 if (uisec < isec) 1618 { 1619 i5 = ii + isec - 1; 1620 for (i = ii + uisec; i <= i5; ++i) 1621 { 1622 f11 = c[i + j * c_dim1]; 1623 f12 = c[i + (j + 1) * c_dim1]; 1624 f13 = c[i + (j + 2) * c_dim1]; 1625 f14 = c[i + (j + 3) * c_dim1]; 1626 i6 = ll + lsec - 1; 1627 for (l = ll; l <= i6; ++l) 1628 { 1629 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1630 257] * b[l + j * b_dim1]; 1631 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1632 257] * b[l + (j + 1) * b_dim1]; 1633 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1634 257] * b[l + (j + 2) * b_dim1]; 1635 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1636 257] * b[l + (j + 3) * b_dim1]; 1637 } 1638 c[i + j * c_dim1] = f11; 1639 c[i + (j + 1) * c_dim1] = f12; 1640 c[i + (j + 2) * c_dim1] = f13; 1641 c[i + (j + 3) * c_dim1] = f14; 1642 } 1643 } 1644 } 1645 if (ujsec < jsec) 1646 { 1647 i4 = jj + jsec - 1; 1648 for (j = jj + ujsec; j <= i4; ++j) 1649 { 1650 i5 = ii + uisec - 1; 1651 for (i = ii; i <= i5; i += 4) 1652 { 1653 f11 = c[i + j * c_dim1]; 1654 f21 = c[i + 1 + j * c_dim1]; 1655 f31 = c[i + 2 + j * c_dim1]; 1656 f41 = c[i + 3 + j * c_dim1]; 1657 i6 = ll + lsec - 1; 1658 for (l = ll; l <= i6; ++l) 1659 { 1660 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1661 257] * b[l + j * b_dim1]; 1662 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 1663 257] * b[l + j * b_dim1]; 1664 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 1665 257] * b[l + j * b_dim1]; 1666 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 1667 257] * b[l + j * b_dim1]; 1668 } 1669 c[i + j * c_dim1] = f11; 1670 c[i + 1 + j * c_dim1] = f21; 1671 c[i + 2 + j * c_dim1] = f31; 1672 c[i + 3 + j * c_dim1] = f41; 1673 } 1674 i5 = ii + isec - 1; 1675 for (i = ii + uisec; i <= i5; ++i) 1676 { 1677 f11 = c[i + j * c_dim1]; 1678 i6 = ll + lsec - 1; 1679 for (l = ll; l <= i6; ++l) 1680 { 1681 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 1682 257] * b[l + j * b_dim1]; 1683 } 1684 c[i + j * c_dim1] = f11; 1685 } 1686 } 1687 } 1688 } 1689 } 1690 } 1691 free(t1); 1692 return; 1693 } 1694 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 1695 { 1696 if (GFC_DESCRIPTOR_RANK (a) != 1) 1697 { 1698 const GFC_INTEGER_2 *restrict abase_x; 1699 const GFC_INTEGER_2 *restrict bbase_y; 1700 GFC_INTEGER_2 *restrict dest_y; 1701 GFC_INTEGER_2 s; 1702 1703 for (y = 0; y < ycount; y++) 1704 { 1705 bbase_y = &bbase[y*bystride]; 1706 dest_y = &dest[y*rystride]; 1707 for (x = 0; x < xcount; x++) 1708 { 1709 abase_x = &abase[x*axstride]; 1710 s = (GFC_INTEGER_2) 0; 1711 for (n = 0; n < count; n++) 1712 s += abase_x[n] * bbase_y[n]; 1713 dest_y[x] = s; 1714 } 1715 } 1716 } 1717 else 1718 { 1719 const GFC_INTEGER_2 *restrict bbase_y; 1720 GFC_INTEGER_2 s; 1721 1722 for (y = 0; y < ycount; y++) 1723 { 1724 bbase_y = &bbase[y*bystride]; 1725 s = (GFC_INTEGER_2) 0; 1726 for (n = 0; n < count; n++) 1727 s += abase[n*axstride] * bbase_y[n]; 1728 dest[y*rystride] = s; 1729 } 1730 } 1731 } 1732 else if (GFC_DESCRIPTOR_RANK (a) == 1) 1733 { 1734 const GFC_INTEGER_2 *restrict bbase_y; 1735 GFC_INTEGER_2 s; 1736 1737 for (y = 0; y < ycount; y++) 1738 { 1739 bbase_y = &bbase[y*bystride]; 1740 s = (GFC_INTEGER_2) 0; 1741 for (n = 0; n < count; n++) 1742 s += abase[n*axstride] * bbase_y[n*bxstride]; 1743 dest[y*rxstride] = s; 1744 } 1745 } 1746 else if (axstride < aystride) 1747 { 1748 for (y = 0; y < ycount; y++) 1749 for (x = 0; x < xcount; x++) 1750 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 1751 1752 for (y = 0; y < ycount; y++) 1753 for (n = 0; n < count; n++) 1754 for (x = 0; x < xcount; x++) 1755 /* dest[x,y] += a[x,n] * b[n,y] */ 1756 dest[x*rxstride + y*rystride] += 1757 abase[x*axstride + n*aystride] * 1758 bbase[n*bxstride + y*bystride]; 1759 } 1760 else 1761 { 1762 const GFC_INTEGER_2 *restrict abase_x; 1763 const GFC_INTEGER_2 *restrict bbase_y; 1764 GFC_INTEGER_2 *restrict dest_y; 1765 GFC_INTEGER_2 s; 1766 1767 for (y = 0; y < ycount; y++) 1768 { 1769 bbase_y = &bbase[y*bystride]; 1770 dest_y = &dest[y*rystride]; 1771 for (x = 0; x < xcount; x++) 1772 { 1773 abase_x = &abase[x*axstride]; 1774 s = (GFC_INTEGER_2) 0; 1775 for (n = 0; n < count; n++) 1776 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 1777 dest_y[x*rxstride] = s; 1778 } 1779 } 1780 } 1781} 1782#undef POW3 1783#undef min 1784#undef max 1785 1786#endif /* HAVE_AVX512F */ 1787 1788/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */ 1789 1790#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 1791void 1792matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray, 1793 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1794 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); 1795internal_proto(matmul_i2_avx128_fma3); 1796#endif 1797 1798#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 1799void 1800matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray, 1801 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1802 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); 1803internal_proto(matmul_i2_avx128_fma4); 1804#endif 1805 1806/* Function to fall back to if there is no special processor-specific version. */ 1807static void 1808matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, 1809 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 1810 int blas_limit, blas_call gemm) 1811{ 1812 const GFC_INTEGER_2 * restrict abase; 1813 const GFC_INTEGER_2 * restrict bbase; 1814 GFC_INTEGER_2 * restrict dest; 1815 1816 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 1817 index_type x, y, n, count, xcount, ycount; 1818 1819 assert (GFC_DESCRIPTOR_RANK (a) == 2 1820 || GFC_DESCRIPTOR_RANK (b) == 2); 1821 1822/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 1823 1824 Either A or B (but not both) can be rank 1: 1825 1826 o One-dimensional argument A is implicitly treated as a row matrix 1827 dimensioned [1,count], so xcount=1. 1828 1829 o One-dimensional argument B is implicitly treated as a column matrix 1830 dimensioned [count, 1], so ycount=1. 1831*/ 1832 1833 if (retarray->base_addr == NULL) 1834 { 1835 if (GFC_DESCRIPTOR_RANK (a) == 1) 1836 { 1837 GFC_DIMENSION_SET(retarray->dim[0], 0, 1838 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 1839 } 1840 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1841 { 1842 GFC_DIMENSION_SET(retarray->dim[0], 0, 1843 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1844 } 1845 else 1846 { 1847 GFC_DIMENSION_SET(retarray->dim[0], 0, 1848 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 1849 1850 GFC_DIMENSION_SET(retarray->dim[1], 0, 1851 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1852 GFC_DESCRIPTOR_EXTENT(retarray,0)); 1853 } 1854 1855 retarray->base_addr 1856 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 1857 retarray->offset = 0; 1858 } 1859 else if (unlikely (compile_options.bounds_check)) 1860 { 1861 index_type ret_extent, arg_extent; 1862 1863 if (GFC_DESCRIPTOR_RANK (a) == 1) 1864 { 1865 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1866 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1867 if (arg_extent != ret_extent) 1868 runtime_error ("Array bound mismatch for dimension 1 of " 1869 "array (%ld/%ld) ", 1870 (long int) ret_extent, (long int) arg_extent); 1871 } 1872 else if (GFC_DESCRIPTOR_RANK (b) == 1) 1873 { 1874 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1875 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1876 if (arg_extent != ret_extent) 1877 runtime_error ("Array bound mismatch for dimension 1 of " 1878 "array (%ld/%ld) ", 1879 (long int) ret_extent, (long int) arg_extent); 1880 } 1881 else 1882 { 1883 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 1884 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 1885 if (arg_extent != ret_extent) 1886 runtime_error ("Array bound mismatch for dimension 1 of " 1887 "array (%ld/%ld) ", 1888 (long int) ret_extent, (long int) arg_extent); 1889 1890 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 1891 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 1892 if (arg_extent != ret_extent) 1893 runtime_error ("Array bound mismatch for dimension 2 of " 1894 "array (%ld/%ld) ", 1895 (long int) ret_extent, (long int) arg_extent); 1896 } 1897 } 1898 1899 1900 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 1901 { 1902 /* One-dimensional result may be addressed in the code below 1903 either as a row or a column matrix. We want both cases to 1904 work. */ 1905 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1906 } 1907 else 1908 { 1909 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 1910 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 1911 } 1912 1913 1914 if (GFC_DESCRIPTOR_RANK (a) == 1) 1915 { 1916 /* Treat it as a a row matrix A[1,count]. */ 1917 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1918 aystride = 1; 1919 1920 xcount = 1; 1921 count = GFC_DESCRIPTOR_EXTENT(a,0); 1922 } 1923 else 1924 { 1925 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 1926 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 1927 1928 count = GFC_DESCRIPTOR_EXTENT(a,1); 1929 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 1930 } 1931 1932 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 1933 { 1934 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 1935 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 1936 "in dimension 1: is %ld, should be %ld", 1937 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 1938 } 1939 1940 if (GFC_DESCRIPTOR_RANK (b) == 1) 1941 { 1942 /* Treat it as a column matrix B[count,1] */ 1943 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1944 1945 /* bystride should never be used for 1-dimensional b. 1946 The value is only used for calculation of the 1947 memory by the buffer. */ 1948 bystride = 256; 1949 ycount = 1; 1950 } 1951 else 1952 { 1953 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 1954 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 1955 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 1956 } 1957 1958 abase = a->base_addr; 1959 bbase = b->base_addr; 1960 dest = retarray->base_addr; 1961 1962 /* Now that everything is set up, we perform the multiplication 1963 itself. */ 1964 1965#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 1966#define min(a,b) ((a) <= (b) ? (a) : (b)) 1967#define max(a,b) ((a) >= (b) ? (a) : (b)) 1968 1969 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 1970 && (bxstride == 1 || bystride == 1) 1971 && (((float) xcount) * ((float) ycount) * ((float) count) 1972 > POW3(blas_limit))) 1973 { 1974 const int m = xcount, n = ycount, k = count, ldc = rystride; 1975 const GFC_INTEGER_2 one = 1, zero = 0; 1976 const int lda = (axstride == 1) ? aystride : axstride, 1977 ldb = (bxstride == 1) ? bystride : bxstride; 1978 1979 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 1980 { 1981 assert (gemm != NULL); 1982 const char *transa, *transb; 1983 if (try_blas & 2) 1984 transa = "C"; 1985 else 1986 transa = axstride == 1 ? "N" : "T"; 1987 1988 if (try_blas & 4) 1989 transb = "C"; 1990 else 1991 transb = bxstride == 1 ? "N" : "T"; 1992 1993 gemm (transa, transb , &m, 1994 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 1995 &ldc, 1, 1); 1996 return; 1997 } 1998 } 1999 2000 if (rxstride == 1 && axstride == 1 && bxstride == 1 2001 && GFC_DESCRIPTOR_RANK (b) != 1) 2002 { 2003 /* This block of code implements a tuned matmul, derived from 2004 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2005 2006 Bo Kagstrom and Per Ling 2007 Department of Computing Science 2008 Umea University 2009 S-901 87 Umea, Sweden 2010 2011 from netlib.org, translated to C, and modified for matmul.m4. */ 2012 2013 const GFC_INTEGER_2 *a, *b; 2014 GFC_INTEGER_2 *c; 2015 const index_type m = xcount, n = ycount, k = count; 2016 2017 /* System generated locals */ 2018 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2019 i1, i2, i3, i4, i5, i6; 2020 2021 /* Local variables */ 2022 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 2023 f13, f14, f23, f24, f33, f34, f43, f44; 2024 index_type i, j, l, ii, jj, ll; 2025 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2026 GFC_INTEGER_2 *t1; 2027 2028 a = abase; 2029 b = bbase; 2030 c = retarray->base_addr; 2031 2032 /* Parameter adjustments */ 2033 c_dim1 = rystride; 2034 c_offset = 1 + c_dim1; 2035 c -= c_offset; 2036 a_dim1 = aystride; 2037 a_offset = 1 + a_dim1; 2038 a -= a_offset; 2039 b_dim1 = bystride; 2040 b_offset = 1 + b_dim1; 2041 b -= b_offset; 2042 2043 /* Empty c first. */ 2044 for (j=1; j<=n; j++) 2045 for (i=1; i<=m; i++) 2046 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 2047 2048 /* Early exit if possible */ 2049 if (m == 0 || n == 0 || k == 0) 2050 return; 2051 2052 /* Adjust size of t1 to what is needed. */ 2053 index_type t1_dim, a_sz; 2054 if (aystride == 1) 2055 a_sz = rystride; 2056 else 2057 a_sz = a_dim1; 2058 2059 t1_dim = a_sz * 256 + b_dim1; 2060 if (t1_dim > 65536) 2061 t1_dim = 65536; 2062 2063 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 2064 2065 /* Start turning the crank. */ 2066 i1 = n; 2067 for (jj = 1; jj <= i1; jj += 512) 2068 { 2069 /* Computing MIN */ 2070 i2 = 512; 2071 i3 = n - jj + 1; 2072 jsec = min(i2,i3); 2073 ujsec = jsec - jsec % 4; 2074 i2 = k; 2075 for (ll = 1; ll <= i2; ll += 256) 2076 { 2077 /* Computing MIN */ 2078 i3 = 256; 2079 i4 = k - ll + 1; 2080 lsec = min(i3,i4); 2081 ulsec = lsec - lsec % 2; 2082 2083 i3 = m; 2084 for (ii = 1; ii <= i3; ii += 256) 2085 { 2086 /* Computing MIN */ 2087 i4 = 256; 2088 i5 = m - ii + 1; 2089 isec = min(i4,i5); 2090 uisec = isec - isec % 2; 2091 i4 = ll + ulsec - 1; 2092 for (l = ll; l <= i4; l += 2) 2093 { 2094 i5 = ii + uisec - 1; 2095 for (i = ii; i <= i5; i += 2) 2096 { 2097 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2098 a[i + l * a_dim1]; 2099 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2100 a[i + (l + 1) * a_dim1]; 2101 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2102 a[i + 1 + l * a_dim1]; 2103 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2104 a[i + 1 + (l + 1) * a_dim1]; 2105 } 2106 if (uisec < isec) 2107 { 2108 t1[l - ll + 1 + (isec << 8) - 257] = 2109 a[ii + isec - 1 + l * a_dim1]; 2110 t1[l - ll + 2 + (isec << 8) - 257] = 2111 a[ii + isec - 1 + (l + 1) * a_dim1]; 2112 } 2113 } 2114 if (ulsec < lsec) 2115 { 2116 i4 = ii + isec - 1; 2117 for (i = ii; i<= i4; ++i) 2118 { 2119 t1[lsec + ((i - ii + 1) << 8) - 257] = 2120 a[i + (ll + lsec - 1) * a_dim1]; 2121 } 2122 } 2123 2124 uisec = isec - isec % 4; 2125 i4 = jj + ujsec - 1; 2126 for (j = jj; j <= i4; j += 4) 2127 { 2128 i5 = ii + uisec - 1; 2129 for (i = ii; i <= i5; i += 4) 2130 { 2131 f11 = c[i + j * c_dim1]; 2132 f21 = c[i + 1 + j * c_dim1]; 2133 f12 = c[i + (j + 1) * c_dim1]; 2134 f22 = c[i + 1 + (j + 1) * c_dim1]; 2135 f13 = c[i + (j + 2) * c_dim1]; 2136 f23 = c[i + 1 + (j + 2) * c_dim1]; 2137 f14 = c[i + (j + 3) * c_dim1]; 2138 f24 = c[i + 1 + (j + 3) * c_dim1]; 2139 f31 = c[i + 2 + j * c_dim1]; 2140 f41 = c[i + 3 + j * c_dim1]; 2141 f32 = c[i + 2 + (j + 1) * c_dim1]; 2142 f42 = c[i + 3 + (j + 1) * c_dim1]; 2143 f33 = c[i + 2 + (j + 2) * c_dim1]; 2144 f43 = c[i + 3 + (j + 2) * c_dim1]; 2145 f34 = c[i + 2 + (j + 3) * c_dim1]; 2146 f44 = c[i + 3 + (j + 3) * c_dim1]; 2147 i6 = ll + lsec - 1; 2148 for (l = ll; l <= i6; ++l) 2149 { 2150 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2151 * b[l + j * b_dim1]; 2152 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2153 * b[l + j * b_dim1]; 2154 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2155 * b[l + (j + 1) * b_dim1]; 2156 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2157 * b[l + (j + 1) * b_dim1]; 2158 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2159 * b[l + (j + 2) * b_dim1]; 2160 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2161 * b[l + (j + 2) * b_dim1]; 2162 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2163 * b[l + (j + 3) * b_dim1]; 2164 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2165 * b[l + (j + 3) * b_dim1]; 2166 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2167 * b[l + j * b_dim1]; 2168 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2169 * b[l + j * b_dim1]; 2170 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2171 * b[l + (j + 1) * b_dim1]; 2172 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2173 * b[l + (j + 1) * b_dim1]; 2174 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2175 * b[l + (j + 2) * b_dim1]; 2176 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2177 * b[l + (j + 2) * b_dim1]; 2178 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2179 * b[l + (j + 3) * b_dim1]; 2180 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2181 * b[l + (j + 3) * b_dim1]; 2182 } 2183 c[i + j * c_dim1] = f11; 2184 c[i + 1 + j * c_dim1] = f21; 2185 c[i + (j + 1) * c_dim1] = f12; 2186 c[i + 1 + (j + 1) * c_dim1] = f22; 2187 c[i + (j + 2) * c_dim1] = f13; 2188 c[i + 1 + (j + 2) * c_dim1] = f23; 2189 c[i + (j + 3) * c_dim1] = f14; 2190 c[i + 1 + (j + 3) * c_dim1] = f24; 2191 c[i + 2 + j * c_dim1] = f31; 2192 c[i + 3 + j * c_dim1] = f41; 2193 c[i + 2 + (j + 1) * c_dim1] = f32; 2194 c[i + 3 + (j + 1) * c_dim1] = f42; 2195 c[i + 2 + (j + 2) * c_dim1] = f33; 2196 c[i + 3 + (j + 2) * c_dim1] = f43; 2197 c[i + 2 + (j + 3) * c_dim1] = f34; 2198 c[i + 3 + (j + 3) * c_dim1] = f44; 2199 } 2200 if (uisec < isec) 2201 { 2202 i5 = ii + isec - 1; 2203 for (i = ii + uisec; i <= i5; ++i) 2204 { 2205 f11 = c[i + j * c_dim1]; 2206 f12 = c[i + (j + 1) * c_dim1]; 2207 f13 = c[i + (j + 2) * c_dim1]; 2208 f14 = c[i + (j + 3) * c_dim1]; 2209 i6 = ll + lsec - 1; 2210 for (l = ll; l <= i6; ++l) 2211 { 2212 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2213 257] * b[l + j * b_dim1]; 2214 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2215 257] * b[l + (j + 1) * b_dim1]; 2216 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2217 257] * b[l + (j + 2) * b_dim1]; 2218 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2219 257] * b[l + (j + 3) * b_dim1]; 2220 } 2221 c[i + j * c_dim1] = f11; 2222 c[i + (j + 1) * c_dim1] = f12; 2223 c[i + (j + 2) * c_dim1] = f13; 2224 c[i + (j + 3) * c_dim1] = f14; 2225 } 2226 } 2227 } 2228 if (ujsec < jsec) 2229 { 2230 i4 = jj + jsec - 1; 2231 for (j = jj + ujsec; j <= i4; ++j) 2232 { 2233 i5 = ii + uisec - 1; 2234 for (i = ii; i <= i5; i += 4) 2235 { 2236 f11 = c[i + j * c_dim1]; 2237 f21 = c[i + 1 + j * c_dim1]; 2238 f31 = c[i + 2 + j * c_dim1]; 2239 f41 = c[i + 3 + j * c_dim1]; 2240 i6 = ll + lsec - 1; 2241 for (l = ll; l <= i6; ++l) 2242 { 2243 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2244 257] * b[l + j * b_dim1]; 2245 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2246 257] * b[l + j * b_dim1]; 2247 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2248 257] * b[l + j * b_dim1]; 2249 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2250 257] * b[l + j * b_dim1]; 2251 } 2252 c[i + j * c_dim1] = f11; 2253 c[i + 1 + j * c_dim1] = f21; 2254 c[i + 2 + j * c_dim1] = f31; 2255 c[i + 3 + j * c_dim1] = f41; 2256 } 2257 i5 = ii + isec - 1; 2258 for (i = ii + uisec; i <= i5; ++i) 2259 { 2260 f11 = c[i + j * c_dim1]; 2261 i6 = ll + lsec - 1; 2262 for (l = ll; l <= i6; ++l) 2263 { 2264 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2265 257] * b[l + j * b_dim1]; 2266 } 2267 c[i + j * c_dim1] = f11; 2268 } 2269 } 2270 } 2271 } 2272 } 2273 } 2274 free(t1); 2275 return; 2276 } 2277 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2278 { 2279 if (GFC_DESCRIPTOR_RANK (a) != 1) 2280 { 2281 const GFC_INTEGER_2 *restrict abase_x; 2282 const GFC_INTEGER_2 *restrict bbase_y; 2283 GFC_INTEGER_2 *restrict dest_y; 2284 GFC_INTEGER_2 s; 2285 2286 for (y = 0; y < ycount; y++) 2287 { 2288 bbase_y = &bbase[y*bystride]; 2289 dest_y = &dest[y*rystride]; 2290 for (x = 0; x < xcount; x++) 2291 { 2292 abase_x = &abase[x*axstride]; 2293 s = (GFC_INTEGER_2) 0; 2294 for (n = 0; n < count; n++) 2295 s += abase_x[n] * bbase_y[n]; 2296 dest_y[x] = s; 2297 } 2298 } 2299 } 2300 else 2301 { 2302 const GFC_INTEGER_2 *restrict bbase_y; 2303 GFC_INTEGER_2 s; 2304 2305 for (y = 0; y < ycount; y++) 2306 { 2307 bbase_y = &bbase[y*bystride]; 2308 s = (GFC_INTEGER_2) 0; 2309 for (n = 0; n < count; n++) 2310 s += abase[n*axstride] * bbase_y[n]; 2311 dest[y*rystride] = s; 2312 } 2313 } 2314 } 2315 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2316 { 2317 const GFC_INTEGER_2 *restrict bbase_y; 2318 GFC_INTEGER_2 s; 2319 2320 for (y = 0; y < ycount; y++) 2321 { 2322 bbase_y = &bbase[y*bystride]; 2323 s = (GFC_INTEGER_2) 0; 2324 for (n = 0; n < count; n++) 2325 s += abase[n*axstride] * bbase_y[n*bxstride]; 2326 dest[y*rxstride] = s; 2327 } 2328 } 2329 else if (axstride < aystride) 2330 { 2331 for (y = 0; y < ycount; y++) 2332 for (x = 0; x < xcount; x++) 2333 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 2334 2335 for (y = 0; y < ycount; y++) 2336 for (n = 0; n < count; n++) 2337 for (x = 0; x < xcount; x++) 2338 /* dest[x,y] += a[x,n] * b[n,y] */ 2339 dest[x*rxstride + y*rystride] += 2340 abase[x*axstride + n*aystride] * 2341 bbase[n*bxstride + y*bystride]; 2342 } 2343 else 2344 { 2345 const GFC_INTEGER_2 *restrict abase_x; 2346 const GFC_INTEGER_2 *restrict bbase_y; 2347 GFC_INTEGER_2 *restrict dest_y; 2348 GFC_INTEGER_2 s; 2349 2350 for (y = 0; y < ycount; y++) 2351 { 2352 bbase_y = &bbase[y*bystride]; 2353 dest_y = &dest[y*rystride]; 2354 for (x = 0; x < xcount; x++) 2355 { 2356 abase_x = &abase[x*axstride]; 2357 s = (GFC_INTEGER_2) 0; 2358 for (n = 0; n < count; n++) 2359 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 2360 dest_y[x*rxstride] = s; 2361 } 2362 } 2363 } 2364} 2365#undef POW3 2366#undef min 2367#undef max 2368 2369 2370/* Compiling main function, with selection code for the processor. */ 2371 2372/* Currently, this is i386 only. Adjust for other architectures. */ 2373 2374void matmul_i2 (gfc_array_i2 * const restrict retarray, 2375 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2376 int blas_limit, blas_call gemm) 2377{ 2378 static void (*matmul_p) (gfc_array_i2 * const restrict retarray, 2379 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2380 int blas_limit, blas_call gemm); 2381 2382 void (*matmul_fn) (gfc_array_i2 * const restrict retarray, 2383 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2384 int blas_limit, blas_call gemm); 2385 2386 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); 2387 if (matmul_fn == NULL) 2388 { 2389 matmul_fn = matmul_i2_vanilla; 2390 if (__builtin_cpu_is ("intel")) 2391 { 2392 /* Run down the available processors in order of preference. */ 2393#ifdef HAVE_AVX512F 2394 if (__builtin_cpu_supports ("avx512f")) 2395 { 2396 matmul_fn = matmul_i2_avx512f; 2397 goto store; 2398 } 2399 2400#endif /* HAVE_AVX512F */ 2401 2402#ifdef HAVE_AVX2 2403 if (__builtin_cpu_supports ("avx2") 2404 && __builtin_cpu_supports ("fma")) 2405 { 2406 matmul_fn = matmul_i2_avx2; 2407 goto store; 2408 } 2409 2410#endif 2411 2412#ifdef HAVE_AVX 2413 if (__builtin_cpu_supports ("avx")) 2414 { 2415 matmul_fn = matmul_i2_avx; 2416 goto store; 2417 } 2418#endif /* HAVE_AVX */ 2419 } 2420 else if (__builtin_cpu_is ("amd")) 2421 { 2422#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 2423 if (__builtin_cpu_supports ("avx") 2424 && __builtin_cpu_supports ("fma")) 2425 { 2426 matmul_fn = matmul_i2_avx128_fma3; 2427 goto store; 2428 } 2429#endif 2430#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 2431 if (__builtin_cpu_supports ("avx") 2432 && __builtin_cpu_supports ("fma4")) 2433 { 2434 matmul_fn = matmul_i2_avx128_fma4; 2435 goto store; 2436 } 2437#endif 2438 2439 } 2440 store: 2441 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); 2442 } 2443 2444 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm); 2445} 2446 2447#else /* Just the vanilla function. */ 2448 2449void 2450matmul_i2 (gfc_array_i2 * const restrict retarray, 2451 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, 2452 int blas_limit, blas_call gemm) 2453{ 2454 const GFC_INTEGER_2 * restrict abase; 2455 const GFC_INTEGER_2 * restrict bbase; 2456 GFC_INTEGER_2 * restrict dest; 2457 2458 index_type rxstride, rystride, axstride, aystride, bxstride, bystride; 2459 index_type x, y, n, count, xcount, ycount; 2460 2461 assert (GFC_DESCRIPTOR_RANK (a) == 2 2462 || GFC_DESCRIPTOR_RANK (b) == 2); 2463 2464/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] 2465 2466 Either A or B (but not both) can be rank 1: 2467 2468 o One-dimensional argument A is implicitly treated as a row matrix 2469 dimensioned [1,count], so xcount=1. 2470 2471 o One-dimensional argument B is implicitly treated as a column matrix 2472 dimensioned [count, 1], so ycount=1. 2473*/ 2474 2475 if (retarray->base_addr == NULL) 2476 { 2477 if (GFC_DESCRIPTOR_RANK (a) == 1) 2478 { 2479 GFC_DIMENSION_SET(retarray->dim[0], 0, 2480 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); 2481 } 2482 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2483 { 2484 GFC_DIMENSION_SET(retarray->dim[0], 0, 2485 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2486 } 2487 else 2488 { 2489 GFC_DIMENSION_SET(retarray->dim[0], 0, 2490 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); 2491 2492 GFC_DIMENSION_SET(retarray->dim[1], 0, 2493 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 2494 GFC_DESCRIPTOR_EXTENT(retarray,0)); 2495 } 2496 2497 retarray->base_addr 2498 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); 2499 retarray->offset = 0; 2500 } 2501 else if (unlikely (compile_options.bounds_check)) 2502 { 2503 index_type ret_extent, arg_extent; 2504 2505 if (GFC_DESCRIPTOR_RANK (a) == 1) 2506 { 2507 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2508 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2509 if (arg_extent != ret_extent) 2510 runtime_error ("Array bound mismatch for dimension 1 of " 2511 "array (%ld/%ld) ", 2512 (long int) ret_extent, (long int) arg_extent); 2513 } 2514 else if (GFC_DESCRIPTOR_RANK (b) == 1) 2515 { 2516 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2517 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2518 if (arg_extent != ret_extent) 2519 runtime_error ("Array bound mismatch for dimension 1 of " 2520 "array (%ld/%ld) ", 2521 (long int) ret_extent, (long int) arg_extent); 2522 } 2523 else 2524 { 2525 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); 2526 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); 2527 if (arg_extent != ret_extent) 2528 runtime_error ("Array bound mismatch for dimension 1 of " 2529 "array (%ld/%ld) ", 2530 (long int) ret_extent, (long int) arg_extent); 2531 2532 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); 2533 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); 2534 if (arg_extent != ret_extent) 2535 runtime_error ("Array bound mismatch for dimension 2 of " 2536 "array (%ld/%ld) ", 2537 (long int) ret_extent, (long int) arg_extent); 2538 } 2539 } 2540 2541 2542 if (GFC_DESCRIPTOR_RANK (retarray) == 1) 2543 { 2544 /* One-dimensional result may be addressed in the code below 2545 either as a row or a column matrix. We want both cases to 2546 work. */ 2547 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2548 } 2549 else 2550 { 2551 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); 2552 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); 2553 } 2554 2555 2556 if (GFC_DESCRIPTOR_RANK (a) == 1) 2557 { 2558 /* Treat it as a a row matrix A[1,count]. */ 2559 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2560 aystride = 1; 2561 2562 xcount = 1; 2563 count = GFC_DESCRIPTOR_EXTENT(a,0); 2564 } 2565 else 2566 { 2567 axstride = GFC_DESCRIPTOR_STRIDE(a,0); 2568 aystride = GFC_DESCRIPTOR_STRIDE(a,1); 2569 2570 count = GFC_DESCRIPTOR_EXTENT(a,1); 2571 xcount = GFC_DESCRIPTOR_EXTENT(a,0); 2572 } 2573 2574 if (count != GFC_DESCRIPTOR_EXTENT(b,0)) 2575 { 2576 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) 2577 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic " 2578 "in dimension 1: is %ld, should be %ld", 2579 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count); 2580 } 2581 2582 if (GFC_DESCRIPTOR_RANK (b) == 1) 2583 { 2584 /* Treat it as a column matrix B[count,1] */ 2585 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2586 2587 /* bystride should never be used for 1-dimensional b. 2588 The value is only used for calculation of the 2589 memory by the buffer. */ 2590 bystride = 256; 2591 ycount = 1; 2592 } 2593 else 2594 { 2595 bxstride = GFC_DESCRIPTOR_STRIDE(b,0); 2596 bystride = GFC_DESCRIPTOR_STRIDE(b,1); 2597 ycount = GFC_DESCRIPTOR_EXTENT(b,1); 2598 } 2599 2600 abase = a->base_addr; 2601 bbase = b->base_addr; 2602 dest = retarray->base_addr; 2603 2604 /* Now that everything is set up, we perform the multiplication 2605 itself. */ 2606 2607#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) 2608#define min(a,b) ((a) <= (b) ? (a) : (b)) 2609#define max(a,b) ((a) >= (b) ? (a) : (b)) 2610 2611 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) 2612 && (bxstride == 1 || bystride == 1) 2613 && (((float) xcount) * ((float) ycount) * ((float) count) 2614 > POW3(blas_limit))) 2615 { 2616 const int m = xcount, n = ycount, k = count, ldc = rystride; 2617 const GFC_INTEGER_2 one = 1, zero = 0; 2618 const int lda = (axstride == 1) ? aystride : axstride, 2619 ldb = (bxstride == 1) ? bystride : bxstride; 2620 2621 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) 2622 { 2623 assert (gemm != NULL); 2624 const char *transa, *transb; 2625 if (try_blas & 2) 2626 transa = "C"; 2627 else 2628 transa = axstride == 1 ? "N" : "T"; 2629 2630 if (try_blas & 4) 2631 transb = "C"; 2632 else 2633 transb = bxstride == 1 ? "N" : "T"; 2634 2635 gemm (transa, transb , &m, 2636 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, 2637 &ldc, 1, 1); 2638 return; 2639 } 2640 } 2641 2642 if (rxstride == 1 && axstride == 1 && bxstride == 1 2643 && GFC_DESCRIPTOR_RANK (b) != 1) 2644 { 2645 /* This block of code implements a tuned matmul, derived from 2646 Superscalar GEMM-based level 3 BLAS, Beta version 0.1 2647 2648 Bo Kagstrom and Per Ling 2649 Department of Computing Science 2650 Umea University 2651 S-901 87 Umea, Sweden 2652 2653 from netlib.org, translated to C, and modified for matmul.m4. */ 2654 2655 const GFC_INTEGER_2 *a, *b; 2656 GFC_INTEGER_2 *c; 2657 const index_type m = xcount, n = ycount, k = count; 2658 2659 /* System generated locals */ 2660 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, 2661 i1, i2, i3, i4, i5, i6; 2662 2663 /* Local variables */ 2664 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, 2665 f13, f14, f23, f24, f33, f34, f43, f44; 2666 index_type i, j, l, ii, jj, ll; 2667 index_type isec, jsec, lsec, uisec, ujsec, ulsec; 2668 GFC_INTEGER_2 *t1; 2669 2670 a = abase; 2671 b = bbase; 2672 c = retarray->base_addr; 2673 2674 /* Parameter adjustments */ 2675 c_dim1 = rystride; 2676 c_offset = 1 + c_dim1; 2677 c -= c_offset; 2678 a_dim1 = aystride; 2679 a_offset = 1 + a_dim1; 2680 a -= a_offset; 2681 b_dim1 = bystride; 2682 b_offset = 1 + b_dim1; 2683 b -= b_offset; 2684 2685 /* Empty c first. */ 2686 for (j=1; j<=n; j++) 2687 for (i=1; i<=m; i++) 2688 c[i + j * c_dim1] = (GFC_INTEGER_2)0; 2689 2690 /* Early exit if possible */ 2691 if (m == 0 || n == 0 || k == 0) 2692 return; 2693 2694 /* Adjust size of t1 to what is needed. */ 2695 index_type t1_dim, a_sz; 2696 if (aystride == 1) 2697 a_sz = rystride; 2698 else 2699 a_sz = a_dim1; 2700 2701 t1_dim = a_sz * 256 + b_dim1; 2702 if (t1_dim > 65536) 2703 t1_dim = 65536; 2704 2705 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); 2706 2707 /* Start turning the crank. */ 2708 i1 = n; 2709 for (jj = 1; jj <= i1; jj += 512) 2710 { 2711 /* Computing MIN */ 2712 i2 = 512; 2713 i3 = n - jj + 1; 2714 jsec = min(i2,i3); 2715 ujsec = jsec - jsec % 4; 2716 i2 = k; 2717 for (ll = 1; ll <= i2; ll += 256) 2718 { 2719 /* Computing MIN */ 2720 i3 = 256; 2721 i4 = k - ll + 1; 2722 lsec = min(i3,i4); 2723 ulsec = lsec - lsec % 2; 2724 2725 i3 = m; 2726 for (ii = 1; ii <= i3; ii += 256) 2727 { 2728 /* Computing MIN */ 2729 i4 = 256; 2730 i5 = m - ii + 1; 2731 isec = min(i4,i5); 2732 uisec = isec - isec % 2; 2733 i4 = ll + ulsec - 1; 2734 for (l = ll; l <= i4; l += 2) 2735 { 2736 i5 = ii + uisec - 1; 2737 for (i = ii; i <= i5; i += 2) 2738 { 2739 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = 2740 a[i + l * a_dim1]; 2741 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = 2742 a[i + (l + 1) * a_dim1]; 2743 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = 2744 a[i + 1 + l * a_dim1]; 2745 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = 2746 a[i + 1 + (l + 1) * a_dim1]; 2747 } 2748 if (uisec < isec) 2749 { 2750 t1[l - ll + 1 + (isec << 8) - 257] = 2751 a[ii + isec - 1 + l * a_dim1]; 2752 t1[l - ll + 2 + (isec << 8) - 257] = 2753 a[ii + isec - 1 + (l + 1) * a_dim1]; 2754 } 2755 } 2756 if (ulsec < lsec) 2757 { 2758 i4 = ii + isec - 1; 2759 for (i = ii; i<= i4; ++i) 2760 { 2761 t1[lsec + ((i - ii + 1) << 8) - 257] = 2762 a[i + (ll + lsec - 1) * a_dim1]; 2763 } 2764 } 2765 2766 uisec = isec - isec % 4; 2767 i4 = jj + ujsec - 1; 2768 for (j = jj; j <= i4; j += 4) 2769 { 2770 i5 = ii + uisec - 1; 2771 for (i = ii; i <= i5; i += 4) 2772 { 2773 f11 = c[i + j * c_dim1]; 2774 f21 = c[i + 1 + j * c_dim1]; 2775 f12 = c[i + (j + 1) * c_dim1]; 2776 f22 = c[i + 1 + (j + 1) * c_dim1]; 2777 f13 = c[i + (j + 2) * c_dim1]; 2778 f23 = c[i + 1 + (j + 2) * c_dim1]; 2779 f14 = c[i + (j + 3) * c_dim1]; 2780 f24 = c[i + 1 + (j + 3) * c_dim1]; 2781 f31 = c[i + 2 + j * c_dim1]; 2782 f41 = c[i + 3 + j * c_dim1]; 2783 f32 = c[i + 2 + (j + 1) * c_dim1]; 2784 f42 = c[i + 3 + (j + 1) * c_dim1]; 2785 f33 = c[i + 2 + (j + 2) * c_dim1]; 2786 f43 = c[i + 3 + (j + 2) * c_dim1]; 2787 f34 = c[i + 2 + (j + 3) * c_dim1]; 2788 f44 = c[i + 3 + (j + 3) * c_dim1]; 2789 i6 = ll + lsec - 1; 2790 for (l = ll; l <= i6; ++l) 2791 { 2792 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2793 * b[l + j * b_dim1]; 2794 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2795 * b[l + j * b_dim1]; 2796 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2797 * b[l + (j + 1) * b_dim1]; 2798 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2799 * b[l + (j + 1) * b_dim1]; 2800 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2801 * b[l + (j + 2) * b_dim1]; 2802 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2803 * b[l + (j + 2) * b_dim1]; 2804 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] 2805 * b[l + (j + 3) * b_dim1]; 2806 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] 2807 * b[l + (j + 3) * b_dim1]; 2808 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2809 * b[l + j * b_dim1]; 2810 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2811 * b[l + j * b_dim1]; 2812 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2813 * b[l + (j + 1) * b_dim1]; 2814 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2815 * b[l + (j + 1) * b_dim1]; 2816 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2817 * b[l + (j + 2) * b_dim1]; 2818 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2819 * b[l + (j + 2) * b_dim1]; 2820 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] 2821 * b[l + (j + 3) * b_dim1]; 2822 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] 2823 * b[l + (j + 3) * b_dim1]; 2824 } 2825 c[i + j * c_dim1] = f11; 2826 c[i + 1 + j * c_dim1] = f21; 2827 c[i + (j + 1) * c_dim1] = f12; 2828 c[i + 1 + (j + 1) * c_dim1] = f22; 2829 c[i + (j + 2) * c_dim1] = f13; 2830 c[i + 1 + (j + 2) * c_dim1] = f23; 2831 c[i + (j + 3) * c_dim1] = f14; 2832 c[i + 1 + (j + 3) * c_dim1] = f24; 2833 c[i + 2 + j * c_dim1] = f31; 2834 c[i + 3 + j * c_dim1] = f41; 2835 c[i + 2 + (j + 1) * c_dim1] = f32; 2836 c[i + 3 + (j + 1) * c_dim1] = f42; 2837 c[i + 2 + (j + 2) * c_dim1] = f33; 2838 c[i + 3 + (j + 2) * c_dim1] = f43; 2839 c[i + 2 + (j + 3) * c_dim1] = f34; 2840 c[i + 3 + (j + 3) * c_dim1] = f44; 2841 } 2842 if (uisec < isec) 2843 { 2844 i5 = ii + isec - 1; 2845 for (i = ii + uisec; i <= i5; ++i) 2846 { 2847 f11 = c[i + j * c_dim1]; 2848 f12 = c[i + (j + 1) * c_dim1]; 2849 f13 = c[i + (j + 2) * c_dim1]; 2850 f14 = c[i + (j + 3) * c_dim1]; 2851 i6 = ll + lsec - 1; 2852 for (l = ll; l <= i6; ++l) 2853 { 2854 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2855 257] * b[l + j * b_dim1]; 2856 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2857 257] * b[l + (j + 1) * b_dim1]; 2858 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2859 257] * b[l + (j + 2) * b_dim1]; 2860 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2861 257] * b[l + (j + 3) * b_dim1]; 2862 } 2863 c[i + j * c_dim1] = f11; 2864 c[i + (j + 1) * c_dim1] = f12; 2865 c[i + (j + 2) * c_dim1] = f13; 2866 c[i + (j + 3) * c_dim1] = f14; 2867 } 2868 } 2869 } 2870 if (ujsec < jsec) 2871 { 2872 i4 = jj + jsec - 1; 2873 for (j = jj + ujsec; j <= i4; ++j) 2874 { 2875 i5 = ii + uisec - 1; 2876 for (i = ii; i <= i5; i += 4) 2877 { 2878 f11 = c[i + j * c_dim1]; 2879 f21 = c[i + 1 + j * c_dim1]; 2880 f31 = c[i + 2 + j * c_dim1]; 2881 f41 = c[i + 3 + j * c_dim1]; 2882 i6 = ll + lsec - 1; 2883 for (l = ll; l <= i6; ++l) 2884 { 2885 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2886 257] * b[l + j * b_dim1]; 2887 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 2888 257] * b[l + j * b_dim1]; 2889 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 2890 257] * b[l + j * b_dim1]; 2891 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 2892 257] * b[l + j * b_dim1]; 2893 } 2894 c[i + j * c_dim1] = f11; 2895 c[i + 1 + j * c_dim1] = f21; 2896 c[i + 2 + j * c_dim1] = f31; 2897 c[i + 3 + j * c_dim1] = f41; 2898 } 2899 i5 = ii + isec - 1; 2900 for (i = ii + uisec; i <= i5; ++i) 2901 { 2902 f11 = c[i + j * c_dim1]; 2903 i6 = ll + lsec - 1; 2904 for (l = ll; l <= i6; ++l) 2905 { 2906 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 2907 257] * b[l + j * b_dim1]; 2908 } 2909 c[i + j * c_dim1] = f11; 2910 } 2911 } 2912 } 2913 } 2914 } 2915 } 2916 free(t1); 2917 return; 2918 } 2919 else if (rxstride == 1 && aystride == 1 && bxstride == 1) 2920 { 2921 if (GFC_DESCRIPTOR_RANK (a) != 1) 2922 { 2923 const GFC_INTEGER_2 *restrict abase_x; 2924 const GFC_INTEGER_2 *restrict bbase_y; 2925 GFC_INTEGER_2 *restrict dest_y; 2926 GFC_INTEGER_2 s; 2927 2928 for (y = 0; y < ycount; y++) 2929 { 2930 bbase_y = &bbase[y*bystride]; 2931 dest_y = &dest[y*rystride]; 2932 for (x = 0; x < xcount; x++) 2933 { 2934 abase_x = &abase[x*axstride]; 2935 s = (GFC_INTEGER_2) 0; 2936 for (n = 0; n < count; n++) 2937 s += abase_x[n] * bbase_y[n]; 2938 dest_y[x] = s; 2939 } 2940 } 2941 } 2942 else 2943 { 2944 const GFC_INTEGER_2 *restrict bbase_y; 2945 GFC_INTEGER_2 s; 2946 2947 for (y = 0; y < ycount; y++) 2948 { 2949 bbase_y = &bbase[y*bystride]; 2950 s = (GFC_INTEGER_2) 0; 2951 for (n = 0; n < count; n++) 2952 s += abase[n*axstride] * bbase_y[n]; 2953 dest[y*rystride] = s; 2954 } 2955 } 2956 } 2957 else if (GFC_DESCRIPTOR_RANK (a) == 1) 2958 { 2959 const GFC_INTEGER_2 *restrict bbase_y; 2960 GFC_INTEGER_2 s; 2961 2962 for (y = 0; y < ycount; y++) 2963 { 2964 bbase_y = &bbase[y*bystride]; 2965 s = (GFC_INTEGER_2) 0; 2966 for (n = 0; n < count; n++) 2967 s += abase[n*axstride] * bbase_y[n*bxstride]; 2968 dest[y*rxstride] = s; 2969 } 2970 } 2971 else if (axstride < aystride) 2972 { 2973 for (y = 0; y < ycount; y++) 2974 for (x = 0; x < xcount; x++) 2975 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; 2976 2977 for (y = 0; y < ycount; y++) 2978 for (n = 0; n < count; n++) 2979 for (x = 0; x < xcount; x++) 2980 /* dest[x,y] += a[x,n] * b[n,y] */ 2981 dest[x*rxstride + y*rystride] += 2982 abase[x*axstride + n*aystride] * 2983 bbase[n*bxstride + y*bystride]; 2984 } 2985 else 2986 { 2987 const GFC_INTEGER_2 *restrict abase_x; 2988 const GFC_INTEGER_2 *restrict bbase_y; 2989 GFC_INTEGER_2 *restrict dest_y; 2990 GFC_INTEGER_2 s; 2991 2992 for (y = 0; y < ycount; y++) 2993 { 2994 bbase_y = &bbase[y*bystride]; 2995 dest_y = &dest[y*rystride]; 2996 for (x = 0; x < xcount; x++) 2997 { 2998 abase_x = &abase[x*axstride]; 2999 s = (GFC_INTEGER_2) 0; 3000 for (n = 0; n < count; n++) 3001 s += abase_x[n*aystride] * bbase_y[n*bxstride]; 3002 dest_y[x*rxstride] = s; 3003 } 3004 } 3005 } 3006} 3007#undef POW3 3008#undef min 3009#undef max 3010 3011#endif 3012#endif 3013 3014