1/* Implementation of the MATMUL intrinsic
2   Copyright (C) 2002-2022 Free Software Foundation, Inc.
3   Contributed by Paul Brook <paul@nowt.org>
4
5This file is part of the GNU Fortran runtime library (libgfortran).
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
10version 3 of the License, or (at your option) any later version.
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26#include "libgfortran.h"
27#include <string.h>
28#include <assert.h>
29
30
31#if defined (HAVE_GFC_INTEGER_2)
32
33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34   passed to us by the front-end, in which case we call it for large
35   matrices.  */
36
37typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38                          const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
39                          const int *, const GFC_INTEGER_2 *, const int *,
40                          const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
41                          int, int);
42
43/* The order of loops is different in the case of plain matrix
44   multiplication C=MATMUL(A,B), and in the frequent special case where
45   the argument A is the temporary result of a TRANSPOSE intrinsic:
46   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47   looking at their strides.
48
49   The equivalent Fortran pseudo-code is:
50
51   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52   IF (.NOT.IS_TRANSPOSED(A)) THEN
53     C = 0
54     DO J=1,N
55       DO K=1,COUNT
56         DO I=1,M
57           C(I,J) = C(I,J)+A(I,K)*B(K,J)
58   ELSE
59     DO J=1,N
60       DO I=1,M
61         S = 0
62         DO K=1,COUNT
63           S = S+A(I,K)*B(K,J)
64         C(I,J) = S
65   ENDIF
66*/
67
68/* If try_blas is set to a nonzero value, then the matmul function will
69   see if there is a way to perform the matrix multiplication by a call
70   to the BLAS gemm function.  */
71
72extern void matmul_i2 (gfc_array_i2 * const restrict retarray,
73	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
74	int blas_limit, blas_call gemm);
75export_proto(matmul_i2);
76
77/* Put exhaustive list of possible architectures here here, ORed together.  */
78
79#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81#ifdef HAVE_AVX
82static void
83matmul_i2_avx (gfc_array_i2 * const restrict retarray,
84	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
85	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86static void
87matmul_i2_avx (gfc_array_i2 * const restrict retarray,
88	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
89	int blas_limit, blas_call gemm)
90{
91  const GFC_INTEGER_2 * restrict abase;
92  const GFC_INTEGER_2 * restrict bbase;
93  GFC_INTEGER_2 * restrict dest;
94
95  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96  index_type x, y, n, count, xcount, ycount;
97
98  assert (GFC_DESCRIPTOR_RANK (a) == 2
99          || GFC_DESCRIPTOR_RANK (b) == 2);
100
101/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103   Either A or B (but not both) can be rank 1:
104
105   o One-dimensional argument A is implicitly treated as a row matrix
106     dimensioned [1,count], so xcount=1.
107
108   o One-dimensional argument B is implicitly treated as a column matrix
109     dimensioned [count, 1], so ycount=1.
110*/
111
112  if (retarray->base_addr == NULL)
113    {
114      if (GFC_DESCRIPTOR_RANK (a) == 1)
115        {
116	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118        }
119      else if (GFC_DESCRIPTOR_RANK (b) == 1)
120        {
121	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123        }
124      else
125        {
126	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129          GFC_DIMENSION_SET(retarray->dim[1], 0,
130	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132        }
133
134      retarray->base_addr
135	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
136      retarray->offset = 0;
137    }
138  else if (unlikely (compile_options.bounds_check))
139    {
140      index_type ret_extent, arg_extent;
141
142      if (GFC_DESCRIPTOR_RANK (a) == 1)
143	{
144	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146	  if (arg_extent != ret_extent)
147	    runtime_error ("Array bound mismatch for dimension 1 of "
148	    		   "array (%ld/%ld) ",
149			   (long int) ret_extent, (long int) arg_extent);
150	}
151      else if (GFC_DESCRIPTOR_RANK (b) == 1)
152	{
153	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155	  if (arg_extent != ret_extent)
156	    runtime_error ("Array bound mismatch for dimension 1 of "
157	    		   "array (%ld/%ld) ",
158			   (long int) ret_extent, (long int) arg_extent);
159	}
160      else
161	{
162	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164	  if (arg_extent != ret_extent)
165	    runtime_error ("Array bound mismatch for dimension 1 of "
166	    		   "array (%ld/%ld) ",
167			   (long int) ret_extent, (long int) arg_extent);
168
169	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171	  if (arg_extent != ret_extent)
172	    runtime_error ("Array bound mismatch for dimension 2 of "
173	    		   "array (%ld/%ld) ",
174			   (long int) ret_extent, (long int) arg_extent);
175	}
176    }
177
178
179  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180    {
181      /* One-dimensional result may be addressed in the code below
182	 either as a row or a column matrix. We want both cases to
183	 work. */
184      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185    }
186  else
187    {
188      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190    }
191
192
193  if (GFC_DESCRIPTOR_RANK (a) == 1)
194    {
195      /* Treat it as a a row matrix A[1,count]. */
196      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197      aystride = 1;
198
199      xcount = 1;
200      count = GFC_DESCRIPTOR_EXTENT(a,0);
201    }
202  else
203    {
204      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206
207      count = GFC_DESCRIPTOR_EXTENT(a,1);
208      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209    }
210
211  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212    {
213      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215		       "in dimension 1: is %ld, should be %ld",
216		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217    }
218
219  if (GFC_DESCRIPTOR_RANK (b) == 1)
220    {
221      /* Treat it as a column matrix B[count,1] */
222      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224      /* bystride should never be used for 1-dimensional b.
225         The value is only used for calculation of the
226         memory by the buffer.  */
227      bystride = 256;
228      ycount = 1;
229    }
230  else
231    {
232      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235    }
236
237  abase = a->base_addr;
238  bbase = b->base_addr;
239  dest = retarray->base_addr;
240
241  /* Now that everything is set up, we perform the multiplication
242     itself.  */
243
244#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245#define min(a,b) ((a) <= (b) ? (a) : (b))
246#define max(a,b) ((a) >= (b) ? (a) : (b))
247
248  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249      && (bxstride == 1 || bystride == 1)
250      && (((float) xcount) * ((float) ycount) * ((float) count)
251          > POW3(blas_limit)))
252    {
253      const int m = xcount, n = ycount, k = count, ldc = rystride;
254      const GFC_INTEGER_2 one = 1, zero = 0;
255      const int lda = (axstride == 1) ? aystride : axstride,
256		ldb = (bxstride == 1) ? bystride : bxstride;
257
258      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259	{
260	  assert (gemm != NULL);
261	  const char *transa, *transb;
262	  if (try_blas & 2)
263	    transa = "C";
264	  else
265	    transa = axstride == 1 ? "N" : "T";
266
267	  if (try_blas & 4)
268	    transb = "C";
269	  else
270	    transb = bxstride == 1 ? "N" : "T";
271
272	  gemm (transa, transb , &m,
273		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
274		&ldc, 1, 1);
275	  return;
276	}
277    }
278
279  if (rxstride == 1 && axstride == 1 && bxstride == 1
280      && GFC_DESCRIPTOR_RANK (b) != 1)
281    {
282      /* This block of code implements a tuned matmul, derived from
283         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
284
285               Bo Kagstrom and Per Ling
286               Department of Computing Science
287               Umea University
288               S-901 87 Umea, Sweden
289
290	 from netlib.org, translated to C, and modified for matmul.m4.  */
291
292      const GFC_INTEGER_2 *a, *b;
293      GFC_INTEGER_2 *c;
294      const index_type m = xcount, n = ycount, k = count;
295
296      /* System generated locals */
297      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298		 i1, i2, i3, i4, i5, i6;
299
300      /* Local variables */
301      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
302		 f13, f14, f23, f24, f33, f34, f43, f44;
303      index_type i, j, l, ii, jj, ll;
304      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305      GFC_INTEGER_2 *t1;
306
307      a = abase;
308      b = bbase;
309      c = retarray->base_addr;
310
311      /* Parameter adjustments */
312      c_dim1 = rystride;
313      c_offset = 1 + c_dim1;
314      c -= c_offset;
315      a_dim1 = aystride;
316      a_offset = 1 + a_dim1;
317      a -= a_offset;
318      b_dim1 = bystride;
319      b_offset = 1 + b_dim1;
320      b -= b_offset;
321
322      /* Empty c first.  */
323      for (j=1; j<=n; j++)
324	for (i=1; i<=m; i++)
325	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
326
327      /* Early exit if possible */
328      if (m == 0 || n == 0 || k == 0)
329	return;
330
331      /* Adjust size of t1 to what is needed.  */
332      index_type t1_dim, a_sz;
333      if (aystride == 1)
334        a_sz = rystride;
335      else
336        a_sz = a_dim1;
337
338      t1_dim = a_sz * 256 + b_dim1;
339      if (t1_dim > 65536)
340	t1_dim = 65536;
341
342      t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
343
344      /* Start turning the crank. */
345      i1 = n;
346      for (jj = 1; jj <= i1; jj += 512)
347	{
348	  /* Computing MIN */
349	  i2 = 512;
350	  i3 = n - jj + 1;
351	  jsec = min(i2,i3);
352	  ujsec = jsec - jsec % 4;
353	  i2 = k;
354	  for (ll = 1; ll <= i2; ll += 256)
355	    {
356	      /* Computing MIN */
357	      i3 = 256;
358	      i4 = k - ll + 1;
359	      lsec = min(i3,i4);
360	      ulsec = lsec - lsec % 2;
361
362	      i3 = m;
363	      for (ii = 1; ii <= i3; ii += 256)
364		{
365		  /* Computing MIN */
366		  i4 = 256;
367		  i5 = m - ii + 1;
368		  isec = min(i4,i5);
369		  uisec = isec - isec % 2;
370		  i4 = ll + ulsec - 1;
371		  for (l = ll; l <= i4; l += 2)
372		    {
373		      i5 = ii + uisec - 1;
374		      for (i = ii; i <= i5; i += 2)
375			{
376			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377					a[i + l * a_dim1];
378			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379					a[i + (l + 1) * a_dim1];
380			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381					a[i + 1 + l * a_dim1];
382			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383					a[i + 1 + (l + 1) * a_dim1];
384			}
385		      if (uisec < isec)
386			{
387			  t1[l - ll + 1 + (isec << 8) - 257] =
388				    a[ii + isec - 1 + l * a_dim1];
389			  t1[l - ll + 2 + (isec << 8) - 257] =
390				    a[ii + isec - 1 + (l + 1) * a_dim1];
391			}
392		    }
393		  if (ulsec < lsec)
394		    {
395		      i4 = ii + isec - 1;
396		      for (i = ii; i<= i4; ++i)
397			{
398			  t1[lsec + ((i - ii + 1) << 8) - 257] =
399				    a[i + (ll + lsec - 1) * a_dim1];
400			}
401		    }
402
403		  uisec = isec - isec % 4;
404		  i4 = jj + ujsec - 1;
405		  for (j = jj; j <= i4; j += 4)
406		    {
407		      i5 = ii + uisec - 1;
408		      for (i = ii; i <= i5; i += 4)
409			{
410			  f11 = c[i + j * c_dim1];
411			  f21 = c[i + 1 + j * c_dim1];
412			  f12 = c[i + (j + 1) * c_dim1];
413			  f22 = c[i + 1 + (j + 1) * c_dim1];
414			  f13 = c[i + (j + 2) * c_dim1];
415			  f23 = c[i + 1 + (j + 2) * c_dim1];
416			  f14 = c[i + (j + 3) * c_dim1];
417			  f24 = c[i + 1 + (j + 3) * c_dim1];
418			  f31 = c[i + 2 + j * c_dim1];
419			  f41 = c[i + 3 + j * c_dim1];
420			  f32 = c[i + 2 + (j + 1) * c_dim1];
421			  f42 = c[i + 3 + (j + 1) * c_dim1];
422			  f33 = c[i + 2 + (j + 2) * c_dim1];
423			  f43 = c[i + 3 + (j + 2) * c_dim1];
424			  f34 = c[i + 2 + (j + 3) * c_dim1];
425			  f44 = c[i + 3 + (j + 3) * c_dim1];
426			  i6 = ll + lsec - 1;
427			  for (l = ll; l <= i6; ++l)
428			    {
429			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430				      * b[l + j * b_dim1];
431			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432				      * b[l + j * b_dim1];
433			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434				      * b[l + (j + 1) * b_dim1];
435			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436				      * b[l + (j + 1) * b_dim1];
437			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438				      * b[l + (j + 2) * b_dim1];
439			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440				      * b[l + (j + 2) * b_dim1];
441			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442				      * b[l + (j + 3) * b_dim1];
443			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444				      * b[l + (j + 3) * b_dim1];
445			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446				      * b[l + j * b_dim1];
447			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448				      * b[l + j * b_dim1];
449			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450				      * b[l + (j + 1) * b_dim1];
451			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452				      * b[l + (j + 1) * b_dim1];
453			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454				      * b[l + (j + 2) * b_dim1];
455			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456				      * b[l + (j + 2) * b_dim1];
457			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458				      * b[l + (j + 3) * b_dim1];
459			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460				      * b[l + (j + 3) * b_dim1];
461			    }
462			  c[i + j * c_dim1] = f11;
463			  c[i + 1 + j * c_dim1] = f21;
464			  c[i + (j + 1) * c_dim1] = f12;
465			  c[i + 1 + (j + 1) * c_dim1] = f22;
466			  c[i + (j + 2) * c_dim1] = f13;
467			  c[i + 1 + (j + 2) * c_dim1] = f23;
468			  c[i + (j + 3) * c_dim1] = f14;
469			  c[i + 1 + (j + 3) * c_dim1] = f24;
470			  c[i + 2 + j * c_dim1] = f31;
471			  c[i + 3 + j * c_dim1] = f41;
472			  c[i + 2 + (j + 1) * c_dim1] = f32;
473			  c[i + 3 + (j + 1) * c_dim1] = f42;
474			  c[i + 2 + (j + 2) * c_dim1] = f33;
475			  c[i + 3 + (j + 2) * c_dim1] = f43;
476			  c[i + 2 + (j + 3) * c_dim1] = f34;
477			  c[i + 3 + (j + 3) * c_dim1] = f44;
478			}
479		      if (uisec < isec)
480			{
481			  i5 = ii + isec - 1;
482			  for (i = ii + uisec; i <= i5; ++i)
483			    {
484			      f11 = c[i + j * c_dim1];
485			      f12 = c[i + (j + 1) * c_dim1];
486			      f13 = c[i + (j + 2) * c_dim1];
487			      f14 = c[i + (j + 3) * c_dim1];
488			      i6 = ll + lsec - 1;
489			      for (l = ll; l <= i6; ++l)
490				{
491				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492					  257] * b[l + j * b_dim1];
493				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494					  257] * b[l + (j + 1) * b_dim1];
495				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496					  257] * b[l + (j + 2) * b_dim1];
497				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498					  257] * b[l + (j + 3) * b_dim1];
499				}
500			      c[i + j * c_dim1] = f11;
501			      c[i + (j + 1) * c_dim1] = f12;
502			      c[i + (j + 2) * c_dim1] = f13;
503			      c[i + (j + 3) * c_dim1] = f14;
504			    }
505			}
506		    }
507		  if (ujsec < jsec)
508		    {
509		      i4 = jj + jsec - 1;
510		      for (j = jj + ujsec; j <= i4; ++j)
511			{
512			  i5 = ii + uisec - 1;
513			  for (i = ii; i <= i5; i += 4)
514			    {
515			      f11 = c[i + j * c_dim1];
516			      f21 = c[i + 1 + j * c_dim1];
517			      f31 = c[i + 2 + j * c_dim1];
518			      f41 = c[i + 3 + j * c_dim1];
519			      i6 = ll + lsec - 1;
520			      for (l = ll; l <= i6; ++l)
521				{
522				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523					  257] * b[l + j * b_dim1];
524				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525					  257] * b[l + j * b_dim1];
526				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527					  257] * b[l + j * b_dim1];
528				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529					  257] * b[l + j * b_dim1];
530				}
531			      c[i + j * c_dim1] = f11;
532			      c[i + 1 + j * c_dim1] = f21;
533			      c[i + 2 + j * c_dim1] = f31;
534			      c[i + 3 + j * c_dim1] = f41;
535			    }
536			  i5 = ii + isec - 1;
537			  for (i = ii + uisec; i <= i5; ++i)
538			    {
539			      f11 = c[i + j * c_dim1];
540			      i6 = ll + lsec - 1;
541			      for (l = ll; l <= i6; ++l)
542				{
543				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544					  257] * b[l + j * b_dim1];
545				}
546			      c[i + j * c_dim1] = f11;
547			    }
548			}
549		    }
550		}
551	    }
552	}
553      free(t1);
554      return;
555    }
556  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557    {
558      if (GFC_DESCRIPTOR_RANK (a) != 1)
559	{
560	  const GFC_INTEGER_2 *restrict abase_x;
561	  const GFC_INTEGER_2 *restrict bbase_y;
562	  GFC_INTEGER_2 *restrict dest_y;
563	  GFC_INTEGER_2 s;
564
565	  for (y = 0; y < ycount; y++)
566	    {
567	      bbase_y = &bbase[y*bystride];
568	      dest_y = &dest[y*rystride];
569	      for (x = 0; x < xcount; x++)
570		{
571		  abase_x = &abase[x*axstride];
572		  s = (GFC_INTEGER_2) 0;
573		  for (n = 0; n < count; n++)
574		    s += abase_x[n] * bbase_y[n];
575		  dest_y[x] = s;
576		}
577	    }
578	}
579      else
580	{
581	  const GFC_INTEGER_2 *restrict bbase_y;
582	  GFC_INTEGER_2 s;
583
584	  for (y = 0; y < ycount; y++)
585	    {
586	      bbase_y = &bbase[y*bystride];
587	      s = (GFC_INTEGER_2) 0;
588	      for (n = 0; n < count; n++)
589		s += abase[n*axstride] * bbase_y[n];
590	      dest[y*rystride] = s;
591	    }
592	}
593    }
594  else if (GFC_DESCRIPTOR_RANK (a) == 1)
595    {
596      const GFC_INTEGER_2 *restrict bbase_y;
597      GFC_INTEGER_2 s;
598
599      for (y = 0; y < ycount; y++)
600	{
601	  bbase_y = &bbase[y*bystride];
602	  s = (GFC_INTEGER_2) 0;
603	  for (n = 0; n < count; n++)
604	    s += abase[n*axstride] * bbase_y[n*bxstride];
605	  dest[y*rxstride] = s;
606	}
607    }
608  else if (axstride < aystride)
609    {
610      for (y = 0; y < ycount; y++)
611	for (x = 0; x < xcount; x++)
612	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
613
614      for (y = 0; y < ycount; y++)
615	for (n = 0; n < count; n++)
616	  for (x = 0; x < xcount; x++)
617	    /* dest[x,y] += a[x,n] * b[n,y] */
618	    dest[x*rxstride + y*rystride] +=
619					abase[x*axstride + n*aystride] *
620					bbase[n*bxstride + y*bystride];
621    }
622  else
623    {
624      const GFC_INTEGER_2 *restrict abase_x;
625      const GFC_INTEGER_2 *restrict bbase_y;
626      GFC_INTEGER_2 *restrict dest_y;
627      GFC_INTEGER_2 s;
628
629      for (y = 0; y < ycount; y++)
630	{
631	  bbase_y = &bbase[y*bystride];
632	  dest_y = &dest[y*rystride];
633	  for (x = 0; x < xcount; x++)
634	    {
635	      abase_x = &abase[x*axstride];
636	      s = (GFC_INTEGER_2) 0;
637	      for (n = 0; n < count; n++)
638		s += abase_x[n*aystride] * bbase_y[n*bxstride];
639	      dest_y[x*rxstride] = s;
640	    }
641	}
642    }
643}
644#undef POW3
645#undef min
646#undef max
647
648#endif /* HAVE_AVX */
649
650#ifdef HAVE_AVX2
651static void
652matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
653	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
654	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655static void
656matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
657	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
658	int blas_limit, blas_call gemm)
659{
660  const GFC_INTEGER_2 * restrict abase;
661  const GFC_INTEGER_2 * restrict bbase;
662  GFC_INTEGER_2 * restrict dest;
663
664  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665  index_type x, y, n, count, xcount, ycount;
666
667  assert (GFC_DESCRIPTOR_RANK (a) == 2
668          || GFC_DESCRIPTOR_RANK (b) == 2);
669
670/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671
672   Either A or B (but not both) can be rank 1:
673
674   o One-dimensional argument A is implicitly treated as a row matrix
675     dimensioned [1,count], so xcount=1.
676
677   o One-dimensional argument B is implicitly treated as a column matrix
678     dimensioned [count, 1], so ycount=1.
679*/
680
681  if (retarray->base_addr == NULL)
682    {
683      if (GFC_DESCRIPTOR_RANK (a) == 1)
684        {
685	  GFC_DIMENSION_SET(retarray->dim[0], 0,
686	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687        }
688      else if (GFC_DESCRIPTOR_RANK (b) == 1)
689        {
690	  GFC_DIMENSION_SET(retarray->dim[0], 0,
691	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692        }
693      else
694        {
695	  GFC_DIMENSION_SET(retarray->dim[0], 0,
696	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697
698          GFC_DIMENSION_SET(retarray->dim[1], 0,
699	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700			    GFC_DESCRIPTOR_EXTENT(retarray,0));
701        }
702
703      retarray->base_addr
704	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
705      retarray->offset = 0;
706    }
707  else if (unlikely (compile_options.bounds_check))
708    {
709      index_type ret_extent, arg_extent;
710
711      if (GFC_DESCRIPTOR_RANK (a) == 1)
712	{
713	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715	  if (arg_extent != ret_extent)
716	    runtime_error ("Array bound mismatch for dimension 1 of "
717	    		   "array (%ld/%ld) ",
718			   (long int) ret_extent, (long int) arg_extent);
719	}
720      else if (GFC_DESCRIPTOR_RANK (b) == 1)
721	{
722	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724	  if (arg_extent != ret_extent)
725	    runtime_error ("Array bound mismatch for dimension 1 of "
726	    		   "array (%ld/%ld) ",
727			   (long int) ret_extent, (long int) arg_extent);
728	}
729      else
730	{
731	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733	  if (arg_extent != ret_extent)
734	    runtime_error ("Array bound mismatch for dimension 1 of "
735	    		   "array (%ld/%ld) ",
736			   (long int) ret_extent, (long int) arg_extent);
737
738	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740	  if (arg_extent != ret_extent)
741	    runtime_error ("Array bound mismatch for dimension 2 of "
742	    		   "array (%ld/%ld) ",
743			   (long int) ret_extent, (long int) arg_extent);
744	}
745    }
746
747
748  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749    {
750      /* One-dimensional result may be addressed in the code below
751	 either as a row or a column matrix. We want both cases to
752	 work. */
753      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754    }
755  else
756    {
757      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759    }
760
761
762  if (GFC_DESCRIPTOR_RANK (a) == 1)
763    {
764      /* Treat it as a a row matrix A[1,count]. */
765      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766      aystride = 1;
767
768      xcount = 1;
769      count = GFC_DESCRIPTOR_EXTENT(a,0);
770    }
771  else
772    {
773      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775
776      count = GFC_DESCRIPTOR_EXTENT(a,1);
777      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778    }
779
780  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781    {
782      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784		       "in dimension 1: is %ld, should be %ld",
785		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786    }
787
788  if (GFC_DESCRIPTOR_RANK (b) == 1)
789    {
790      /* Treat it as a column matrix B[count,1] */
791      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792
793      /* bystride should never be used for 1-dimensional b.
794         The value is only used for calculation of the
795         memory by the buffer.  */
796      bystride = 256;
797      ycount = 1;
798    }
799  else
800    {
801      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804    }
805
806  abase = a->base_addr;
807  bbase = b->base_addr;
808  dest = retarray->base_addr;
809
810  /* Now that everything is set up, we perform the multiplication
811     itself.  */
812
813#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814#define min(a,b) ((a) <= (b) ? (a) : (b))
815#define max(a,b) ((a) >= (b) ? (a) : (b))
816
817  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818      && (bxstride == 1 || bystride == 1)
819      && (((float) xcount) * ((float) ycount) * ((float) count)
820          > POW3(blas_limit)))
821    {
822      const int m = xcount, n = ycount, k = count, ldc = rystride;
823      const GFC_INTEGER_2 one = 1, zero = 0;
824      const int lda = (axstride == 1) ? aystride : axstride,
825		ldb = (bxstride == 1) ? bystride : bxstride;
826
827      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828	{
829	  assert (gemm != NULL);
830	  const char *transa, *transb;
831	  if (try_blas & 2)
832	    transa = "C";
833	  else
834	    transa = axstride == 1 ? "N" : "T";
835
836	  if (try_blas & 4)
837	    transb = "C";
838	  else
839	    transb = bxstride == 1 ? "N" : "T";
840
841	  gemm (transa, transb , &m,
842		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
843		&ldc, 1, 1);
844	  return;
845	}
846    }
847
848  if (rxstride == 1 && axstride == 1 && bxstride == 1
849      && GFC_DESCRIPTOR_RANK (b) != 1)
850    {
851      /* This block of code implements a tuned matmul, derived from
852         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
853
854               Bo Kagstrom and Per Ling
855               Department of Computing Science
856               Umea University
857               S-901 87 Umea, Sweden
858
859	 from netlib.org, translated to C, and modified for matmul.m4.  */
860
861      const GFC_INTEGER_2 *a, *b;
862      GFC_INTEGER_2 *c;
863      const index_type m = xcount, n = ycount, k = count;
864
865      /* System generated locals */
866      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867		 i1, i2, i3, i4, i5, i6;
868
869      /* Local variables */
870      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
871		 f13, f14, f23, f24, f33, f34, f43, f44;
872      index_type i, j, l, ii, jj, ll;
873      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874      GFC_INTEGER_2 *t1;
875
876      a = abase;
877      b = bbase;
878      c = retarray->base_addr;
879
880      /* Parameter adjustments */
881      c_dim1 = rystride;
882      c_offset = 1 + c_dim1;
883      c -= c_offset;
884      a_dim1 = aystride;
885      a_offset = 1 + a_dim1;
886      a -= a_offset;
887      b_dim1 = bystride;
888      b_offset = 1 + b_dim1;
889      b -= b_offset;
890
891      /* Empty c first.  */
892      for (j=1; j<=n; j++)
893	for (i=1; i<=m; i++)
894	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
895
896      /* Early exit if possible */
897      if (m == 0 || n == 0 || k == 0)
898	return;
899
900      /* Adjust size of t1 to what is needed.  */
901      index_type t1_dim, a_sz;
902      if (aystride == 1)
903        a_sz = rystride;
904      else
905        a_sz = a_dim1;
906
907      t1_dim = a_sz * 256 + b_dim1;
908      if (t1_dim > 65536)
909	t1_dim = 65536;
910
911      t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
912
913      /* Start turning the crank. */
914      i1 = n;
915      for (jj = 1; jj <= i1; jj += 512)
916	{
917	  /* Computing MIN */
918	  i2 = 512;
919	  i3 = n - jj + 1;
920	  jsec = min(i2,i3);
921	  ujsec = jsec - jsec % 4;
922	  i2 = k;
923	  for (ll = 1; ll <= i2; ll += 256)
924	    {
925	      /* Computing MIN */
926	      i3 = 256;
927	      i4 = k - ll + 1;
928	      lsec = min(i3,i4);
929	      ulsec = lsec - lsec % 2;
930
931	      i3 = m;
932	      for (ii = 1; ii <= i3; ii += 256)
933		{
934		  /* Computing MIN */
935		  i4 = 256;
936		  i5 = m - ii + 1;
937		  isec = min(i4,i5);
938		  uisec = isec - isec % 2;
939		  i4 = ll + ulsec - 1;
940		  for (l = ll; l <= i4; l += 2)
941		    {
942		      i5 = ii + uisec - 1;
943		      for (i = ii; i <= i5; i += 2)
944			{
945			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946					a[i + l * a_dim1];
947			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948					a[i + (l + 1) * a_dim1];
949			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950					a[i + 1 + l * a_dim1];
951			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952					a[i + 1 + (l + 1) * a_dim1];
953			}
954		      if (uisec < isec)
955			{
956			  t1[l - ll + 1 + (isec << 8) - 257] =
957				    a[ii + isec - 1 + l * a_dim1];
958			  t1[l - ll + 2 + (isec << 8) - 257] =
959				    a[ii + isec - 1 + (l + 1) * a_dim1];
960			}
961		    }
962		  if (ulsec < lsec)
963		    {
964		      i4 = ii + isec - 1;
965		      for (i = ii; i<= i4; ++i)
966			{
967			  t1[lsec + ((i - ii + 1) << 8) - 257] =
968				    a[i + (ll + lsec - 1) * a_dim1];
969			}
970		    }
971
972		  uisec = isec - isec % 4;
973		  i4 = jj + ujsec - 1;
974		  for (j = jj; j <= i4; j += 4)
975		    {
976		      i5 = ii + uisec - 1;
977		      for (i = ii; i <= i5; i += 4)
978			{
979			  f11 = c[i + j * c_dim1];
980			  f21 = c[i + 1 + j * c_dim1];
981			  f12 = c[i + (j + 1) * c_dim1];
982			  f22 = c[i + 1 + (j + 1) * c_dim1];
983			  f13 = c[i + (j + 2) * c_dim1];
984			  f23 = c[i + 1 + (j + 2) * c_dim1];
985			  f14 = c[i + (j + 3) * c_dim1];
986			  f24 = c[i + 1 + (j + 3) * c_dim1];
987			  f31 = c[i + 2 + j * c_dim1];
988			  f41 = c[i + 3 + j * c_dim1];
989			  f32 = c[i + 2 + (j + 1) * c_dim1];
990			  f42 = c[i + 3 + (j + 1) * c_dim1];
991			  f33 = c[i + 2 + (j + 2) * c_dim1];
992			  f43 = c[i + 3 + (j + 2) * c_dim1];
993			  f34 = c[i + 2 + (j + 3) * c_dim1];
994			  f44 = c[i + 3 + (j + 3) * c_dim1];
995			  i6 = ll + lsec - 1;
996			  for (l = ll; l <= i6; ++l)
997			    {
998			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999				      * b[l + j * b_dim1];
1000			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001				      * b[l + j * b_dim1];
1002			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003				      * b[l + (j + 1) * b_dim1];
1004			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005				      * b[l + (j + 1) * b_dim1];
1006			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007				      * b[l + (j + 2) * b_dim1];
1008			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009				      * b[l + (j + 2) * b_dim1];
1010			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011				      * b[l + (j + 3) * b_dim1];
1012			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013				      * b[l + (j + 3) * b_dim1];
1014			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015				      * b[l + j * b_dim1];
1016			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017				      * b[l + j * b_dim1];
1018			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019				      * b[l + (j + 1) * b_dim1];
1020			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021				      * b[l + (j + 1) * b_dim1];
1022			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023				      * b[l + (j + 2) * b_dim1];
1024			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025				      * b[l + (j + 2) * b_dim1];
1026			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027				      * b[l + (j + 3) * b_dim1];
1028			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029				      * b[l + (j + 3) * b_dim1];
1030			    }
1031			  c[i + j * c_dim1] = f11;
1032			  c[i + 1 + j * c_dim1] = f21;
1033			  c[i + (j + 1) * c_dim1] = f12;
1034			  c[i + 1 + (j + 1) * c_dim1] = f22;
1035			  c[i + (j + 2) * c_dim1] = f13;
1036			  c[i + 1 + (j + 2) * c_dim1] = f23;
1037			  c[i + (j + 3) * c_dim1] = f14;
1038			  c[i + 1 + (j + 3) * c_dim1] = f24;
1039			  c[i + 2 + j * c_dim1] = f31;
1040			  c[i + 3 + j * c_dim1] = f41;
1041			  c[i + 2 + (j + 1) * c_dim1] = f32;
1042			  c[i + 3 + (j + 1) * c_dim1] = f42;
1043			  c[i + 2 + (j + 2) * c_dim1] = f33;
1044			  c[i + 3 + (j + 2) * c_dim1] = f43;
1045			  c[i + 2 + (j + 3) * c_dim1] = f34;
1046			  c[i + 3 + (j + 3) * c_dim1] = f44;
1047			}
1048		      if (uisec < isec)
1049			{
1050			  i5 = ii + isec - 1;
1051			  for (i = ii + uisec; i <= i5; ++i)
1052			    {
1053			      f11 = c[i + j * c_dim1];
1054			      f12 = c[i + (j + 1) * c_dim1];
1055			      f13 = c[i + (j + 2) * c_dim1];
1056			      f14 = c[i + (j + 3) * c_dim1];
1057			      i6 = ll + lsec - 1;
1058			      for (l = ll; l <= i6; ++l)
1059				{
1060				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061					  257] * b[l + j * b_dim1];
1062				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063					  257] * b[l + (j + 1) * b_dim1];
1064				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065					  257] * b[l + (j + 2) * b_dim1];
1066				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067					  257] * b[l + (j + 3) * b_dim1];
1068				}
1069			      c[i + j * c_dim1] = f11;
1070			      c[i + (j + 1) * c_dim1] = f12;
1071			      c[i + (j + 2) * c_dim1] = f13;
1072			      c[i + (j + 3) * c_dim1] = f14;
1073			    }
1074			}
1075		    }
1076		  if (ujsec < jsec)
1077		    {
1078		      i4 = jj + jsec - 1;
1079		      for (j = jj + ujsec; j <= i4; ++j)
1080			{
1081			  i5 = ii + uisec - 1;
1082			  for (i = ii; i <= i5; i += 4)
1083			    {
1084			      f11 = c[i + j * c_dim1];
1085			      f21 = c[i + 1 + j * c_dim1];
1086			      f31 = c[i + 2 + j * c_dim1];
1087			      f41 = c[i + 3 + j * c_dim1];
1088			      i6 = ll + lsec - 1;
1089			      for (l = ll; l <= i6; ++l)
1090				{
1091				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092					  257] * b[l + j * b_dim1];
1093				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094					  257] * b[l + j * b_dim1];
1095				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096					  257] * b[l + j * b_dim1];
1097				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098					  257] * b[l + j * b_dim1];
1099				}
1100			      c[i + j * c_dim1] = f11;
1101			      c[i + 1 + j * c_dim1] = f21;
1102			      c[i + 2 + j * c_dim1] = f31;
1103			      c[i + 3 + j * c_dim1] = f41;
1104			    }
1105			  i5 = ii + isec - 1;
1106			  for (i = ii + uisec; i <= i5; ++i)
1107			    {
1108			      f11 = c[i + j * c_dim1];
1109			      i6 = ll + lsec - 1;
1110			      for (l = ll; l <= i6; ++l)
1111				{
1112				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113					  257] * b[l + j * b_dim1];
1114				}
1115			      c[i + j * c_dim1] = f11;
1116			    }
1117			}
1118		    }
1119		}
1120	    }
1121	}
1122      free(t1);
1123      return;
1124    }
1125  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126    {
1127      if (GFC_DESCRIPTOR_RANK (a) != 1)
1128	{
1129	  const GFC_INTEGER_2 *restrict abase_x;
1130	  const GFC_INTEGER_2 *restrict bbase_y;
1131	  GFC_INTEGER_2 *restrict dest_y;
1132	  GFC_INTEGER_2 s;
1133
1134	  for (y = 0; y < ycount; y++)
1135	    {
1136	      bbase_y = &bbase[y*bystride];
1137	      dest_y = &dest[y*rystride];
1138	      for (x = 0; x < xcount; x++)
1139		{
1140		  abase_x = &abase[x*axstride];
1141		  s = (GFC_INTEGER_2) 0;
1142		  for (n = 0; n < count; n++)
1143		    s += abase_x[n] * bbase_y[n];
1144		  dest_y[x] = s;
1145		}
1146	    }
1147	}
1148      else
1149	{
1150	  const GFC_INTEGER_2 *restrict bbase_y;
1151	  GFC_INTEGER_2 s;
1152
1153	  for (y = 0; y < ycount; y++)
1154	    {
1155	      bbase_y = &bbase[y*bystride];
1156	      s = (GFC_INTEGER_2) 0;
1157	      for (n = 0; n < count; n++)
1158		s += abase[n*axstride] * bbase_y[n];
1159	      dest[y*rystride] = s;
1160	    }
1161	}
1162    }
1163  else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164    {
1165      const GFC_INTEGER_2 *restrict bbase_y;
1166      GFC_INTEGER_2 s;
1167
1168      for (y = 0; y < ycount; y++)
1169	{
1170	  bbase_y = &bbase[y*bystride];
1171	  s = (GFC_INTEGER_2) 0;
1172	  for (n = 0; n < count; n++)
1173	    s += abase[n*axstride] * bbase_y[n*bxstride];
1174	  dest[y*rxstride] = s;
1175	}
1176    }
1177  else if (axstride < aystride)
1178    {
1179      for (y = 0; y < ycount; y++)
1180	for (x = 0; x < xcount; x++)
1181	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1182
1183      for (y = 0; y < ycount; y++)
1184	for (n = 0; n < count; n++)
1185	  for (x = 0; x < xcount; x++)
1186	    /* dest[x,y] += a[x,n] * b[n,y] */
1187	    dest[x*rxstride + y*rystride] +=
1188					abase[x*axstride + n*aystride] *
1189					bbase[n*bxstride + y*bystride];
1190    }
1191  else
1192    {
1193      const GFC_INTEGER_2 *restrict abase_x;
1194      const GFC_INTEGER_2 *restrict bbase_y;
1195      GFC_INTEGER_2 *restrict dest_y;
1196      GFC_INTEGER_2 s;
1197
1198      for (y = 0; y < ycount; y++)
1199	{
1200	  bbase_y = &bbase[y*bystride];
1201	  dest_y = &dest[y*rystride];
1202	  for (x = 0; x < xcount; x++)
1203	    {
1204	      abase_x = &abase[x*axstride];
1205	      s = (GFC_INTEGER_2) 0;
1206	      for (n = 0; n < count; n++)
1207		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208	      dest_y[x*rxstride] = s;
1209	    }
1210	}
1211    }
1212}
1213#undef POW3
1214#undef min
1215#undef max
1216
1217#endif /* HAVE_AVX2 */
1218
1219#ifdef HAVE_AVX512F
1220static void
1221matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
1222	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1223	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224static void
1225matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
1226	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1227	int blas_limit, blas_call gemm)
1228{
1229  const GFC_INTEGER_2 * restrict abase;
1230  const GFC_INTEGER_2 * restrict bbase;
1231  GFC_INTEGER_2 * restrict dest;
1232
1233  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234  index_type x, y, n, count, xcount, ycount;
1235
1236  assert (GFC_DESCRIPTOR_RANK (a) == 2
1237          || GFC_DESCRIPTOR_RANK (b) == 2);
1238
1239/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240
1241   Either A or B (but not both) can be rank 1:
1242
1243   o One-dimensional argument A is implicitly treated as a row matrix
1244     dimensioned [1,count], so xcount=1.
1245
1246   o One-dimensional argument B is implicitly treated as a column matrix
1247     dimensioned [count, 1], so ycount=1.
1248*/
1249
1250  if (retarray->base_addr == NULL)
1251    {
1252      if (GFC_DESCRIPTOR_RANK (a) == 1)
1253        {
1254	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1255	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256        }
1257      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258        {
1259	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1260	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261        }
1262      else
1263        {
1264	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1265	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266
1267          GFC_DIMENSION_SET(retarray->dim[1], 0,
1268	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1270        }
1271
1272      retarray->base_addr
1273	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
1274      retarray->offset = 0;
1275    }
1276  else if (unlikely (compile_options.bounds_check))
1277    {
1278      index_type ret_extent, arg_extent;
1279
1280      if (GFC_DESCRIPTOR_RANK (a) == 1)
1281	{
1282	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284	  if (arg_extent != ret_extent)
1285	    runtime_error ("Array bound mismatch for dimension 1 of "
1286	    		   "array (%ld/%ld) ",
1287			   (long int) ret_extent, (long int) arg_extent);
1288	}
1289      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290	{
1291	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293	  if (arg_extent != ret_extent)
1294	    runtime_error ("Array bound mismatch for dimension 1 of "
1295	    		   "array (%ld/%ld) ",
1296			   (long int) ret_extent, (long int) arg_extent);
1297	}
1298      else
1299	{
1300	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302	  if (arg_extent != ret_extent)
1303	    runtime_error ("Array bound mismatch for dimension 1 of "
1304	    		   "array (%ld/%ld) ",
1305			   (long int) ret_extent, (long int) arg_extent);
1306
1307	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309	  if (arg_extent != ret_extent)
1310	    runtime_error ("Array bound mismatch for dimension 2 of "
1311	    		   "array (%ld/%ld) ",
1312			   (long int) ret_extent, (long int) arg_extent);
1313	}
1314    }
1315
1316
1317  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318    {
1319      /* One-dimensional result may be addressed in the code below
1320	 either as a row or a column matrix. We want both cases to
1321	 work. */
1322      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323    }
1324  else
1325    {
1326      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328    }
1329
1330
1331  if (GFC_DESCRIPTOR_RANK (a) == 1)
1332    {
1333      /* Treat it as a a row matrix A[1,count]. */
1334      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335      aystride = 1;
1336
1337      xcount = 1;
1338      count = GFC_DESCRIPTOR_EXTENT(a,0);
1339    }
1340  else
1341    {
1342      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344
1345      count = GFC_DESCRIPTOR_EXTENT(a,1);
1346      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347    }
1348
1349  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350    {
1351      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353		       "in dimension 1: is %ld, should be %ld",
1354		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355    }
1356
1357  if (GFC_DESCRIPTOR_RANK (b) == 1)
1358    {
1359      /* Treat it as a column matrix B[count,1] */
1360      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361
1362      /* bystride should never be used for 1-dimensional b.
1363         The value is only used for calculation of the
1364         memory by the buffer.  */
1365      bystride = 256;
1366      ycount = 1;
1367    }
1368  else
1369    {
1370      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373    }
1374
1375  abase = a->base_addr;
1376  bbase = b->base_addr;
1377  dest = retarray->base_addr;
1378
1379  /* Now that everything is set up, we perform the multiplication
1380     itself.  */
1381
1382#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383#define min(a,b) ((a) <= (b) ? (a) : (b))
1384#define max(a,b) ((a) >= (b) ? (a) : (b))
1385
1386  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387      && (bxstride == 1 || bystride == 1)
1388      && (((float) xcount) * ((float) ycount) * ((float) count)
1389          > POW3(blas_limit)))
1390    {
1391      const int m = xcount, n = ycount, k = count, ldc = rystride;
1392      const GFC_INTEGER_2 one = 1, zero = 0;
1393      const int lda = (axstride == 1) ? aystride : axstride,
1394		ldb = (bxstride == 1) ? bystride : bxstride;
1395
1396      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397	{
1398	  assert (gemm != NULL);
1399	  const char *transa, *transb;
1400	  if (try_blas & 2)
1401	    transa = "C";
1402	  else
1403	    transa = axstride == 1 ? "N" : "T";
1404
1405	  if (try_blas & 4)
1406	    transb = "C";
1407	  else
1408	    transb = bxstride == 1 ? "N" : "T";
1409
1410	  gemm (transa, transb , &m,
1411		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1412		&ldc, 1, 1);
1413	  return;
1414	}
1415    }
1416
1417  if (rxstride == 1 && axstride == 1 && bxstride == 1
1418      && GFC_DESCRIPTOR_RANK (b) != 1)
1419    {
1420      /* This block of code implements a tuned matmul, derived from
1421         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1422
1423               Bo Kagstrom and Per Ling
1424               Department of Computing Science
1425               Umea University
1426               S-901 87 Umea, Sweden
1427
1428	 from netlib.org, translated to C, and modified for matmul.m4.  */
1429
1430      const GFC_INTEGER_2 *a, *b;
1431      GFC_INTEGER_2 *c;
1432      const index_type m = xcount, n = ycount, k = count;
1433
1434      /* System generated locals */
1435      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436		 i1, i2, i3, i4, i5, i6;
1437
1438      /* Local variables */
1439      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
1440		 f13, f14, f23, f24, f33, f34, f43, f44;
1441      index_type i, j, l, ii, jj, ll;
1442      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443      GFC_INTEGER_2 *t1;
1444
1445      a = abase;
1446      b = bbase;
1447      c = retarray->base_addr;
1448
1449      /* Parameter adjustments */
1450      c_dim1 = rystride;
1451      c_offset = 1 + c_dim1;
1452      c -= c_offset;
1453      a_dim1 = aystride;
1454      a_offset = 1 + a_dim1;
1455      a -= a_offset;
1456      b_dim1 = bystride;
1457      b_offset = 1 + b_dim1;
1458      b -= b_offset;
1459
1460      /* Empty c first.  */
1461      for (j=1; j<=n; j++)
1462	for (i=1; i<=m; i++)
1463	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
1464
1465      /* Early exit if possible */
1466      if (m == 0 || n == 0 || k == 0)
1467	return;
1468
1469      /* Adjust size of t1 to what is needed.  */
1470      index_type t1_dim, a_sz;
1471      if (aystride == 1)
1472        a_sz = rystride;
1473      else
1474        a_sz = a_dim1;
1475
1476      t1_dim = a_sz * 256 + b_dim1;
1477      if (t1_dim > 65536)
1478	t1_dim = 65536;
1479
1480      t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
1481
1482      /* Start turning the crank. */
1483      i1 = n;
1484      for (jj = 1; jj <= i1; jj += 512)
1485	{
1486	  /* Computing MIN */
1487	  i2 = 512;
1488	  i3 = n - jj + 1;
1489	  jsec = min(i2,i3);
1490	  ujsec = jsec - jsec % 4;
1491	  i2 = k;
1492	  for (ll = 1; ll <= i2; ll += 256)
1493	    {
1494	      /* Computing MIN */
1495	      i3 = 256;
1496	      i4 = k - ll + 1;
1497	      lsec = min(i3,i4);
1498	      ulsec = lsec - lsec % 2;
1499
1500	      i3 = m;
1501	      for (ii = 1; ii <= i3; ii += 256)
1502		{
1503		  /* Computing MIN */
1504		  i4 = 256;
1505		  i5 = m - ii + 1;
1506		  isec = min(i4,i5);
1507		  uisec = isec - isec % 2;
1508		  i4 = ll + ulsec - 1;
1509		  for (l = ll; l <= i4; l += 2)
1510		    {
1511		      i5 = ii + uisec - 1;
1512		      for (i = ii; i <= i5; i += 2)
1513			{
1514			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515					a[i + l * a_dim1];
1516			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517					a[i + (l + 1) * a_dim1];
1518			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519					a[i + 1 + l * a_dim1];
1520			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521					a[i + 1 + (l + 1) * a_dim1];
1522			}
1523		      if (uisec < isec)
1524			{
1525			  t1[l - ll + 1 + (isec << 8) - 257] =
1526				    a[ii + isec - 1 + l * a_dim1];
1527			  t1[l - ll + 2 + (isec << 8) - 257] =
1528				    a[ii + isec - 1 + (l + 1) * a_dim1];
1529			}
1530		    }
1531		  if (ulsec < lsec)
1532		    {
1533		      i4 = ii + isec - 1;
1534		      for (i = ii; i<= i4; ++i)
1535			{
1536			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1537				    a[i + (ll + lsec - 1) * a_dim1];
1538			}
1539		    }
1540
1541		  uisec = isec - isec % 4;
1542		  i4 = jj + ujsec - 1;
1543		  for (j = jj; j <= i4; j += 4)
1544		    {
1545		      i5 = ii + uisec - 1;
1546		      for (i = ii; i <= i5; i += 4)
1547			{
1548			  f11 = c[i + j * c_dim1];
1549			  f21 = c[i + 1 + j * c_dim1];
1550			  f12 = c[i + (j + 1) * c_dim1];
1551			  f22 = c[i + 1 + (j + 1) * c_dim1];
1552			  f13 = c[i + (j + 2) * c_dim1];
1553			  f23 = c[i + 1 + (j + 2) * c_dim1];
1554			  f14 = c[i + (j + 3) * c_dim1];
1555			  f24 = c[i + 1 + (j + 3) * c_dim1];
1556			  f31 = c[i + 2 + j * c_dim1];
1557			  f41 = c[i + 3 + j * c_dim1];
1558			  f32 = c[i + 2 + (j + 1) * c_dim1];
1559			  f42 = c[i + 3 + (j + 1) * c_dim1];
1560			  f33 = c[i + 2 + (j + 2) * c_dim1];
1561			  f43 = c[i + 3 + (j + 2) * c_dim1];
1562			  f34 = c[i + 2 + (j + 3) * c_dim1];
1563			  f44 = c[i + 3 + (j + 3) * c_dim1];
1564			  i6 = ll + lsec - 1;
1565			  for (l = ll; l <= i6; ++l)
1566			    {
1567			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568				      * b[l + j * b_dim1];
1569			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570				      * b[l + j * b_dim1];
1571			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572				      * b[l + (j + 1) * b_dim1];
1573			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574				      * b[l + (j + 1) * b_dim1];
1575			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576				      * b[l + (j + 2) * b_dim1];
1577			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578				      * b[l + (j + 2) * b_dim1];
1579			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580				      * b[l + (j + 3) * b_dim1];
1581			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582				      * b[l + (j + 3) * b_dim1];
1583			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584				      * b[l + j * b_dim1];
1585			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586				      * b[l + j * b_dim1];
1587			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588				      * b[l + (j + 1) * b_dim1];
1589			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590				      * b[l + (j + 1) * b_dim1];
1591			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592				      * b[l + (j + 2) * b_dim1];
1593			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594				      * b[l + (j + 2) * b_dim1];
1595			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596				      * b[l + (j + 3) * b_dim1];
1597			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598				      * b[l + (j + 3) * b_dim1];
1599			    }
1600			  c[i + j * c_dim1] = f11;
1601			  c[i + 1 + j * c_dim1] = f21;
1602			  c[i + (j + 1) * c_dim1] = f12;
1603			  c[i + 1 + (j + 1) * c_dim1] = f22;
1604			  c[i + (j + 2) * c_dim1] = f13;
1605			  c[i + 1 + (j + 2) * c_dim1] = f23;
1606			  c[i + (j + 3) * c_dim1] = f14;
1607			  c[i + 1 + (j + 3) * c_dim1] = f24;
1608			  c[i + 2 + j * c_dim1] = f31;
1609			  c[i + 3 + j * c_dim1] = f41;
1610			  c[i + 2 + (j + 1) * c_dim1] = f32;
1611			  c[i + 3 + (j + 1) * c_dim1] = f42;
1612			  c[i + 2 + (j + 2) * c_dim1] = f33;
1613			  c[i + 3 + (j + 2) * c_dim1] = f43;
1614			  c[i + 2 + (j + 3) * c_dim1] = f34;
1615			  c[i + 3 + (j + 3) * c_dim1] = f44;
1616			}
1617		      if (uisec < isec)
1618			{
1619			  i5 = ii + isec - 1;
1620			  for (i = ii + uisec; i <= i5; ++i)
1621			    {
1622			      f11 = c[i + j * c_dim1];
1623			      f12 = c[i + (j + 1) * c_dim1];
1624			      f13 = c[i + (j + 2) * c_dim1];
1625			      f14 = c[i + (j + 3) * c_dim1];
1626			      i6 = ll + lsec - 1;
1627			      for (l = ll; l <= i6; ++l)
1628				{
1629				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630					  257] * b[l + j * b_dim1];
1631				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632					  257] * b[l + (j + 1) * b_dim1];
1633				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634					  257] * b[l + (j + 2) * b_dim1];
1635				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636					  257] * b[l + (j + 3) * b_dim1];
1637				}
1638			      c[i + j * c_dim1] = f11;
1639			      c[i + (j + 1) * c_dim1] = f12;
1640			      c[i + (j + 2) * c_dim1] = f13;
1641			      c[i + (j + 3) * c_dim1] = f14;
1642			    }
1643			}
1644		    }
1645		  if (ujsec < jsec)
1646		    {
1647		      i4 = jj + jsec - 1;
1648		      for (j = jj + ujsec; j <= i4; ++j)
1649			{
1650			  i5 = ii + uisec - 1;
1651			  for (i = ii; i <= i5; i += 4)
1652			    {
1653			      f11 = c[i + j * c_dim1];
1654			      f21 = c[i + 1 + j * c_dim1];
1655			      f31 = c[i + 2 + j * c_dim1];
1656			      f41 = c[i + 3 + j * c_dim1];
1657			      i6 = ll + lsec - 1;
1658			      for (l = ll; l <= i6; ++l)
1659				{
1660				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661					  257] * b[l + j * b_dim1];
1662				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663					  257] * b[l + j * b_dim1];
1664				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665					  257] * b[l + j * b_dim1];
1666				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667					  257] * b[l + j * b_dim1];
1668				}
1669			      c[i + j * c_dim1] = f11;
1670			      c[i + 1 + j * c_dim1] = f21;
1671			      c[i + 2 + j * c_dim1] = f31;
1672			      c[i + 3 + j * c_dim1] = f41;
1673			    }
1674			  i5 = ii + isec - 1;
1675			  for (i = ii + uisec; i <= i5; ++i)
1676			    {
1677			      f11 = c[i + j * c_dim1];
1678			      i6 = ll + lsec - 1;
1679			      for (l = ll; l <= i6; ++l)
1680				{
1681				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682					  257] * b[l + j * b_dim1];
1683				}
1684			      c[i + j * c_dim1] = f11;
1685			    }
1686			}
1687		    }
1688		}
1689	    }
1690	}
1691      free(t1);
1692      return;
1693    }
1694  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695    {
1696      if (GFC_DESCRIPTOR_RANK (a) != 1)
1697	{
1698	  const GFC_INTEGER_2 *restrict abase_x;
1699	  const GFC_INTEGER_2 *restrict bbase_y;
1700	  GFC_INTEGER_2 *restrict dest_y;
1701	  GFC_INTEGER_2 s;
1702
1703	  for (y = 0; y < ycount; y++)
1704	    {
1705	      bbase_y = &bbase[y*bystride];
1706	      dest_y = &dest[y*rystride];
1707	      for (x = 0; x < xcount; x++)
1708		{
1709		  abase_x = &abase[x*axstride];
1710		  s = (GFC_INTEGER_2) 0;
1711		  for (n = 0; n < count; n++)
1712		    s += abase_x[n] * bbase_y[n];
1713		  dest_y[x] = s;
1714		}
1715	    }
1716	}
1717      else
1718	{
1719	  const GFC_INTEGER_2 *restrict bbase_y;
1720	  GFC_INTEGER_2 s;
1721
1722	  for (y = 0; y < ycount; y++)
1723	    {
1724	      bbase_y = &bbase[y*bystride];
1725	      s = (GFC_INTEGER_2) 0;
1726	      for (n = 0; n < count; n++)
1727		s += abase[n*axstride] * bbase_y[n];
1728	      dest[y*rystride] = s;
1729	    }
1730	}
1731    }
1732  else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733    {
1734      const GFC_INTEGER_2 *restrict bbase_y;
1735      GFC_INTEGER_2 s;
1736
1737      for (y = 0; y < ycount; y++)
1738	{
1739	  bbase_y = &bbase[y*bystride];
1740	  s = (GFC_INTEGER_2) 0;
1741	  for (n = 0; n < count; n++)
1742	    s += abase[n*axstride] * bbase_y[n*bxstride];
1743	  dest[y*rxstride] = s;
1744	}
1745    }
1746  else if (axstride < aystride)
1747    {
1748      for (y = 0; y < ycount; y++)
1749	for (x = 0; x < xcount; x++)
1750	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1751
1752      for (y = 0; y < ycount; y++)
1753	for (n = 0; n < count; n++)
1754	  for (x = 0; x < xcount; x++)
1755	    /* dest[x,y] += a[x,n] * b[n,y] */
1756	    dest[x*rxstride + y*rystride] +=
1757					abase[x*axstride + n*aystride] *
1758					bbase[n*bxstride + y*bystride];
1759    }
1760  else
1761    {
1762      const GFC_INTEGER_2 *restrict abase_x;
1763      const GFC_INTEGER_2 *restrict bbase_y;
1764      GFC_INTEGER_2 *restrict dest_y;
1765      GFC_INTEGER_2 s;
1766
1767      for (y = 0; y < ycount; y++)
1768	{
1769	  bbase_y = &bbase[y*bystride];
1770	  dest_y = &dest[y*rystride];
1771	  for (x = 0; x < xcount; x++)
1772	    {
1773	      abase_x = &abase[x*axstride];
1774	      s = (GFC_INTEGER_2) 0;
1775	      for (n = 0; n < count; n++)
1776		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777	      dest_y[x*rxstride] = s;
1778	    }
1779	}
1780    }
1781}
1782#undef POW3
1783#undef min
1784#undef max
1785
1786#endif  /* HAVE_AVX512F */
1787
1788/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1789
1790#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791void
1792matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
1793	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1794	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795internal_proto(matmul_i2_avx128_fma3);
1796#endif
1797
1798#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799void
1800matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
1801	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1802	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803internal_proto(matmul_i2_avx128_fma4);
1804#endif
1805
1806/* Function to fall back to if there is no special processor-specific version.  */
1807static void
1808matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
1809	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1810	int blas_limit, blas_call gemm)
1811{
1812  const GFC_INTEGER_2 * restrict abase;
1813  const GFC_INTEGER_2 * restrict bbase;
1814  GFC_INTEGER_2 * restrict dest;
1815
1816  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817  index_type x, y, n, count, xcount, ycount;
1818
1819  assert (GFC_DESCRIPTOR_RANK (a) == 2
1820          || GFC_DESCRIPTOR_RANK (b) == 2);
1821
1822/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823
1824   Either A or B (but not both) can be rank 1:
1825
1826   o One-dimensional argument A is implicitly treated as a row matrix
1827     dimensioned [1,count], so xcount=1.
1828
1829   o One-dimensional argument B is implicitly treated as a column matrix
1830     dimensioned [count, 1], so ycount=1.
1831*/
1832
1833  if (retarray->base_addr == NULL)
1834    {
1835      if (GFC_DESCRIPTOR_RANK (a) == 1)
1836        {
1837	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1838	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839        }
1840      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841        {
1842	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1843	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844        }
1845      else
1846        {
1847	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1848	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849
1850          GFC_DIMENSION_SET(retarray->dim[1], 0,
1851	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1853        }
1854
1855      retarray->base_addr
1856	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
1857      retarray->offset = 0;
1858    }
1859  else if (unlikely (compile_options.bounds_check))
1860    {
1861      index_type ret_extent, arg_extent;
1862
1863      if (GFC_DESCRIPTOR_RANK (a) == 1)
1864	{
1865	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867	  if (arg_extent != ret_extent)
1868	    runtime_error ("Array bound mismatch for dimension 1 of "
1869	    		   "array (%ld/%ld) ",
1870			   (long int) ret_extent, (long int) arg_extent);
1871	}
1872      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873	{
1874	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876	  if (arg_extent != ret_extent)
1877	    runtime_error ("Array bound mismatch for dimension 1 of "
1878	    		   "array (%ld/%ld) ",
1879			   (long int) ret_extent, (long int) arg_extent);
1880	}
1881      else
1882	{
1883	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885	  if (arg_extent != ret_extent)
1886	    runtime_error ("Array bound mismatch for dimension 1 of "
1887	    		   "array (%ld/%ld) ",
1888			   (long int) ret_extent, (long int) arg_extent);
1889
1890	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892	  if (arg_extent != ret_extent)
1893	    runtime_error ("Array bound mismatch for dimension 2 of "
1894	    		   "array (%ld/%ld) ",
1895			   (long int) ret_extent, (long int) arg_extent);
1896	}
1897    }
1898
1899
1900  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901    {
1902      /* One-dimensional result may be addressed in the code below
1903	 either as a row or a column matrix. We want both cases to
1904	 work. */
1905      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906    }
1907  else
1908    {
1909      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911    }
1912
1913
1914  if (GFC_DESCRIPTOR_RANK (a) == 1)
1915    {
1916      /* Treat it as a a row matrix A[1,count]. */
1917      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918      aystride = 1;
1919
1920      xcount = 1;
1921      count = GFC_DESCRIPTOR_EXTENT(a,0);
1922    }
1923  else
1924    {
1925      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927
1928      count = GFC_DESCRIPTOR_EXTENT(a,1);
1929      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930    }
1931
1932  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933    {
1934      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936		       "in dimension 1: is %ld, should be %ld",
1937		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938    }
1939
1940  if (GFC_DESCRIPTOR_RANK (b) == 1)
1941    {
1942      /* Treat it as a column matrix B[count,1] */
1943      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944
1945      /* bystride should never be used for 1-dimensional b.
1946         The value is only used for calculation of the
1947         memory by the buffer.  */
1948      bystride = 256;
1949      ycount = 1;
1950    }
1951  else
1952    {
1953      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956    }
1957
1958  abase = a->base_addr;
1959  bbase = b->base_addr;
1960  dest = retarray->base_addr;
1961
1962  /* Now that everything is set up, we perform the multiplication
1963     itself.  */
1964
1965#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966#define min(a,b) ((a) <= (b) ? (a) : (b))
1967#define max(a,b) ((a) >= (b) ? (a) : (b))
1968
1969  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970      && (bxstride == 1 || bystride == 1)
1971      && (((float) xcount) * ((float) ycount) * ((float) count)
1972          > POW3(blas_limit)))
1973    {
1974      const int m = xcount, n = ycount, k = count, ldc = rystride;
1975      const GFC_INTEGER_2 one = 1, zero = 0;
1976      const int lda = (axstride == 1) ? aystride : axstride,
1977		ldb = (bxstride == 1) ? bystride : bxstride;
1978
1979      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980	{
1981	  assert (gemm != NULL);
1982	  const char *transa, *transb;
1983	  if (try_blas & 2)
1984	    transa = "C";
1985	  else
1986	    transa = axstride == 1 ? "N" : "T";
1987
1988	  if (try_blas & 4)
1989	    transb = "C";
1990	  else
1991	    transb = bxstride == 1 ? "N" : "T";
1992
1993	  gemm (transa, transb , &m,
1994		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1995		&ldc, 1, 1);
1996	  return;
1997	}
1998    }
1999
2000  if (rxstride == 1 && axstride == 1 && bxstride == 1
2001      && GFC_DESCRIPTOR_RANK (b) != 1)
2002    {
2003      /* This block of code implements a tuned matmul, derived from
2004         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2005
2006               Bo Kagstrom and Per Ling
2007               Department of Computing Science
2008               Umea University
2009               S-901 87 Umea, Sweden
2010
2011	 from netlib.org, translated to C, and modified for matmul.m4.  */
2012
2013      const GFC_INTEGER_2 *a, *b;
2014      GFC_INTEGER_2 *c;
2015      const index_type m = xcount, n = ycount, k = count;
2016
2017      /* System generated locals */
2018      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019		 i1, i2, i3, i4, i5, i6;
2020
2021      /* Local variables */
2022      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
2023		 f13, f14, f23, f24, f33, f34, f43, f44;
2024      index_type i, j, l, ii, jj, ll;
2025      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026      GFC_INTEGER_2 *t1;
2027
2028      a = abase;
2029      b = bbase;
2030      c = retarray->base_addr;
2031
2032      /* Parameter adjustments */
2033      c_dim1 = rystride;
2034      c_offset = 1 + c_dim1;
2035      c -= c_offset;
2036      a_dim1 = aystride;
2037      a_offset = 1 + a_dim1;
2038      a -= a_offset;
2039      b_dim1 = bystride;
2040      b_offset = 1 + b_dim1;
2041      b -= b_offset;
2042
2043      /* Empty c first.  */
2044      for (j=1; j<=n; j++)
2045	for (i=1; i<=m; i++)
2046	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
2047
2048      /* Early exit if possible */
2049      if (m == 0 || n == 0 || k == 0)
2050	return;
2051
2052      /* Adjust size of t1 to what is needed.  */
2053      index_type t1_dim, a_sz;
2054      if (aystride == 1)
2055        a_sz = rystride;
2056      else
2057        a_sz = a_dim1;
2058
2059      t1_dim = a_sz * 256 + b_dim1;
2060      if (t1_dim > 65536)
2061	t1_dim = 65536;
2062
2063      t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
2064
2065      /* Start turning the crank. */
2066      i1 = n;
2067      for (jj = 1; jj <= i1; jj += 512)
2068	{
2069	  /* Computing MIN */
2070	  i2 = 512;
2071	  i3 = n - jj + 1;
2072	  jsec = min(i2,i3);
2073	  ujsec = jsec - jsec % 4;
2074	  i2 = k;
2075	  for (ll = 1; ll <= i2; ll += 256)
2076	    {
2077	      /* Computing MIN */
2078	      i3 = 256;
2079	      i4 = k - ll + 1;
2080	      lsec = min(i3,i4);
2081	      ulsec = lsec - lsec % 2;
2082
2083	      i3 = m;
2084	      for (ii = 1; ii <= i3; ii += 256)
2085		{
2086		  /* Computing MIN */
2087		  i4 = 256;
2088		  i5 = m - ii + 1;
2089		  isec = min(i4,i5);
2090		  uisec = isec - isec % 2;
2091		  i4 = ll + ulsec - 1;
2092		  for (l = ll; l <= i4; l += 2)
2093		    {
2094		      i5 = ii + uisec - 1;
2095		      for (i = ii; i <= i5; i += 2)
2096			{
2097			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098					a[i + l * a_dim1];
2099			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100					a[i + (l + 1) * a_dim1];
2101			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102					a[i + 1 + l * a_dim1];
2103			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104					a[i + 1 + (l + 1) * a_dim1];
2105			}
2106		      if (uisec < isec)
2107			{
2108			  t1[l - ll + 1 + (isec << 8) - 257] =
2109				    a[ii + isec - 1 + l * a_dim1];
2110			  t1[l - ll + 2 + (isec << 8) - 257] =
2111				    a[ii + isec - 1 + (l + 1) * a_dim1];
2112			}
2113		    }
2114		  if (ulsec < lsec)
2115		    {
2116		      i4 = ii + isec - 1;
2117		      for (i = ii; i<= i4; ++i)
2118			{
2119			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2120				    a[i + (ll + lsec - 1) * a_dim1];
2121			}
2122		    }
2123
2124		  uisec = isec - isec % 4;
2125		  i4 = jj + ujsec - 1;
2126		  for (j = jj; j <= i4; j += 4)
2127		    {
2128		      i5 = ii + uisec - 1;
2129		      for (i = ii; i <= i5; i += 4)
2130			{
2131			  f11 = c[i + j * c_dim1];
2132			  f21 = c[i + 1 + j * c_dim1];
2133			  f12 = c[i + (j + 1) * c_dim1];
2134			  f22 = c[i + 1 + (j + 1) * c_dim1];
2135			  f13 = c[i + (j + 2) * c_dim1];
2136			  f23 = c[i + 1 + (j + 2) * c_dim1];
2137			  f14 = c[i + (j + 3) * c_dim1];
2138			  f24 = c[i + 1 + (j + 3) * c_dim1];
2139			  f31 = c[i + 2 + j * c_dim1];
2140			  f41 = c[i + 3 + j * c_dim1];
2141			  f32 = c[i + 2 + (j + 1) * c_dim1];
2142			  f42 = c[i + 3 + (j + 1) * c_dim1];
2143			  f33 = c[i + 2 + (j + 2) * c_dim1];
2144			  f43 = c[i + 3 + (j + 2) * c_dim1];
2145			  f34 = c[i + 2 + (j + 3) * c_dim1];
2146			  f44 = c[i + 3 + (j + 3) * c_dim1];
2147			  i6 = ll + lsec - 1;
2148			  for (l = ll; l <= i6; ++l)
2149			    {
2150			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151				      * b[l + j * b_dim1];
2152			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153				      * b[l + j * b_dim1];
2154			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155				      * b[l + (j + 1) * b_dim1];
2156			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157				      * b[l + (j + 1) * b_dim1];
2158			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159				      * b[l + (j + 2) * b_dim1];
2160			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161				      * b[l + (j + 2) * b_dim1];
2162			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163				      * b[l + (j + 3) * b_dim1];
2164			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165				      * b[l + (j + 3) * b_dim1];
2166			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167				      * b[l + j * b_dim1];
2168			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169				      * b[l + j * b_dim1];
2170			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171				      * b[l + (j + 1) * b_dim1];
2172			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173				      * b[l + (j + 1) * b_dim1];
2174			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175				      * b[l + (j + 2) * b_dim1];
2176			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177				      * b[l + (j + 2) * b_dim1];
2178			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179				      * b[l + (j + 3) * b_dim1];
2180			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181				      * b[l + (j + 3) * b_dim1];
2182			    }
2183			  c[i + j * c_dim1] = f11;
2184			  c[i + 1 + j * c_dim1] = f21;
2185			  c[i + (j + 1) * c_dim1] = f12;
2186			  c[i + 1 + (j + 1) * c_dim1] = f22;
2187			  c[i + (j + 2) * c_dim1] = f13;
2188			  c[i + 1 + (j + 2) * c_dim1] = f23;
2189			  c[i + (j + 3) * c_dim1] = f14;
2190			  c[i + 1 + (j + 3) * c_dim1] = f24;
2191			  c[i + 2 + j * c_dim1] = f31;
2192			  c[i + 3 + j * c_dim1] = f41;
2193			  c[i + 2 + (j + 1) * c_dim1] = f32;
2194			  c[i + 3 + (j + 1) * c_dim1] = f42;
2195			  c[i + 2 + (j + 2) * c_dim1] = f33;
2196			  c[i + 3 + (j + 2) * c_dim1] = f43;
2197			  c[i + 2 + (j + 3) * c_dim1] = f34;
2198			  c[i + 3 + (j + 3) * c_dim1] = f44;
2199			}
2200		      if (uisec < isec)
2201			{
2202			  i5 = ii + isec - 1;
2203			  for (i = ii + uisec; i <= i5; ++i)
2204			    {
2205			      f11 = c[i + j * c_dim1];
2206			      f12 = c[i + (j + 1) * c_dim1];
2207			      f13 = c[i + (j + 2) * c_dim1];
2208			      f14 = c[i + (j + 3) * c_dim1];
2209			      i6 = ll + lsec - 1;
2210			      for (l = ll; l <= i6; ++l)
2211				{
2212				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213					  257] * b[l + j * b_dim1];
2214				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215					  257] * b[l + (j + 1) * b_dim1];
2216				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217					  257] * b[l + (j + 2) * b_dim1];
2218				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219					  257] * b[l + (j + 3) * b_dim1];
2220				}
2221			      c[i + j * c_dim1] = f11;
2222			      c[i + (j + 1) * c_dim1] = f12;
2223			      c[i + (j + 2) * c_dim1] = f13;
2224			      c[i + (j + 3) * c_dim1] = f14;
2225			    }
2226			}
2227		    }
2228		  if (ujsec < jsec)
2229		    {
2230		      i4 = jj + jsec - 1;
2231		      for (j = jj + ujsec; j <= i4; ++j)
2232			{
2233			  i5 = ii + uisec - 1;
2234			  for (i = ii; i <= i5; i += 4)
2235			    {
2236			      f11 = c[i + j * c_dim1];
2237			      f21 = c[i + 1 + j * c_dim1];
2238			      f31 = c[i + 2 + j * c_dim1];
2239			      f41 = c[i + 3 + j * c_dim1];
2240			      i6 = ll + lsec - 1;
2241			      for (l = ll; l <= i6; ++l)
2242				{
2243				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244					  257] * b[l + j * b_dim1];
2245				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246					  257] * b[l + j * b_dim1];
2247				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248					  257] * b[l + j * b_dim1];
2249				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250					  257] * b[l + j * b_dim1];
2251				}
2252			      c[i + j * c_dim1] = f11;
2253			      c[i + 1 + j * c_dim1] = f21;
2254			      c[i + 2 + j * c_dim1] = f31;
2255			      c[i + 3 + j * c_dim1] = f41;
2256			    }
2257			  i5 = ii + isec - 1;
2258			  for (i = ii + uisec; i <= i5; ++i)
2259			    {
2260			      f11 = c[i + j * c_dim1];
2261			      i6 = ll + lsec - 1;
2262			      for (l = ll; l <= i6; ++l)
2263				{
2264				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265					  257] * b[l + j * b_dim1];
2266				}
2267			      c[i + j * c_dim1] = f11;
2268			    }
2269			}
2270		    }
2271		}
2272	    }
2273	}
2274      free(t1);
2275      return;
2276    }
2277  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278    {
2279      if (GFC_DESCRIPTOR_RANK (a) != 1)
2280	{
2281	  const GFC_INTEGER_2 *restrict abase_x;
2282	  const GFC_INTEGER_2 *restrict bbase_y;
2283	  GFC_INTEGER_2 *restrict dest_y;
2284	  GFC_INTEGER_2 s;
2285
2286	  for (y = 0; y < ycount; y++)
2287	    {
2288	      bbase_y = &bbase[y*bystride];
2289	      dest_y = &dest[y*rystride];
2290	      for (x = 0; x < xcount; x++)
2291		{
2292		  abase_x = &abase[x*axstride];
2293		  s = (GFC_INTEGER_2) 0;
2294		  for (n = 0; n < count; n++)
2295		    s += abase_x[n] * bbase_y[n];
2296		  dest_y[x] = s;
2297		}
2298	    }
2299	}
2300      else
2301	{
2302	  const GFC_INTEGER_2 *restrict bbase_y;
2303	  GFC_INTEGER_2 s;
2304
2305	  for (y = 0; y < ycount; y++)
2306	    {
2307	      bbase_y = &bbase[y*bystride];
2308	      s = (GFC_INTEGER_2) 0;
2309	      for (n = 0; n < count; n++)
2310		s += abase[n*axstride] * bbase_y[n];
2311	      dest[y*rystride] = s;
2312	    }
2313	}
2314    }
2315  else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316    {
2317      const GFC_INTEGER_2 *restrict bbase_y;
2318      GFC_INTEGER_2 s;
2319
2320      for (y = 0; y < ycount; y++)
2321	{
2322	  bbase_y = &bbase[y*bystride];
2323	  s = (GFC_INTEGER_2) 0;
2324	  for (n = 0; n < count; n++)
2325	    s += abase[n*axstride] * bbase_y[n*bxstride];
2326	  dest[y*rxstride] = s;
2327	}
2328    }
2329  else if (axstride < aystride)
2330    {
2331      for (y = 0; y < ycount; y++)
2332	for (x = 0; x < xcount; x++)
2333	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
2334
2335      for (y = 0; y < ycount; y++)
2336	for (n = 0; n < count; n++)
2337	  for (x = 0; x < xcount; x++)
2338	    /* dest[x,y] += a[x,n] * b[n,y] */
2339	    dest[x*rxstride + y*rystride] +=
2340					abase[x*axstride + n*aystride] *
2341					bbase[n*bxstride + y*bystride];
2342    }
2343  else
2344    {
2345      const GFC_INTEGER_2 *restrict abase_x;
2346      const GFC_INTEGER_2 *restrict bbase_y;
2347      GFC_INTEGER_2 *restrict dest_y;
2348      GFC_INTEGER_2 s;
2349
2350      for (y = 0; y < ycount; y++)
2351	{
2352	  bbase_y = &bbase[y*bystride];
2353	  dest_y = &dest[y*rystride];
2354	  for (x = 0; x < xcount; x++)
2355	    {
2356	      abase_x = &abase[x*axstride];
2357	      s = (GFC_INTEGER_2) 0;
2358	      for (n = 0; n < count; n++)
2359		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360	      dest_y[x*rxstride] = s;
2361	    }
2362	}
2363    }
2364}
2365#undef POW3
2366#undef min
2367#undef max
2368
2369
2370/* Compiling main function, with selection code for the processor.  */
2371
2372/* Currently, this is i386 only.  Adjust for other architectures.  */
2373
2374void matmul_i2 (gfc_array_i2 * const restrict retarray,
2375	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2376	int blas_limit, blas_call gemm)
2377{
2378  static void (*matmul_p) (gfc_array_i2 * const restrict retarray,
2379	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2380	int blas_limit, blas_call gemm);
2381
2382  void (*matmul_fn) (gfc_array_i2 * const restrict retarray,
2383	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2384	int blas_limit, blas_call gemm);
2385
2386  matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2387  if (matmul_fn == NULL)
2388    {
2389      matmul_fn = matmul_i2_vanilla;
2390      if (__builtin_cpu_is ("intel"))
2391	{
2392          /* Run down the available processors in order of preference.  */
2393#ifdef HAVE_AVX512F
2394	  if (__builtin_cpu_supports ("avx512f"))
2395	    {
2396	      matmul_fn = matmul_i2_avx512f;
2397	      goto store;
2398	    }
2399
2400#endif  /* HAVE_AVX512F */
2401
2402#ifdef HAVE_AVX2
2403	  if (__builtin_cpu_supports ("avx2")
2404	      && __builtin_cpu_supports ("fma"))
2405	    {
2406	      matmul_fn = matmul_i2_avx2;
2407	      goto store;
2408	    }
2409
2410#endif
2411
2412#ifdef HAVE_AVX
2413	  if (__builtin_cpu_supports ("avx"))
2414 	    {
2415              matmul_fn = matmul_i2_avx;
2416	      goto store;
2417	    }
2418#endif  /* HAVE_AVX */
2419        }
2420    else if (__builtin_cpu_is ("amd"))
2421      {
2422#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2423	if (__builtin_cpu_supports ("avx")
2424	    && __builtin_cpu_supports ("fma"))
2425	  {
2426            matmul_fn = matmul_i2_avx128_fma3;
2427	    goto store;
2428	  }
2429#endif
2430#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2431	if (__builtin_cpu_supports ("avx")
2432	    && __builtin_cpu_supports ("fma4"))
2433	  {
2434            matmul_fn = matmul_i2_avx128_fma4;
2435	    goto store;
2436	  }
2437#endif
2438
2439      }
2440   store:
2441      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2442   }
2443
2444   (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2445}
2446
2447#else  /* Just the vanilla function.  */
2448
2449void
2450matmul_i2 (gfc_array_i2 * const restrict retarray,
2451	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2452	int blas_limit, blas_call gemm)
2453{
2454  const GFC_INTEGER_2 * restrict abase;
2455  const GFC_INTEGER_2 * restrict bbase;
2456  GFC_INTEGER_2 * restrict dest;
2457
2458  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2459  index_type x, y, n, count, xcount, ycount;
2460
2461  assert (GFC_DESCRIPTOR_RANK (a) == 2
2462          || GFC_DESCRIPTOR_RANK (b) == 2);
2463
2464/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2465
2466   Either A or B (but not both) can be rank 1:
2467
2468   o One-dimensional argument A is implicitly treated as a row matrix
2469     dimensioned [1,count], so xcount=1.
2470
2471   o One-dimensional argument B is implicitly treated as a column matrix
2472     dimensioned [count, 1], so ycount=1.
2473*/
2474
2475  if (retarray->base_addr == NULL)
2476    {
2477      if (GFC_DESCRIPTOR_RANK (a) == 1)
2478        {
2479	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2480	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2481        }
2482      else if (GFC_DESCRIPTOR_RANK (b) == 1)
2483        {
2484	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2485	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2486        }
2487      else
2488        {
2489	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2490	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2491
2492          GFC_DIMENSION_SET(retarray->dim[1], 0,
2493	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2494			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2495        }
2496
2497      retarray->base_addr
2498	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
2499      retarray->offset = 0;
2500    }
2501  else if (unlikely (compile_options.bounds_check))
2502    {
2503      index_type ret_extent, arg_extent;
2504
2505      if (GFC_DESCRIPTOR_RANK (a) == 1)
2506	{
2507	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2508	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2509	  if (arg_extent != ret_extent)
2510	    runtime_error ("Array bound mismatch for dimension 1 of "
2511	    		   "array (%ld/%ld) ",
2512			   (long int) ret_extent, (long int) arg_extent);
2513	}
2514      else if (GFC_DESCRIPTOR_RANK (b) == 1)
2515	{
2516	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2517	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2518	  if (arg_extent != ret_extent)
2519	    runtime_error ("Array bound mismatch for dimension 1 of "
2520	    		   "array (%ld/%ld) ",
2521			   (long int) ret_extent, (long int) arg_extent);
2522	}
2523      else
2524	{
2525	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2526	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2527	  if (arg_extent != ret_extent)
2528	    runtime_error ("Array bound mismatch for dimension 1 of "
2529	    		   "array (%ld/%ld) ",
2530			   (long int) ret_extent, (long int) arg_extent);
2531
2532	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2533	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2534	  if (arg_extent != ret_extent)
2535	    runtime_error ("Array bound mismatch for dimension 2 of "
2536	    		   "array (%ld/%ld) ",
2537			   (long int) ret_extent, (long int) arg_extent);
2538	}
2539    }
2540
2541
2542  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2543    {
2544      /* One-dimensional result may be addressed in the code below
2545	 either as a row or a column matrix. We want both cases to
2546	 work. */
2547      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2548    }
2549  else
2550    {
2551      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2552      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2553    }
2554
2555
2556  if (GFC_DESCRIPTOR_RANK (a) == 1)
2557    {
2558      /* Treat it as a a row matrix A[1,count]. */
2559      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2560      aystride = 1;
2561
2562      xcount = 1;
2563      count = GFC_DESCRIPTOR_EXTENT(a,0);
2564    }
2565  else
2566    {
2567      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2568      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2569
2570      count = GFC_DESCRIPTOR_EXTENT(a,1);
2571      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2572    }
2573
2574  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2575    {
2576      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2577	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2578		       "in dimension 1: is %ld, should be %ld",
2579		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2580    }
2581
2582  if (GFC_DESCRIPTOR_RANK (b) == 1)
2583    {
2584      /* Treat it as a column matrix B[count,1] */
2585      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2586
2587      /* bystride should never be used for 1-dimensional b.
2588         The value is only used for calculation of the
2589         memory by the buffer.  */
2590      bystride = 256;
2591      ycount = 1;
2592    }
2593  else
2594    {
2595      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2596      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2597      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2598    }
2599
2600  abase = a->base_addr;
2601  bbase = b->base_addr;
2602  dest = retarray->base_addr;
2603
2604  /* Now that everything is set up, we perform the multiplication
2605     itself.  */
2606
2607#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2608#define min(a,b) ((a) <= (b) ? (a) : (b))
2609#define max(a,b) ((a) >= (b) ? (a) : (b))
2610
2611  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2612      && (bxstride == 1 || bystride == 1)
2613      && (((float) xcount) * ((float) ycount) * ((float) count)
2614          > POW3(blas_limit)))
2615    {
2616      const int m = xcount, n = ycount, k = count, ldc = rystride;
2617      const GFC_INTEGER_2 one = 1, zero = 0;
2618      const int lda = (axstride == 1) ? aystride : axstride,
2619		ldb = (bxstride == 1) ? bystride : bxstride;
2620
2621      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2622	{
2623	  assert (gemm != NULL);
2624	  const char *transa, *transb;
2625	  if (try_blas & 2)
2626	    transa = "C";
2627	  else
2628	    transa = axstride == 1 ? "N" : "T";
2629
2630	  if (try_blas & 4)
2631	    transb = "C";
2632	  else
2633	    transb = bxstride == 1 ? "N" : "T";
2634
2635	  gemm (transa, transb , &m,
2636		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2637		&ldc, 1, 1);
2638	  return;
2639	}
2640    }
2641
2642  if (rxstride == 1 && axstride == 1 && bxstride == 1
2643      && GFC_DESCRIPTOR_RANK (b) != 1)
2644    {
2645      /* This block of code implements a tuned matmul, derived from
2646         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2647
2648               Bo Kagstrom and Per Ling
2649               Department of Computing Science
2650               Umea University
2651               S-901 87 Umea, Sweden
2652
2653	 from netlib.org, translated to C, and modified for matmul.m4.  */
2654
2655      const GFC_INTEGER_2 *a, *b;
2656      GFC_INTEGER_2 *c;
2657      const index_type m = xcount, n = ycount, k = count;
2658
2659      /* System generated locals */
2660      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2661		 i1, i2, i3, i4, i5, i6;
2662
2663      /* Local variables */
2664      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
2665		 f13, f14, f23, f24, f33, f34, f43, f44;
2666      index_type i, j, l, ii, jj, ll;
2667      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2668      GFC_INTEGER_2 *t1;
2669
2670      a = abase;
2671      b = bbase;
2672      c = retarray->base_addr;
2673
2674      /* Parameter adjustments */
2675      c_dim1 = rystride;
2676      c_offset = 1 + c_dim1;
2677      c -= c_offset;
2678      a_dim1 = aystride;
2679      a_offset = 1 + a_dim1;
2680      a -= a_offset;
2681      b_dim1 = bystride;
2682      b_offset = 1 + b_dim1;
2683      b -= b_offset;
2684
2685      /* Empty c first.  */
2686      for (j=1; j<=n; j++)
2687	for (i=1; i<=m; i++)
2688	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
2689
2690      /* Early exit if possible */
2691      if (m == 0 || n == 0 || k == 0)
2692	return;
2693
2694      /* Adjust size of t1 to what is needed.  */
2695      index_type t1_dim, a_sz;
2696      if (aystride == 1)
2697        a_sz = rystride;
2698      else
2699        a_sz = a_dim1;
2700
2701      t1_dim = a_sz * 256 + b_dim1;
2702      if (t1_dim > 65536)
2703	t1_dim = 65536;
2704
2705      t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
2706
2707      /* Start turning the crank. */
2708      i1 = n;
2709      for (jj = 1; jj <= i1; jj += 512)
2710	{
2711	  /* Computing MIN */
2712	  i2 = 512;
2713	  i3 = n - jj + 1;
2714	  jsec = min(i2,i3);
2715	  ujsec = jsec - jsec % 4;
2716	  i2 = k;
2717	  for (ll = 1; ll <= i2; ll += 256)
2718	    {
2719	      /* Computing MIN */
2720	      i3 = 256;
2721	      i4 = k - ll + 1;
2722	      lsec = min(i3,i4);
2723	      ulsec = lsec - lsec % 2;
2724
2725	      i3 = m;
2726	      for (ii = 1; ii <= i3; ii += 256)
2727		{
2728		  /* Computing MIN */
2729		  i4 = 256;
2730		  i5 = m - ii + 1;
2731		  isec = min(i4,i5);
2732		  uisec = isec - isec % 2;
2733		  i4 = ll + ulsec - 1;
2734		  for (l = ll; l <= i4; l += 2)
2735		    {
2736		      i5 = ii + uisec - 1;
2737		      for (i = ii; i <= i5; i += 2)
2738			{
2739			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2740					a[i + l * a_dim1];
2741			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2742					a[i + (l + 1) * a_dim1];
2743			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2744					a[i + 1 + l * a_dim1];
2745			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2746					a[i + 1 + (l + 1) * a_dim1];
2747			}
2748		      if (uisec < isec)
2749			{
2750			  t1[l - ll + 1 + (isec << 8) - 257] =
2751				    a[ii + isec - 1 + l * a_dim1];
2752			  t1[l - ll + 2 + (isec << 8) - 257] =
2753				    a[ii + isec - 1 + (l + 1) * a_dim1];
2754			}
2755		    }
2756		  if (ulsec < lsec)
2757		    {
2758		      i4 = ii + isec - 1;
2759		      for (i = ii; i<= i4; ++i)
2760			{
2761			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2762				    a[i + (ll + lsec - 1) * a_dim1];
2763			}
2764		    }
2765
2766		  uisec = isec - isec % 4;
2767		  i4 = jj + ujsec - 1;
2768		  for (j = jj; j <= i4; j += 4)
2769		    {
2770		      i5 = ii + uisec - 1;
2771		      for (i = ii; i <= i5; i += 4)
2772			{
2773			  f11 = c[i + j * c_dim1];
2774			  f21 = c[i + 1 + j * c_dim1];
2775			  f12 = c[i + (j + 1) * c_dim1];
2776			  f22 = c[i + 1 + (j + 1) * c_dim1];
2777			  f13 = c[i + (j + 2) * c_dim1];
2778			  f23 = c[i + 1 + (j + 2) * c_dim1];
2779			  f14 = c[i + (j + 3) * c_dim1];
2780			  f24 = c[i + 1 + (j + 3) * c_dim1];
2781			  f31 = c[i + 2 + j * c_dim1];
2782			  f41 = c[i + 3 + j * c_dim1];
2783			  f32 = c[i + 2 + (j + 1) * c_dim1];
2784			  f42 = c[i + 3 + (j + 1) * c_dim1];
2785			  f33 = c[i + 2 + (j + 2) * c_dim1];
2786			  f43 = c[i + 3 + (j + 2) * c_dim1];
2787			  f34 = c[i + 2 + (j + 3) * c_dim1];
2788			  f44 = c[i + 3 + (j + 3) * c_dim1];
2789			  i6 = ll + lsec - 1;
2790			  for (l = ll; l <= i6; ++l)
2791			    {
2792			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2793				      * b[l + j * b_dim1];
2794			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2795				      * b[l + j * b_dim1];
2796			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2797				      * b[l + (j + 1) * b_dim1];
2798			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2799				      * b[l + (j + 1) * b_dim1];
2800			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2801				      * b[l + (j + 2) * b_dim1];
2802			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2803				      * b[l + (j + 2) * b_dim1];
2804			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2805				      * b[l + (j + 3) * b_dim1];
2806			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2807				      * b[l + (j + 3) * b_dim1];
2808			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2809				      * b[l + j * b_dim1];
2810			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2811				      * b[l + j * b_dim1];
2812			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2813				      * b[l + (j + 1) * b_dim1];
2814			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2815				      * b[l + (j + 1) * b_dim1];
2816			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2817				      * b[l + (j + 2) * b_dim1];
2818			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2819				      * b[l + (j + 2) * b_dim1];
2820			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2821				      * b[l + (j + 3) * b_dim1];
2822			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2823				      * b[l + (j + 3) * b_dim1];
2824			    }
2825			  c[i + j * c_dim1] = f11;
2826			  c[i + 1 + j * c_dim1] = f21;
2827			  c[i + (j + 1) * c_dim1] = f12;
2828			  c[i + 1 + (j + 1) * c_dim1] = f22;
2829			  c[i + (j + 2) * c_dim1] = f13;
2830			  c[i + 1 + (j + 2) * c_dim1] = f23;
2831			  c[i + (j + 3) * c_dim1] = f14;
2832			  c[i + 1 + (j + 3) * c_dim1] = f24;
2833			  c[i + 2 + j * c_dim1] = f31;
2834			  c[i + 3 + j * c_dim1] = f41;
2835			  c[i + 2 + (j + 1) * c_dim1] = f32;
2836			  c[i + 3 + (j + 1) * c_dim1] = f42;
2837			  c[i + 2 + (j + 2) * c_dim1] = f33;
2838			  c[i + 3 + (j + 2) * c_dim1] = f43;
2839			  c[i + 2 + (j + 3) * c_dim1] = f34;
2840			  c[i + 3 + (j + 3) * c_dim1] = f44;
2841			}
2842		      if (uisec < isec)
2843			{
2844			  i5 = ii + isec - 1;
2845			  for (i = ii + uisec; i <= i5; ++i)
2846			    {
2847			      f11 = c[i + j * c_dim1];
2848			      f12 = c[i + (j + 1) * c_dim1];
2849			      f13 = c[i + (j + 2) * c_dim1];
2850			      f14 = c[i + (j + 3) * c_dim1];
2851			      i6 = ll + lsec - 1;
2852			      for (l = ll; l <= i6; ++l)
2853				{
2854				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2855					  257] * b[l + j * b_dim1];
2856				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2857					  257] * b[l + (j + 1) * b_dim1];
2858				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2859					  257] * b[l + (j + 2) * b_dim1];
2860				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2861					  257] * b[l + (j + 3) * b_dim1];
2862				}
2863			      c[i + j * c_dim1] = f11;
2864			      c[i + (j + 1) * c_dim1] = f12;
2865			      c[i + (j + 2) * c_dim1] = f13;
2866			      c[i + (j + 3) * c_dim1] = f14;
2867			    }
2868			}
2869		    }
2870		  if (ujsec < jsec)
2871		    {
2872		      i4 = jj + jsec - 1;
2873		      for (j = jj + ujsec; j <= i4; ++j)
2874			{
2875			  i5 = ii + uisec - 1;
2876			  for (i = ii; i <= i5; i += 4)
2877			    {
2878			      f11 = c[i + j * c_dim1];
2879			      f21 = c[i + 1 + j * c_dim1];
2880			      f31 = c[i + 2 + j * c_dim1];
2881			      f41 = c[i + 3 + j * c_dim1];
2882			      i6 = ll + lsec - 1;
2883			      for (l = ll; l <= i6; ++l)
2884				{
2885				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2886					  257] * b[l + j * b_dim1];
2887				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2888					  257] * b[l + j * b_dim1];
2889				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2890					  257] * b[l + j * b_dim1];
2891				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2892					  257] * b[l + j * b_dim1];
2893				}
2894			      c[i + j * c_dim1] = f11;
2895			      c[i + 1 + j * c_dim1] = f21;
2896			      c[i + 2 + j * c_dim1] = f31;
2897			      c[i + 3 + j * c_dim1] = f41;
2898			    }
2899			  i5 = ii + isec - 1;
2900			  for (i = ii + uisec; i <= i5; ++i)
2901			    {
2902			      f11 = c[i + j * c_dim1];
2903			      i6 = ll + lsec - 1;
2904			      for (l = ll; l <= i6; ++l)
2905				{
2906				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2907					  257] * b[l + j * b_dim1];
2908				}
2909			      c[i + j * c_dim1] = f11;
2910			    }
2911			}
2912		    }
2913		}
2914	    }
2915	}
2916      free(t1);
2917      return;
2918    }
2919  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2920    {
2921      if (GFC_DESCRIPTOR_RANK (a) != 1)
2922	{
2923	  const GFC_INTEGER_2 *restrict abase_x;
2924	  const GFC_INTEGER_2 *restrict bbase_y;
2925	  GFC_INTEGER_2 *restrict dest_y;
2926	  GFC_INTEGER_2 s;
2927
2928	  for (y = 0; y < ycount; y++)
2929	    {
2930	      bbase_y = &bbase[y*bystride];
2931	      dest_y = &dest[y*rystride];
2932	      for (x = 0; x < xcount; x++)
2933		{
2934		  abase_x = &abase[x*axstride];
2935		  s = (GFC_INTEGER_2) 0;
2936		  for (n = 0; n < count; n++)
2937		    s += abase_x[n] * bbase_y[n];
2938		  dest_y[x] = s;
2939		}
2940	    }
2941	}
2942      else
2943	{
2944	  const GFC_INTEGER_2 *restrict bbase_y;
2945	  GFC_INTEGER_2 s;
2946
2947	  for (y = 0; y < ycount; y++)
2948	    {
2949	      bbase_y = &bbase[y*bystride];
2950	      s = (GFC_INTEGER_2) 0;
2951	      for (n = 0; n < count; n++)
2952		s += abase[n*axstride] * bbase_y[n];
2953	      dest[y*rystride] = s;
2954	    }
2955	}
2956    }
2957  else if (GFC_DESCRIPTOR_RANK (a) == 1)
2958    {
2959      const GFC_INTEGER_2 *restrict bbase_y;
2960      GFC_INTEGER_2 s;
2961
2962      for (y = 0; y < ycount; y++)
2963	{
2964	  bbase_y = &bbase[y*bystride];
2965	  s = (GFC_INTEGER_2) 0;
2966	  for (n = 0; n < count; n++)
2967	    s += abase[n*axstride] * bbase_y[n*bxstride];
2968	  dest[y*rxstride] = s;
2969	}
2970    }
2971  else if (axstride < aystride)
2972    {
2973      for (y = 0; y < ycount; y++)
2974	for (x = 0; x < xcount; x++)
2975	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
2976
2977      for (y = 0; y < ycount; y++)
2978	for (n = 0; n < count; n++)
2979	  for (x = 0; x < xcount; x++)
2980	    /* dest[x,y] += a[x,n] * b[n,y] */
2981	    dest[x*rxstride + y*rystride] +=
2982					abase[x*axstride + n*aystride] *
2983					bbase[n*bxstride + y*bystride];
2984    }
2985  else
2986    {
2987      const GFC_INTEGER_2 *restrict abase_x;
2988      const GFC_INTEGER_2 *restrict bbase_y;
2989      GFC_INTEGER_2 *restrict dest_y;
2990      GFC_INTEGER_2 s;
2991
2992      for (y = 0; y < ycount; y++)
2993	{
2994	  bbase_y = &bbase[y*bystride];
2995	  dest_y = &dest[y*rystride];
2996	  for (x = 0; x < xcount; x++)
2997	    {
2998	      abase_x = &abase[x*axstride];
2999	      s = (GFC_INTEGER_2) 0;
3000	      for (n = 0; n < count; n++)
3001		s += abase_x[n*aystride] * bbase_y[n*bxstride];
3002	      dest_y[x*rxstride] = s;
3003	    }
3004	}
3005    }
3006}
3007#undef POW3
3008#undef min
3009#undef max
3010
3011#endif
3012#endif
3013
3014