1/* Implementation of the MATMUL intrinsic
2   Copyright (C) 2002-2020 Free Software Foundation, Inc.
3   Contributed by Paul Brook <paul@nowt.org>
4
5This file is part of the GNU Fortran runtime library (libgfortran).
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
10version 3 of the License, or (at your option) any later version.
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26#include "libgfortran.h"
27#include <string.h>
28#include <assert.h>
29
30
31#if defined (HAVE_GFC_REAL_4)
32
33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34   passed to us by the front-end, in which case we call it for large
35   matrices.  */
36
37typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38                          const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
39                          const int *, const GFC_REAL_4 *, const int *,
40                          const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
41                          int, int);
42
43/* The order of loops is different in the case of plain matrix
44   multiplication C=MATMUL(A,B), and in the frequent special case where
45   the argument A is the temporary result of a TRANSPOSE intrinsic:
46   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
47   looking at their strides.
48
49   The equivalent Fortran pseudo-code is:
50
51   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52   IF (.NOT.IS_TRANSPOSED(A)) THEN
53     C = 0
54     DO J=1,N
55       DO K=1,COUNT
56         DO I=1,M
57           C(I,J) = C(I,J)+A(I,K)*B(K,J)
58   ELSE
59     DO J=1,N
60       DO I=1,M
61         S = 0
62         DO K=1,COUNT
63           S = S+A(I,K)*B(K,J)
64         C(I,J) = S
65   ENDIF
66*/
67
68/* If try_blas is set to a nonzero value, then the matmul function will
69   see if there is a way to perform the matrix multiplication by a call
70   to the BLAS gemm function.  */
71
72extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
73	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
74	int blas_limit, blas_call gemm);
75export_proto(matmul_r4);
76
77/* Put exhaustive list of possible architectures here here, ORed together.  */
78
79#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81#ifdef HAVE_AVX
82static void
83matmul_r4_avx (gfc_array_r4 * const restrict retarray,
84	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
85	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86static void
87matmul_r4_avx (gfc_array_r4 * const restrict retarray,
88	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
89	int blas_limit, blas_call gemm)
90{
91  const GFC_REAL_4 * restrict abase;
92  const GFC_REAL_4 * restrict bbase;
93  GFC_REAL_4 * restrict dest;
94
95  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96  index_type x, y, n, count, xcount, ycount;
97
98  assert (GFC_DESCRIPTOR_RANK (a) == 2
99          || GFC_DESCRIPTOR_RANK (b) == 2);
100
101/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103   Either A or B (but not both) can be rank 1:
104
105   o One-dimensional argument A is implicitly treated as a row matrix
106     dimensioned [1,count], so xcount=1.
107
108   o One-dimensional argument B is implicitly treated as a column matrix
109     dimensioned [count, 1], so ycount=1.
110*/
111
112  if (retarray->base_addr == NULL)
113    {
114      if (GFC_DESCRIPTOR_RANK (a) == 1)
115        {
116	  GFC_DIMENSION_SET(retarray->dim[0], 0,
117	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118        }
119      else if (GFC_DESCRIPTOR_RANK (b) == 1)
120        {
121	  GFC_DIMENSION_SET(retarray->dim[0], 0,
122	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123        }
124      else
125        {
126	  GFC_DIMENSION_SET(retarray->dim[0], 0,
127	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129          GFC_DIMENSION_SET(retarray->dim[1], 0,
130	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131			    GFC_DESCRIPTOR_EXTENT(retarray,0));
132        }
133
134      retarray->base_addr
135	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
136      retarray->offset = 0;
137    }
138  else if (unlikely (compile_options.bounds_check))
139    {
140      index_type ret_extent, arg_extent;
141
142      if (GFC_DESCRIPTOR_RANK (a) == 1)
143	{
144	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146	  if (arg_extent != ret_extent)
147	    runtime_error ("Array bound mismatch for dimension 1 of "
148	    		   "array (%ld/%ld) ",
149			   (long int) ret_extent, (long int) arg_extent);
150	}
151      else if (GFC_DESCRIPTOR_RANK (b) == 1)
152	{
153	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155	  if (arg_extent != ret_extent)
156	    runtime_error ("Array bound mismatch for dimension 1 of "
157	    		   "array (%ld/%ld) ",
158			   (long int) ret_extent, (long int) arg_extent);
159	}
160      else
161	{
162	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164	  if (arg_extent != ret_extent)
165	    runtime_error ("Array bound mismatch for dimension 1 of "
166	    		   "array (%ld/%ld) ",
167			   (long int) ret_extent, (long int) arg_extent);
168
169	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171	  if (arg_extent != ret_extent)
172	    runtime_error ("Array bound mismatch for dimension 2 of "
173	    		   "array (%ld/%ld) ",
174			   (long int) ret_extent, (long int) arg_extent);
175	}
176    }
177
178
179  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180    {
181      /* One-dimensional result may be addressed in the code below
182	 either as a row or a column matrix. We want both cases to
183	 work. */
184      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185    }
186  else
187    {
188      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190    }
191
192
193  if (GFC_DESCRIPTOR_RANK (a) == 1)
194    {
195      /* Treat it as a a row matrix A[1,count]. */
196      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197      aystride = 1;
198
199      xcount = 1;
200      count = GFC_DESCRIPTOR_EXTENT(a,0);
201    }
202  else
203    {
204      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206
207      count = GFC_DESCRIPTOR_EXTENT(a,1);
208      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209    }
210
211  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212    {
213      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215		       "in dimension 1: is %ld, should be %ld",
216		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217    }
218
219  if (GFC_DESCRIPTOR_RANK (b) == 1)
220    {
221      /* Treat it as a column matrix B[count,1] */
222      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224      /* bystride should never be used for 1-dimensional b.
225         The value is only used for calculation of the
226         memory by the buffer.  */
227      bystride = 256;
228      ycount = 1;
229    }
230  else
231    {
232      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235    }
236
237  abase = a->base_addr;
238  bbase = b->base_addr;
239  dest = retarray->base_addr;
240
241  /* Now that everything is set up, we perform the multiplication
242     itself.  */
243
244#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245#define min(a,b) ((a) <= (b) ? (a) : (b))
246#define max(a,b) ((a) >= (b) ? (a) : (b))
247
248  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249      && (bxstride == 1 || bystride == 1)
250      && (((float) xcount) * ((float) ycount) * ((float) count)
251          > POW3(blas_limit)))
252    {
253      const int m = xcount, n = ycount, k = count, ldc = rystride;
254      const GFC_REAL_4 one = 1, zero = 0;
255      const int lda = (axstride == 1) ? aystride : axstride,
256		ldb = (bxstride == 1) ? bystride : bxstride;
257
258      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259	{
260	  assert (gemm != NULL);
261	  const char *transa, *transb;
262	  if (try_blas & 2)
263	    transa = "C";
264	  else
265	    transa = axstride == 1 ? "N" : "T";
266
267	  if (try_blas & 4)
268	    transb = "C";
269	  else
270	    transb = bxstride == 1 ? "N" : "T";
271
272	  gemm (transa, transb , &m,
273		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
274		&ldc, 1, 1);
275	  return;
276	}
277    }
278
279  if (rxstride == 1 && axstride == 1 && bxstride == 1
280      && GFC_DESCRIPTOR_RANK (b) != 1)
281    {
282      /* This block of code implements a tuned matmul, derived from
283         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
284
285               Bo Kagstrom and Per Ling
286               Department of Computing Science
287               Umea University
288               S-901 87 Umea, Sweden
289
290	 from netlib.org, translated to C, and modified for matmul.m4.  */
291
292      const GFC_REAL_4 *a, *b;
293      GFC_REAL_4 *c;
294      const index_type m = xcount, n = ycount, k = count;
295
296      /* System generated locals */
297      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
298		 i1, i2, i3, i4, i5, i6;
299
300      /* Local variables */
301      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
302		 f13, f14, f23, f24, f33, f34, f43, f44;
303      index_type i, j, l, ii, jj, ll;
304      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
305      GFC_REAL_4 *t1;
306
307      a = abase;
308      b = bbase;
309      c = retarray->base_addr;
310
311      /* Parameter adjustments */
312      c_dim1 = rystride;
313      c_offset = 1 + c_dim1;
314      c -= c_offset;
315      a_dim1 = aystride;
316      a_offset = 1 + a_dim1;
317      a -= a_offset;
318      b_dim1 = bystride;
319      b_offset = 1 + b_dim1;
320      b -= b_offset;
321
322      /* Empty c first.  */
323      for (j=1; j<=n; j++)
324	for (i=1; i<=m; i++)
325	  c[i + j * c_dim1] = (GFC_REAL_4)0;
326
327      /* Early exit if possible */
328      if (m == 0 || n == 0 || k == 0)
329	return;
330
331      /* Adjust size of t1 to what is needed.  */
332      index_type t1_dim, a_sz;
333      if (aystride == 1)
334        a_sz = rystride;
335      else
336        a_sz = a_dim1;
337
338      t1_dim = a_sz * 256 + b_dim1;
339      if (t1_dim > 65536)
340	t1_dim = 65536;
341
342      t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
343
344      /* Start turning the crank. */
345      i1 = n;
346      for (jj = 1; jj <= i1; jj += 512)
347	{
348	  /* Computing MIN */
349	  i2 = 512;
350	  i3 = n - jj + 1;
351	  jsec = min(i2,i3);
352	  ujsec = jsec - jsec % 4;
353	  i2 = k;
354	  for (ll = 1; ll <= i2; ll += 256)
355	    {
356	      /* Computing MIN */
357	      i3 = 256;
358	      i4 = k - ll + 1;
359	      lsec = min(i3,i4);
360	      ulsec = lsec - lsec % 2;
361
362	      i3 = m;
363	      for (ii = 1; ii <= i3; ii += 256)
364		{
365		  /* Computing MIN */
366		  i4 = 256;
367		  i5 = m - ii + 1;
368		  isec = min(i4,i5);
369		  uisec = isec - isec % 2;
370		  i4 = ll + ulsec - 1;
371		  for (l = ll; l <= i4; l += 2)
372		    {
373		      i5 = ii + uisec - 1;
374		      for (i = ii; i <= i5; i += 2)
375			{
376			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
377					a[i + l * a_dim1];
378			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
379					a[i + (l + 1) * a_dim1];
380			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
381					a[i + 1 + l * a_dim1];
382			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
383					a[i + 1 + (l + 1) * a_dim1];
384			}
385		      if (uisec < isec)
386			{
387			  t1[l - ll + 1 + (isec << 8) - 257] =
388				    a[ii + isec - 1 + l * a_dim1];
389			  t1[l - ll + 2 + (isec << 8) - 257] =
390				    a[ii + isec - 1 + (l + 1) * a_dim1];
391			}
392		    }
393		  if (ulsec < lsec)
394		    {
395		      i4 = ii + isec - 1;
396		      for (i = ii; i<= i4; ++i)
397			{
398			  t1[lsec + ((i - ii + 1) << 8) - 257] =
399				    a[i + (ll + lsec - 1) * a_dim1];
400			}
401		    }
402
403		  uisec = isec - isec % 4;
404		  i4 = jj + ujsec - 1;
405		  for (j = jj; j <= i4; j += 4)
406		    {
407		      i5 = ii + uisec - 1;
408		      for (i = ii; i <= i5; i += 4)
409			{
410			  f11 = c[i + j * c_dim1];
411			  f21 = c[i + 1 + j * c_dim1];
412			  f12 = c[i + (j + 1) * c_dim1];
413			  f22 = c[i + 1 + (j + 1) * c_dim1];
414			  f13 = c[i + (j + 2) * c_dim1];
415			  f23 = c[i + 1 + (j + 2) * c_dim1];
416			  f14 = c[i + (j + 3) * c_dim1];
417			  f24 = c[i + 1 + (j + 3) * c_dim1];
418			  f31 = c[i + 2 + j * c_dim1];
419			  f41 = c[i + 3 + j * c_dim1];
420			  f32 = c[i + 2 + (j + 1) * c_dim1];
421			  f42 = c[i + 3 + (j + 1) * c_dim1];
422			  f33 = c[i + 2 + (j + 2) * c_dim1];
423			  f43 = c[i + 3 + (j + 2) * c_dim1];
424			  f34 = c[i + 2 + (j + 3) * c_dim1];
425			  f44 = c[i + 3 + (j + 3) * c_dim1];
426			  i6 = ll + lsec - 1;
427			  for (l = ll; l <= i6; ++l)
428			    {
429			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430				      * b[l + j * b_dim1];
431			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432				      * b[l + j * b_dim1];
433			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
434				      * b[l + (j + 1) * b_dim1];
435			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
436				      * b[l + (j + 1) * b_dim1];
437			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
438				      * b[l + (j + 2) * b_dim1];
439			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
440				      * b[l + (j + 2) * b_dim1];
441			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
442				      * b[l + (j + 3) * b_dim1];
443			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
444				      * b[l + (j + 3) * b_dim1];
445			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446				      * b[l + j * b_dim1];
447			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448				      * b[l + j * b_dim1];
449			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
450				      * b[l + (j + 1) * b_dim1];
451			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
452				      * b[l + (j + 1) * b_dim1];
453			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
454				      * b[l + (j + 2) * b_dim1];
455			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
456				      * b[l + (j + 2) * b_dim1];
457			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
458				      * b[l + (j + 3) * b_dim1];
459			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
460				      * b[l + (j + 3) * b_dim1];
461			    }
462			  c[i + j * c_dim1] = f11;
463			  c[i + 1 + j * c_dim1] = f21;
464			  c[i + (j + 1) * c_dim1] = f12;
465			  c[i + 1 + (j + 1) * c_dim1] = f22;
466			  c[i + (j + 2) * c_dim1] = f13;
467			  c[i + 1 + (j + 2) * c_dim1] = f23;
468			  c[i + (j + 3) * c_dim1] = f14;
469			  c[i + 1 + (j + 3) * c_dim1] = f24;
470			  c[i + 2 + j * c_dim1] = f31;
471			  c[i + 3 + j * c_dim1] = f41;
472			  c[i + 2 + (j + 1) * c_dim1] = f32;
473			  c[i + 3 + (j + 1) * c_dim1] = f42;
474			  c[i + 2 + (j + 2) * c_dim1] = f33;
475			  c[i + 3 + (j + 2) * c_dim1] = f43;
476			  c[i + 2 + (j + 3) * c_dim1] = f34;
477			  c[i + 3 + (j + 3) * c_dim1] = f44;
478			}
479		      if (uisec < isec)
480			{
481			  i5 = ii + isec - 1;
482			  for (i = ii + uisec; i <= i5; ++i)
483			    {
484			      f11 = c[i + j * c_dim1];
485			      f12 = c[i + (j + 1) * c_dim1];
486			      f13 = c[i + (j + 2) * c_dim1];
487			      f14 = c[i + (j + 3) * c_dim1];
488			      i6 = ll + lsec - 1;
489			      for (l = ll; l <= i6; ++l)
490				{
491				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492					  257] * b[l + j * b_dim1];
493				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
494					  257] * b[l + (j + 1) * b_dim1];
495				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
496					  257] * b[l + (j + 2) * b_dim1];
497				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498					  257] * b[l + (j + 3) * b_dim1];
499				}
500			      c[i + j * c_dim1] = f11;
501			      c[i + (j + 1) * c_dim1] = f12;
502			      c[i + (j + 2) * c_dim1] = f13;
503			      c[i + (j + 3) * c_dim1] = f14;
504			    }
505			}
506		    }
507		  if (ujsec < jsec)
508		    {
509		      i4 = jj + jsec - 1;
510		      for (j = jj + ujsec; j <= i4; ++j)
511			{
512			  i5 = ii + uisec - 1;
513			  for (i = ii; i <= i5; i += 4)
514			    {
515			      f11 = c[i + j * c_dim1];
516			      f21 = c[i + 1 + j * c_dim1];
517			      f31 = c[i + 2 + j * c_dim1];
518			      f41 = c[i + 3 + j * c_dim1];
519			      i6 = ll + lsec - 1;
520			      for (l = ll; l <= i6; ++l)
521				{
522				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
523					  257] * b[l + j * b_dim1];
524				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
525					  257] * b[l + j * b_dim1];
526				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
527					  257] * b[l + j * b_dim1];
528				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
529					  257] * b[l + j * b_dim1];
530				}
531			      c[i + j * c_dim1] = f11;
532			      c[i + 1 + j * c_dim1] = f21;
533			      c[i + 2 + j * c_dim1] = f31;
534			      c[i + 3 + j * c_dim1] = f41;
535			    }
536			  i5 = ii + isec - 1;
537			  for (i = ii + uisec; i <= i5; ++i)
538			    {
539			      f11 = c[i + j * c_dim1];
540			      i6 = ll + lsec - 1;
541			      for (l = ll; l <= i6; ++l)
542				{
543				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
544					  257] * b[l + j * b_dim1];
545				}
546			      c[i + j * c_dim1] = f11;
547			    }
548			}
549		    }
550		}
551	    }
552	}
553      free(t1);
554      return;
555    }
556  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
557    {
558      if (GFC_DESCRIPTOR_RANK (a) != 1)
559	{
560	  const GFC_REAL_4 *restrict abase_x;
561	  const GFC_REAL_4 *restrict bbase_y;
562	  GFC_REAL_4 *restrict dest_y;
563	  GFC_REAL_4 s;
564
565	  for (y = 0; y < ycount; y++)
566	    {
567	      bbase_y = &bbase[y*bystride];
568	      dest_y = &dest[y*rystride];
569	      for (x = 0; x < xcount; x++)
570		{
571		  abase_x = &abase[x*axstride];
572		  s = (GFC_REAL_4) 0;
573		  for (n = 0; n < count; n++)
574		    s += abase_x[n] * bbase_y[n];
575		  dest_y[x] = s;
576		}
577	    }
578	}
579      else
580	{
581	  const GFC_REAL_4 *restrict bbase_y;
582	  GFC_REAL_4 s;
583
584	  for (y = 0; y < ycount; y++)
585	    {
586	      bbase_y = &bbase[y*bystride];
587	      s = (GFC_REAL_4) 0;
588	      for (n = 0; n < count; n++)
589		s += abase[n*axstride] * bbase_y[n];
590	      dest[y*rystride] = s;
591	    }
592	}
593    }
594  else if (GFC_DESCRIPTOR_RANK (a) == 1)
595    {
596      const GFC_REAL_4 *restrict bbase_y;
597      GFC_REAL_4 s;
598
599      for (y = 0; y < ycount; y++)
600	{
601	  bbase_y = &bbase[y*bystride];
602	  s = (GFC_REAL_4) 0;
603	  for (n = 0; n < count; n++)
604	    s += abase[n*axstride] * bbase_y[n*bxstride];
605	  dest[y*rxstride] = s;
606	}
607    }
608  else if (axstride < aystride)
609    {
610      for (y = 0; y < ycount; y++)
611	for (x = 0; x < xcount; x++)
612	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
613
614      for (y = 0; y < ycount; y++)
615	for (n = 0; n < count; n++)
616	  for (x = 0; x < xcount; x++)
617	    /* dest[x,y] += a[x,n] * b[n,y] */
618	    dest[x*rxstride + y*rystride] +=
619					abase[x*axstride + n*aystride] *
620					bbase[n*bxstride + y*bystride];
621    }
622  else
623    {
624      const GFC_REAL_4 *restrict abase_x;
625      const GFC_REAL_4 *restrict bbase_y;
626      GFC_REAL_4 *restrict dest_y;
627      GFC_REAL_4 s;
628
629      for (y = 0; y < ycount; y++)
630	{
631	  bbase_y = &bbase[y*bystride];
632	  dest_y = &dest[y*rystride];
633	  for (x = 0; x < xcount; x++)
634	    {
635	      abase_x = &abase[x*axstride];
636	      s = (GFC_REAL_4) 0;
637	      for (n = 0; n < count; n++)
638		s += abase_x[n*aystride] * bbase_y[n*bxstride];
639	      dest_y[x*rxstride] = s;
640	    }
641	}
642    }
643}
644#undef POW3
645#undef min
646#undef max
647
648#endif /* HAVE_AVX */
649
650#ifdef HAVE_AVX2
651static void
652matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
653	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
654	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
655static void
656matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
657	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
658	int blas_limit, blas_call gemm)
659{
660  const GFC_REAL_4 * restrict abase;
661  const GFC_REAL_4 * restrict bbase;
662  GFC_REAL_4 * restrict dest;
663
664  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
665  index_type x, y, n, count, xcount, ycount;
666
667  assert (GFC_DESCRIPTOR_RANK (a) == 2
668          || GFC_DESCRIPTOR_RANK (b) == 2);
669
670/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
671
672   Either A or B (but not both) can be rank 1:
673
674   o One-dimensional argument A is implicitly treated as a row matrix
675     dimensioned [1,count], so xcount=1.
676
677   o One-dimensional argument B is implicitly treated as a column matrix
678     dimensioned [count, 1], so ycount=1.
679*/
680
681  if (retarray->base_addr == NULL)
682    {
683      if (GFC_DESCRIPTOR_RANK (a) == 1)
684        {
685	  GFC_DIMENSION_SET(retarray->dim[0], 0,
686	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
687        }
688      else if (GFC_DESCRIPTOR_RANK (b) == 1)
689        {
690	  GFC_DIMENSION_SET(retarray->dim[0], 0,
691	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
692        }
693      else
694        {
695	  GFC_DIMENSION_SET(retarray->dim[0], 0,
696	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
697
698          GFC_DIMENSION_SET(retarray->dim[1], 0,
699	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
700			    GFC_DESCRIPTOR_EXTENT(retarray,0));
701        }
702
703      retarray->base_addr
704	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
705      retarray->offset = 0;
706    }
707  else if (unlikely (compile_options.bounds_check))
708    {
709      index_type ret_extent, arg_extent;
710
711      if (GFC_DESCRIPTOR_RANK (a) == 1)
712	{
713	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715	  if (arg_extent != ret_extent)
716	    runtime_error ("Array bound mismatch for dimension 1 of "
717	    		   "array (%ld/%ld) ",
718			   (long int) ret_extent, (long int) arg_extent);
719	}
720      else if (GFC_DESCRIPTOR_RANK (b) == 1)
721	{
722	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
723	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
724	  if (arg_extent != ret_extent)
725	    runtime_error ("Array bound mismatch for dimension 1 of "
726	    		   "array (%ld/%ld) ",
727			   (long int) ret_extent, (long int) arg_extent);
728	}
729      else
730	{
731	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
732	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
733	  if (arg_extent != ret_extent)
734	    runtime_error ("Array bound mismatch for dimension 1 of "
735	    		   "array (%ld/%ld) ",
736			   (long int) ret_extent, (long int) arg_extent);
737
738	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
739	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
740	  if (arg_extent != ret_extent)
741	    runtime_error ("Array bound mismatch for dimension 2 of "
742	    		   "array (%ld/%ld) ",
743			   (long int) ret_extent, (long int) arg_extent);
744	}
745    }
746
747
748  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
749    {
750      /* One-dimensional result may be addressed in the code below
751	 either as a row or a column matrix. We want both cases to
752	 work. */
753      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
754    }
755  else
756    {
757      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
758      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
759    }
760
761
762  if (GFC_DESCRIPTOR_RANK (a) == 1)
763    {
764      /* Treat it as a a row matrix A[1,count]. */
765      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
766      aystride = 1;
767
768      xcount = 1;
769      count = GFC_DESCRIPTOR_EXTENT(a,0);
770    }
771  else
772    {
773      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
774      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
775
776      count = GFC_DESCRIPTOR_EXTENT(a,1);
777      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
778    }
779
780  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
781    {
782      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
783	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
784		       "in dimension 1: is %ld, should be %ld",
785		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
786    }
787
788  if (GFC_DESCRIPTOR_RANK (b) == 1)
789    {
790      /* Treat it as a column matrix B[count,1] */
791      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
792
793      /* bystride should never be used for 1-dimensional b.
794         The value is only used for calculation of the
795         memory by the buffer.  */
796      bystride = 256;
797      ycount = 1;
798    }
799  else
800    {
801      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
802      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
803      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
804    }
805
806  abase = a->base_addr;
807  bbase = b->base_addr;
808  dest = retarray->base_addr;
809
810  /* Now that everything is set up, we perform the multiplication
811     itself.  */
812
813#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
814#define min(a,b) ((a) <= (b) ? (a) : (b))
815#define max(a,b) ((a) >= (b) ? (a) : (b))
816
817  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
818      && (bxstride == 1 || bystride == 1)
819      && (((float) xcount) * ((float) ycount) * ((float) count)
820          > POW3(blas_limit)))
821    {
822      const int m = xcount, n = ycount, k = count, ldc = rystride;
823      const GFC_REAL_4 one = 1, zero = 0;
824      const int lda = (axstride == 1) ? aystride : axstride,
825		ldb = (bxstride == 1) ? bystride : bxstride;
826
827      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
828	{
829	  assert (gemm != NULL);
830	  const char *transa, *transb;
831	  if (try_blas & 2)
832	    transa = "C";
833	  else
834	    transa = axstride == 1 ? "N" : "T";
835
836	  if (try_blas & 4)
837	    transb = "C";
838	  else
839	    transb = bxstride == 1 ? "N" : "T";
840
841	  gemm (transa, transb , &m,
842		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
843		&ldc, 1, 1);
844	  return;
845	}
846    }
847
848  if (rxstride == 1 && axstride == 1 && bxstride == 1
849      && GFC_DESCRIPTOR_RANK (b) != 1)
850    {
851      /* This block of code implements a tuned matmul, derived from
852         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
853
854               Bo Kagstrom and Per Ling
855               Department of Computing Science
856               Umea University
857               S-901 87 Umea, Sweden
858
859	 from netlib.org, translated to C, and modified for matmul.m4.  */
860
861      const GFC_REAL_4 *a, *b;
862      GFC_REAL_4 *c;
863      const index_type m = xcount, n = ycount, k = count;
864
865      /* System generated locals */
866      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
867		 i1, i2, i3, i4, i5, i6;
868
869      /* Local variables */
870      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
871		 f13, f14, f23, f24, f33, f34, f43, f44;
872      index_type i, j, l, ii, jj, ll;
873      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
874      GFC_REAL_4 *t1;
875
876      a = abase;
877      b = bbase;
878      c = retarray->base_addr;
879
880      /* Parameter adjustments */
881      c_dim1 = rystride;
882      c_offset = 1 + c_dim1;
883      c -= c_offset;
884      a_dim1 = aystride;
885      a_offset = 1 + a_dim1;
886      a -= a_offset;
887      b_dim1 = bystride;
888      b_offset = 1 + b_dim1;
889      b -= b_offset;
890
891      /* Empty c first.  */
892      for (j=1; j<=n; j++)
893	for (i=1; i<=m; i++)
894	  c[i + j * c_dim1] = (GFC_REAL_4)0;
895
896      /* Early exit if possible */
897      if (m == 0 || n == 0 || k == 0)
898	return;
899
900      /* Adjust size of t1 to what is needed.  */
901      index_type t1_dim, a_sz;
902      if (aystride == 1)
903        a_sz = rystride;
904      else
905        a_sz = a_dim1;
906
907      t1_dim = a_sz * 256 + b_dim1;
908      if (t1_dim > 65536)
909	t1_dim = 65536;
910
911      t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
912
913      /* Start turning the crank. */
914      i1 = n;
915      for (jj = 1; jj <= i1; jj += 512)
916	{
917	  /* Computing MIN */
918	  i2 = 512;
919	  i3 = n - jj + 1;
920	  jsec = min(i2,i3);
921	  ujsec = jsec - jsec % 4;
922	  i2 = k;
923	  for (ll = 1; ll <= i2; ll += 256)
924	    {
925	      /* Computing MIN */
926	      i3 = 256;
927	      i4 = k - ll + 1;
928	      lsec = min(i3,i4);
929	      ulsec = lsec - lsec % 2;
930
931	      i3 = m;
932	      for (ii = 1; ii <= i3; ii += 256)
933		{
934		  /* Computing MIN */
935		  i4 = 256;
936		  i5 = m - ii + 1;
937		  isec = min(i4,i5);
938		  uisec = isec - isec % 2;
939		  i4 = ll + ulsec - 1;
940		  for (l = ll; l <= i4; l += 2)
941		    {
942		      i5 = ii + uisec - 1;
943		      for (i = ii; i <= i5; i += 2)
944			{
945			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
946					a[i + l * a_dim1];
947			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
948					a[i + (l + 1) * a_dim1];
949			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
950					a[i + 1 + l * a_dim1];
951			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
952					a[i + 1 + (l + 1) * a_dim1];
953			}
954		      if (uisec < isec)
955			{
956			  t1[l - ll + 1 + (isec << 8) - 257] =
957				    a[ii + isec - 1 + l * a_dim1];
958			  t1[l - ll + 2 + (isec << 8) - 257] =
959				    a[ii + isec - 1 + (l + 1) * a_dim1];
960			}
961		    }
962		  if (ulsec < lsec)
963		    {
964		      i4 = ii + isec - 1;
965		      for (i = ii; i<= i4; ++i)
966			{
967			  t1[lsec + ((i - ii + 1) << 8) - 257] =
968				    a[i + (ll + lsec - 1) * a_dim1];
969			}
970		    }
971
972		  uisec = isec - isec % 4;
973		  i4 = jj + ujsec - 1;
974		  for (j = jj; j <= i4; j += 4)
975		    {
976		      i5 = ii + uisec - 1;
977		      for (i = ii; i <= i5; i += 4)
978			{
979			  f11 = c[i + j * c_dim1];
980			  f21 = c[i + 1 + j * c_dim1];
981			  f12 = c[i + (j + 1) * c_dim1];
982			  f22 = c[i + 1 + (j + 1) * c_dim1];
983			  f13 = c[i + (j + 2) * c_dim1];
984			  f23 = c[i + 1 + (j + 2) * c_dim1];
985			  f14 = c[i + (j + 3) * c_dim1];
986			  f24 = c[i + 1 + (j + 3) * c_dim1];
987			  f31 = c[i + 2 + j * c_dim1];
988			  f41 = c[i + 3 + j * c_dim1];
989			  f32 = c[i + 2 + (j + 1) * c_dim1];
990			  f42 = c[i + 3 + (j + 1) * c_dim1];
991			  f33 = c[i + 2 + (j + 2) * c_dim1];
992			  f43 = c[i + 3 + (j + 2) * c_dim1];
993			  f34 = c[i + 2 + (j + 3) * c_dim1];
994			  f44 = c[i + 3 + (j + 3) * c_dim1];
995			  i6 = ll + lsec - 1;
996			  for (l = ll; l <= i6; ++l)
997			    {
998			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
999				      * b[l + j * b_dim1];
1000			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1001				      * b[l + j * b_dim1];
1002			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1003				      * b[l + (j + 1) * b_dim1];
1004			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1005				      * b[l + (j + 1) * b_dim1];
1006			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1007				      * b[l + (j + 2) * b_dim1];
1008			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1009				      * b[l + (j + 2) * b_dim1];
1010			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1011				      * b[l + (j + 3) * b_dim1];
1012			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1013				      * b[l + (j + 3) * b_dim1];
1014			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1015				      * b[l + j * b_dim1];
1016			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1017				      * b[l + j * b_dim1];
1018			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1019				      * b[l + (j + 1) * b_dim1];
1020			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1021				      * b[l + (j + 1) * b_dim1];
1022			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1023				      * b[l + (j + 2) * b_dim1];
1024			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1025				      * b[l + (j + 2) * b_dim1];
1026			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1027				      * b[l + (j + 3) * b_dim1];
1028			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1029				      * b[l + (j + 3) * b_dim1];
1030			    }
1031			  c[i + j * c_dim1] = f11;
1032			  c[i + 1 + j * c_dim1] = f21;
1033			  c[i + (j + 1) * c_dim1] = f12;
1034			  c[i + 1 + (j + 1) * c_dim1] = f22;
1035			  c[i + (j + 2) * c_dim1] = f13;
1036			  c[i + 1 + (j + 2) * c_dim1] = f23;
1037			  c[i + (j + 3) * c_dim1] = f14;
1038			  c[i + 1 + (j + 3) * c_dim1] = f24;
1039			  c[i + 2 + j * c_dim1] = f31;
1040			  c[i + 3 + j * c_dim1] = f41;
1041			  c[i + 2 + (j + 1) * c_dim1] = f32;
1042			  c[i + 3 + (j + 1) * c_dim1] = f42;
1043			  c[i + 2 + (j + 2) * c_dim1] = f33;
1044			  c[i + 3 + (j + 2) * c_dim1] = f43;
1045			  c[i + 2 + (j + 3) * c_dim1] = f34;
1046			  c[i + 3 + (j + 3) * c_dim1] = f44;
1047			}
1048		      if (uisec < isec)
1049			{
1050			  i5 = ii + isec - 1;
1051			  for (i = ii + uisec; i <= i5; ++i)
1052			    {
1053			      f11 = c[i + j * c_dim1];
1054			      f12 = c[i + (j + 1) * c_dim1];
1055			      f13 = c[i + (j + 2) * c_dim1];
1056			      f14 = c[i + (j + 3) * c_dim1];
1057			      i6 = ll + lsec - 1;
1058			      for (l = ll; l <= i6; ++l)
1059				{
1060				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061					  257] * b[l + j * b_dim1];
1062				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063					  257] * b[l + (j + 1) * b_dim1];
1064				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065					  257] * b[l + (j + 2) * b_dim1];
1066				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1067					  257] * b[l + (j + 3) * b_dim1];
1068				}
1069			      c[i + j * c_dim1] = f11;
1070			      c[i + (j + 1) * c_dim1] = f12;
1071			      c[i + (j + 2) * c_dim1] = f13;
1072			      c[i + (j + 3) * c_dim1] = f14;
1073			    }
1074			}
1075		    }
1076		  if (ujsec < jsec)
1077		    {
1078		      i4 = jj + jsec - 1;
1079		      for (j = jj + ujsec; j <= i4; ++j)
1080			{
1081			  i5 = ii + uisec - 1;
1082			  for (i = ii; i <= i5; i += 4)
1083			    {
1084			      f11 = c[i + j * c_dim1];
1085			      f21 = c[i + 1 + j * c_dim1];
1086			      f31 = c[i + 2 + j * c_dim1];
1087			      f41 = c[i + 3 + j * c_dim1];
1088			      i6 = ll + lsec - 1;
1089			      for (l = ll; l <= i6; ++l)
1090				{
1091				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1092					  257] * b[l + j * b_dim1];
1093				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1094					  257] * b[l + j * b_dim1];
1095				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1096					  257] * b[l + j * b_dim1];
1097				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1098					  257] * b[l + j * b_dim1];
1099				}
1100			      c[i + j * c_dim1] = f11;
1101			      c[i + 1 + j * c_dim1] = f21;
1102			      c[i + 2 + j * c_dim1] = f31;
1103			      c[i + 3 + j * c_dim1] = f41;
1104			    }
1105			  i5 = ii + isec - 1;
1106			  for (i = ii + uisec; i <= i5; ++i)
1107			    {
1108			      f11 = c[i + j * c_dim1];
1109			      i6 = ll + lsec - 1;
1110			      for (l = ll; l <= i6; ++l)
1111				{
1112				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1113					  257] * b[l + j * b_dim1];
1114				}
1115			      c[i + j * c_dim1] = f11;
1116			    }
1117			}
1118		    }
1119		}
1120	    }
1121	}
1122      free(t1);
1123      return;
1124    }
1125  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1126    {
1127      if (GFC_DESCRIPTOR_RANK (a) != 1)
1128	{
1129	  const GFC_REAL_4 *restrict abase_x;
1130	  const GFC_REAL_4 *restrict bbase_y;
1131	  GFC_REAL_4 *restrict dest_y;
1132	  GFC_REAL_4 s;
1133
1134	  for (y = 0; y < ycount; y++)
1135	    {
1136	      bbase_y = &bbase[y*bystride];
1137	      dest_y = &dest[y*rystride];
1138	      for (x = 0; x < xcount; x++)
1139		{
1140		  abase_x = &abase[x*axstride];
1141		  s = (GFC_REAL_4) 0;
1142		  for (n = 0; n < count; n++)
1143		    s += abase_x[n] * bbase_y[n];
1144		  dest_y[x] = s;
1145		}
1146	    }
1147	}
1148      else
1149	{
1150	  const GFC_REAL_4 *restrict bbase_y;
1151	  GFC_REAL_4 s;
1152
1153	  for (y = 0; y < ycount; y++)
1154	    {
1155	      bbase_y = &bbase[y*bystride];
1156	      s = (GFC_REAL_4) 0;
1157	      for (n = 0; n < count; n++)
1158		s += abase[n*axstride] * bbase_y[n];
1159	      dest[y*rystride] = s;
1160	    }
1161	}
1162    }
1163  else if (GFC_DESCRIPTOR_RANK (a) == 1)
1164    {
1165      const GFC_REAL_4 *restrict bbase_y;
1166      GFC_REAL_4 s;
1167
1168      for (y = 0; y < ycount; y++)
1169	{
1170	  bbase_y = &bbase[y*bystride];
1171	  s = (GFC_REAL_4) 0;
1172	  for (n = 0; n < count; n++)
1173	    s += abase[n*axstride] * bbase_y[n*bxstride];
1174	  dest[y*rxstride] = s;
1175	}
1176    }
1177  else if (axstride < aystride)
1178    {
1179      for (y = 0; y < ycount; y++)
1180	for (x = 0; x < xcount; x++)
1181	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1182
1183      for (y = 0; y < ycount; y++)
1184	for (n = 0; n < count; n++)
1185	  for (x = 0; x < xcount; x++)
1186	    /* dest[x,y] += a[x,n] * b[n,y] */
1187	    dest[x*rxstride + y*rystride] +=
1188					abase[x*axstride + n*aystride] *
1189					bbase[n*bxstride + y*bystride];
1190    }
1191  else
1192    {
1193      const GFC_REAL_4 *restrict abase_x;
1194      const GFC_REAL_4 *restrict bbase_y;
1195      GFC_REAL_4 *restrict dest_y;
1196      GFC_REAL_4 s;
1197
1198      for (y = 0; y < ycount; y++)
1199	{
1200	  bbase_y = &bbase[y*bystride];
1201	  dest_y = &dest[y*rystride];
1202	  for (x = 0; x < xcount; x++)
1203	    {
1204	      abase_x = &abase[x*axstride];
1205	      s = (GFC_REAL_4) 0;
1206	      for (n = 0; n < count; n++)
1207		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1208	      dest_y[x*rxstride] = s;
1209	    }
1210	}
1211    }
1212}
1213#undef POW3
1214#undef min
1215#undef max
1216
1217#endif /* HAVE_AVX2 */
1218
1219#ifdef HAVE_AVX512F
1220static void
1221matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1222	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1223	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1224static void
1225matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1226	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1227	int blas_limit, blas_call gemm)
1228{
1229  const GFC_REAL_4 * restrict abase;
1230  const GFC_REAL_4 * restrict bbase;
1231  GFC_REAL_4 * restrict dest;
1232
1233  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1234  index_type x, y, n, count, xcount, ycount;
1235
1236  assert (GFC_DESCRIPTOR_RANK (a) == 2
1237          || GFC_DESCRIPTOR_RANK (b) == 2);
1238
1239/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1240
1241   Either A or B (but not both) can be rank 1:
1242
1243   o One-dimensional argument A is implicitly treated as a row matrix
1244     dimensioned [1,count], so xcount=1.
1245
1246   o One-dimensional argument B is implicitly treated as a column matrix
1247     dimensioned [count, 1], so ycount=1.
1248*/
1249
1250  if (retarray->base_addr == NULL)
1251    {
1252      if (GFC_DESCRIPTOR_RANK (a) == 1)
1253        {
1254	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1255	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1256        }
1257      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258        {
1259	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1260	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1261        }
1262      else
1263        {
1264	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1265	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1266
1267          GFC_DIMENSION_SET(retarray->dim[1], 0,
1268	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1269			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1270        }
1271
1272      retarray->base_addr
1273	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1274      retarray->offset = 0;
1275    }
1276  else if (unlikely (compile_options.bounds_check))
1277    {
1278      index_type ret_extent, arg_extent;
1279
1280      if (GFC_DESCRIPTOR_RANK (a) == 1)
1281	{
1282	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1283	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1284	  if (arg_extent != ret_extent)
1285	    runtime_error ("Array bound mismatch for dimension 1 of "
1286	    		   "array (%ld/%ld) ",
1287			   (long int) ret_extent, (long int) arg_extent);
1288	}
1289      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1290	{
1291	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1292	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1293	  if (arg_extent != ret_extent)
1294	    runtime_error ("Array bound mismatch for dimension 1 of "
1295	    		   "array (%ld/%ld) ",
1296			   (long int) ret_extent, (long int) arg_extent);
1297	}
1298      else
1299	{
1300	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1301	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1302	  if (arg_extent != ret_extent)
1303	    runtime_error ("Array bound mismatch for dimension 1 of "
1304	    		   "array (%ld/%ld) ",
1305			   (long int) ret_extent, (long int) arg_extent);
1306
1307	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1308	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1309	  if (arg_extent != ret_extent)
1310	    runtime_error ("Array bound mismatch for dimension 2 of "
1311	    		   "array (%ld/%ld) ",
1312			   (long int) ret_extent, (long int) arg_extent);
1313	}
1314    }
1315
1316
1317  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1318    {
1319      /* One-dimensional result may be addressed in the code below
1320	 either as a row or a column matrix. We want both cases to
1321	 work. */
1322      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1323    }
1324  else
1325    {
1326      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1327      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1328    }
1329
1330
1331  if (GFC_DESCRIPTOR_RANK (a) == 1)
1332    {
1333      /* Treat it as a a row matrix A[1,count]. */
1334      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1335      aystride = 1;
1336
1337      xcount = 1;
1338      count = GFC_DESCRIPTOR_EXTENT(a,0);
1339    }
1340  else
1341    {
1342      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1343      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1344
1345      count = GFC_DESCRIPTOR_EXTENT(a,1);
1346      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1347    }
1348
1349  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1350    {
1351      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1352	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1353		       "in dimension 1: is %ld, should be %ld",
1354		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1355    }
1356
1357  if (GFC_DESCRIPTOR_RANK (b) == 1)
1358    {
1359      /* Treat it as a column matrix B[count,1] */
1360      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1361
1362      /* bystride should never be used for 1-dimensional b.
1363         The value is only used for calculation of the
1364         memory by the buffer.  */
1365      bystride = 256;
1366      ycount = 1;
1367    }
1368  else
1369    {
1370      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1371      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1372      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1373    }
1374
1375  abase = a->base_addr;
1376  bbase = b->base_addr;
1377  dest = retarray->base_addr;
1378
1379  /* Now that everything is set up, we perform the multiplication
1380     itself.  */
1381
1382#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1383#define min(a,b) ((a) <= (b) ? (a) : (b))
1384#define max(a,b) ((a) >= (b) ? (a) : (b))
1385
1386  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1387      && (bxstride == 1 || bystride == 1)
1388      && (((float) xcount) * ((float) ycount) * ((float) count)
1389          > POW3(blas_limit)))
1390    {
1391      const int m = xcount, n = ycount, k = count, ldc = rystride;
1392      const GFC_REAL_4 one = 1, zero = 0;
1393      const int lda = (axstride == 1) ? aystride : axstride,
1394		ldb = (bxstride == 1) ? bystride : bxstride;
1395
1396      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1397	{
1398	  assert (gemm != NULL);
1399	  const char *transa, *transb;
1400	  if (try_blas & 2)
1401	    transa = "C";
1402	  else
1403	    transa = axstride == 1 ? "N" : "T";
1404
1405	  if (try_blas & 4)
1406	    transb = "C";
1407	  else
1408	    transb = bxstride == 1 ? "N" : "T";
1409
1410	  gemm (transa, transb , &m,
1411		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1412		&ldc, 1, 1);
1413	  return;
1414	}
1415    }
1416
1417  if (rxstride == 1 && axstride == 1 && bxstride == 1
1418      && GFC_DESCRIPTOR_RANK (b) != 1)
1419    {
1420      /* This block of code implements a tuned matmul, derived from
1421         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1422
1423               Bo Kagstrom and Per Ling
1424               Department of Computing Science
1425               Umea University
1426               S-901 87 Umea, Sweden
1427
1428	 from netlib.org, translated to C, and modified for matmul.m4.  */
1429
1430      const GFC_REAL_4 *a, *b;
1431      GFC_REAL_4 *c;
1432      const index_type m = xcount, n = ycount, k = count;
1433
1434      /* System generated locals */
1435      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1436		 i1, i2, i3, i4, i5, i6;
1437
1438      /* Local variables */
1439      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1440		 f13, f14, f23, f24, f33, f34, f43, f44;
1441      index_type i, j, l, ii, jj, ll;
1442      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1443      GFC_REAL_4 *t1;
1444
1445      a = abase;
1446      b = bbase;
1447      c = retarray->base_addr;
1448
1449      /* Parameter adjustments */
1450      c_dim1 = rystride;
1451      c_offset = 1 + c_dim1;
1452      c -= c_offset;
1453      a_dim1 = aystride;
1454      a_offset = 1 + a_dim1;
1455      a -= a_offset;
1456      b_dim1 = bystride;
1457      b_offset = 1 + b_dim1;
1458      b -= b_offset;
1459
1460      /* Empty c first.  */
1461      for (j=1; j<=n; j++)
1462	for (i=1; i<=m; i++)
1463	  c[i + j * c_dim1] = (GFC_REAL_4)0;
1464
1465      /* Early exit if possible */
1466      if (m == 0 || n == 0 || k == 0)
1467	return;
1468
1469      /* Adjust size of t1 to what is needed.  */
1470      index_type t1_dim, a_sz;
1471      if (aystride == 1)
1472        a_sz = rystride;
1473      else
1474        a_sz = a_dim1;
1475
1476      t1_dim = a_sz * 256 + b_dim1;
1477      if (t1_dim > 65536)
1478	t1_dim = 65536;
1479
1480      t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
1481
1482      /* Start turning the crank. */
1483      i1 = n;
1484      for (jj = 1; jj <= i1; jj += 512)
1485	{
1486	  /* Computing MIN */
1487	  i2 = 512;
1488	  i3 = n - jj + 1;
1489	  jsec = min(i2,i3);
1490	  ujsec = jsec - jsec % 4;
1491	  i2 = k;
1492	  for (ll = 1; ll <= i2; ll += 256)
1493	    {
1494	      /* Computing MIN */
1495	      i3 = 256;
1496	      i4 = k - ll + 1;
1497	      lsec = min(i3,i4);
1498	      ulsec = lsec - lsec % 2;
1499
1500	      i3 = m;
1501	      for (ii = 1; ii <= i3; ii += 256)
1502		{
1503		  /* Computing MIN */
1504		  i4 = 256;
1505		  i5 = m - ii + 1;
1506		  isec = min(i4,i5);
1507		  uisec = isec - isec % 2;
1508		  i4 = ll + ulsec - 1;
1509		  for (l = ll; l <= i4; l += 2)
1510		    {
1511		      i5 = ii + uisec - 1;
1512		      for (i = ii; i <= i5; i += 2)
1513			{
1514			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1515					a[i + l * a_dim1];
1516			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1517					a[i + (l + 1) * a_dim1];
1518			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1519					a[i + 1 + l * a_dim1];
1520			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1521					a[i + 1 + (l + 1) * a_dim1];
1522			}
1523		      if (uisec < isec)
1524			{
1525			  t1[l - ll + 1 + (isec << 8) - 257] =
1526				    a[ii + isec - 1 + l * a_dim1];
1527			  t1[l - ll + 2 + (isec << 8) - 257] =
1528				    a[ii + isec - 1 + (l + 1) * a_dim1];
1529			}
1530		    }
1531		  if (ulsec < lsec)
1532		    {
1533		      i4 = ii + isec - 1;
1534		      for (i = ii; i<= i4; ++i)
1535			{
1536			  t1[lsec + ((i - ii + 1) << 8) - 257] =
1537				    a[i + (ll + lsec - 1) * a_dim1];
1538			}
1539		    }
1540
1541		  uisec = isec - isec % 4;
1542		  i4 = jj + ujsec - 1;
1543		  for (j = jj; j <= i4; j += 4)
1544		    {
1545		      i5 = ii + uisec - 1;
1546		      for (i = ii; i <= i5; i += 4)
1547			{
1548			  f11 = c[i + j * c_dim1];
1549			  f21 = c[i + 1 + j * c_dim1];
1550			  f12 = c[i + (j + 1) * c_dim1];
1551			  f22 = c[i + 1 + (j + 1) * c_dim1];
1552			  f13 = c[i + (j + 2) * c_dim1];
1553			  f23 = c[i + 1 + (j + 2) * c_dim1];
1554			  f14 = c[i + (j + 3) * c_dim1];
1555			  f24 = c[i + 1 + (j + 3) * c_dim1];
1556			  f31 = c[i + 2 + j * c_dim1];
1557			  f41 = c[i + 3 + j * c_dim1];
1558			  f32 = c[i + 2 + (j + 1) * c_dim1];
1559			  f42 = c[i + 3 + (j + 1) * c_dim1];
1560			  f33 = c[i + 2 + (j + 2) * c_dim1];
1561			  f43 = c[i + 3 + (j + 2) * c_dim1];
1562			  f34 = c[i + 2 + (j + 3) * c_dim1];
1563			  f44 = c[i + 3 + (j + 3) * c_dim1];
1564			  i6 = ll + lsec - 1;
1565			  for (l = ll; l <= i6; ++l)
1566			    {
1567			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1568				      * b[l + j * b_dim1];
1569			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1570				      * b[l + j * b_dim1];
1571			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1572				      * b[l + (j + 1) * b_dim1];
1573			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1574				      * b[l + (j + 1) * b_dim1];
1575			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1576				      * b[l + (j + 2) * b_dim1];
1577			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1578				      * b[l + (j + 2) * b_dim1];
1579			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1580				      * b[l + (j + 3) * b_dim1];
1581			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1582				      * b[l + (j + 3) * b_dim1];
1583			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1584				      * b[l + j * b_dim1];
1585			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1586				      * b[l + j * b_dim1];
1587			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1588				      * b[l + (j + 1) * b_dim1];
1589			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1590				      * b[l + (j + 1) * b_dim1];
1591			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1592				      * b[l + (j + 2) * b_dim1];
1593			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1594				      * b[l + (j + 2) * b_dim1];
1595			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1596				      * b[l + (j + 3) * b_dim1];
1597			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1598				      * b[l + (j + 3) * b_dim1];
1599			    }
1600			  c[i + j * c_dim1] = f11;
1601			  c[i + 1 + j * c_dim1] = f21;
1602			  c[i + (j + 1) * c_dim1] = f12;
1603			  c[i + 1 + (j + 1) * c_dim1] = f22;
1604			  c[i + (j + 2) * c_dim1] = f13;
1605			  c[i + 1 + (j + 2) * c_dim1] = f23;
1606			  c[i + (j + 3) * c_dim1] = f14;
1607			  c[i + 1 + (j + 3) * c_dim1] = f24;
1608			  c[i + 2 + j * c_dim1] = f31;
1609			  c[i + 3 + j * c_dim1] = f41;
1610			  c[i + 2 + (j + 1) * c_dim1] = f32;
1611			  c[i + 3 + (j + 1) * c_dim1] = f42;
1612			  c[i + 2 + (j + 2) * c_dim1] = f33;
1613			  c[i + 3 + (j + 2) * c_dim1] = f43;
1614			  c[i + 2 + (j + 3) * c_dim1] = f34;
1615			  c[i + 3 + (j + 3) * c_dim1] = f44;
1616			}
1617		      if (uisec < isec)
1618			{
1619			  i5 = ii + isec - 1;
1620			  for (i = ii + uisec; i <= i5; ++i)
1621			    {
1622			      f11 = c[i + j * c_dim1];
1623			      f12 = c[i + (j + 1) * c_dim1];
1624			      f13 = c[i + (j + 2) * c_dim1];
1625			      f14 = c[i + (j + 3) * c_dim1];
1626			      i6 = ll + lsec - 1;
1627			      for (l = ll; l <= i6; ++l)
1628				{
1629				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1630					  257] * b[l + j * b_dim1];
1631				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1632					  257] * b[l + (j + 1) * b_dim1];
1633				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1634					  257] * b[l + (j + 2) * b_dim1];
1635				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1636					  257] * b[l + (j + 3) * b_dim1];
1637				}
1638			      c[i + j * c_dim1] = f11;
1639			      c[i + (j + 1) * c_dim1] = f12;
1640			      c[i + (j + 2) * c_dim1] = f13;
1641			      c[i + (j + 3) * c_dim1] = f14;
1642			    }
1643			}
1644		    }
1645		  if (ujsec < jsec)
1646		    {
1647		      i4 = jj + jsec - 1;
1648		      for (j = jj + ujsec; j <= i4; ++j)
1649			{
1650			  i5 = ii + uisec - 1;
1651			  for (i = ii; i <= i5; i += 4)
1652			    {
1653			      f11 = c[i + j * c_dim1];
1654			      f21 = c[i + 1 + j * c_dim1];
1655			      f31 = c[i + 2 + j * c_dim1];
1656			      f41 = c[i + 3 + j * c_dim1];
1657			      i6 = ll + lsec - 1;
1658			      for (l = ll; l <= i6; ++l)
1659				{
1660				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1661					  257] * b[l + j * b_dim1];
1662				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1663					  257] * b[l + j * b_dim1];
1664				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1665					  257] * b[l + j * b_dim1];
1666				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1667					  257] * b[l + j * b_dim1];
1668				}
1669			      c[i + j * c_dim1] = f11;
1670			      c[i + 1 + j * c_dim1] = f21;
1671			      c[i + 2 + j * c_dim1] = f31;
1672			      c[i + 3 + j * c_dim1] = f41;
1673			    }
1674			  i5 = ii + isec - 1;
1675			  for (i = ii + uisec; i <= i5; ++i)
1676			    {
1677			      f11 = c[i + j * c_dim1];
1678			      i6 = ll + lsec - 1;
1679			      for (l = ll; l <= i6; ++l)
1680				{
1681				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1682					  257] * b[l + j * b_dim1];
1683				}
1684			      c[i + j * c_dim1] = f11;
1685			    }
1686			}
1687		    }
1688		}
1689	    }
1690	}
1691      free(t1);
1692      return;
1693    }
1694  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1695    {
1696      if (GFC_DESCRIPTOR_RANK (a) != 1)
1697	{
1698	  const GFC_REAL_4 *restrict abase_x;
1699	  const GFC_REAL_4 *restrict bbase_y;
1700	  GFC_REAL_4 *restrict dest_y;
1701	  GFC_REAL_4 s;
1702
1703	  for (y = 0; y < ycount; y++)
1704	    {
1705	      bbase_y = &bbase[y*bystride];
1706	      dest_y = &dest[y*rystride];
1707	      for (x = 0; x < xcount; x++)
1708		{
1709		  abase_x = &abase[x*axstride];
1710		  s = (GFC_REAL_4) 0;
1711		  for (n = 0; n < count; n++)
1712		    s += abase_x[n] * bbase_y[n];
1713		  dest_y[x] = s;
1714		}
1715	    }
1716	}
1717      else
1718	{
1719	  const GFC_REAL_4 *restrict bbase_y;
1720	  GFC_REAL_4 s;
1721
1722	  for (y = 0; y < ycount; y++)
1723	    {
1724	      bbase_y = &bbase[y*bystride];
1725	      s = (GFC_REAL_4) 0;
1726	      for (n = 0; n < count; n++)
1727		s += abase[n*axstride] * bbase_y[n];
1728	      dest[y*rystride] = s;
1729	    }
1730	}
1731    }
1732  else if (GFC_DESCRIPTOR_RANK (a) == 1)
1733    {
1734      const GFC_REAL_4 *restrict bbase_y;
1735      GFC_REAL_4 s;
1736
1737      for (y = 0; y < ycount; y++)
1738	{
1739	  bbase_y = &bbase[y*bystride];
1740	  s = (GFC_REAL_4) 0;
1741	  for (n = 0; n < count; n++)
1742	    s += abase[n*axstride] * bbase_y[n*bxstride];
1743	  dest[y*rxstride] = s;
1744	}
1745    }
1746  else if (axstride < aystride)
1747    {
1748      for (y = 0; y < ycount; y++)
1749	for (x = 0; x < xcount; x++)
1750	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1751
1752      for (y = 0; y < ycount; y++)
1753	for (n = 0; n < count; n++)
1754	  for (x = 0; x < xcount; x++)
1755	    /* dest[x,y] += a[x,n] * b[n,y] */
1756	    dest[x*rxstride + y*rystride] +=
1757					abase[x*axstride + n*aystride] *
1758					bbase[n*bxstride + y*bystride];
1759    }
1760  else
1761    {
1762      const GFC_REAL_4 *restrict abase_x;
1763      const GFC_REAL_4 *restrict bbase_y;
1764      GFC_REAL_4 *restrict dest_y;
1765      GFC_REAL_4 s;
1766
1767      for (y = 0; y < ycount; y++)
1768	{
1769	  bbase_y = &bbase[y*bystride];
1770	  dest_y = &dest[y*rystride];
1771	  for (x = 0; x < xcount; x++)
1772	    {
1773	      abase_x = &abase[x*axstride];
1774	      s = (GFC_REAL_4) 0;
1775	      for (n = 0; n < count; n++)
1776		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1777	      dest_y[x*rxstride] = s;
1778	    }
1779	}
1780    }
1781}
1782#undef POW3
1783#undef min
1784#undef max
1785
1786#endif  /* HAVE_AVX512F */
1787
1788/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1789
1790#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1791void
1792matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
1793	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1794	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1795internal_proto(matmul_r4_avx128_fma3);
1796#endif
1797
1798#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1799void
1800matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
1801	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1802	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1803internal_proto(matmul_r4_avx128_fma4);
1804#endif
1805
1806/* Function to fall back to if there is no special processor-specific version.  */
1807static void
1808matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1809	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1810	int blas_limit, blas_call gemm)
1811{
1812  const GFC_REAL_4 * restrict abase;
1813  const GFC_REAL_4 * restrict bbase;
1814  GFC_REAL_4 * restrict dest;
1815
1816  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1817  index_type x, y, n, count, xcount, ycount;
1818
1819  assert (GFC_DESCRIPTOR_RANK (a) == 2
1820          || GFC_DESCRIPTOR_RANK (b) == 2);
1821
1822/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1823
1824   Either A or B (but not both) can be rank 1:
1825
1826   o One-dimensional argument A is implicitly treated as a row matrix
1827     dimensioned [1,count], so xcount=1.
1828
1829   o One-dimensional argument B is implicitly treated as a column matrix
1830     dimensioned [count, 1], so ycount=1.
1831*/
1832
1833  if (retarray->base_addr == NULL)
1834    {
1835      if (GFC_DESCRIPTOR_RANK (a) == 1)
1836        {
1837	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1838	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1839        }
1840      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1841        {
1842	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1843	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1844        }
1845      else
1846        {
1847	  GFC_DIMENSION_SET(retarray->dim[0], 0,
1848	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1849
1850          GFC_DIMENSION_SET(retarray->dim[1], 0,
1851	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1852			    GFC_DESCRIPTOR_EXTENT(retarray,0));
1853        }
1854
1855      retarray->base_addr
1856	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1857      retarray->offset = 0;
1858    }
1859  else if (unlikely (compile_options.bounds_check))
1860    {
1861      index_type ret_extent, arg_extent;
1862
1863      if (GFC_DESCRIPTOR_RANK (a) == 1)
1864	{
1865	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1866	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1867	  if (arg_extent != ret_extent)
1868	    runtime_error ("Array bound mismatch for dimension 1 of "
1869	    		   "array (%ld/%ld) ",
1870			   (long int) ret_extent, (long int) arg_extent);
1871	}
1872      else if (GFC_DESCRIPTOR_RANK (b) == 1)
1873	{
1874	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1875	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1876	  if (arg_extent != ret_extent)
1877	    runtime_error ("Array bound mismatch for dimension 1 of "
1878	    		   "array (%ld/%ld) ",
1879			   (long int) ret_extent, (long int) arg_extent);
1880	}
1881      else
1882	{
1883	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1884	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1885	  if (arg_extent != ret_extent)
1886	    runtime_error ("Array bound mismatch for dimension 1 of "
1887	    		   "array (%ld/%ld) ",
1888			   (long int) ret_extent, (long int) arg_extent);
1889
1890	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1891	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1892	  if (arg_extent != ret_extent)
1893	    runtime_error ("Array bound mismatch for dimension 2 of "
1894	    		   "array (%ld/%ld) ",
1895			   (long int) ret_extent, (long int) arg_extent);
1896	}
1897    }
1898
1899
1900  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1901    {
1902      /* One-dimensional result may be addressed in the code below
1903	 either as a row or a column matrix. We want both cases to
1904	 work. */
1905      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1906    }
1907  else
1908    {
1909      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1910      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1911    }
1912
1913
1914  if (GFC_DESCRIPTOR_RANK (a) == 1)
1915    {
1916      /* Treat it as a a row matrix A[1,count]. */
1917      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1918      aystride = 1;
1919
1920      xcount = 1;
1921      count = GFC_DESCRIPTOR_EXTENT(a,0);
1922    }
1923  else
1924    {
1925      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1926      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1927
1928      count = GFC_DESCRIPTOR_EXTENT(a,1);
1929      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1930    }
1931
1932  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1933    {
1934      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1935	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1936		       "in dimension 1: is %ld, should be %ld",
1937		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1938    }
1939
1940  if (GFC_DESCRIPTOR_RANK (b) == 1)
1941    {
1942      /* Treat it as a column matrix B[count,1] */
1943      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1944
1945      /* bystride should never be used for 1-dimensional b.
1946         The value is only used for calculation of the
1947         memory by the buffer.  */
1948      bystride = 256;
1949      ycount = 1;
1950    }
1951  else
1952    {
1953      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1954      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1955      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1956    }
1957
1958  abase = a->base_addr;
1959  bbase = b->base_addr;
1960  dest = retarray->base_addr;
1961
1962  /* Now that everything is set up, we perform the multiplication
1963     itself.  */
1964
1965#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1966#define min(a,b) ((a) <= (b) ? (a) : (b))
1967#define max(a,b) ((a) >= (b) ? (a) : (b))
1968
1969  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1970      && (bxstride == 1 || bystride == 1)
1971      && (((float) xcount) * ((float) ycount) * ((float) count)
1972          > POW3(blas_limit)))
1973    {
1974      const int m = xcount, n = ycount, k = count, ldc = rystride;
1975      const GFC_REAL_4 one = 1, zero = 0;
1976      const int lda = (axstride == 1) ? aystride : axstride,
1977		ldb = (bxstride == 1) ? bystride : bxstride;
1978
1979      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1980	{
1981	  assert (gemm != NULL);
1982	  const char *transa, *transb;
1983	  if (try_blas & 2)
1984	    transa = "C";
1985	  else
1986	    transa = axstride == 1 ? "N" : "T";
1987
1988	  if (try_blas & 4)
1989	    transb = "C";
1990	  else
1991	    transb = bxstride == 1 ? "N" : "T";
1992
1993	  gemm (transa, transb , &m,
1994		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
1995		&ldc, 1, 1);
1996	  return;
1997	}
1998    }
1999
2000  if (rxstride == 1 && axstride == 1 && bxstride == 1
2001      && GFC_DESCRIPTOR_RANK (b) != 1)
2002    {
2003      /* This block of code implements a tuned matmul, derived from
2004         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2005
2006               Bo Kagstrom and Per Ling
2007               Department of Computing Science
2008               Umea University
2009               S-901 87 Umea, Sweden
2010
2011	 from netlib.org, translated to C, and modified for matmul.m4.  */
2012
2013      const GFC_REAL_4 *a, *b;
2014      GFC_REAL_4 *c;
2015      const index_type m = xcount, n = ycount, k = count;
2016
2017      /* System generated locals */
2018      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2019		 i1, i2, i3, i4, i5, i6;
2020
2021      /* Local variables */
2022      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
2023		 f13, f14, f23, f24, f33, f34, f43, f44;
2024      index_type i, j, l, ii, jj, ll;
2025      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2026      GFC_REAL_4 *t1;
2027
2028      a = abase;
2029      b = bbase;
2030      c = retarray->base_addr;
2031
2032      /* Parameter adjustments */
2033      c_dim1 = rystride;
2034      c_offset = 1 + c_dim1;
2035      c -= c_offset;
2036      a_dim1 = aystride;
2037      a_offset = 1 + a_dim1;
2038      a -= a_offset;
2039      b_dim1 = bystride;
2040      b_offset = 1 + b_dim1;
2041      b -= b_offset;
2042
2043      /* Empty c first.  */
2044      for (j=1; j<=n; j++)
2045	for (i=1; i<=m; i++)
2046	  c[i + j * c_dim1] = (GFC_REAL_4)0;
2047
2048      /* Early exit if possible */
2049      if (m == 0 || n == 0 || k == 0)
2050	return;
2051
2052      /* Adjust size of t1 to what is needed.  */
2053      index_type t1_dim, a_sz;
2054      if (aystride == 1)
2055        a_sz = rystride;
2056      else
2057        a_sz = a_dim1;
2058
2059      t1_dim = a_sz * 256 + b_dim1;
2060      if (t1_dim > 65536)
2061	t1_dim = 65536;
2062
2063      t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
2064
2065      /* Start turning the crank. */
2066      i1 = n;
2067      for (jj = 1; jj <= i1; jj += 512)
2068	{
2069	  /* Computing MIN */
2070	  i2 = 512;
2071	  i3 = n - jj + 1;
2072	  jsec = min(i2,i3);
2073	  ujsec = jsec - jsec % 4;
2074	  i2 = k;
2075	  for (ll = 1; ll <= i2; ll += 256)
2076	    {
2077	      /* Computing MIN */
2078	      i3 = 256;
2079	      i4 = k - ll + 1;
2080	      lsec = min(i3,i4);
2081	      ulsec = lsec - lsec % 2;
2082
2083	      i3 = m;
2084	      for (ii = 1; ii <= i3; ii += 256)
2085		{
2086		  /* Computing MIN */
2087		  i4 = 256;
2088		  i5 = m - ii + 1;
2089		  isec = min(i4,i5);
2090		  uisec = isec - isec % 2;
2091		  i4 = ll + ulsec - 1;
2092		  for (l = ll; l <= i4; l += 2)
2093		    {
2094		      i5 = ii + uisec - 1;
2095		      for (i = ii; i <= i5; i += 2)
2096			{
2097			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2098					a[i + l * a_dim1];
2099			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2100					a[i + (l + 1) * a_dim1];
2101			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2102					a[i + 1 + l * a_dim1];
2103			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2104					a[i + 1 + (l + 1) * a_dim1];
2105			}
2106		      if (uisec < isec)
2107			{
2108			  t1[l - ll + 1 + (isec << 8) - 257] =
2109				    a[ii + isec - 1 + l * a_dim1];
2110			  t1[l - ll + 2 + (isec << 8) - 257] =
2111				    a[ii + isec - 1 + (l + 1) * a_dim1];
2112			}
2113		    }
2114		  if (ulsec < lsec)
2115		    {
2116		      i4 = ii + isec - 1;
2117		      for (i = ii; i<= i4; ++i)
2118			{
2119			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2120				    a[i + (ll + lsec - 1) * a_dim1];
2121			}
2122		    }
2123
2124		  uisec = isec - isec % 4;
2125		  i4 = jj + ujsec - 1;
2126		  for (j = jj; j <= i4; j += 4)
2127		    {
2128		      i5 = ii + uisec - 1;
2129		      for (i = ii; i <= i5; i += 4)
2130			{
2131			  f11 = c[i + j * c_dim1];
2132			  f21 = c[i + 1 + j * c_dim1];
2133			  f12 = c[i + (j + 1) * c_dim1];
2134			  f22 = c[i + 1 + (j + 1) * c_dim1];
2135			  f13 = c[i + (j + 2) * c_dim1];
2136			  f23 = c[i + 1 + (j + 2) * c_dim1];
2137			  f14 = c[i + (j + 3) * c_dim1];
2138			  f24 = c[i + 1 + (j + 3) * c_dim1];
2139			  f31 = c[i + 2 + j * c_dim1];
2140			  f41 = c[i + 3 + j * c_dim1];
2141			  f32 = c[i + 2 + (j + 1) * c_dim1];
2142			  f42 = c[i + 3 + (j + 1) * c_dim1];
2143			  f33 = c[i + 2 + (j + 2) * c_dim1];
2144			  f43 = c[i + 3 + (j + 2) * c_dim1];
2145			  f34 = c[i + 2 + (j + 3) * c_dim1];
2146			  f44 = c[i + 3 + (j + 3) * c_dim1];
2147			  i6 = ll + lsec - 1;
2148			  for (l = ll; l <= i6; ++l)
2149			    {
2150			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151				      * b[l + j * b_dim1];
2152			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153				      * b[l + j * b_dim1];
2154			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155				      * b[l + (j + 1) * b_dim1];
2156			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157				      * b[l + (j + 1) * b_dim1];
2158			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159				      * b[l + (j + 2) * b_dim1];
2160			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161				      * b[l + (j + 2) * b_dim1];
2162			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2163				      * b[l + (j + 3) * b_dim1];
2164			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2165				      * b[l + (j + 3) * b_dim1];
2166			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167				      * b[l + j * b_dim1];
2168			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169				      * b[l + j * b_dim1];
2170			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171				      * b[l + (j + 1) * b_dim1];
2172			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173				      * b[l + (j + 1) * b_dim1];
2174			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175				      * b[l + (j + 2) * b_dim1];
2176			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177				      * b[l + (j + 2) * b_dim1];
2178			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2179				      * b[l + (j + 3) * b_dim1];
2180			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2181				      * b[l + (j + 3) * b_dim1];
2182			    }
2183			  c[i + j * c_dim1] = f11;
2184			  c[i + 1 + j * c_dim1] = f21;
2185			  c[i + (j + 1) * c_dim1] = f12;
2186			  c[i + 1 + (j + 1) * c_dim1] = f22;
2187			  c[i + (j + 2) * c_dim1] = f13;
2188			  c[i + 1 + (j + 2) * c_dim1] = f23;
2189			  c[i + (j + 3) * c_dim1] = f14;
2190			  c[i + 1 + (j + 3) * c_dim1] = f24;
2191			  c[i + 2 + j * c_dim1] = f31;
2192			  c[i + 3 + j * c_dim1] = f41;
2193			  c[i + 2 + (j + 1) * c_dim1] = f32;
2194			  c[i + 3 + (j + 1) * c_dim1] = f42;
2195			  c[i + 2 + (j + 2) * c_dim1] = f33;
2196			  c[i + 3 + (j + 2) * c_dim1] = f43;
2197			  c[i + 2 + (j + 3) * c_dim1] = f34;
2198			  c[i + 3 + (j + 3) * c_dim1] = f44;
2199			}
2200		      if (uisec < isec)
2201			{
2202			  i5 = ii + isec - 1;
2203			  for (i = ii + uisec; i <= i5; ++i)
2204			    {
2205			      f11 = c[i + j * c_dim1];
2206			      f12 = c[i + (j + 1) * c_dim1];
2207			      f13 = c[i + (j + 2) * c_dim1];
2208			      f14 = c[i + (j + 3) * c_dim1];
2209			      i6 = ll + lsec - 1;
2210			      for (l = ll; l <= i6; ++l)
2211				{
2212				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213					  257] * b[l + j * b_dim1];
2214				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215					  257] * b[l + (j + 1) * b_dim1];
2216				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217					  257] * b[l + (j + 2) * b_dim1];
2218				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2219					  257] * b[l + (j + 3) * b_dim1];
2220				}
2221			      c[i + j * c_dim1] = f11;
2222			      c[i + (j + 1) * c_dim1] = f12;
2223			      c[i + (j + 2) * c_dim1] = f13;
2224			      c[i + (j + 3) * c_dim1] = f14;
2225			    }
2226			}
2227		    }
2228		  if (ujsec < jsec)
2229		    {
2230		      i4 = jj + jsec - 1;
2231		      for (j = jj + ujsec; j <= i4; ++j)
2232			{
2233			  i5 = ii + uisec - 1;
2234			  for (i = ii; i <= i5; i += 4)
2235			    {
2236			      f11 = c[i + j * c_dim1];
2237			      f21 = c[i + 1 + j * c_dim1];
2238			      f31 = c[i + 2 + j * c_dim1];
2239			      f41 = c[i + 3 + j * c_dim1];
2240			      i6 = ll + lsec - 1;
2241			      for (l = ll; l <= i6; ++l)
2242				{
2243				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2244					  257] * b[l + j * b_dim1];
2245				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2246					  257] * b[l + j * b_dim1];
2247				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2248					  257] * b[l + j * b_dim1];
2249				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2250					  257] * b[l + j * b_dim1];
2251				}
2252			      c[i + j * c_dim1] = f11;
2253			      c[i + 1 + j * c_dim1] = f21;
2254			      c[i + 2 + j * c_dim1] = f31;
2255			      c[i + 3 + j * c_dim1] = f41;
2256			    }
2257			  i5 = ii + isec - 1;
2258			  for (i = ii + uisec; i <= i5; ++i)
2259			    {
2260			      f11 = c[i + j * c_dim1];
2261			      i6 = ll + lsec - 1;
2262			      for (l = ll; l <= i6; ++l)
2263				{
2264				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2265					  257] * b[l + j * b_dim1];
2266				}
2267			      c[i + j * c_dim1] = f11;
2268			    }
2269			}
2270		    }
2271		}
2272	    }
2273	}
2274      free(t1);
2275      return;
2276    }
2277  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2278    {
2279      if (GFC_DESCRIPTOR_RANK (a) != 1)
2280	{
2281	  const GFC_REAL_4 *restrict abase_x;
2282	  const GFC_REAL_4 *restrict bbase_y;
2283	  GFC_REAL_4 *restrict dest_y;
2284	  GFC_REAL_4 s;
2285
2286	  for (y = 0; y < ycount; y++)
2287	    {
2288	      bbase_y = &bbase[y*bystride];
2289	      dest_y = &dest[y*rystride];
2290	      for (x = 0; x < xcount; x++)
2291		{
2292		  abase_x = &abase[x*axstride];
2293		  s = (GFC_REAL_4) 0;
2294		  for (n = 0; n < count; n++)
2295		    s += abase_x[n] * bbase_y[n];
2296		  dest_y[x] = s;
2297		}
2298	    }
2299	}
2300      else
2301	{
2302	  const GFC_REAL_4 *restrict bbase_y;
2303	  GFC_REAL_4 s;
2304
2305	  for (y = 0; y < ycount; y++)
2306	    {
2307	      bbase_y = &bbase[y*bystride];
2308	      s = (GFC_REAL_4) 0;
2309	      for (n = 0; n < count; n++)
2310		s += abase[n*axstride] * bbase_y[n];
2311	      dest[y*rystride] = s;
2312	    }
2313	}
2314    }
2315  else if (GFC_DESCRIPTOR_RANK (a) == 1)
2316    {
2317      const GFC_REAL_4 *restrict bbase_y;
2318      GFC_REAL_4 s;
2319
2320      for (y = 0; y < ycount; y++)
2321	{
2322	  bbase_y = &bbase[y*bystride];
2323	  s = (GFC_REAL_4) 0;
2324	  for (n = 0; n < count; n++)
2325	    s += abase[n*axstride] * bbase_y[n*bxstride];
2326	  dest[y*rxstride] = s;
2327	}
2328    }
2329  else if (axstride < aystride)
2330    {
2331      for (y = 0; y < ycount; y++)
2332	for (x = 0; x < xcount; x++)
2333	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2334
2335      for (y = 0; y < ycount; y++)
2336	for (n = 0; n < count; n++)
2337	  for (x = 0; x < xcount; x++)
2338	    /* dest[x,y] += a[x,n] * b[n,y] */
2339	    dest[x*rxstride + y*rystride] +=
2340					abase[x*axstride + n*aystride] *
2341					bbase[n*bxstride + y*bystride];
2342    }
2343  else
2344    {
2345      const GFC_REAL_4 *restrict abase_x;
2346      const GFC_REAL_4 *restrict bbase_y;
2347      GFC_REAL_4 *restrict dest_y;
2348      GFC_REAL_4 s;
2349
2350      for (y = 0; y < ycount; y++)
2351	{
2352	  bbase_y = &bbase[y*bystride];
2353	  dest_y = &dest[y*rystride];
2354	  for (x = 0; x < xcount; x++)
2355	    {
2356	      abase_x = &abase[x*axstride];
2357	      s = (GFC_REAL_4) 0;
2358	      for (n = 0; n < count; n++)
2359		s += abase_x[n*aystride] * bbase_y[n*bxstride];
2360	      dest_y[x*rxstride] = s;
2361	    }
2362	}
2363    }
2364}
2365#undef POW3
2366#undef min
2367#undef max
2368
2369
2370/* Compiling main function, with selection code for the processor.  */
2371
2372/* Currently, this is i386 only.  Adjust for other architectures.  */
2373
2374#include <config/i386/cpuinfo.h>
2375void matmul_r4 (gfc_array_r4 * const restrict retarray,
2376	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2377	int blas_limit, blas_call gemm)
2378{
2379  static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2380	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2381	int blas_limit, blas_call gemm);
2382
2383  void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
2384	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2385	int blas_limit, blas_call gemm);
2386
2387  matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2388  if (matmul_fn == NULL)
2389    {
2390      matmul_fn = matmul_r4_vanilla;
2391      if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2392	{
2393          /* Run down the available processors in order of preference.  */
2394#ifdef HAVE_AVX512F
2395      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2396	    {
2397	      matmul_fn = matmul_r4_avx512f;
2398	      goto store;
2399	    }
2400
2401#endif  /* HAVE_AVX512F */
2402
2403#ifdef HAVE_AVX2
2404      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2405	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2406	    {
2407	      matmul_fn = matmul_r4_avx2;
2408	      goto store;
2409	    }
2410
2411#endif
2412
2413#ifdef HAVE_AVX
2414      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2415 	    {
2416              matmul_fn = matmul_r4_avx;
2417	      goto store;
2418	    }
2419#endif  /* HAVE_AVX */
2420        }
2421    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2422      {
2423#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2424        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2425	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2426	  {
2427            matmul_fn = matmul_r4_avx128_fma3;
2428	    goto store;
2429	  }
2430#endif
2431#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2432        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2433	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2434	  {
2435            matmul_fn = matmul_r4_avx128_fma4;
2436	    goto store;
2437	  }
2438#endif
2439
2440      }
2441   store:
2442      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2443   }
2444
2445   (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2446}
2447
2448#else  /* Just the vanilla function.  */
2449
2450void
2451matmul_r4 (gfc_array_r4 * const restrict retarray,
2452	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2453	int blas_limit, blas_call gemm)
2454{
2455  const GFC_REAL_4 * restrict abase;
2456  const GFC_REAL_4 * restrict bbase;
2457  GFC_REAL_4 * restrict dest;
2458
2459  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2460  index_type x, y, n, count, xcount, ycount;
2461
2462  assert (GFC_DESCRIPTOR_RANK (a) == 2
2463          || GFC_DESCRIPTOR_RANK (b) == 2);
2464
2465/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2466
2467   Either A or B (but not both) can be rank 1:
2468
2469   o One-dimensional argument A is implicitly treated as a row matrix
2470     dimensioned [1,count], so xcount=1.
2471
2472   o One-dimensional argument B is implicitly treated as a column matrix
2473     dimensioned [count, 1], so ycount=1.
2474*/
2475
2476  if (retarray->base_addr == NULL)
2477    {
2478      if (GFC_DESCRIPTOR_RANK (a) == 1)
2479        {
2480	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2481	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2482        }
2483      else if (GFC_DESCRIPTOR_RANK (b) == 1)
2484        {
2485	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2486	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2487        }
2488      else
2489        {
2490	  GFC_DIMENSION_SET(retarray->dim[0], 0,
2491	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2492
2493          GFC_DIMENSION_SET(retarray->dim[1], 0,
2494	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2495			    GFC_DESCRIPTOR_EXTENT(retarray,0));
2496        }
2497
2498      retarray->base_addr
2499	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2500      retarray->offset = 0;
2501    }
2502  else if (unlikely (compile_options.bounds_check))
2503    {
2504      index_type ret_extent, arg_extent;
2505
2506      if (GFC_DESCRIPTOR_RANK (a) == 1)
2507	{
2508	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2509	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2510	  if (arg_extent != ret_extent)
2511	    runtime_error ("Array bound mismatch for dimension 1 of "
2512	    		   "array (%ld/%ld) ",
2513			   (long int) ret_extent, (long int) arg_extent);
2514	}
2515      else if (GFC_DESCRIPTOR_RANK (b) == 1)
2516	{
2517	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2518	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2519	  if (arg_extent != ret_extent)
2520	    runtime_error ("Array bound mismatch for dimension 1 of "
2521	    		   "array (%ld/%ld) ",
2522			   (long int) ret_extent, (long int) arg_extent);
2523	}
2524      else
2525	{
2526	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2527	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2528	  if (arg_extent != ret_extent)
2529	    runtime_error ("Array bound mismatch for dimension 1 of "
2530	    		   "array (%ld/%ld) ",
2531			   (long int) ret_extent, (long int) arg_extent);
2532
2533	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2534	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2535	  if (arg_extent != ret_extent)
2536	    runtime_error ("Array bound mismatch for dimension 2 of "
2537	    		   "array (%ld/%ld) ",
2538			   (long int) ret_extent, (long int) arg_extent);
2539	}
2540    }
2541
2542
2543  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2544    {
2545      /* One-dimensional result may be addressed in the code below
2546	 either as a row or a column matrix. We want both cases to
2547	 work. */
2548      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2549    }
2550  else
2551    {
2552      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2553      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2554    }
2555
2556
2557  if (GFC_DESCRIPTOR_RANK (a) == 1)
2558    {
2559      /* Treat it as a a row matrix A[1,count]. */
2560      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2561      aystride = 1;
2562
2563      xcount = 1;
2564      count = GFC_DESCRIPTOR_EXTENT(a,0);
2565    }
2566  else
2567    {
2568      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2569      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2570
2571      count = GFC_DESCRIPTOR_EXTENT(a,1);
2572      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2573    }
2574
2575  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2576    {
2577      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2578	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2579		       "in dimension 1: is %ld, should be %ld",
2580		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2581    }
2582
2583  if (GFC_DESCRIPTOR_RANK (b) == 1)
2584    {
2585      /* Treat it as a column matrix B[count,1] */
2586      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2587
2588      /* bystride should never be used for 1-dimensional b.
2589         The value is only used for calculation of the
2590         memory by the buffer.  */
2591      bystride = 256;
2592      ycount = 1;
2593    }
2594  else
2595    {
2596      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2597      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2598      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2599    }
2600
2601  abase = a->base_addr;
2602  bbase = b->base_addr;
2603  dest = retarray->base_addr;
2604
2605  /* Now that everything is set up, we perform the multiplication
2606     itself.  */
2607
2608#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2609#define min(a,b) ((a) <= (b) ? (a) : (b))
2610#define max(a,b) ((a) >= (b) ? (a) : (b))
2611
2612  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2613      && (bxstride == 1 || bystride == 1)
2614      && (((float) xcount) * ((float) ycount) * ((float) count)
2615          > POW3(blas_limit)))
2616    {
2617      const int m = xcount, n = ycount, k = count, ldc = rystride;
2618      const GFC_REAL_4 one = 1, zero = 0;
2619      const int lda = (axstride == 1) ? aystride : axstride,
2620		ldb = (bxstride == 1) ? bystride : bxstride;
2621
2622      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2623	{
2624	  assert (gemm != NULL);
2625	  const char *transa, *transb;
2626	  if (try_blas & 2)
2627	    transa = "C";
2628	  else
2629	    transa = axstride == 1 ? "N" : "T";
2630
2631	  if (try_blas & 4)
2632	    transb = "C";
2633	  else
2634	    transb = bxstride == 1 ? "N" : "T";
2635
2636	  gemm (transa, transb , &m,
2637		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
2638		&ldc, 1, 1);
2639	  return;
2640	}
2641    }
2642
2643  if (rxstride == 1 && axstride == 1 && bxstride == 1
2644      && GFC_DESCRIPTOR_RANK (b) != 1)
2645    {
2646      /* This block of code implements a tuned matmul, derived from
2647         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2648
2649               Bo Kagstrom and Per Ling
2650               Department of Computing Science
2651               Umea University
2652               S-901 87 Umea, Sweden
2653
2654	 from netlib.org, translated to C, and modified for matmul.m4.  */
2655
2656      const GFC_REAL_4 *a, *b;
2657      GFC_REAL_4 *c;
2658      const index_type m = xcount, n = ycount, k = count;
2659
2660      /* System generated locals */
2661      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2662		 i1, i2, i3, i4, i5, i6;
2663
2664      /* Local variables */
2665      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
2666		 f13, f14, f23, f24, f33, f34, f43, f44;
2667      index_type i, j, l, ii, jj, ll;
2668      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2669      GFC_REAL_4 *t1;
2670
2671      a = abase;
2672      b = bbase;
2673      c = retarray->base_addr;
2674
2675      /* Parameter adjustments */
2676      c_dim1 = rystride;
2677      c_offset = 1 + c_dim1;
2678      c -= c_offset;
2679      a_dim1 = aystride;
2680      a_offset = 1 + a_dim1;
2681      a -= a_offset;
2682      b_dim1 = bystride;
2683      b_offset = 1 + b_dim1;
2684      b -= b_offset;
2685
2686      /* Empty c first.  */
2687      for (j=1; j<=n; j++)
2688	for (i=1; i<=m; i++)
2689	  c[i + j * c_dim1] = (GFC_REAL_4)0;
2690
2691      /* Early exit if possible */
2692      if (m == 0 || n == 0 || k == 0)
2693	return;
2694
2695      /* Adjust size of t1 to what is needed.  */
2696      index_type t1_dim, a_sz;
2697      if (aystride == 1)
2698        a_sz = rystride;
2699      else
2700        a_sz = a_dim1;
2701
2702      t1_dim = a_sz * 256 + b_dim1;
2703      if (t1_dim > 65536)
2704	t1_dim = 65536;
2705
2706      t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
2707
2708      /* Start turning the crank. */
2709      i1 = n;
2710      for (jj = 1; jj <= i1; jj += 512)
2711	{
2712	  /* Computing MIN */
2713	  i2 = 512;
2714	  i3 = n - jj + 1;
2715	  jsec = min(i2,i3);
2716	  ujsec = jsec - jsec % 4;
2717	  i2 = k;
2718	  for (ll = 1; ll <= i2; ll += 256)
2719	    {
2720	      /* Computing MIN */
2721	      i3 = 256;
2722	      i4 = k - ll + 1;
2723	      lsec = min(i3,i4);
2724	      ulsec = lsec - lsec % 2;
2725
2726	      i3 = m;
2727	      for (ii = 1; ii <= i3; ii += 256)
2728		{
2729		  /* Computing MIN */
2730		  i4 = 256;
2731		  i5 = m - ii + 1;
2732		  isec = min(i4,i5);
2733		  uisec = isec - isec % 2;
2734		  i4 = ll + ulsec - 1;
2735		  for (l = ll; l <= i4; l += 2)
2736		    {
2737		      i5 = ii + uisec - 1;
2738		      for (i = ii; i <= i5; i += 2)
2739			{
2740			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2741					a[i + l * a_dim1];
2742			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2743					a[i + (l + 1) * a_dim1];
2744			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2745					a[i + 1 + l * a_dim1];
2746			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2747					a[i + 1 + (l + 1) * a_dim1];
2748			}
2749		      if (uisec < isec)
2750			{
2751			  t1[l - ll + 1 + (isec << 8) - 257] =
2752				    a[ii + isec - 1 + l * a_dim1];
2753			  t1[l - ll + 2 + (isec << 8) - 257] =
2754				    a[ii + isec - 1 + (l + 1) * a_dim1];
2755			}
2756		    }
2757		  if (ulsec < lsec)
2758		    {
2759		      i4 = ii + isec - 1;
2760		      for (i = ii; i<= i4; ++i)
2761			{
2762			  t1[lsec + ((i - ii + 1) << 8) - 257] =
2763				    a[i + (ll + lsec - 1) * a_dim1];
2764			}
2765		    }
2766
2767		  uisec = isec - isec % 4;
2768		  i4 = jj + ujsec - 1;
2769		  for (j = jj; j <= i4; j += 4)
2770		    {
2771		      i5 = ii + uisec - 1;
2772		      for (i = ii; i <= i5; i += 4)
2773			{
2774			  f11 = c[i + j * c_dim1];
2775			  f21 = c[i + 1 + j * c_dim1];
2776			  f12 = c[i + (j + 1) * c_dim1];
2777			  f22 = c[i + 1 + (j + 1) * c_dim1];
2778			  f13 = c[i + (j + 2) * c_dim1];
2779			  f23 = c[i + 1 + (j + 2) * c_dim1];
2780			  f14 = c[i + (j + 3) * c_dim1];
2781			  f24 = c[i + 1 + (j + 3) * c_dim1];
2782			  f31 = c[i + 2 + j * c_dim1];
2783			  f41 = c[i + 3 + j * c_dim1];
2784			  f32 = c[i + 2 + (j + 1) * c_dim1];
2785			  f42 = c[i + 3 + (j + 1) * c_dim1];
2786			  f33 = c[i + 2 + (j + 2) * c_dim1];
2787			  f43 = c[i + 3 + (j + 2) * c_dim1];
2788			  f34 = c[i + 2 + (j + 3) * c_dim1];
2789			  f44 = c[i + 3 + (j + 3) * c_dim1];
2790			  i6 = ll + lsec - 1;
2791			  for (l = ll; l <= i6; ++l)
2792			    {
2793			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2794				      * b[l + j * b_dim1];
2795			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2796				      * b[l + j * b_dim1];
2797			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2798				      * b[l + (j + 1) * b_dim1];
2799			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2800				      * b[l + (j + 1) * b_dim1];
2801			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2802				      * b[l + (j + 2) * b_dim1];
2803			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2804				      * b[l + (j + 2) * b_dim1];
2805			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2806				      * b[l + (j + 3) * b_dim1];
2807			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2808				      * b[l + (j + 3) * b_dim1];
2809			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2810				      * b[l + j * b_dim1];
2811			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2812				      * b[l + j * b_dim1];
2813			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2814				      * b[l + (j + 1) * b_dim1];
2815			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2816				      * b[l + (j + 1) * b_dim1];
2817			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2818				      * b[l + (j + 2) * b_dim1];
2819			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2820				      * b[l + (j + 2) * b_dim1];
2821			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2822				      * b[l + (j + 3) * b_dim1];
2823			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2824				      * b[l + (j + 3) * b_dim1];
2825			    }
2826			  c[i + j * c_dim1] = f11;
2827			  c[i + 1 + j * c_dim1] = f21;
2828			  c[i + (j + 1) * c_dim1] = f12;
2829			  c[i + 1 + (j + 1) * c_dim1] = f22;
2830			  c[i + (j + 2) * c_dim1] = f13;
2831			  c[i + 1 + (j + 2) * c_dim1] = f23;
2832			  c[i + (j + 3) * c_dim1] = f14;
2833			  c[i + 1 + (j + 3) * c_dim1] = f24;
2834			  c[i + 2 + j * c_dim1] = f31;
2835			  c[i + 3 + j * c_dim1] = f41;
2836			  c[i + 2 + (j + 1) * c_dim1] = f32;
2837			  c[i + 3 + (j + 1) * c_dim1] = f42;
2838			  c[i + 2 + (j + 2) * c_dim1] = f33;
2839			  c[i + 3 + (j + 2) * c_dim1] = f43;
2840			  c[i + 2 + (j + 3) * c_dim1] = f34;
2841			  c[i + 3 + (j + 3) * c_dim1] = f44;
2842			}
2843		      if (uisec < isec)
2844			{
2845			  i5 = ii + isec - 1;
2846			  for (i = ii + uisec; i <= i5; ++i)
2847			    {
2848			      f11 = c[i + j * c_dim1];
2849			      f12 = c[i + (j + 1) * c_dim1];
2850			      f13 = c[i + (j + 2) * c_dim1];
2851			      f14 = c[i + (j + 3) * c_dim1];
2852			      i6 = ll + lsec - 1;
2853			      for (l = ll; l <= i6; ++l)
2854				{
2855				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2856					  257] * b[l + j * b_dim1];
2857				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2858					  257] * b[l + (j + 1) * b_dim1];
2859				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2860					  257] * b[l + (j + 2) * b_dim1];
2861				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2862					  257] * b[l + (j + 3) * b_dim1];
2863				}
2864			      c[i + j * c_dim1] = f11;
2865			      c[i + (j + 1) * c_dim1] = f12;
2866			      c[i + (j + 2) * c_dim1] = f13;
2867			      c[i + (j + 3) * c_dim1] = f14;
2868			    }
2869			}
2870		    }
2871		  if (ujsec < jsec)
2872		    {
2873		      i4 = jj + jsec - 1;
2874		      for (j = jj + ujsec; j <= i4; ++j)
2875			{
2876			  i5 = ii + uisec - 1;
2877			  for (i = ii; i <= i5; i += 4)
2878			    {
2879			      f11 = c[i + j * c_dim1];
2880			      f21 = c[i + 1 + j * c_dim1];
2881			      f31 = c[i + 2 + j * c_dim1];
2882			      f41 = c[i + 3 + j * c_dim1];
2883			      i6 = ll + lsec - 1;
2884			      for (l = ll; l <= i6; ++l)
2885				{
2886				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2887					  257] * b[l + j * b_dim1];
2888				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2889					  257] * b[l + j * b_dim1];
2890				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2891					  257] * b[l + j * b_dim1];
2892				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2893					  257] * b[l + j * b_dim1];
2894				}
2895			      c[i + j * c_dim1] = f11;
2896			      c[i + 1 + j * c_dim1] = f21;
2897			      c[i + 2 + j * c_dim1] = f31;
2898			      c[i + 3 + j * c_dim1] = f41;
2899			    }
2900			  i5 = ii + isec - 1;
2901			  for (i = ii + uisec; i <= i5; ++i)
2902			    {
2903			      f11 = c[i + j * c_dim1];
2904			      i6 = ll + lsec - 1;
2905			      for (l = ll; l <= i6; ++l)
2906				{
2907				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2908					  257] * b[l + j * b_dim1];
2909				}
2910			      c[i + j * c_dim1] = f11;
2911			    }
2912			}
2913		    }
2914		}
2915	    }
2916	}
2917      free(t1);
2918      return;
2919    }
2920  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2921    {
2922      if (GFC_DESCRIPTOR_RANK (a) != 1)
2923	{
2924	  const GFC_REAL_4 *restrict abase_x;
2925	  const GFC_REAL_4 *restrict bbase_y;
2926	  GFC_REAL_4 *restrict dest_y;
2927	  GFC_REAL_4 s;
2928
2929	  for (y = 0; y < ycount; y++)
2930	    {
2931	      bbase_y = &bbase[y*bystride];
2932	      dest_y = &dest[y*rystride];
2933	      for (x = 0; x < xcount; x++)
2934		{
2935		  abase_x = &abase[x*axstride];
2936		  s = (GFC_REAL_4) 0;
2937		  for (n = 0; n < count; n++)
2938		    s += abase_x[n] * bbase_y[n];
2939		  dest_y[x] = s;
2940		}
2941	    }
2942	}
2943      else
2944	{
2945	  const GFC_REAL_4 *restrict bbase_y;
2946	  GFC_REAL_4 s;
2947
2948	  for (y = 0; y < ycount; y++)
2949	    {
2950	      bbase_y = &bbase[y*bystride];
2951	      s = (GFC_REAL_4) 0;
2952	      for (n = 0; n < count; n++)
2953		s += abase[n*axstride] * bbase_y[n];
2954	      dest[y*rystride] = s;
2955	    }
2956	}
2957    }
2958  else if (GFC_DESCRIPTOR_RANK (a) == 1)
2959    {
2960      const GFC_REAL_4 *restrict bbase_y;
2961      GFC_REAL_4 s;
2962
2963      for (y = 0; y < ycount; y++)
2964	{
2965	  bbase_y = &bbase[y*bystride];
2966	  s = (GFC_REAL_4) 0;
2967	  for (n = 0; n < count; n++)
2968	    s += abase[n*axstride] * bbase_y[n*bxstride];
2969	  dest[y*rxstride] = s;
2970	}
2971    }
2972  else if (axstride < aystride)
2973    {
2974      for (y = 0; y < ycount; y++)
2975	for (x = 0; x < xcount; x++)
2976	  dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2977
2978      for (y = 0; y < ycount; y++)
2979	for (n = 0; n < count; n++)
2980	  for (x = 0; x < xcount; x++)
2981	    /* dest[x,y] += a[x,n] * b[n,y] */
2982	    dest[x*rxstride + y*rystride] +=
2983					abase[x*axstride + n*aystride] *
2984					bbase[n*bxstride + y*bystride];
2985    }
2986  else
2987    {
2988      const GFC_REAL_4 *restrict abase_x;
2989      const GFC_REAL_4 *restrict bbase_y;
2990      GFC_REAL_4 *restrict dest_y;
2991      GFC_REAL_4 s;
2992
2993      for (y = 0; y < ycount; y++)
2994	{
2995	  bbase_y = &bbase[y*bystride];
2996	  dest_y = &dest[y*rystride];
2997	  for (x = 0; x < xcount; x++)
2998	    {
2999	      abase_x = &abase[x*axstride];
3000	      s = (GFC_REAL_4) 0;
3001	      for (n = 0; n < count; n++)
3002		s += abase_x[n*aystride] * bbase_y[n*bxstride];
3003	      dest_y[x*rxstride] = s;
3004	    }
3005	}
3006    }
3007}
3008#undef POW3
3009#undef min
3010#undef max
3011
3012#endif
3013#endif
3014
3015