1/* Implementation of the MATMUL intrinsic
2   Copyright (C) 2002-2020 Free Software Foundation, Inc.
3   Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4
5This file is part of the GNU Fortran runtime library (libgfortran).
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
10version 3 of the License, or (at your option) any later version.
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26#include "libgfortran.h"
27#include <string.h>
28#include <assert.h>
29
30
31/* These are the specific versions of matmul with -mprefer-avx128.  */
32
33#if defined (HAVE_GFC_INTEGER_1)
34
35/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36   passed to us by the front-end, in which case we call it for large
37   matrices.  */
38
39typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40                          const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *,
41                          const int *, const GFC_INTEGER_1 *, const int *,
42                          const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *,
43                          int, int);
44
45#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46void
47matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
48	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
49	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50internal_proto(matmul_i1_avx128_fma3);
51void
52matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
53	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
54	int blas_limit, blas_call gemm)
55{
56  const GFC_INTEGER_1 * restrict abase;
57  const GFC_INTEGER_1 * restrict bbase;
58  GFC_INTEGER_1 * restrict dest;
59
60  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61  index_type x, y, n, count, xcount, ycount;
62
63  assert (GFC_DESCRIPTOR_RANK (a) == 2
64          || GFC_DESCRIPTOR_RANK (b) == 2);
65
66/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67
68   Either A or B (but not both) can be rank 1:
69
70   o One-dimensional argument A is implicitly treated as a row matrix
71     dimensioned [1,count], so xcount=1.
72
73   o One-dimensional argument B is implicitly treated as a column matrix
74     dimensioned [count, 1], so ycount=1.
75*/
76
77  if (retarray->base_addr == NULL)
78    {
79      if (GFC_DESCRIPTOR_RANK (a) == 1)
80        {
81	  GFC_DIMENSION_SET(retarray->dim[0], 0,
82	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83        }
84      else if (GFC_DESCRIPTOR_RANK (b) == 1)
85        {
86	  GFC_DIMENSION_SET(retarray->dim[0], 0,
87	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88        }
89      else
90        {
91	  GFC_DIMENSION_SET(retarray->dim[0], 0,
92	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93
94          GFC_DIMENSION_SET(retarray->dim[1], 0,
95	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96			    GFC_DESCRIPTOR_EXTENT(retarray,0));
97        }
98
99      retarray->base_addr
100	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
101      retarray->offset = 0;
102    }
103  else if (unlikely (compile_options.bounds_check))
104    {
105      index_type ret_extent, arg_extent;
106
107      if (GFC_DESCRIPTOR_RANK (a) == 1)
108	{
109	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111	  if (arg_extent != ret_extent)
112	    runtime_error ("Array bound mismatch for dimension 1 of "
113	    		   "array (%ld/%ld) ",
114			   (long int) ret_extent, (long int) arg_extent);
115	}
116      else if (GFC_DESCRIPTOR_RANK (b) == 1)
117	{
118	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120	  if (arg_extent != ret_extent)
121	    runtime_error ("Array bound mismatch for dimension 1 of "
122	    		   "array (%ld/%ld) ",
123			   (long int) ret_extent, (long int) arg_extent);
124	}
125      else
126	{
127	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129	  if (arg_extent != ret_extent)
130	    runtime_error ("Array bound mismatch for dimension 1 of "
131	    		   "array (%ld/%ld) ",
132			   (long int) ret_extent, (long int) arg_extent);
133
134	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136	  if (arg_extent != ret_extent)
137	    runtime_error ("Array bound mismatch for dimension 2 of "
138	    		   "array (%ld/%ld) ",
139			   (long int) ret_extent, (long int) arg_extent);
140	}
141    }
142
143
144  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145    {
146      /* One-dimensional result may be addressed in the code below
147	 either as a row or a column matrix. We want both cases to
148	 work. */
149      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150    }
151  else
152    {
153      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155    }
156
157
158  if (GFC_DESCRIPTOR_RANK (a) == 1)
159    {
160      /* Treat it as a a row matrix A[1,count]. */
161      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162      aystride = 1;
163
164      xcount = 1;
165      count = GFC_DESCRIPTOR_EXTENT(a,0);
166    }
167  else
168    {
169      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171
172      count = GFC_DESCRIPTOR_EXTENT(a,1);
173      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174    }
175
176  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177    {
178      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180		       "in dimension 1: is %ld, should be %ld",
181		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182    }
183
184  if (GFC_DESCRIPTOR_RANK (b) == 1)
185    {
186      /* Treat it as a column matrix B[count,1] */
187      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188
189      /* bystride should never be used for 1-dimensional b.
190         The value is only used for calculation of the
191         memory by the buffer.  */
192      bystride = 256;
193      ycount = 1;
194    }
195  else
196    {
197      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200    }
201
202  abase = a->base_addr;
203  bbase = b->base_addr;
204  dest = retarray->base_addr;
205
206  /* Now that everything is set up, we perform the multiplication
207     itself.  */
208
209#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210#define min(a,b) ((a) <= (b) ? (a) : (b))
211#define max(a,b) ((a) >= (b) ? (a) : (b))
212
213  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214      && (bxstride == 1 || bystride == 1)
215      && (((float) xcount) * ((float) ycount) * ((float) count)
216          > POW3(blas_limit)))
217    {
218      const int m = xcount, n = ycount, k = count, ldc = rystride;
219      const GFC_INTEGER_1 one = 1, zero = 0;
220      const int lda = (axstride == 1) ? aystride : axstride,
221		ldb = (bxstride == 1) ? bystride : bxstride;
222
223      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224	{
225	  assert (gemm != NULL);
226	  const char *transa, *transb;
227	  if (try_blas & 2)
228	    transa = "C";
229	  else
230	    transa = axstride == 1 ? "N" : "T";
231
232	  if (try_blas & 4)
233	    transb = "C";
234	  else
235	    transb = bxstride == 1 ? "N" : "T";
236
237	  gemm (transa, transb , &m,
238		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
239		&ldc, 1, 1);
240	  return;
241	}
242    }
243
244  if (rxstride == 1 && axstride == 1 && bxstride == 1
245      && GFC_DESCRIPTOR_RANK (b) != 1)
246    {
247      /* This block of code implements a tuned matmul, derived from
248         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
249
250               Bo Kagstrom and Per Ling
251               Department of Computing Science
252               Umea University
253               S-901 87 Umea, Sweden
254
255	 from netlib.org, translated to C, and modified for matmul.m4.  */
256
257      const GFC_INTEGER_1 *a, *b;
258      GFC_INTEGER_1 *c;
259      const index_type m = xcount, n = ycount, k = count;
260
261      /* System generated locals */
262      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
263		 i1, i2, i3, i4, i5, i6;
264
265      /* Local variables */
266      GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
267		 f13, f14, f23, f24, f33, f34, f43, f44;
268      index_type i, j, l, ii, jj, ll;
269      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
270      GFC_INTEGER_1 *t1;
271
272      a = abase;
273      b = bbase;
274      c = retarray->base_addr;
275
276      /* Parameter adjustments */
277      c_dim1 = rystride;
278      c_offset = 1 + c_dim1;
279      c -= c_offset;
280      a_dim1 = aystride;
281      a_offset = 1 + a_dim1;
282      a -= a_offset;
283      b_dim1 = bystride;
284      b_offset = 1 + b_dim1;
285      b -= b_offset;
286
287      /* Empty c first.  */
288      for (j=1; j<=n; j++)
289	for (i=1; i<=m; i++)
290	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
291
292      /* Early exit if possible */
293      if (m == 0 || n == 0 || k == 0)
294	return;
295
296      /* Adjust size of t1 to what is needed.  */
297      index_type t1_dim, a_sz;
298      if (aystride == 1)
299        a_sz = rystride;
300      else
301        a_sz = a_dim1;
302
303      t1_dim = a_sz * 256 + b_dim1;
304      if (t1_dim > 65536)
305	t1_dim = 65536;
306
307      t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
308
309      /* Start turning the crank. */
310      i1 = n;
311      for (jj = 1; jj <= i1; jj += 512)
312	{
313	  /* Computing MIN */
314	  i2 = 512;
315	  i3 = n - jj + 1;
316	  jsec = min(i2,i3);
317	  ujsec = jsec - jsec % 4;
318	  i2 = k;
319	  for (ll = 1; ll <= i2; ll += 256)
320	    {
321	      /* Computing MIN */
322	      i3 = 256;
323	      i4 = k - ll + 1;
324	      lsec = min(i3,i4);
325	      ulsec = lsec - lsec % 2;
326
327	      i3 = m;
328	      for (ii = 1; ii <= i3; ii += 256)
329		{
330		  /* Computing MIN */
331		  i4 = 256;
332		  i5 = m - ii + 1;
333		  isec = min(i4,i5);
334		  uisec = isec - isec % 2;
335		  i4 = ll + ulsec - 1;
336		  for (l = ll; l <= i4; l += 2)
337		    {
338		      i5 = ii + uisec - 1;
339		      for (i = ii; i <= i5; i += 2)
340			{
341			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
342					a[i + l * a_dim1];
343			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
344					a[i + (l + 1) * a_dim1];
345			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
346					a[i + 1 + l * a_dim1];
347			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
348					a[i + 1 + (l + 1) * a_dim1];
349			}
350		      if (uisec < isec)
351			{
352			  t1[l - ll + 1 + (isec << 8) - 257] =
353				    a[ii + isec - 1 + l * a_dim1];
354			  t1[l - ll + 2 + (isec << 8) - 257] =
355				    a[ii + isec - 1 + (l + 1) * a_dim1];
356			}
357		    }
358		  if (ulsec < lsec)
359		    {
360		      i4 = ii + isec - 1;
361		      for (i = ii; i<= i4; ++i)
362			{
363			  t1[lsec + ((i - ii + 1) << 8) - 257] =
364				    a[i + (ll + lsec - 1) * a_dim1];
365			}
366		    }
367
368		  uisec = isec - isec % 4;
369		  i4 = jj + ujsec - 1;
370		  for (j = jj; j <= i4; j += 4)
371		    {
372		      i5 = ii + uisec - 1;
373		      for (i = ii; i <= i5; i += 4)
374			{
375			  f11 = c[i + j * c_dim1];
376			  f21 = c[i + 1 + j * c_dim1];
377			  f12 = c[i + (j + 1) * c_dim1];
378			  f22 = c[i + 1 + (j + 1) * c_dim1];
379			  f13 = c[i + (j + 2) * c_dim1];
380			  f23 = c[i + 1 + (j + 2) * c_dim1];
381			  f14 = c[i + (j + 3) * c_dim1];
382			  f24 = c[i + 1 + (j + 3) * c_dim1];
383			  f31 = c[i + 2 + j * c_dim1];
384			  f41 = c[i + 3 + j * c_dim1];
385			  f32 = c[i + 2 + (j + 1) * c_dim1];
386			  f42 = c[i + 3 + (j + 1) * c_dim1];
387			  f33 = c[i + 2 + (j + 2) * c_dim1];
388			  f43 = c[i + 3 + (j + 2) * c_dim1];
389			  f34 = c[i + 2 + (j + 3) * c_dim1];
390			  f44 = c[i + 3 + (j + 3) * c_dim1];
391			  i6 = ll + lsec - 1;
392			  for (l = ll; l <= i6; ++l)
393			    {
394			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395				      * b[l + j * b_dim1];
396			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397				      * b[l + j * b_dim1];
398			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
399				      * b[l + (j + 1) * b_dim1];
400			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
401				      * b[l + (j + 1) * b_dim1];
402			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
403				      * b[l + (j + 2) * b_dim1];
404			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
405				      * b[l + (j + 2) * b_dim1];
406			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
407				      * b[l + (j + 3) * b_dim1];
408			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
409				      * b[l + (j + 3) * b_dim1];
410			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411				      * b[l + j * b_dim1];
412			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413				      * b[l + j * b_dim1];
414			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
415				      * b[l + (j + 1) * b_dim1];
416			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
417				      * b[l + (j + 1) * b_dim1];
418			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
419				      * b[l + (j + 2) * b_dim1];
420			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
421				      * b[l + (j + 2) * b_dim1];
422			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
423				      * b[l + (j + 3) * b_dim1];
424			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
425				      * b[l + (j + 3) * b_dim1];
426			    }
427			  c[i + j * c_dim1] = f11;
428			  c[i + 1 + j * c_dim1] = f21;
429			  c[i + (j + 1) * c_dim1] = f12;
430			  c[i + 1 + (j + 1) * c_dim1] = f22;
431			  c[i + (j + 2) * c_dim1] = f13;
432			  c[i + 1 + (j + 2) * c_dim1] = f23;
433			  c[i + (j + 3) * c_dim1] = f14;
434			  c[i + 1 + (j + 3) * c_dim1] = f24;
435			  c[i + 2 + j * c_dim1] = f31;
436			  c[i + 3 + j * c_dim1] = f41;
437			  c[i + 2 + (j + 1) * c_dim1] = f32;
438			  c[i + 3 + (j + 1) * c_dim1] = f42;
439			  c[i + 2 + (j + 2) * c_dim1] = f33;
440			  c[i + 3 + (j + 2) * c_dim1] = f43;
441			  c[i + 2 + (j + 3) * c_dim1] = f34;
442			  c[i + 3 + (j + 3) * c_dim1] = f44;
443			}
444		      if (uisec < isec)
445			{
446			  i5 = ii + isec - 1;
447			  for (i = ii + uisec; i <= i5; ++i)
448			    {
449			      f11 = c[i + j * c_dim1];
450			      f12 = c[i + (j + 1) * c_dim1];
451			      f13 = c[i + (j + 2) * c_dim1];
452			      f14 = c[i + (j + 3) * c_dim1];
453			      i6 = ll + lsec - 1;
454			      for (l = ll; l <= i6; ++l)
455				{
456				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
457					  257] * b[l + j * b_dim1];
458				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459					  257] * b[l + (j + 1) * b_dim1];
460				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461					  257] * b[l + (j + 2) * b_dim1];
462				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463					  257] * b[l + (j + 3) * b_dim1];
464				}
465			      c[i + j * c_dim1] = f11;
466			      c[i + (j + 1) * c_dim1] = f12;
467			      c[i + (j + 2) * c_dim1] = f13;
468			      c[i + (j + 3) * c_dim1] = f14;
469			    }
470			}
471		    }
472		  if (ujsec < jsec)
473		    {
474		      i4 = jj + jsec - 1;
475		      for (j = jj + ujsec; j <= i4; ++j)
476			{
477			  i5 = ii + uisec - 1;
478			  for (i = ii; i <= i5; i += 4)
479			    {
480			      f11 = c[i + j * c_dim1];
481			      f21 = c[i + 1 + j * c_dim1];
482			      f31 = c[i + 2 + j * c_dim1];
483			      f41 = c[i + 3 + j * c_dim1];
484			      i6 = ll + lsec - 1;
485			      for (l = ll; l <= i6; ++l)
486				{
487				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
488					  257] * b[l + j * b_dim1];
489				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
490					  257] * b[l + j * b_dim1];
491				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
492					  257] * b[l + j * b_dim1];
493				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
494					  257] * b[l + j * b_dim1];
495				}
496			      c[i + j * c_dim1] = f11;
497			      c[i + 1 + j * c_dim1] = f21;
498			      c[i + 2 + j * c_dim1] = f31;
499			      c[i + 3 + j * c_dim1] = f41;
500			    }
501			  i5 = ii + isec - 1;
502			  for (i = ii + uisec; i <= i5; ++i)
503			    {
504			      f11 = c[i + j * c_dim1];
505			      i6 = ll + lsec - 1;
506			      for (l = ll; l <= i6; ++l)
507				{
508				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
509					  257] * b[l + j * b_dim1];
510				}
511			      c[i + j * c_dim1] = f11;
512			    }
513			}
514		    }
515		}
516	    }
517	}
518      free(t1);
519      return;
520    }
521  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
522    {
523      if (GFC_DESCRIPTOR_RANK (a) != 1)
524	{
525	  const GFC_INTEGER_1 *restrict abase_x;
526	  const GFC_INTEGER_1 *restrict bbase_y;
527	  GFC_INTEGER_1 *restrict dest_y;
528	  GFC_INTEGER_1 s;
529
530	  for (y = 0; y < ycount; y++)
531	    {
532	      bbase_y = &bbase[y*bystride];
533	      dest_y = &dest[y*rystride];
534	      for (x = 0; x < xcount; x++)
535		{
536		  abase_x = &abase[x*axstride];
537		  s = (GFC_INTEGER_1) 0;
538		  for (n = 0; n < count; n++)
539		    s += abase_x[n] * bbase_y[n];
540		  dest_y[x] = s;
541		}
542	    }
543	}
544      else
545	{
546	  const GFC_INTEGER_1 *restrict bbase_y;
547	  GFC_INTEGER_1 s;
548
549	  for (y = 0; y < ycount; y++)
550	    {
551	      bbase_y = &bbase[y*bystride];
552	      s = (GFC_INTEGER_1) 0;
553	      for (n = 0; n < count; n++)
554		s += abase[n*axstride] * bbase_y[n];
555	      dest[y*rystride] = s;
556	    }
557	}
558    }
559  else if (GFC_DESCRIPTOR_RANK (a) == 1)
560    {
561      const GFC_INTEGER_1 *restrict bbase_y;
562      GFC_INTEGER_1 s;
563
564      for (y = 0; y < ycount; y++)
565	{
566	  bbase_y = &bbase[y*bystride];
567	  s = (GFC_INTEGER_1) 0;
568	  for (n = 0; n < count; n++)
569	    s += abase[n*axstride] * bbase_y[n*bxstride];
570	  dest[y*rxstride] = s;
571	}
572    }
573  else if (axstride < aystride)
574    {
575      for (y = 0; y < ycount; y++)
576	for (x = 0; x < xcount; x++)
577	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
578
579      for (y = 0; y < ycount; y++)
580	for (n = 0; n < count; n++)
581	  for (x = 0; x < xcount; x++)
582	    /* dest[x,y] += a[x,n] * b[n,y] */
583	    dest[x*rxstride + y*rystride] +=
584					abase[x*axstride + n*aystride] *
585					bbase[n*bxstride + y*bystride];
586    }
587  else
588    {
589      const GFC_INTEGER_1 *restrict abase_x;
590      const GFC_INTEGER_1 *restrict bbase_y;
591      GFC_INTEGER_1 *restrict dest_y;
592      GFC_INTEGER_1 s;
593
594      for (y = 0; y < ycount; y++)
595	{
596	  bbase_y = &bbase[y*bystride];
597	  dest_y = &dest[y*rystride];
598	  for (x = 0; x < xcount; x++)
599	    {
600	      abase_x = &abase[x*axstride];
601	      s = (GFC_INTEGER_1) 0;
602	      for (n = 0; n < count; n++)
603		s += abase_x[n*aystride] * bbase_y[n*bxstride];
604	      dest_y[x*rxstride] = s;
605	    }
606	}
607    }
608}
609#undef POW3
610#undef min
611#undef max
612
613#endif
614
615#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
616void
617matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
618	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
619	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
620internal_proto(matmul_i1_avx128_fma4);
621void
622matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
623	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
624	int blas_limit, blas_call gemm)
625{
626  const GFC_INTEGER_1 * restrict abase;
627  const GFC_INTEGER_1 * restrict bbase;
628  GFC_INTEGER_1 * restrict dest;
629
630  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
631  index_type x, y, n, count, xcount, ycount;
632
633  assert (GFC_DESCRIPTOR_RANK (a) == 2
634          || GFC_DESCRIPTOR_RANK (b) == 2);
635
636/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
637
638   Either A or B (but not both) can be rank 1:
639
640   o One-dimensional argument A is implicitly treated as a row matrix
641     dimensioned [1,count], so xcount=1.
642
643   o One-dimensional argument B is implicitly treated as a column matrix
644     dimensioned [count, 1], so ycount=1.
645*/
646
647  if (retarray->base_addr == NULL)
648    {
649      if (GFC_DESCRIPTOR_RANK (a) == 1)
650        {
651	  GFC_DIMENSION_SET(retarray->dim[0], 0,
652	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
653        }
654      else if (GFC_DESCRIPTOR_RANK (b) == 1)
655        {
656	  GFC_DIMENSION_SET(retarray->dim[0], 0,
657	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
658        }
659      else
660        {
661	  GFC_DIMENSION_SET(retarray->dim[0], 0,
662	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
663
664          GFC_DIMENSION_SET(retarray->dim[1], 0,
665	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
666			    GFC_DESCRIPTOR_EXTENT(retarray,0));
667        }
668
669      retarray->base_addr
670	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
671      retarray->offset = 0;
672    }
673  else if (unlikely (compile_options.bounds_check))
674    {
675      index_type ret_extent, arg_extent;
676
677      if (GFC_DESCRIPTOR_RANK (a) == 1)
678	{
679	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
680	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
681	  if (arg_extent != ret_extent)
682	    runtime_error ("Array bound mismatch for dimension 1 of "
683	    		   "array (%ld/%ld) ",
684			   (long int) ret_extent, (long int) arg_extent);
685	}
686      else if (GFC_DESCRIPTOR_RANK (b) == 1)
687	{
688	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
689	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
690	  if (arg_extent != ret_extent)
691	    runtime_error ("Array bound mismatch for dimension 1 of "
692	    		   "array (%ld/%ld) ",
693			   (long int) ret_extent, (long int) arg_extent);
694	}
695      else
696	{
697	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
698	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
699	  if (arg_extent != ret_extent)
700	    runtime_error ("Array bound mismatch for dimension 1 of "
701	    		   "array (%ld/%ld) ",
702			   (long int) ret_extent, (long int) arg_extent);
703
704	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
705	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
706	  if (arg_extent != ret_extent)
707	    runtime_error ("Array bound mismatch for dimension 2 of "
708	    		   "array (%ld/%ld) ",
709			   (long int) ret_extent, (long int) arg_extent);
710	}
711    }
712
713
714  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
715    {
716      /* One-dimensional result may be addressed in the code below
717	 either as a row or a column matrix. We want both cases to
718	 work. */
719      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
720    }
721  else
722    {
723      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
724      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
725    }
726
727
728  if (GFC_DESCRIPTOR_RANK (a) == 1)
729    {
730      /* Treat it as a a row matrix A[1,count]. */
731      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
732      aystride = 1;
733
734      xcount = 1;
735      count = GFC_DESCRIPTOR_EXTENT(a,0);
736    }
737  else
738    {
739      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
740      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
741
742      count = GFC_DESCRIPTOR_EXTENT(a,1);
743      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
744    }
745
746  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
747    {
748      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
749	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
750		       "in dimension 1: is %ld, should be %ld",
751		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
752    }
753
754  if (GFC_DESCRIPTOR_RANK (b) == 1)
755    {
756      /* Treat it as a column matrix B[count,1] */
757      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
758
759      /* bystride should never be used for 1-dimensional b.
760         The value is only used for calculation of the
761         memory by the buffer.  */
762      bystride = 256;
763      ycount = 1;
764    }
765  else
766    {
767      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
768      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
769      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
770    }
771
772  abase = a->base_addr;
773  bbase = b->base_addr;
774  dest = retarray->base_addr;
775
776  /* Now that everything is set up, we perform the multiplication
777     itself.  */
778
779#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
780#define min(a,b) ((a) <= (b) ? (a) : (b))
781#define max(a,b) ((a) >= (b) ? (a) : (b))
782
783  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
784      && (bxstride == 1 || bystride == 1)
785      && (((float) xcount) * ((float) ycount) * ((float) count)
786          > POW3(blas_limit)))
787    {
788      const int m = xcount, n = ycount, k = count, ldc = rystride;
789      const GFC_INTEGER_1 one = 1, zero = 0;
790      const int lda = (axstride == 1) ? aystride : axstride,
791		ldb = (bxstride == 1) ? bystride : bxstride;
792
793      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
794	{
795	  assert (gemm != NULL);
796	  const char *transa, *transb;
797	  if (try_blas & 2)
798	    transa = "C";
799	  else
800	    transa = axstride == 1 ? "N" : "T";
801
802	  if (try_blas & 4)
803	    transb = "C";
804	  else
805	    transb = bxstride == 1 ? "N" : "T";
806
807	  gemm (transa, transb , &m,
808		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
809		&ldc, 1, 1);
810	  return;
811	}
812    }
813
814  if (rxstride == 1 && axstride == 1 && bxstride == 1
815      && GFC_DESCRIPTOR_RANK (b) != 1)
816    {
817      /* This block of code implements a tuned matmul, derived from
818         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
819
820               Bo Kagstrom and Per Ling
821               Department of Computing Science
822               Umea University
823               S-901 87 Umea, Sweden
824
825	 from netlib.org, translated to C, and modified for matmul.m4.  */
826
827      const GFC_INTEGER_1 *a, *b;
828      GFC_INTEGER_1 *c;
829      const index_type m = xcount, n = ycount, k = count;
830
831      /* System generated locals */
832      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
833		 i1, i2, i3, i4, i5, i6;
834
835      /* Local variables */
836      GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
837		 f13, f14, f23, f24, f33, f34, f43, f44;
838      index_type i, j, l, ii, jj, ll;
839      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840      GFC_INTEGER_1 *t1;
841
842      a = abase;
843      b = bbase;
844      c = retarray->base_addr;
845
846      /* Parameter adjustments */
847      c_dim1 = rystride;
848      c_offset = 1 + c_dim1;
849      c -= c_offset;
850      a_dim1 = aystride;
851      a_offset = 1 + a_dim1;
852      a -= a_offset;
853      b_dim1 = bystride;
854      b_offset = 1 + b_dim1;
855      b -= b_offset;
856
857      /* Empty c first.  */
858      for (j=1; j<=n; j++)
859	for (i=1; i<=m; i++)
860	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
861
862      /* Early exit if possible */
863      if (m == 0 || n == 0 || k == 0)
864	return;
865
866      /* Adjust size of t1 to what is needed.  */
867      index_type t1_dim, a_sz;
868      if (aystride == 1)
869        a_sz = rystride;
870      else
871        a_sz = a_dim1;
872
873      t1_dim = a_sz * 256 + b_dim1;
874      if (t1_dim > 65536)
875	t1_dim = 65536;
876
877      t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
878
879      /* Start turning the crank. */
880      i1 = n;
881      for (jj = 1; jj <= i1; jj += 512)
882	{
883	  /* Computing MIN */
884	  i2 = 512;
885	  i3 = n - jj + 1;
886	  jsec = min(i2,i3);
887	  ujsec = jsec - jsec % 4;
888	  i2 = k;
889	  for (ll = 1; ll <= i2; ll += 256)
890	    {
891	      /* Computing MIN */
892	      i3 = 256;
893	      i4 = k - ll + 1;
894	      lsec = min(i3,i4);
895	      ulsec = lsec - lsec % 2;
896
897	      i3 = m;
898	      for (ii = 1; ii <= i3; ii += 256)
899		{
900		  /* Computing MIN */
901		  i4 = 256;
902		  i5 = m - ii + 1;
903		  isec = min(i4,i5);
904		  uisec = isec - isec % 2;
905		  i4 = ll + ulsec - 1;
906		  for (l = ll; l <= i4; l += 2)
907		    {
908		      i5 = ii + uisec - 1;
909		      for (i = ii; i <= i5; i += 2)
910			{
911			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912					a[i + l * a_dim1];
913			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914					a[i + (l + 1) * a_dim1];
915			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916					a[i + 1 + l * a_dim1];
917			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918					a[i + 1 + (l + 1) * a_dim1];
919			}
920		      if (uisec < isec)
921			{
922			  t1[l - ll + 1 + (isec << 8) - 257] =
923				    a[ii + isec - 1 + l * a_dim1];
924			  t1[l - ll + 2 + (isec << 8) - 257] =
925				    a[ii + isec - 1 + (l + 1) * a_dim1];
926			}
927		    }
928		  if (ulsec < lsec)
929		    {
930		      i4 = ii + isec - 1;
931		      for (i = ii; i<= i4; ++i)
932			{
933			  t1[lsec + ((i - ii + 1) << 8) - 257] =
934				    a[i + (ll + lsec - 1) * a_dim1];
935			}
936		    }
937
938		  uisec = isec - isec % 4;
939		  i4 = jj + ujsec - 1;
940		  for (j = jj; j <= i4; j += 4)
941		    {
942		      i5 = ii + uisec - 1;
943		      for (i = ii; i <= i5; i += 4)
944			{
945			  f11 = c[i + j * c_dim1];
946			  f21 = c[i + 1 + j * c_dim1];
947			  f12 = c[i + (j + 1) * c_dim1];
948			  f22 = c[i + 1 + (j + 1) * c_dim1];
949			  f13 = c[i + (j + 2) * c_dim1];
950			  f23 = c[i + 1 + (j + 2) * c_dim1];
951			  f14 = c[i + (j + 3) * c_dim1];
952			  f24 = c[i + 1 + (j + 3) * c_dim1];
953			  f31 = c[i + 2 + j * c_dim1];
954			  f41 = c[i + 3 + j * c_dim1];
955			  f32 = c[i + 2 + (j + 1) * c_dim1];
956			  f42 = c[i + 3 + (j + 1) * c_dim1];
957			  f33 = c[i + 2 + (j + 2) * c_dim1];
958			  f43 = c[i + 3 + (j + 2) * c_dim1];
959			  f34 = c[i + 2 + (j + 3) * c_dim1];
960			  f44 = c[i + 3 + (j + 3) * c_dim1];
961			  i6 = ll + lsec - 1;
962			  for (l = ll; l <= i6; ++l)
963			    {
964			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965				      * b[l + j * b_dim1];
966			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967				      * b[l + j * b_dim1];
968			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969				      * b[l + (j + 1) * b_dim1];
970			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971				      * b[l + (j + 1) * b_dim1];
972			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973				      * b[l + (j + 2) * b_dim1];
974			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975				      * b[l + (j + 2) * b_dim1];
976			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977				      * b[l + (j + 3) * b_dim1];
978			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979				      * b[l + (j + 3) * b_dim1];
980			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981				      * b[l + j * b_dim1];
982			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983				      * b[l + j * b_dim1];
984			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985				      * b[l + (j + 1) * b_dim1];
986			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987				      * b[l + (j + 1) * b_dim1];
988			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989				      * b[l + (j + 2) * b_dim1];
990			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991				      * b[l + (j + 2) * b_dim1];
992			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993				      * b[l + (j + 3) * b_dim1];
994			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995				      * b[l + (j + 3) * b_dim1];
996			    }
997			  c[i + j * c_dim1] = f11;
998			  c[i + 1 + j * c_dim1] = f21;
999			  c[i + (j + 1) * c_dim1] = f12;
1000			  c[i + 1 + (j + 1) * c_dim1] = f22;
1001			  c[i + (j + 2) * c_dim1] = f13;
1002			  c[i + 1 + (j + 2) * c_dim1] = f23;
1003			  c[i + (j + 3) * c_dim1] = f14;
1004			  c[i + 1 + (j + 3) * c_dim1] = f24;
1005			  c[i + 2 + j * c_dim1] = f31;
1006			  c[i + 3 + j * c_dim1] = f41;
1007			  c[i + 2 + (j + 1) * c_dim1] = f32;
1008			  c[i + 3 + (j + 1) * c_dim1] = f42;
1009			  c[i + 2 + (j + 2) * c_dim1] = f33;
1010			  c[i + 3 + (j + 2) * c_dim1] = f43;
1011			  c[i + 2 + (j + 3) * c_dim1] = f34;
1012			  c[i + 3 + (j + 3) * c_dim1] = f44;
1013			}
1014		      if (uisec < isec)
1015			{
1016			  i5 = ii + isec - 1;
1017			  for (i = ii + uisec; i <= i5; ++i)
1018			    {
1019			      f11 = c[i + j * c_dim1];
1020			      f12 = c[i + (j + 1) * c_dim1];
1021			      f13 = c[i + (j + 2) * c_dim1];
1022			      f14 = c[i + (j + 3) * c_dim1];
1023			      i6 = ll + lsec - 1;
1024			      for (l = ll; l <= i6; ++l)
1025				{
1026				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027					  257] * b[l + j * b_dim1];
1028				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029					  257] * b[l + (j + 1) * b_dim1];
1030				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031					  257] * b[l + (j + 2) * b_dim1];
1032				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033					  257] * b[l + (j + 3) * b_dim1];
1034				}
1035			      c[i + j * c_dim1] = f11;
1036			      c[i + (j + 1) * c_dim1] = f12;
1037			      c[i + (j + 2) * c_dim1] = f13;
1038			      c[i + (j + 3) * c_dim1] = f14;
1039			    }
1040			}
1041		    }
1042		  if (ujsec < jsec)
1043		    {
1044		      i4 = jj + jsec - 1;
1045		      for (j = jj + ujsec; j <= i4; ++j)
1046			{
1047			  i5 = ii + uisec - 1;
1048			  for (i = ii; i <= i5; i += 4)
1049			    {
1050			      f11 = c[i + j * c_dim1];
1051			      f21 = c[i + 1 + j * c_dim1];
1052			      f31 = c[i + 2 + j * c_dim1];
1053			      f41 = c[i + 3 + j * c_dim1];
1054			      i6 = ll + lsec - 1;
1055			      for (l = ll; l <= i6; ++l)
1056				{
1057				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058					  257] * b[l + j * b_dim1];
1059				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060					  257] * b[l + j * b_dim1];
1061				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062					  257] * b[l + j * b_dim1];
1063				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064					  257] * b[l + j * b_dim1];
1065				}
1066			      c[i + j * c_dim1] = f11;
1067			      c[i + 1 + j * c_dim1] = f21;
1068			      c[i + 2 + j * c_dim1] = f31;
1069			      c[i + 3 + j * c_dim1] = f41;
1070			    }
1071			  i5 = ii + isec - 1;
1072			  for (i = ii + uisec; i <= i5; ++i)
1073			    {
1074			      f11 = c[i + j * c_dim1];
1075			      i6 = ll + lsec - 1;
1076			      for (l = ll; l <= i6; ++l)
1077				{
1078				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079					  257] * b[l + j * b_dim1];
1080				}
1081			      c[i + j * c_dim1] = f11;
1082			    }
1083			}
1084		    }
1085		}
1086	    }
1087	}
1088      free(t1);
1089      return;
1090    }
1091  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092    {
1093      if (GFC_DESCRIPTOR_RANK (a) != 1)
1094	{
1095	  const GFC_INTEGER_1 *restrict abase_x;
1096	  const GFC_INTEGER_1 *restrict bbase_y;
1097	  GFC_INTEGER_1 *restrict dest_y;
1098	  GFC_INTEGER_1 s;
1099
1100	  for (y = 0; y < ycount; y++)
1101	    {
1102	      bbase_y = &bbase[y*bystride];
1103	      dest_y = &dest[y*rystride];
1104	      for (x = 0; x < xcount; x++)
1105		{
1106		  abase_x = &abase[x*axstride];
1107		  s = (GFC_INTEGER_1) 0;
1108		  for (n = 0; n < count; n++)
1109		    s += abase_x[n] * bbase_y[n];
1110		  dest_y[x] = s;
1111		}
1112	    }
1113	}
1114      else
1115	{
1116	  const GFC_INTEGER_1 *restrict bbase_y;
1117	  GFC_INTEGER_1 s;
1118
1119	  for (y = 0; y < ycount; y++)
1120	    {
1121	      bbase_y = &bbase[y*bystride];
1122	      s = (GFC_INTEGER_1) 0;
1123	      for (n = 0; n < count; n++)
1124		s += abase[n*axstride] * bbase_y[n];
1125	      dest[y*rystride] = s;
1126	    }
1127	}
1128    }
1129  else if (GFC_DESCRIPTOR_RANK (a) == 1)
1130    {
1131      const GFC_INTEGER_1 *restrict bbase_y;
1132      GFC_INTEGER_1 s;
1133
1134      for (y = 0; y < ycount; y++)
1135	{
1136	  bbase_y = &bbase[y*bystride];
1137	  s = (GFC_INTEGER_1) 0;
1138	  for (n = 0; n < count; n++)
1139	    s += abase[n*axstride] * bbase_y[n*bxstride];
1140	  dest[y*rxstride] = s;
1141	}
1142    }
1143  else if (axstride < aystride)
1144    {
1145      for (y = 0; y < ycount; y++)
1146	for (x = 0; x < xcount; x++)
1147	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
1148
1149      for (y = 0; y < ycount; y++)
1150	for (n = 0; n < count; n++)
1151	  for (x = 0; x < xcount; x++)
1152	    /* dest[x,y] += a[x,n] * b[n,y] */
1153	    dest[x*rxstride + y*rystride] +=
1154					abase[x*axstride + n*aystride] *
1155					bbase[n*bxstride + y*bystride];
1156    }
1157  else
1158    {
1159      const GFC_INTEGER_1 *restrict abase_x;
1160      const GFC_INTEGER_1 *restrict bbase_y;
1161      GFC_INTEGER_1 *restrict dest_y;
1162      GFC_INTEGER_1 s;
1163
1164      for (y = 0; y < ycount; y++)
1165	{
1166	  bbase_y = &bbase[y*bystride];
1167	  dest_y = &dest[y*rystride];
1168	  for (x = 0; x < xcount; x++)
1169	    {
1170	      abase_x = &abase[x*axstride];
1171	      s = (GFC_INTEGER_1) 0;
1172	      for (n = 0; n < count; n++)
1173		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174	      dest_y[x*rxstride] = s;
1175	    }
1176	}
1177    }
1178}
1179#undef POW3
1180#undef min
1181#undef max
1182
1183#endif
1184
1185#endif
1186
1187